diff --git a/agent_loop.py b/agent_loop.py
index dd8f548..63b9ea3 100644
--- a/agent_loop.py
+++ b/agent_loop.py
@@ -21,6 +21,7 @@
 from modality import CognitiveEvent, CognitiveIntent, ModalityType
 from pipeline_state import PipelineState
 from providers import AGENT_TOOLS, InferenceProvider
+from schemas.bargein import BargeinContext
 
 if TYPE_CHECKING:
     from channels import BrowserChannel
@@ -149,6 +150,9 @@ def __init__(
         self.draft_queue = DraftQueue()
         self._speculative_context: list[dict[str, str]] = []  # Context for speculative inference
         self._human_speaking = False  # Whether human is currently speaking
+        # A2: typed barge-in context prepared before the next turn, consumed by A3
+        # for prompt injection. Set by _prepare_bargein_context() on the WS path.
+        self._pending_bargein: BargeinContext | None = None
 
     async def handle_event(self, event: CognitiveEvent) -> None:
         """Called when a CognitiveEvent arrives from the channel."""
@@ -175,12 +179,46 @@ async def handle_event(self, event: CognitiveEvent) -> None:
 
     async def _process(self, event: CognitiveEvent) -> None:
         """Core: event → provider → tool dispatch."""
-        # Context stitching: inject interrupt context from dashboard path
-        # This closes the barge-in loop — the agent knows what was spoken,
-        # what was unsaid, and what the user interrupted with.
-        interrupt_context = self._build_interrupt_context(event.content)
-        if interrupt_context:
-            self.conversation.append({"role": "system", "content": interrupt_context})
+        # A2: build typed BargeinContext from pipeline_state.last_interrupt (if any)
+        # and stash on self._pending_bargein. A3 will consume it for prompt injection.
+        self._prepare_bargein_context(user_text=event.content)
+
+        # MOD3_USE_COGOS_AGENT fork: forward user turn to kernel bus instead of
+        # calling local provider. Response arrives asynchronously via the
+        # cogos_agent_bridge → BrowserChannel.broadcast_response_text path.
+        from cogos_agent_bridge import is_enabled as _cogos_agent_enabled
+        from cogos_agent_bridge import post_user_message as _post_user_message
+
+        if _cogos_agent_enabled():
+            session_id = f"mod3:{self.channel_id or 'unknown'}"
+            # Fold any pending barge-in context into the forwarded text so the
+            # kernel cycle sees it. A full structured payload will come in a
+            # later iteration; for v1 we prepend the terse prompt renderer.
+            forwarded_text = event.content
+            pending = self._pending_bargein
+            if pending is not None:
+                self._pending_bargein = None
+                forwarded_text = (
+                    "[interrupted earlier] "
+                    + pending.format_for_prompt()
+                    + "\n"
+                    + forwarded_text
+                )
+            ok = await _post_user_message(forwarded_text, session_id=session_id)
+            if not ok and self._channel_ref:
+                try:
+                    await self._channel_ref.send_response_text(
+                        "[cogos-agent unreachable — check kernel]"
+                    )
+                    await self._channel_ref.send_response_complete(
+                        metrics={"provider": "cogos-agent", "error": "unreachable"}
+                    )
+                except Exception:
+                    pass
+            # Track the user turn in history so subsequent turns carry it.
+            self.conversation.append({"role": "user", "content": event.content})
+            self._trim_history()
+            return
 
         self.conversation.append({"role": "user", "content": event.content})
         self._trim_history()
@@ -190,6 +228,7 @@ async def _process(self, event: CognitiveEvent) -> None:
         # Assemble system prompt with kernel context (afferent path)
         kernel_ctx = _fetch_kernel_context()
         system_prompt = _BASE_SYSTEM_PROMPT + kernel_ctx
+        system_prompt = self._inject_pending_bargein(system_prompt)
 
         response = await self.provider.chat(
             messages=self.conversation,
@@ -510,46 +549,51 @@ async def background_validate_drafts(self, latest_user_text: str) -> None:
 
         await self._push_draft_queue_state()
 
-    def _build_interrupt_context(self, user_text: str) -> str | None:
-        """Build context stitch from pipeline_state.last_interrupt.
-
-        When the user barged in during TTS playback, captures what was
-        spoken vs unspoken and injects it as structured context for the
-        next inference call. Consumes the interrupt (clears it).
+    def _prepare_bargein_context(self, user_text: str | None) -> None:
+        """Read pipeline_state.last_interrupt and stash a typed BargeinContext.
 
-        Returns a context string, or None if no interrupt occurred.
+        Called at the top of each WS turn. If the previous assistant reply was
+        interrupted (and the interrupt is still fresh, < 30s), build a
+        BargeinContext via the A1 schema and store it on ``self._pending_bargein``
+        for A3 to pick up during prompt construction. Clears last_interrupt so
+        the next turn does not re-consume a stale record.
         """
         info = self.pipeline_state.last_interrupt
         if info is None:
-            return None
+            self._pending_bargein = None
+            return
 
         # Only use recent interrupts (within last 30 seconds)
         if time.time() - info.timestamp > 30:
-            return None
+            # Stale — clear and skip.
+            with self.pipeline_state._lock:
+                self.pipeline_state._last_interrupt = None
+            self._pending_bargein = None
+            return
 
-        # Clear the interrupt so we don't re-inject it
+        # Consume the interrupt so we don't re-inject it on subsequent turns.
+        # pipeline_state has no public consume helper yet; clear the private
+        # slot under its lock (matches the pre-existing pattern on this path).
         with self.pipeline_state._lock:
             self.pipeline_state._last_interrupt = None
 
-        # Compute unspoken remainder
-        unspoken = ""
-        if info.full_text and info.delivered_text:
-            if info.full_text.startswith(info.delivered_text):
-                unspoken = info.full_text[len(info.delivered_text) :].strip()
-            else:
-                # Fallback: everything after the delivered percentage
-                unspoken = info.full_text[len(info.delivered_text) :].strip()
+        self._pending_bargein = BargeinContext.from_interrupt_info(
+            info,
+            source="browser_vad",
+            user_said=user_text or None,
+        )
 
-        parts = []
-        parts.append("[Barge-in context — your previous response was interrupted]")
-        parts.append(f'spoken (user heard this): "{info.delivered_text}"')
-        if unspoken:
-            parts.append(f'unspoken (user did NOT hear this): "{unspoken}"')
-        parts.append(f"interrupted_at: {info.spoken_pct * 100:.0f}%")
-        parts.append(f'user_said: "{user_text}"')
-        parts.append("Acknowledge what was interrupted and respond to the user's new input.")
-
-        return "\n".join(parts)
+    def _inject_pending_bargein(self, system_prompt: str) -> str:
+        """Append the pending BargeinContext (if any) to the system prompt.
+
+        Consumes ``self._pending_bargein`` so it does not leak into subsequent
+        turns. Returns the prompt unchanged if no barge-in is pending.
+        """
+        pending = self._pending_bargein
+        if pending is None:
+            return system_prompt
+        self._pending_bargein = None
+        return system_prompt + "\n\n" + pending.format_for_prompt()
 
     def _trim_history(self) -> None:
         """Keep conversation within MAX_HISTORY messages."""
diff --git a/bargein/__init__.py b/bargein/__init__.py
new file mode 100644
index 0000000..068e18f
--- /dev/null
+++ b/bargein/__init__.py
@@ -0,0 +1,296 @@
+"""Barge-in subsystem.
+
+This package owns the first-class barge-in primitive inside mod3. Sources
+(SuperWhisper, browser VAD, MCP signals, etc.) register as
+``BargeinProvider`` instances; each one emits ``BargeinEvent``s through a
+callback. The registry below wires those callbacks into the shared consumer
+helper ``handle_bargein_event``, which does the same work the legacy
+``/tmp/mod3-barge-in.json`` file watcher in ``server.py`` does today:
+interrupt in-progress playback via ``pipeline_state.interrupt()`` and log.
+
+Env-driven config:
+  MOD3_BARGEIN_PROVIDERS — comma-separated provider names (default: empty).
+                           Example: ``MOD3_BARGEIN_PROVIDERS=superwhisper``
+
+Default is empty so users without SuperWhisper installed see no behavior
+change from the current setup — they can still run the standalone
+``integrations/bargein-producer.py`` script and the legacy file watcher
+in ``server.py`` keeps picking up its signals.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import threading
+from typing import Callable
+
+from pipeline_state import InterruptInfo, PipelineState
+from schemas.bargein import BargeinSource
+
+from .providers.base import BargeinCallback, BargeinEvent, BargeinEventType, BargeinProvider
+
+log = logging.getLogger("bargein")
+
+# ---------------------------------------------------------------------------
+# Shared consumer helper
+# ---------------------------------------------------------------------------
+#
+# Both the legacy file watcher in server.py and the new provider registry
+# call this when a "user is speaking" signal arrives. It is the single
+# authoritative "barge-in start" handler.
+#
+# Returning the InterruptInfo (or None) lets the file watcher continue its
+# extra work of writing the interrupt detail back into the signal file —
+# cross-process coordination that only matters for the file-based IPC.
+# In-process providers ignore the return.
+
+
+def handle_bargein_start(
+    pipeline_state: PipelineState,
+    source: str,
+    metadata: dict | None = None,
+) -> InterruptInfo | None:
+    """Attempt to interrupt in-progress TTS playback because the user began speaking.
+
+    Returns the ``InterruptInfo`` if playback was actually halted, or ``None``
+    if nothing was speaking (or another process owns the speech — only the
+    file watcher can handle that via the cross-process lock).
+    """
+    if not pipeline_state.is_speaking:
+        return None
+    info = pipeline_state.interrupt(reason="barge_in")
+    if info is not None:
+        log.info(
+            "Barge-in from %s: paused local playback (%.0f%% delivered)%s",
+            source,
+            info.spoken_pct * 100,
+            f" meta={metadata}" if metadata else "",
+        )
+    return info
+
+
+# ---------------------------------------------------------------------------
+# Provider registry
+# ---------------------------------------------------------------------------
+
+
+PROVIDER_NAMES = ["superwhisper"]
+
+
+def _build_provider(name: str, on_event: BargeinCallback) -> BargeinProvider | None:
+    """Instantiate a provider by name. Returns None if unknown or import fails."""
+    name = name.strip().lower()
+    if not name:
+        return None
+    if name == "superwhisper":
+        from .providers.superwhisper import SuperWhisperProvider
+
+        return SuperWhisperProvider(on_event=on_event)
+    log.warning("Unknown barge-in provider: %r (known: %s)", name, PROVIDER_NAMES)
+    return None
+
+
+class BargeinRegistry:
+    """Owns the set of active barge-in providers and routes their events.
+
+    Use:
+        registry = BargeinRegistry(pipeline_state)
+        registry.start_from_env()       # or registry.register(SomeProvider(...))
+        # ... later, on shutdown:
+        registry.stop_all()
+
+    Tests can install their own dispatch by passing ``on_event`` to
+    ``register``; registry-level dispatch goes through ``_dispatch`` which
+    calls both ``handle_bargein_start`` and any extra subscribers.
+    """
+
+    def __init__(self, pipeline_state: PipelineState):
+        self._pipeline_state = pipeline_state
+        self._providers: list[BargeinProvider] = []
+        self._subscribers: list[Callable[[BargeinEvent], None]] = []
+        self._lock = threading.Lock()
+
+    # ------------------------------------------------------------------
+    # Registration
+    # ------------------------------------------------------------------
+
+    def register(self, provider: BargeinProvider) -> None:
+        """Register a pre-built provider. Does NOT start it (see ``start_all``)."""
+        with self._lock:
+            self._providers.append(provider)
+
+    def subscribe(self, callback: Callable[[BargeinEvent], None]) -> None:
+        """Register an additional event subscriber (fires after the consumer helper).
+
+        Useful for tests and for future observers (metrics, bus emits, etc.).
+        """
+        with self._lock:
+            self._subscribers.append(callback)
+
+    def unsubscribe(self, callback: Callable[[BargeinEvent], None]) -> None:
+        """Remove a previously-registered subscriber. Idempotent."""
+        with self._lock:
+            try:
+                self._subscribers.remove(callback)
+            except ValueError:
+                pass
+
+    # ------------------------------------------------------------------
+    # Synchronous wait primitive
+    # ------------------------------------------------------------------
+
+    def wait_for_event(
+        self,
+        event_type: BargeinEventType,
+        source: BargeinSource | None = None,
+        timeout: float | None = None,
+    ) -> BargeinEvent | None:
+        """Block until a matching event is dispatched, or until ``timeout``.
+
+        Returns the matching ``BargeinEvent`` on success, or ``None`` on timeout.
+        Thread-safe; multiple waiters may run concurrently — each receives the
+        first matching event emitted after its wait began.
+
+        Example::
+
+            event = registry.wait_for_event("user_speaking_end", timeout=180)
+            if event is None:
+                ...  # timed out
+        """
+        signal = threading.Event()
+        captured: list[BargeinEvent] = []
+
+        def _waiter(event: BargeinEvent) -> None:
+            if event.event_type != event_type:
+                return
+            if source is not None and event.source != source:
+                return
+            if signal.is_set():
+                return
+            captured.append(event)
+            signal.set()
+
+        self.subscribe(_waiter)
+        try:
+            if signal.wait(timeout):
+                return captured[0]
+            return None
+        finally:
+            self.unsubscribe(_waiter)
+
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+
+    def start_all(self) -> None:
+        """Start every registered provider."""
+        with self._lock:
+            providers = list(self._providers)
+        for p in providers:
+            p.start()
+
+    def stop_all(self, timeout: float = 2.0) -> None:
+        """Signal shutdown and (best-effort) join every provider thread."""
+        with self._lock:
+            providers = list(self._providers)
+        for p in providers:
+            p.stop(timeout=timeout)
+
+    def start_from_env(self, env_var: str = "MOD3_BARGEIN_PROVIDERS") -> list[str]:
+        """Instantiate and start providers listed in the env var. Returns started names.
+
+        Providers already present on the registry are kept; we append whatever
+        the env var asks for that isn't already there.
+        """
+        raw = os.environ.get(env_var, "").strip()
+        if not raw:
+            log.info("No barge-in providers configured (set %s=superwhisper to enable)", env_var)
+            return []
+
+        requested = [n.strip().lower() for n in raw.split(",") if n.strip()]
+        already = {type(p).__name__.lower() for p in self._providers}
+        started: list[str] = []
+        for name in requested:
+            # Match by normalized class name (SuperWhisperProvider -> "superwhisperprovider")
+            # or the logical name the factory accepts.
+            if f"{name}provider" in already:
+                continue
+            provider = _build_provider(name, self._dispatch)
+            if provider is None:
+                continue
+            self.register(provider)
+            provider.start()
+            started.append(name)
+        log.info("Barge-in providers started: %s", started)
+        return started
+
+    # ------------------------------------------------------------------
+    # Internals
+    # ------------------------------------------------------------------
+
+    def _dispatch(self, event: BargeinEvent) -> None:
+        """Route a provider event through the shared consumer + any subscribers."""
+        try:
+            if event.event_type == "user_speaking_start":
+                handle_bargein_start(
+                    self._pipeline_state,
+                    source=event.source,
+                    metadata=event.metadata,
+                )
+            # user_speaking_end has no in-process consumer today (the legacy
+            # file watcher also only reacts to "start"). Subscribers still
+            # see it so future code can use it.
+        except Exception:
+            log.exception("consumer helper raised while handling %s", event)
+
+        with self._lock:
+            subs = list(self._subscribers)
+        for cb in subs:
+            try:
+                cb(event)
+            except Exception:
+                log.exception("barge-in subscriber raised")
+
+
+def make_file_mirror_subscriber(signal_path: str) -> Callable[[BargeinEvent], None]:
+    """Build a registry subscriber that mirrors events into the legacy signal file.
+
+    The legacy ``/tmp/mod3-barge-in.json`` file is consumed by
+    out-of-process clients (e.g. ``mcp_shim.py``'s ``await_voice_input``)
+    that cannot subscribe to the in-process registry. Installing this
+    subscriber lets in-process providers reach those pollers.
+
+    Writes are atomic (tmp + rename). ``OSError`` is swallowed and logged
+    at debug level — the file mirror is best-effort and must never break
+    in-process delivery.
+    """
+
+    def _mirror(event: BargeinEvent) -> None:
+        try:
+            payload = {
+                "event": event.event_type,
+                "source": event.source,
+                "timestamp": event.timestamp.isoformat(),
+                "via": "bargein_registry",
+                **event.metadata,
+            }
+            tmp = signal_path + ".tmp"
+            with open(tmp, "w") as f:
+                json.dump(payload, f)
+            os.replace(tmp, signal_path)
+        except OSError:
+            log.debug("file mirror write failed", exc_info=True)
+
+    return _mirror
+
+
+__all__ = [
+    "BargeinEvent",
+    "BargeinProvider",
+    "BargeinRegistry",
+    "handle_bargein_start",
+    "make_file_mirror_subscriber",
+    "PROVIDER_NAMES",
+]
diff --git a/bargein/providers/__init__.py b/bargein/providers/__init__.py
new file mode 100644
index 0000000..664a82b
--- /dev/null
+++ b/bargein/providers/__init__.py
@@ -0,0 +1,14 @@
+"""Barge-in providers.
+
+Each provider watches a different signal source (SuperWhisper, browser VAD,
+hotkey, mic-level VAD, …) and emits ``BargeinEvent`` through a callback.
+"""
+
+from .base import BargeinCallback, BargeinEvent, BargeinEventType, BargeinProvider
+
+__all__ = [
+    "BargeinCallback",
+    "BargeinEvent",
+    "BargeinEventType",
+    "BargeinProvider",
+]
diff --git a/bargein/providers/base.py b/bargein/providers/base.py
new file mode 100644
index 0000000..7c7cd1d
--- /dev/null
+++ b/bargein/providers/base.py
@@ -0,0 +1,132 @@
+"""Barge-in provider base class + event shape.
+
+A provider watches some external signal source (SuperWhisper recordings,
+browser VAD, a push-to-talk hotkey, a mic-level silero VAD, …) and emits
+``BargeinEvent``s through an ``on_event`` callback supplied at construction.
+The mod3 provider registry wires that callback to the shared consumer helper
+(``bargein._handle_bargein_start``), which takes the same action the legacy
+``/tmp/mod3-barge-in.json`` file watcher takes today.
+
+Concurrency: threads. Providers run their own polling loop on a daemon
+thread started by ``start()`` and stopped by ``stop()``. This matches the
+existing ``_bargein_watcher`` in server.py. The SuperWhisper provider's
+inner loop does blocking filesystem + sqlite3 reads, so a thread is the
+natural fit; an async shape would force every provider to wrap blocking
+calls in ``asyncio.to_thread``.
+"""
+
+from __future__ import annotations
+
+import threading
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from typing import Callable, Literal
+
+from schemas.bargein import BargeinSource
+
+BargeinEventType = Literal["user_speaking_start", "user_speaking_end"]
+
+
+@dataclass
+class BargeinEvent:
+    """A single emission from a ``BargeinProvider``.
+
+    ``metadata`` carries provider-specific detail (folder names, confidence
+    scores, etc.) that the consumer may log but must not depend on for
+    correctness.
+    """
+
+    source: BargeinSource
+    event_type: BargeinEventType
+    timestamp: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
+    metadata: dict = field(default_factory=dict)
+
+
+BargeinCallback = Callable[[BargeinEvent], None]
+
+
+class BargeinProvider(ABC):
+    """Abstract barge-in provider.
+
+    Subclasses implement ``_run`` as a blocking poll loop. ``start()`` spawns
+    it on a daemon thread; ``stop()`` sets the stop-event and (best-effort)
+    joins the thread.
+    """
+
+    source: BargeinSource  # class-level — subclasses set this
+
+    def __init__(self, on_event: BargeinCallback):
+        self._on_event = on_event
+        self._stop = threading.Event()
+        self._thread: threading.Thread | None = None
+
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+
+    def start(self) -> None:
+        """Start the provider's background thread. Idempotent."""
+        if self._thread is not None and self._thread.is_alive():
+            return
+        self._stop.clear()
+        self._thread = threading.Thread(
+            target=self._run_guarded,
+            name=f"bargein-{self.source}",
+            daemon=True,
+        )
+        self._thread.start()
+
+    def stop(self, timeout: float = 2.0) -> None:
+        """Signal shutdown and best-effort join the thread."""
+        self._stop.set()
+        thread = self._thread
+        if thread is not None and thread.is_alive():
+            thread.join(timeout=timeout)
+        self._thread = None
+
+    @property
+    def is_running(self) -> bool:
+        return self._thread is not None and self._thread.is_alive()
+
+    # ------------------------------------------------------------------
+    # Subclass contract
+    # ------------------------------------------------------------------
+
+    @abstractmethod
+    def _run(self) -> None:
+        """Provider-specific poll loop. Must return when ``self._stop`` is set."""
+
+    def _emit(
+        self,
+        event_type: BargeinEventType,
+        metadata: dict | None = None,
+    ) -> None:
+        """Emit an event to the registered callback. Swallows callback errors."""
+        try:
+            self._on_event(
+                BargeinEvent(
+                    source=self.source,
+                    event_type=event_type,
+                    metadata=metadata or {},
+                )
+            )
+        except Exception:
+            # Provider must not die because the consumer threw.
+            import logging
+
+            logging.getLogger(f"bargein.{self.source}").exception("barge-in callback raised; continuing")
+
+    # ------------------------------------------------------------------
+    # Internal
+    # ------------------------------------------------------------------
+
+    def _run_guarded(self) -> None:
+        """Wrap ``_run`` so an unexpected raise logs instead of vanishing silently."""
+        import logging
+
+        log = logging.getLogger(f"bargein.{self.source}")
+        try:
+            self._run()
+        except Exception:
+            log.exception("provider loop crashed")
diff --git a/bargein/providers/superwhisper.py b/bargein/providers/superwhisper.py
new file mode 100644
index 0000000..c95c2e3
--- /dev/null
+++ b/bargein/providers/superwhisper.py
@@ -0,0 +1,265 @@
+"""SuperWhisper barge-in provider.
+
+Watches the SuperWhisper recordings directory and its SQLite DB for
+recording start/end, emitting ``BargeinEvent``s through the registered
+callback. This is the in-process replacement for the standalone
+``integrations/bargein-producer.py`` script: same detection logic, but
+instead of writing ``/tmp/mod3-barge-in.json`` it calls directly into
+mod3's barge-in consumer.
+
+Detection:
+  * Start: a new empty timestamped folder appears under the recordings dir.
+  * End (any of):
+      - ``output.wav`` or ``meta.json`` appears in that folder, OR
+      - a matching row appears in ``superwhisper.sqlite`` (structural ground
+        truth — written only after transcription completes), OR
+      - the folder disappears (cancellation), OR
+      - the staleness timeout elapses without the above (crash / sleep).
+
+Environment variables:
+  SW_RECORDINGS_DIR   — override recordings path
+  BARGEIN_POLL_MS     — poll interval in ms (default: 150)
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import time
+from pathlib import Path
+
+from .base import BargeinProvider
+
+log = logging.getLogger("bargein.superwhisper")
+
+
+class SuperWhisperProvider(BargeinProvider):
+    """Barge-in provider backed by SuperWhisper's recordings folder + DB."""
+
+    source = "superwhisper"
+
+    # Default ~/Documents/superwhisper/recordings, overridable via env.
+    _DEFAULT_REC_DIR = os.path.expanduser("~/Documents/superwhisper/recordings")
+    # SuperWhisper SQLite DB — secondary "recording finished" signal.
+    _SW_DB = os.path.expanduser("~/Library/Application Support/SuperWhisper/database/superwhisper.sqlite")
+    # 2.5 minutes — recordings can legitimately run 60s+; be generous
+    # before declaring a stuck folder stale.
+    _STALE_TIMEOUT = 150
+    _STARTUP_FRESH_SECS = 30
+
+    def __init__(self, on_event, recordings_dir: str | None = None, poll_ms: int | None = None):
+        super().__init__(on_event)
+        self.recordings_dir = Path(recordings_dir or os.environ.get("SW_RECORDINGS_DIR", self._DEFAULT_REC_DIR))
+        poll_ms = poll_ms if poll_ms is not None else int(os.environ.get("BARGEIN_POLL_MS", "150"))
+        self._poll_interval = poll_ms / 1000.0
+
+        # Mutable state (touched only from the provider thread)
+        self._recording = False
+        self._active_folder: str | None = None
+        self._known_folders: set[str] = set()
+        self._last_dir_mtime: float = 0.0
+
+    # ------------------------------------------------------------------
+    # State transitions (emit events through the callback)
+    # ------------------------------------------------------------------
+
+    def _start_recording(self, folder: str) -> None:
+        if self._recording and self._active_folder == folder:
+            return
+        self._recording = True
+        self._active_folder = folder
+        log.info("Recording started (folder=%s)", folder)
+        self._emit("user_speaking_start", {"folder": folder})
+
+    def _end_recording(self, reason: str) -> None:
+        if not self._recording:
+            return
+        folder = self._active_folder
+        self._recording = False
+        if folder:
+            self._known_folders.add(folder)
+        self._active_folder = None
+        log.info("Recording finished (folder=%s, reason=%s)", folder, reason)
+        self._emit("user_speaking_end", {"folder": folder, "reason": reason})
+
+    # ------------------------------------------------------------------
+    # Detection helpers
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def _is_empty_dir(path: Path) -> bool:
+        try:
+            return path.is_dir() and not any(path.iterdir())
+        except OSError:
+            return False
+
+    @staticmethod
+    def _has_output(path: Path) -> bool:
+        return (path / "output.wav").exists() or (path / "meta.json").exists()
+
+    @classmethod
+    def _is_in_db(cls, folder_name: str) -> bool:
+        """True if SuperWhisper's DB has a ``recording`` row for this folder.
+
+        SuperWhisper writes the row only after transcription completes, so a
+        hit here is a definitive "recording is done" signal regardless of
+        filesystem state.
+        """
+        try:
+            import sqlite3
+
+            conn = sqlite3.connect(f"file:{cls._SW_DB}?mode=ro", uri=True, timeout=1.0)
+            cursor = conn.execute(
+                "SELECT 1 FROM recording WHERE folderName = ? LIMIT 1",
+                (folder_name,),
+            )
+            found = cursor.fetchone() is not None
+            conn.close()
+            return found
+        except Exception:
+            return False
+
+    def _scan(self) -> None:
+        """One poll cycle: detect state changes in the recordings dir."""
+        rec_dir = self.recordings_dir
+
+        # Fast path: if we're tracking an active recording, check completion signals
+        if self._recording and self._active_folder:
+            active_path = rec_dir / self._active_folder
+            if self._has_output(active_path):
+                self._end_recording(reason="output_files")
+                return
+            if self._is_in_db(self._active_folder):
+                log.info("DB confirms recording complete (filesystem missed it)")
+                self._end_recording(reason="db")
+                return
+            if not active_path.exists():
+                log.warning("Active recording folder disappeared, clearing state")
+                self._end_recording(reason="folder_gone")
+                return
+            # Fall through so we can detect a newer recording superseding this one
+
+        # Stat-then-iterdir: skip the expensive scan if mtime is unchanged
+        try:
+            dir_mtime = os.stat(rec_dir).st_mtime
+        except OSError:
+            return
+        if dir_mtime == self._last_dir_mtime:
+            return
+        self._last_dir_mtime = dir_mtime
+
+        try:
+            candidates: list[Path] = []
+            for entry in rec_dir.iterdir():
+                if entry.is_dir() and entry.name.isdigit() and entry.name not in self._known_folders:
+                    candidates.append(entry)
+        except OSError:
+            return
+
+        candidates.sort(key=lambda p: p.name, reverse=True)
+        for entry in candidates[:5]:
+            name = entry.name
+            if self._is_empty_dir(entry):
+                self._start_recording(name)
+                return
+            # Non-empty, previously unseen — completed recording we missed
+            self._known_folders.add(name)
+
+    def _check_stale(self) -> None:
+        """Clear stuck recording state if the active folder has been empty too long.
+
+        Before clearing, double-check the DB so legitimately long recordings
+        aren't thrown away when they finally land.
+        """
+        if not self._recording or not self._active_folder:
+            return
+        folder = self.recordings_dir / self._active_folder
+        try:
+            ctime = folder.stat().st_birthtime
+        except (OSError, AttributeError):
+            return
+        if time.time() - ctime <= self._STALE_TIMEOUT:
+            return
+
+        if self._is_in_db(self._active_folder):
+            log.info("Stale timeout hit but DB confirms completion — ending normally")
+            self._end_recording(reason="db_after_stale")
+        elif self._has_output(folder):
+            log.info("Stale timeout hit but output files present — ending normally")
+            self._end_recording(reason="output_after_stale")
+        else:
+            log.warning(
+                "Stale recording (>%ds), no DB entry, no output files — clearing as cancelled/crashed",
+                self._STALE_TIMEOUT,
+            )
+            self._end_recording(reason="stale")
+
+    # ------------------------------------------------------------------
+    # Startup scan: handle recordings that existed before we started
+    # ------------------------------------------------------------------
+
+    def _startup_scan(self) -> None:
+        now = time.time()
+        newest_empty: tuple[str, float] | None = None
+        try:
+            for entry in self.recordings_dir.iterdir():
+                if not (entry.is_dir() and entry.name.isdigit()):
+                    continue
+                if self._has_output(entry):
+                    self._known_folders.add(entry.name)
+                elif self._is_empty_dir(entry):
+                    try:
+                        age = now - entry.stat().st_birthtime
+                    except (OSError, AttributeError):
+                        age = float("inf")
+                    if age < self._STARTUP_FRESH_SECS:
+                        if newest_empty is None or entry.name > newest_empty[0]:
+                            newest_empty = (entry.name, age)
+                    else:
+                        self._known_folders.add(entry.name)
+        except OSError as e:
+            log.warning("Startup scan error: %s", e)
+
+        if newest_empty:
+            log.info("Detected in-progress recording on startup (age=%.1fs)", newest_empty[1])
+            self._start_recording(newest_empty[0])
+
+    # ------------------------------------------------------------------
+    # Provider contract
+    # ------------------------------------------------------------------
+
+    def _run(self) -> None:
+        rec_dir = self.recordings_dir
+        if not rec_dir.is_dir():
+            log.warning(
+                "SuperWhisper recordings directory not found: %s (provider inactive)",
+                rec_dir,
+            )
+            return
+
+        self._startup_scan()
+        log.info(
+            "SuperWhisper provider running (poll=%dms, recordings=%s, known=%d)",
+            self._poll_interval * 1000,
+            rec_dir,
+            len(self._known_folders),
+        )
+
+        stale_every = max(1, int(2.0 / self._poll_interval))
+        stale_counter = 0
+        while not self._stop.is_set():
+            try:
+                self._scan()
+                stale_counter += 1
+                if stale_counter >= stale_every:
+                    self._check_stale()
+                    stale_counter = 0
+            except Exception:
+                log.exception("SuperWhisper poll cycle raised; continuing")
+            # Use Event.wait for responsive shutdown
+            if self._stop.wait(self._poll_interval):
+                break
+
+        if self._recording:
+            # Emit a synthetic end so consumers don't stay in "speaking" forever
+            self._end_recording(reason="shutdown")
diff --git a/bus_bridge.py b/bus_bridge.py
new file mode 100644
index 0000000..ad2f014
--- /dev/null
+++ b/bus_bridge.py
@@ -0,0 +1,305 @@
+"""Kernel-bus SSE subscriber.
+
+Consumes http://localhost:6931/v1/events/stream and yields parsed bus events.
+Reconnects on disconnect with exponential backoff. Tolerates unknown event kinds
+per ADR-083 (cycle-trace event contract).
+
+C3 will consume this to broadcast CycleEvents to dashboard WebSocket clients.
+
+The kernel (see apps/cogos/bus_stream.go) emits SSE frames of the form:
+
+    data: {"id":"live_*_42","type":"bus.event","timestamp":"...","data":{<CogBlock>}}\\n\\n
+
+Heartbeats arrive as SSE comment lines:
+
+    : keep-alive\\n\\n
+
+An initial frame of {"type":"connected","bus_id":"*","timestamp":"..."} is
+sent on subscribe — we surface that as a BusEnvelope with kind="connected".
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import os
+from dataclasses import dataclass, field
+from typing import Any, AsyncIterator, Optional
+
+import httpx
+
+logger = logging.getLogger("mod3.bus_bridge")
+
+# Path appended to ``COGOS_ENDPOINT`` (or the default below) to form the
+# kernel SSE stream URL.
+KERNEL_BUS_STREAM_PATH = "/v1/events/stream"
+
+_DEFAULT_KERNEL_BASE = "http://localhost:6931"
+
+
+def default_stream_url() -> str:
+    """Build the kernel bus stream URL from ``COGOS_ENDPOINT`` (or the default).
+
+    Resolved at call time, not at import time, so tests and runtime config
+    can override the env var before the bridge is constructed.
+    """
+    base = os.environ.get("COGOS_ENDPOINT", _DEFAULT_KERNEL_BASE).rstrip("/")
+    return f"{base}{KERNEL_BUS_STREAM_PATH}"
+
+
+# Back-compat module attribute. New code should call ``default_stream_url()``
+# so that ``COGOS_ENDPOINT`` overrides take effect at runtime.
+KERNEL_BUS_STREAM_URL = default_stream_url()
+
+
+@dataclass
+class BusEnvelope:
+    """Raw bus-envelope record as received from the kernel SSE stream.
+
+    `raw` is the full outer JSON (the bus.event envelope). `payload` is the
+    inner CogBlock dict (envelope["data"]) — may be {} for non-bus.event
+    frames (e.g. the initial "connected" frame). `kind` is the best-effort
+    event-kind string: preferring payload["kind"] (ADR-083 CycleEvent), then
+    payload["type"], then envelope["type"]. Consumers MUST tolerate unknown
+    kinds.
+    """
+
+    raw: dict
+    kind: str
+    payload: dict = field(default_factory=dict)
+    ts: Optional[str] = None
+    event_id: Optional[str] = None
+
+
+def _extract_kind(envelope: dict, payload: dict) -> str:
+    for src in (payload, envelope):
+        for key in ("kind", "type"):
+            val = src.get(key) if isinstance(src, dict) else None
+            if isinstance(val, str) and val:
+                return val
+    return "unknown"
+
+
+class KernelBusSubscriber:
+    """Async SSE subscriber for the cogos kernel bus stream.
+
+    Usage::
+
+        sub = KernelBusSubscriber()
+        async for env in sub.stream():
+            handle(env)
+
+    `stream()` yields indefinitely; on any transport error it reconnects
+    with exponential backoff clamped to [reconnect_min_s, reconnect_max_s].
+    Call `close()` (or cancel the consuming task) to stop.
+    """
+
+    def __init__(
+        self,
+        url: Optional[str] = None,
+        *,
+        bus_filter: str = "*",
+        consumer_id: Optional[str] = None,
+        reconnect_min_s: float = 1.0,
+        reconnect_max_s: float = 30.0,
+        request_timeout_s: float = 10.0,
+    ) -> None:
+        # ``COGOS_ENDPOINT`` is honored at construction time when ``url`` is
+        # not explicitly provided, so the subscriber tracks whatever endpoint
+        # the rest of the cogos client code is using.
+        self._url = url or default_stream_url()
+        self._bus_filter = bus_filter
+        self._consumer_id = consumer_id
+        self._min_backoff = reconnect_min_s
+        self._max_backoff = reconnect_max_s
+        self._request_timeout = request_timeout_s
+        self._last_event_id: Optional[str] = None
+        self._closed = asyncio.Event()
+        self._client: Optional[httpx.AsyncClient] = None
+
+    async def close(self) -> None:
+        self._closed.set()
+        if self._client is not None:
+            try:
+                await self._client.aclose()
+            except Exception:  # pragma: no cover - best-effort
+                pass
+            self._client = None
+
+    def _build_params(self) -> dict[str, str]:
+        params: dict[str, str] = {}
+        if self._bus_filter and self._bus_filter != "*":
+            params["bus_id"] = self._bus_filter
+        if self._consumer_id:
+            params["consumer"] = self._consumer_id
+        return params
+
+    def _build_headers(self) -> dict[str, str]:
+        headers = {"Accept": "text/event-stream", "Cache-Control": "no-cache"}
+        if self._last_event_id:
+            # Harmless if the kernel doesn't honor it today; future protocol
+            # bump may use it for resume.
+            headers["Last-Event-ID"] = self._last_event_id
+        return headers
+
+    async def stream(self) -> AsyncIterator[BusEnvelope]:
+        backoff = self._min_backoff
+        # Generous read timeout — SSE is long-lived with 30s heartbeats.
+        timeout = httpx.Timeout(self._request_timeout, read=None)
+        while not self._closed.is_set():
+            self._client = httpx.AsyncClient(timeout=timeout)
+            try:
+                async with self._client.stream(
+                    "GET",
+                    self._url,
+                    params=self._build_params(),
+                    headers=self._build_headers(),
+                ) as resp:
+                    if resp.status_code != 200:
+                        logger.info(
+                            "bus-bridge: non-200 from %s: %s — backing off %.1fs",
+                            self._url,
+                            resp.status_code,
+                            backoff,
+                        )
+                        await self._sleep_or_close(backoff)
+                        backoff = min(self._max_backoff, max(self._min_backoff, backoff * 2))
+                        continue
+
+                    logger.info("bus-bridge: connected to %s", self._url)
+                    backoff = self._min_backoff  # reset on successful connect
+
+                    async for envelope in self._iter_sse(resp):
+                        yield envelope
+            except (httpx.HTTPError, asyncio.TimeoutError, ConnectionError) as e:
+                logger.info(
+                    "bus-bridge: transport error (%s); reconnecting in %.1fs",
+                    e.__class__.__name__,
+                    backoff,
+                )
+                await self._sleep_or_close(backoff)
+                backoff = min(self._max_backoff, max(self._min_backoff, backoff * 2))
+            except asyncio.CancelledError:
+                await self.close()
+                raise
+            finally:
+                if self._client is not None:
+                    try:
+                        await self._client.aclose()
+                    except Exception:  # pragma: no cover
+                        pass
+                    self._client = None
+
+    async def _sleep_or_close(self, seconds: float) -> None:
+        try:
+            await asyncio.wait_for(self._closed.wait(), timeout=seconds)
+        except asyncio.TimeoutError:
+            return
+
+    async def _iter_sse(self, resp: httpx.Response) -> AsyncIterator[BusEnvelope]:
+        """Parse the SSE byte stream into BusEnvelope records.
+
+        Minimal SSE parser: we accumulate field lines into the current event,
+        dispatch on blank-line boundaries, silently skip comment lines
+        (`: heartbeat`), and honor `data:`, `event:`, `id:` fields.
+        """
+        event_name: Optional[str] = None
+        data_lines: list[str] = []
+        event_id: Optional[str] = None
+
+        async for raw_line in resp.aiter_lines():
+            if self._closed.is_set():
+                return
+            # httpx strips the trailing \n but preserves empty lines.
+            if raw_line == "":
+                # Dispatch boundary.
+                if data_lines:
+                    env = self._parse_event(event_name, "\n".join(data_lines), event_id)
+                    if env is not None:
+                        yield env
+                event_name = None
+                data_lines = []
+                event_id = None
+                continue
+            if raw_line.startswith(":"):
+                # Comment line / heartbeat.
+                continue
+            field, _, value = raw_line.partition(":")
+            if value.startswith(" "):
+                value = value[1:]
+            if field == "data":
+                data_lines.append(value)
+            elif field == "event":
+                event_name = value
+            elif field == "id":
+                event_id = value
+                self._last_event_id = value
+            # retry / unknown fields: ignore
+
+    def _parse_event(self, event_name: Optional[str], data: str, event_id: Optional[str]) -> Optional[BusEnvelope]:
+        try:
+            envelope: Any = json.loads(data)
+        except json.JSONDecodeError:
+            logger.debug("bus-bridge: non-JSON data frame dropped: %r", data[:200])
+            return None
+        if not isinstance(envelope, dict):
+            logger.debug("bus-bridge: non-object data frame dropped: %r", envelope)
+            return None
+
+        inner = envelope.get("data")
+        payload: dict = inner if isinstance(inner, dict) else {}
+        kind = _extract_kind(envelope, payload)
+        ts = envelope.get("timestamp") or payload.get("ts") or payload.get("timestamp")
+        eid = event_id or envelope.get("id")
+        if eid and not self._last_event_id:
+            self._last_event_id = eid
+
+        if kind not in ("state_transition", "tool_dispatch", "assessment", "bus.event", "connected"):
+            # Tolerate unknowns — just log and forward.
+            logger.debug("bus-bridge: forwarding unknown event kind=%r", kind)
+
+        return BusEnvelope(
+            raw=envelope,
+            kind=kind,
+            payload=payload,
+            ts=ts if isinstance(ts, str) else None,
+            event_id=eid if isinstance(eid, str) else None,
+        )
+
+
+# ---------------------------------------------------------------------------
+# Manual validation entry point
+# ---------------------------------------------------------------------------
+
+
+async def _main() -> None:
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+    )
+    sub = KernelBusSubscriber()
+    print(f"bus-bridge: subscribing to {sub._url} (Ctrl-C to stop)")
+    try:
+        async for env in sub.stream():
+            print(
+                json.dumps(
+                    {
+                        "kind": env.kind,
+                        "ts": env.ts,
+                        "id": env.event_id,
+                        "payload_keys": sorted(env.payload.keys())[:12],
+                    }
+                )
+            )
+    except (KeyboardInterrupt, asyncio.CancelledError):
+        pass
+    finally:
+        await sub.close()
+
+
+if __name__ == "__main__":
+    try:
+        asyncio.run(_main())
+    except KeyboardInterrupt:
+        pass
diff --git a/bus_bridge_runner.py b/bus_bridge_runner.py
new file mode 100644
index 0000000..aa788c4
--- /dev/null
+++ b/bus_bridge_runner.py
@@ -0,0 +1,154 @@
+"""Kernel-bus → dashboard bridge runner.
+
+Consumes `KernelBusSubscriber.stream()` (see `bus_bridge.py`) and fans the
+ADR-083 cycle-trace events out to every connected dashboard WebSocket via
+`BrowserChannel.broadcast_trace_event()` (see `channels.py`).
+
+Wiring:
+
+  kernel (bus_cycle_trace)
+     └─► SSE /v1/events/stream?bus_id=bus_cycle_trace
+            └─► KernelBusSubscriber.stream()       [C1]
+                   └─► run_bridge() filter + forward
+                          └─► BrowserChannel.broadcast_trace_event()  [C2]
+
+The subscriber does its own reconnect with exponential backoff, so a kernel
+that is temporarily unreachable does not affect server startup. Disable the
+bridge entirely at process boot by setting env `MOD3_BUS_BRIDGE_DISABLED=1`.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+from typing import Optional
+
+from bus_bridge import KernelBusSubscriber, default_stream_url
+from channels import BrowserChannel
+
+logger = logging.getLogger("mod3.bus_bridge")
+
+# ADR-083 kinds the dashboard trace panel cares about. Kept as a module-level
+# constant so tests and the lifespan wiring share one definition.
+ADR083_KINDS: frozenset[str] = frozenset({"state_transition", "tool_dispatch", "assessment"})
+
+# Kernel-side bus name (see apps/cogos/trace_emit.go:const traceBusID).
+TRACE_BUS_ID = "bus_cycle_trace"
+
+# Env flag consulted at startup.
+DISABLE_ENV = "MOD3_BUS_BRIDGE_DISABLED"
+
+
+def is_disabled() -> bool:
+    """True when MOD3_BUS_BRIDGE_DISABLED is set to a truthy value."""
+    v = os.environ.get(DISABLE_ENV, "").strip().lower()
+    return v in ("1", "true", "yes", "on")
+
+
+async def run_bridge(
+    subscriber: KernelBusSubscriber,
+    *,
+    filter_kinds: Optional[set[str]] = None,
+) -> None:
+    """Consume `subscriber` and broadcast cycle-trace events to dashboard clients.
+
+    `filter_kinds`:
+      - `None`: forward everything (dev mode — useful when inspecting the raw
+        stream through a dashboard).
+      - set of kind strings: only forward envelopes whose `BusEnvelope.kind`
+        is in the set. Unknown kinds are tolerated per ADR-083 — they simply
+        won't pass this filter.
+
+    `BrowserChannel.broadcast_trace_event()` is thread-safe and non-blocking:
+    it dispatches each WS send via `run_coroutine_threadsafe`. We call it
+    directly (no await).
+    """
+    first_event_logged = False
+    forwarded = 0
+    async for env in subscriber.stream():
+        if filter_kinds is not None and env.kind not in filter_kinds:
+            continue
+        # The "connected" bootstrap frame has an empty payload; skip silently.
+        if env.kind == "connected":
+            continue
+        if not first_event_logged:
+            logger.info(
+                "bridge: first event forwarded kind=%s event_id=%s",
+                env.kind,
+                env.event_id,
+            )
+            first_event_logged = True
+        try:
+            BrowserChannel.broadcast_trace_event(env.payload)
+            forwarded += 1
+            logger.debug(
+                "bridge: forwarded kind=%s event_id=%s (total=%d)",
+                env.kind,
+                env.event_id,
+                forwarded,
+            )
+        except Exception as exc:  # noqa: BLE001 — broadcaster is best-effort
+            logger.debug("bridge: broadcast failed: %s", exc)
+
+
+async def start_bridge(
+    app_state: object,
+    *,
+    url: Optional[str] = None,
+    bus_filter: str = TRACE_BUS_ID,
+    filter_kinds: Optional[set[str]] = frozenset(ADR083_KINDS),
+) -> None:
+    """Construct the subscriber + bridge task and store them on `app_state`.
+
+    Startup is non-blocking: we don't await the task or probe the kernel.
+    The subscriber's own backoff loop handles reconnects. Logs a disabled
+    notice and returns cleanly when `MOD3_BUS_BRIDGE_DISABLED` is set.
+
+    ``url`` defaults to ``COGOS_ENDPOINT`` (resolved at call time) so the
+    subscriber tracks whatever endpoint the rest of the cogos client code is
+    using.
+    """
+    if is_disabled():
+        logger.info("bridge: disabled via %s=1", DISABLE_ENV)
+        setattr(app_state, "bus_bridge_subscriber", None)
+        setattr(app_state, "bus_bridge_task", None)
+        return
+
+    resolved_url = url or default_stream_url()
+    subscriber = KernelBusSubscriber(url=resolved_url, bus_filter=bus_filter, consumer_id="mod3-dashboard")
+    task = asyncio.create_task(
+        run_bridge(subscriber, filter_kinds=set(filter_kinds) if filter_kinds else None),
+        name="mod3-bus-bridge",
+    )
+    setattr(app_state, "bus_bridge_subscriber", subscriber)
+    setattr(app_state, "bus_bridge_task", task)
+    logger.info(
+        "bridge: started, target=%s bus_id=%s filter=%s",
+        resolved_url,
+        bus_filter,
+        sorted(filter_kinds) if filter_kinds else "*",
+    )
+
+
+async def stop_bridge(app_state: object, *, timeout_s: float = 2.0) -> None:
+    """Gracefully stop the bridge: close subscriber, await task, cancel on timeout."""
+    subscriber: Optional[KernelBusSubscriber] = getattr(app_state, "bus_bridge_subscriber", None)
+    task: Optional[asyncio.Task] = getattr(app_state, "bus_bridge_task", None)
+    if subscriber is None and task is None:
+        return
+    if subscriber is not None:
+        try:
+            await subscriber.close()
+        except Exception:  # pragma: no cover - best-effort
+            pass
+    if task is not None:
+        try:
+            await asyncio.wait_for(task, timeout=timeout_s)
+        except (asyncio.TimeoutError, asyncio.CancelledError):
+            task.cancel()
+            try:
+                await task
+            except (asyncio.CancelledError, Exception):  # pragma: no cover
+                pass
+    logger.info("bridge: stopped")
diff --git a/channels.py b/channels.py
index c026e64..702dbc8 100644
--- a/channels.py
+++ b/channels.py
@@ -8,6 +8,12 @@
   T1 (Whisper Base, ~31ms): per-chunk during speech
   T2 (Whisper Large, ~470ms): on natural pause
   T3 (Whisper Large, ~470ms): on end-of-utterance (final)
+
+Server→client WebSocket message types:
+  audio, response_text, response_complete, interrupted,
+  partial_transcript, transcript,
+  trace_event  — kernel cycle-trace events (ADR-083), fanned out via
+                 BrowserChannel.broadcast_trace_event().
 """
 
 from __future__ import annotations
@@ -33,6 +39,12 @@
 class BrowserChannel:
     """WebSocket-backed channel for the browser dashboard."""
 
+    # Registry of currently-active dashboard channels. Used by
+    # broadcast_trace_event() to fan kernel cycle-trace events out to every
+    # connected dashboard client (see ADR-083). Populated in __init__,
+    # pruned in _cleanup.
+    _active_channels: "set[BrowserChannel]" = set()
+
     def __init__(
         self,
         ws: WebSocket,
@@ -71,6 +83,7 @@ def __init__(
             modalities=[ModalityType.VOICE, ModalityType.TEXT],
             deliver=self._deliver_sync,
         )
+        BrowserChannel._active_channels.add(self)
         logger.info("BrowserChannel registered: %s", self.channel_id)
 
     # ------------------------------------------------------------------
@@ -424,6 +437,62 @@ async def send_response_complete(self, metrics: dict | None = None) -> None:
             except Exception:
                 self._active = False
 
+    # ------------------------------------------------------------------
+    # Trace event broadcast (kernel cycle-trace → dashboards)
+    # ------------------------------------------------------------------
+
+    @classmethod
+    def broadcast_trace_event(cls, event: dict) -> None:
+        """Fan a kernel cycle-trace event out to every connected dashboard.
+
+        Per ADR-083, `event` is a pre-parsed CycleEvent dict
+        (id, ts, source, cycle_id, kind, payload). Wrapped in the
+        `{"type": "trace_event", "event": ...}` envelope and sent to each
+        active BrowserChannel's WebSocket. Clients whose send fails are
+        skipped silently (they will be pruned by their own disconnect path).
+        """
+        frame = {"type": "trace_event", "event": event}
+        for ch in list(cls._active_channels):
+            if not ch._active:
+                continue
+            try:
+                asyncio.run_coroutine_threadsafe(ch.ws.send_json(frame), ch._loop)
+            except Exception as exc:  # noqa: BLE001 — disconnected clients are expected
+                logger.debug("trace_event send failed for %s: %s", ch.channel_id, exc)
+
+    @classmethod
+    def broadcast_response_text(cls, text: str, session_id: str | None = None) -> None:
+        """Push an agent-reply text frame to dashboard WebSocket clients.
+
+        Used by the MOD3_USE_COGOS_AGENT response bridge (see
+        `cogos_agent_bridge.run_response_bridge`). The frame matches the
+        existing text-response shape emitted by `_deliver_async` and
+        `send_response_text`: `{"type": "response_text", "text": <text>}`.
+
+        If `session_id` is None (default) the frame is broadcast to every
+        active dashboard channel. When provided, only channels whose
+        `channel_id` matches the `mod3:<channel_id>` convention from
+        `cogos_agent_bridge.post_user_message` receive the frame — this is
+        how future multi-user routing will land, but for v1 a None
+        broadcast is the common case (only one dashboard attached).
+
+        Thread-safe: dispatches each WS send via `run_coroutine_threadsafe`
+        on the channel's own loop, matching `broadcast_trace_event`.
+        """
+        frame = {"type": "response_text", "text": text}
+        expected_channel = None
+        if session_id and session_id.startswith("mod3:"):
+            expected_channel = session_id[len("mod3:"):]
+        for ch in list(cls._active_channels):
+            if not ch._active:
+                continue
+            if expected_channel and ch.channel_id != expected_channel:
+                continue
+            try:
+                asyncio.run_coroutine_threadsafe(ch.ws.send_json(frame), ch._loop)
+            except Exception as exc:  # noqa: BLE001 — disconnected clients are expected
+                logger.debug("response_text send failed for %s: %s", ch.channel_id, exc)
+
     # ------------------------------------------------------------------
     # Cleanup
     # ------------------------------------------------------------------
@@ -431,6 +500,7 @@ async def send_response_complete(self, metrics: dict | None = None) -> None:
     def _cleanup(self) -> None:
         """Deactivate channel and cancel pending TTS jobs on disconnect."""
         self._active = False
+        BrowserChannel._active_channels.discard(self)
         ch = self.bus._channels.get(self.channel_id)
         if ch:
             ch.active = False
diff --git a/cogos_agent_bridge.py b/cogos_agent_bridge.py
new file mode 100644
index 0000000..c8616f2
--- /dev/null
+++ b/cogos_agent_bridge.py
@@ -0,0 +1,292 @@
+"""CogOS kernel agent bridge (MOD3_USE_COGOS_AGENT=1).
+
+When the env flag is set, Mod³'s agent loop forwards user turns to the
+cogos kernel's metabolic cycle instead of the local inference provider:
+
+  browser → WS turn → post_user_message()  ─POST /v1/bus/send─►  kernel
+                                                                     │
+                                                                     ▼
+                                                         bus_dashboard_chat
+                                                                     │
+                                                                     ▼
+                                                   kernel cycle → `respond` tool
+                                                                     │
+                                                                     ▼
+                                                         bus_dashboard_response
+                                                                     │
+                                                     SSE /v1/events/stream
+                                                                     │
+                                                                     ▼
+                                               KernelBusSubscriber.stream()
+                                                                     │
+                                                                     ▼
+                                                    run_response_bridge()
+                                                                     │
+                                                                     ▼
+                                          BrowserChannel.broadcast_response_text()
+
+The subscriber does its own reconnect with exponential backoff (see
+`bus_bridge.py`). Disable the whole fork by leaving `MOD3_USE_COGOS_AGENT`
+unset (default).
+
+Note: the kernel's `POST /v1/bus/send` takes a flat `{bus_id, from, to,
+message, type}` body — the inner JSON event is serialised into `message`
+(matches the pattern used by other cogos producers).
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import os
+from datetime import datetime, timezone
+from typing import Optional
+
+import httpx
+
+from bus_bridge import KernelBusSubscriber, default_stream_url
+from channels import BrowserChannel
+
+logger = logging.getLogger("mod3.cogos_agent")
+
+# Bus names — contract with the kernel side (see ADR / c-agent subagent).
+CHAT_BUS_ID = "bus_dashboard_chat"
+RESPONSE_BUS_ID = "bus_dashboard_response"
+
+
+def _kernel_base() -> str:
+    """Resolve the kernel base URL from ``COGOS_ENDPOINT`` at call time."""
+    return os.environ.get("COGOS_ENDPOINT", "http://localhost:6931").rstrip("/")
+
+
+def _bus_send_url() -> str:
+    """Build the kernel bus-send URL from the current ``COGOS_ENDPOINT``."""
+    return f"{_kernel_base()}/v1/bus/send"
+
+
+# Back-compat module attribute. Use ``_bus_send_url()`` for runtime resolution.
+BUS_SEND_URL = _bus_send_url()
+
+# Env gate.
+ENABLE_ENV = "MOD3_USE_COGOS_AGENT"
+
+_POST_TIMEOUT_S = 5.0
+
+
+def is_enabled() -> bool:
+    """True when MOD3_USE_COGOS_AGENT is set to a truthy value."""
+    v = os.environ.get(ENABLE_ENV, "").strip().lower()
+    return v in ("1", "true", "yes", "on")
+
+
+def _now_rfc3339() -> str:
+    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+
+async def post_user_message(text: str, session_id: str) -> bool:
+    """POST a user turn to the kernel's `bus_dashboard_chat` bus.
+
+    Returns True if the send succeeded (kernel replied 2xx), False otherwise.
+    Logs at warning-level on failure but never raises — callers use graceful
+    degradation (e.g. show an error response frame to the dashboard).
+
+    The kernel's handleBusSend (see apps/cogos/bus_api.go) accepts
+    `{bus_id, from, to, message, type}` — we JSON-encode the full event dict
+    into `message` so the kernel's cycle receives the structured payload.
+    """
+    event = {
+        "type": "user_message",
+        "text": text,
+        "session_id": session_id,
+        "ts": _now_rfc3339(),
+    }
+    body = {
+        "bus_id": CHAT_BUS_ID,
+        "from": "mod3-dashboard",
+        "type": "user_message",
+        "message": json.dumps(event, separators=(",", ":")),
+    }
+    url = _bus_send_url()
+    try:
+        async with httpx.AsyncClient(timeout=_POST_TIMEOUT_S) as client:
+            resp = await client.post(url, json=body)
+    except httpx.HTTPError as exc:
+        logger.warning("cogos-agent: post to %s failed: %s", url, exc)
+        return False
+    if resp.status_code // 100 != 2:
+        logger.warning(
+            "cogos-agent: post non-2xx: %d body=%r",
+            resp.status_code,
+            resp.text[:200],
+        )
+        return False
+    logger.info(
+        "cogos-agent: forwarded user turn to kernel bus (session=%s)",
+        session_id,
+    )
+    return True
+
+
+def _extract_session_id(payload: dict) -> Optional[str]:
+    """Extract the ``session_id`` from a kernel reply payload, if present.
+
+    Mirrors :func:`_extract_response_text`: checks the top-level shape and
+    the JSON-encoded ``content`` wrapper that ``handleBusSend`` produces.
+    Returns ``None`` for older kernels that don't include a session id, or
+    for non-session-scoped events.
+
+    The downstream :meth:`BrowserChannel.broadcast_response_text` falls
+    back to broadcasting when ``session_id`` is ``None``, preserving the
+    backward-compat behavior.
+    """
+    if not isinstance(payload, dict):
+        return None
+    top = payload.get("session_id")
+    if isinstance(top, str) and top:
+        return top
+    content = payload.get("content")
+    if isinstance(content, str) and content:
+        try:
+            inner = json.loads(content)
+        except (TypeError, ValueError):
+            return None
+        if isinstance(inner, dict):
+            sid = inner.get("session_id")
+            if isinstance(sid, str) and sid:
+                return sid
+    return None
+
+
+def _extract_response_text(payload: dict) -> Optional[str]:
+    """Dig the assistant reply out of the bus event payload.
+
+    Kernel's `handleBusSend` wraps the sent `message` string inside a
+    `{"content": "<message>"}` map. On SSE delivery, the envelope's `data`
+    field is that map. We look first for structured keys (`text`, direct
+    agent_response shape), then fall through to parsing `content` as JSON.
+    """
+    if not isinstance(payload, dict):
+        return None
+    # Direct shape (if an upstream producer wrote the event dict at the top level).
+    for key in ("text", "reply", "response"):
+        val = payload.get(key)
+        if isinstance(val, str) and val:
+            return val
+    # Standard bus envelope: payload = {"content": "<json-encoded event>"}
+    content = payload.get("content")
+    if isinstance(content, str) and content:
+        try:
+            inner = json.loads(content)
+        except (TypeError, ValueError):
+            # Free-form string — treat the whole thing as the reply.
+            return content
+        if isinstance(inner, dict):
+            for key in ("text", "reply", "response"):
+                val = inner.get(key)
+                if isinstance(val, str) and val:
+                    return val
+        elif isinstance(inner, str) and inner:
+            return inner
+    return None
+
+
+async def run_response_bridge(subscriber: KernelBusSubscriber) -> None:
+    """Consume `subscriber` and broadcast agent replies to dashboard clients.
+
+    `BrowserChannel.broadcast_response_text()` is thread-safe via
+    `run_coroutine_threadsafe`, matching the existing trace-event pattern.
+    Malformed events (no recoverable text) are logged at debug and skipped.
+    """
+    first_event_logged = False
+    forwarded = 0
+    async for env in subscriber.stream():
+        if env.kind == "connected":
+            continue
+        text = _extract_response_text(env.payload)
+        if not text:
+            logger.debug(
+                "cogos-agent: skip event with no text kind=%s id=%s",
+                env.kind,
+                env.event_id,
+            )
+            continue
+        if not first_event_logged:
+            logger.info(
+                "cogos-agent: first response forwarded kind=%s event_id=%s",
+                env.kind,
+                env.event_id,
+            )
+            first_event_logged = True
+        session_id = _extract_session_id(env.payload)
+        try:
+            BrowserChannel.broadcast_response_text(text, session_id=session_id)
+            forwarded += 1
+            logger.debug(
+                "cogos-agent: forwarded response event_id=%s session=%s (total=%d)",
+                env.event_id,
+                session_id,
+                forwarded,
+            )
+        except Exception as exc:  # noqa: BLE001 — best-effort fan-out
+            logger.debug("cogos-agent: broadcast failed: %s", exc)
+
+
+async def start_response_bridge(
+    app_state: object,
+    *,
+    url: Optional[str] = None,
+) -> None:
+    """Construct the response subscriber + bridge task and store on `app_state`.
+
+    No-op (logs once) when `MOD3_USE_COGOS_AGENT` is unset.
+
+    ``url`` defaults to ``COGOS_ENDPOINT`` (resolved at call time) so the
+    subscriber tracks the same kernel endpoint as ``post_user_message``.
+    """
+    if not is_enabled():
+        logger.debug("cogos-agent: response bridge disabled (%s unset)", ENABLE_ENV)
+        setattr(app_state, "cogos_agent_subscriber", None)
+        setattr(app_state, "cogos_agent_task", None)
+        return
+
+    resolved_url = url or default_stream_url()
+    subscriber = KernelBusSubscriber(
+        url=resolved_url,
+        bus_filter=RESPONSE_BUS_ID,
+        consumer_id="mod3-dashboard-agent",
+    )
+    task = asyncio.create_task(
+        run_response_bridge(subscriber),
+        name="mod3-cogos-agent-bridge",
+    )
+    setattr(app_state, "cogos_agent_subscriber", subscriber)
+    setattr(app_state, "cogos_agent_task", task)
+    logger.info(
+        "cogos-agent: response bridge started, target=%s bus_id=%s",
+        resolved_url,
+        RESPONSE_BUS_ID,
+    )
+
+
+async def stop_response_bridge(app_state: object, *, timeout_s: float = 2.0) -> None:
+    """Gracefully stop the response bridge: close subscriber, await task, cancel on timeout."""
+    subscriber: Optional[KernelBusSubscriber] = getattr(app_state, "cogos_agent_subscriber", None)
+    task: Optional[asyncio.Task] = getattr(app_state, "cogos_agent_task", None)
+    if subscriber is None and task is None:
+        return
+    if subscriber is not None:
+        try:
+            await subscriber.close()
+        except Exception:  # pragma: no cover - best-effort
+            pass
+    if task is not None:
+        try:
+            await asyncio.wait_for(task, timeout=timeout_s)
+        except (asyncio.TimeoutError, asyncio.CancelledError):
+            task.cancel()
+            try:
+                await task
+            except (asyncio.CancelledError, Exception):  # pragma: no cover
+                pass
+    logger.info("cogos-agent: response bridge stopped")
diff --git a/dashboard/index.html b/dashboard/index.html
index 914a2b0..ae166c5 100644
--- a/dashboard/index.html
+++ b/dashboard/index.html
@@ -171,6 +171,49 @@
     50% { opacity: 0.3; }
   }
 
+  /* Cycle Trace panel (bottom drawer) */
+  #trace-panel {
+    position: fixed; left: 0; right: 0; bottom: 0;
+    background: var(--surface); border-top: 1px solid var(--border);
+    font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+    z-index: 20; max-height: 40vh; display: flex; flex-direction: column;
+    transition: max-height 0.2s ease;
+  }
+  #trace-panel.collapsed { max-height: 32px; }
+  #trace-panel .trace-header {
+    display: flex; align-items: center; gap: 8px;
+    padding: 6px 16px; border-bottom: 1px solid var(--border);
+    background: var(--bg); cursor: pointer; user-select: none;
+    font-size: 0.75rem; color: var(--muted); text-transform: uppercase;
+    letter-spacing: 0.5px; flex-shrink: 0; height: 32px;
+  }
+  #trace-panel .trace-header .trace-title { font-weight: 600; }
+  #trace-panel .trace-header .trace-toggle {
+    margin-left: auto; font-size: 0.7rem; color: var(--muted);
+  }
+  #trace-panel.collapsed #trace-entries { display: none; }
+  #trace-entries {
+    overflow-y: auto; padding: 4px 0; flex: 1; min-height: 0;
+    font-size: 0.75rem; line-height: 1.4;
+  }
+  .trace-entry {
+    display: flex; gap: 8px; align-items: baseline;
+    padding: 2px 16px; border-bottom: 1px solid rgba(48,54,61,0.3);
+    white-space: nowrap; overflow: hidden;
+  }
+  .trace-entry:hover { background: rgba(88,166,255,0.05); }
+  .trace-time { color: var(--muted); flex-shrink: 0; font-variant-numeric: tabular-nums; }
+  .trace-source { color: var(--muted); flex-shrink: 0; font-size: 0.7rem; }
+  .trace-kind {
+    flex-shrink: 0; font-size: 0.65rem; padding: 1px 6px;
+    border: 1px solid var(--border); border-radius: 3px;
+    text-transform: uppercase; letter-spacing: 0.5px;
+  }
+  .trace-summary { color: var(--text); overflow: hidden; text-overflow: ellipsis; }
+
+  /* Leave room at the bottom so the drawer doesn't cover the input */
+  body { padding-bottom: 32px; }
+
   /* Responsive */
   @media (max-width: 700px) {
     .main { padding: 12px 16px; }
@@ -300,6 +343,15 @@ <h1>Mod&#179;</h1>
   <div class="hint">For voice, use headphones. Speak naturally — the system detects when you start and stop. Speak during playback to interrupt.</div>
 </div>
 
+<!-- Cycle Trace drawer (ADR-083 trace_event frames) -->
+<div id="trace-panel" class="collapsed">
+  <div class="trace-header" id="trace-toggle" title="Toggle Cycle Trace panel">
+    <span class="trace-title">Cycle Trace</span>
+    <span class="trace-toggle" id="trace-toggle-icon">&#9650;</span>
+  </div>
+  <div id="trace-entries"></div>
+</div>
+
 <script>
 // --- Settings panel toggle ---
 document.getElementById('settings-toggle').addEventListener('click', () => {
@@ -625,6 +677,7 @@ <h1>Mod&#179;</h1>
 <script src="/dashboard/vad/bundle.min.js"></script>
 <script src="/dashboard/playback.js?v=2"></script>
 <script src="/dashboard/transport.js?v=2"></script>
+<script src="/dashboard/trace.js?v=1" defer></script>
 
 <script>
 // Auto-connect WebSocket on page load (text chat works without mic)
diff --git a/dashboard/trace.js b/dashboard/trace.js
new file mode 100644
index 0000000..d443911
--- /dev/null
+++ b/dashboard/trace.js
@@ -0,0 +1,179 @@
+/**
+ * TracePanel — renders `trace_event` WebSocket frames into the Cycle Trace drawer.
+ *
+ * Event shape (ADR-083):
+ *   { id, ts (RFC3339), source, cycle_id, kind, payload }
+ *
+ * Renders most-recent-first, caps at MAX_ENTRIES, exposes clear().
+ */
+(function () {
+  const MAX_ENTRIES = 100;
+
+  const KIND_META = {
+    state_transition: { label: "state", color: "var(--accent)" },
+    tool_dispatch:    { label: "tool",  color: "var(--orange)" },
+    assessment:       { label: "asmt",  color: "var(--green)" },
+  };
+
+  function escapeHtml(s) {
+    const d = document.createElement("div");
+    d.textContent = String(s);
+    return d.innerHTML;
+  }
+
+  function fmtTime(ts) {
+    try {
+      const d = new Date(ts);
+      if (isNaN(d.getTime())) return "--:--:--";
+      const hh = String(d.getHours()).padStart(2, "0");
+      const mm = String(d.getMinutes()).padStart(2, "0");
+      const ss = String(d.getSeconds()).padStart(2, "0");
+      return `${hh}:${mm}:${ss}`;
+    } catch (_e) {
+      return "--:--:--";
+    }
+  }
+
+  function summarize(kind, payload) {
+    const p = payload || {};
+    try {
+      if (kind === "state_transition") {
+        const from = p.from != null ? p.from : "?";
+        const to = p.to != null ? p.to : "?";
+        return `${from} \u2192 ${to}`;
+      }
+      if (kind === "tool_dispatch") {
+        const tool = p.tool || p.name || "tool";
+        const ms = p.duration_ms != null ? p.duration_ms
+                  : p.latency_ms != null ? p.latency_ms
+                  : null;
+        return ms != null ? `${tool} (${Math.round(ms)}ms)` : `${tool}`;
+      }
+      if (kind === "assessment") {
+        const action = p.action || p.decision || "assess";
+        const conf = p.confidence != null ? p.confidence
+                    : p.conf != null ? p.conf
+                    : null;
+        return conf != null
+          ? `${action} (conf: ${Number(conf).toFixed(2)})`
+          : `${action}`;
+      }
+      // Unknown kind — show a compact inline JSON preview if short
+      const keys = Object.keys(p);
+      if (keys.length === 0) return "";
+      const first = keys[0];
+      const val = p[first];
+      const valStr = typeof val === "object" ? "{...}" : String(val);
+      return `${first}=${valStr}${keys.length > 1 ? ` +${keys.length - 1}` : ""}`;
+    } catch (_e) {
+      return "";
+    }
+  }
+
+  class TracePanel {
+    constructor(container) {
+      this.container = container;
+      this.count = 0;
+    }
+
+    render(event) {
+      if (!event || !this.container) return;
+      const kind = event.kind || "unknown";
+      const meta = KIND_META[kind] || { label: kind, color: "var(--muted)" };
+
+      const row = document.createElement("div");
+      row.className = "trace-entry";
+
+      const time = fmtTime(event.ts);
+      const source = event.source || "?";
+      const summary = summarize(kind, event.payload);
+
+      let fullJson = "";
+      try { fullJson = JSON.stringify(event, null, 2); } catch (_e) { fullJson = "<unserializable>"; }
+
+      row.innerHTML =
+        `<span class="trace-time">${escapeHtml(time)}</span>` +
+        `<span class="trace-source">${escapeHtml(source)}</span>` +
+        `<span class="trace-kind" style="color:${meta.color};border-color:${meta.color};">` +
+          `${escapeHtml(meta.label)}</span>` +
+        `<span class="trace-summary">${escapeHtml(summary)}</span>`;
+      row.title = fullJson;
+
+      // Most-recent-first
+      if (this.container.firstChild) {
+        this.container.insertBefore(row, this.container.firstChild);
+      } else {
+        this.container.appendChild(row);
+      }
+      this.count += 1;
+
+      // Rolling window: drop oldest (last child)
+      while (this.count > MAX_ENTRIES && this.container.lastChild) {
+        this.container.removeChild(this.container.lastChild);
+        this.count -= 1;
+      }
+    }
+
+    clear() {
+      if (!this.container) return;
+      while (this.container.firstChild) {
+        this.container.removeChild(this.container.firstChild);
+      }
+      this.count = 0;
+    }
+  }
+
+  // --- Singleton wiring on DOMContentLoaded ---
+  function init() {
+    const container = document.getElementById("trace-entries");
+    const panel = document.getElementById("trace-panel");
+    const toggle = document.getElementById("trace-toggle");
+    const toggleIcon = document.getElementById("trace-toggle-icon");
+    if (!container || !panel) return;
+
+    window.tracePanel = new TracePanel(container);
+
+    // Persist collapse state
+    const LS_KEY = "mod3.tracePanel.collapsed";
+    function applyCollapsed(collapsed) {
+      if (collapsed) panel.classList.add("collapsed");
+      else panel.classList.remove("collapsed");
+      if (toggleIcon) toggleIcon.innerHTML = collapsed ? "&#9650;" : "&#9660;";
+    }
+    let stored = null;
+    try { stored = localStorage.getItem(LS_KEY); } catch (_e) {}
+    // Default collapsed on first load
+    applyCollapsed(stored === null ? true : stored === "1");
+
+    if (toggle) {
+      toggle.addEventListener("click", () => {
+        const nowCollapsed = !panel.classList.contains("collapsed");
+        applyCollapsed(nowCollapsed);
+        try { localStorage.setItem(LS_KEY, nowCollapsed ? "1" : "0"); } catch (_e) {}
+      });
+    }
+
+    // Dev hook for smoke-testing from the browser console.
+    window.testTrace = function (partial) {
+      const ev = Object.assign(
+        {
+          id: "test-" + Math.random().toString(36).slice(2, 8),
+          ts: new Date().toISOString(),
+          source: "cog",
+          cycle_id: "test-cycle",
+          kind: "state_transition",
+          payload: { from: "idle", to: "active" },
+        },
+        partial || {}
+      );
+      window.tracePanel.render(ev);
+      return ev;
+    };
+  }
+
+  if (document.readyState === "loading") {
+    document.addEventListener("DOMContentLoaded", init);
+  } else {
+    init();
+  }
+})();
diff --git a/dashboard/transport.js b/dashboard/transport.js
index ee8dc5c..0646c24 100644
--- a/dashboard/transport.js
+++ b/dashboard/transport.js
@@ -51,6 +51,16 @@ class VoiceTransport {
   }
 
   _dispatch(msg) {
+    // Trace events — cosmetic, never blocks audio/text rendering
+    if (msg.type === "trace_event") {
+      try {
+        if (window.tracePanel && msg.event) window.tracePanel.render(msg.event);
+      } catch (e) {
+        console.warn("[WS] trace_event render failed:", e);
+      }
+      return;
+    }
+
     // Handle base64 audio message — decode and route to onAudio
     if (msg.type === "audio" && msg.data) {
       const binary = atob(msg.data);
diff --git a/dashboard/vad/ort.min.js b/dashboard/vad/ort.min.js
new file mode 100644
index 0000000..63ab7bd
--- /dev/null
+++ b/dashboard/vad/ort.min.js
@@ -0,0 +1,2869 @@
+/*!
+ * ONNX Runtime Web v1.22.0
+ * Copyright (c) Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License.
+ */
+"use strict";var ort=(()=>{var bn=Object.defineProperty;var Vu=Object.getOwnPropertyDescriptor;var Nu=Object.getOwnPropertyNames;var Lu=Object.prototype.hasOwnProperty;var _n=(e=>typeof require<"u"?require:typeof Proxy<"u"?new Proxy(e,{get:(t,n)=>(typeof require<"u"?require:t)[n]}):e)(function(e){if(typeof require<"u")return require.apply(this,arguments);throw Error('Dynamic require of "'+e+'" is not supported')});var E=(e,t)=>()=>(e&&(t=e(e=0)),t);var et=(e,t)=>{for(var n in t)bn(e,n,{get:t[n],enumerable:!0})},Gu=(e,t,n,r)=>{if(t&&typeof t=="object"||typeof t=="function")for(let o of Nu(t))!Lu.call(e,o)&&o!==n&&bn(e,o,{get:()=>t[o],enumerable:!(r=Vu(t,o))||r.enumerable});return e};var at=e=>Gu(bn({},"__esModule",{value:!0}),e);var $t,Fe,Ne,Wu,hr,wn=E(()=>{"use strict";$t=new Map,Fe=[],Ne=(e,t,n)=>{if(t&&typeof t.init=="function"&&typeof t.createInferenceSessionHandler=="function"){let r=$t.get(e);if(r===void 0)$t.set(e,{backend:t,priority:n});else{if(r.priority>n)return;if(r.priority===n&&r.backend!==t)throw new Error(`cannot register backend "${e}" using priority ${n}`)}if(n>=0){let o=Fe.indexOf(e);o!==-1&&Fe.splice(o,1);for(let i=0;i<Fe.length;i++)if($t.get(Fe[i]).priority<=n){Fe.splice(i,0,e);return}Fe.push(e)}return}throw new TypeError("not a valid backend")},Wu=async e=>{let t=$t.get(e);if(!t)return"backend not found.";if(t.initialized)return t.backend;if(t.aborted)return t.error;{let n=!!t.initPromise;try{return n||(t.initPromise=t.backend.init(e)),await t.initPromise,t.initialized=!0,t.backend}catch(r){return n||(t.error=`${r}`,t.aborted=!0),t.error}finally{delete t.initPromise}}},hr=async e=>{let t=e.executionProviders||[],n=t.map(u=>typeof u=="string"?u:u.name),r=n.length===0?Fe:n,o,i=[],s=new Set;for(let u of r){let d=await Wu(u);typeof d=="string"?i.push({name:u,err:d}):(o||(o=d),o===d&&s.add(u))}if(!o)throw new Error(`no available backend found. ERR: ${i.map(u=>`[${u.name}] ${u.err}`).join(", ")}`);for(let{name:u,err:d}of i)n.includes(u)&&console.warn(`removing requested execution provider "${u}" from session options because it is not available: ${d}`);let a=t.filter(u=>s.has(typeof u=="string"?u:u.name));return[o,new Proxy(e,{get:(u,d)=>d==="executionProviders"?a:Reflect.get(u,d)})]}});var gr=E(()=>{"use strict";wn()});var yr,br=E(()=>{"use strict";yr="1.22.0"});var _r,ge,$n=E(()=>{"use strict";br();_r="warning",ge={wasm:{},webgl:{},webgpu:{},versions:{common:yr},set logLevel(e){if(e!==void 0){if(typeof e!="string"||["verbose","info","warning","error","fatal"].indexOf(e)===-1)throw new Error(`Unsupported logging level: ${e}`);_r=e}},get logLevel(){return _r}};Object.defineProperty(ge,"logLevel",{enumerable:!0})});var te,wr=E(()=>{"use strict";$n();te=ge});var $r,vr,xr=E(()=>{"use strict";$r=(e,t)=>{let n=typeof document<"u"?document.createElement("canvas"):new OffscreenCanvas(1,1);n.width=e.dims[3],n.height=e.dims[2];let r=n.getContext("2d");if(r!=null){let o,i;t?.tensorLayout!==void 0&&t.tensorLayout==="NHWC"?(o=e.dims[2],i=e.dims[3]):(o=e.dims[3],i=e.dims[2]);let s=t?.format!==void 0?t.format:"RGB",a=t?.norm,u,d;a===void 0||a.mean===void 0?u=[255,255,255,255]:typeof a.mean=="number"?u=[a.mean,a.mean,a.mean,a.mean]:(u=[a.mean[0],a.mean[1],a.mean[2],0],a.mean[3]!==void 0&&(u[3]=a.mean[3])),a===void 0||a.bias===void 0?d=[0,0,0,0]:typeof a.bias=="number"?d=[a.bias,a.bias,a.bias,a.bias]:(d=[a.bias[0],a.bias[1],a.bias[2],0],a.bias[3]!==void 0&&(d[3]=a.bias[3]));let l=i*o,c=0,p=l,f=l*2,m=-1;s==="RGBA"?(c=0,p=l,f=l*2,m=l*3):s==="RGB"?(c=0,p=l,f=l*2):s==="RBG"&&(c=0,f=l,p=l*2);for(let h=0;h<i;h++)for(let b=0;b<o;b++){let y=(e.data[c++]-d[0])*u[0],g=(e.data[p++]-d[1])*u[1],_=(e.data[f++]-d[2])*u[2],w=m===-1?255:(e.data[m++]-d[3])*u[3];r.fillStyle="rgba("+y+","+g+","+_+","+w+")",r.fillRect(b,h,1,1)}if("toDataURL"in n)return n.toDataURL();throw new Error("toDataURL is not supported")}else throw new Error("Can not access image data")},vr=(e,t)=>{let n=typeof document<"u"?document.createElement("canvas").getContext("2d"):new OffscreenCanvas(1,1).getContext("2d"),r;if(n!=null){let o,i,s;t?.tensorLayout!==void 0&&t.tensorLayout==="NHWC"?(o=e.dims[2],i=e.dims[1],s=e.dims[3]):(o=e.dims[3],i=e.dims[2],s=e.dims[1]);let a=t!==void 0&&t.format!==void 0?t.format:"RGB",u=t?.norm,d,l;u===void 0||u.mean===void 0?d=[255,255,255,255]:typeof u.mean=="number"?d=[u.mean,u.mean,u.mean,u.mean]:(d=[u.mean[0],u.mean[1],u.mean[2],255],u.mean[3]!==void 0&&(d[3]=u.mean[3])),u===void 0||u.bias===void 0?l=[0,0,0,0]:typeof u.bias=="number"?l=[u.bias,u.bias,u.bias,u.bias]:(l=[u.bias[0],u.bias[1],u.bias[2],0],u.bias[3]!==void 0&&(l[3]=u.bias[3]));let c=i*o;if(t!==void 0&&(t.format!==void 0&&s===4&&t.format!=="RGBA"||s===3&&t.format!=="RGB"&&t.format!=="BGR"))throw new Error("Tensor format doesn't match input tensor dims");let p=4,f=0,m=1,h=2,b=3,y=0,g=c,_=c*2,w=-1;a==="RGBA"?(y=0,g=c,_=c*2,w=c*3):a==="RGB"?(y=0,g=c,_=c*2):a==="RBG"&&(y=0,_=c,g=c*2),r=n.createImageData(o,i);for(let v=0;v<i*o;f+=p,m+=p,h+=p,b+=p,v++)r.data[f]=(e.data[y++]-l[0])*d[0],r.data[m]=(e.data[g++]-l[1])*d[1],r.data[h]=(e.data[_++]-l[2])*d[2],r.data[b]=w===-1?255:(e.data[w++]-l[3])*d[3]}else throw new Error("Can not access image data");return r}});var vn,Sr,Tr,Ir,Cr,Ar,kr=E(()=>{"use strict";vt();vn=(e,t)=>{if(e===void 0)throw new Error("Image buffer must be defined");if(t.height===void 0||t.width===void 0)throw new Error("Image height and width must be defined");if(t.tensorLayout==="NHWC")throw new Error("NHWC Tensor layout is not supported yet");let{height:n,width:r}=t,o=t.norm??{mean:255,bias:0},i,s;typeof o.mean=="number"?i=[o.mean,o.mean,o.mean,o.mean]:i=[o.mean[0],o.mean[1],o.mean[2],o.mean[3]??255],typeof o.bias=="number"?s=[o.bias,o.bias,o.bias,o.bias]:s=[o.bias[0],o.bias[1],o.bias[2],o.bias[3]??0];let a=t.format!==void 0?t.format:"RGBA",u=t.tensorFormat!==void 0&&t.tensorFormat!==void 0?t.tensorFormat:"RGB",d=n*r,l=u==="RGBA"?new Float32Array(d*4):new Float32Array(d*3),c=4,p=0,f=1,m=2,h=3,b=0,y=d,g=d*2,_=-1;a==="RGB"&&(c=3,p=0,f=1,m=2,h=-1),u==="RGBA"?_=d*3:u==="RBG"?(b=0,g=d,y=d*2):u==="BGR"&&(g=0,y=d,b=d*2);for(let v=0;v<d;v++,p+=c,m+=c,f+=c,h+=c)l[b++]=(e[p]+s[0])/i[0],l[y++]=(e[f]+s[1])/i[1],l[g++]=(e[m]+s[2])/i[2],_!==-1&&h!==-1&&(l[_++]=(e[h]+s[3])/i[3]);return u==="RGBA"?new pe("float32",l,[1,4,n,r]):new pe("float32",l,[1,3,n,r])},Sr=async(e,t)=>{let n=typeof HTMLImageElement<"u"&&e instanceof HTMLImageElement,r=typeof ImageData<"u"&&e instanceof ImageData,o=typeof ImageBitmap<"u"&&e instanceof ImageBitmap,i=typeof e=="string",s,a=t??{},u=()=>{if(typeof document<"u")return document.createElement("canvas");if(typeof OffscreenCanvas<"u")return new OffscreenCanvas(1,1);throw new Error("Canvas is not supported")},d=l=>typeof HTMLCanvasElement<"u"&&l instanceof HTMLCanvasElement||l instanceof OffscreenCanvas?l.getContext("2d"):null;if(n){let l=u();l.width=e.width,l.height=e.height;let c=d(l);if(c!=null){let p=e.height,f=e.width;if(t!==void 0&&t.resizedHeight!==void 0&&t.resizedWidth!==void 0&&(p=t.resizedHeight,f=t.resizedWidth),t!==void 0){if(a=t,t.tensorFormat!==void 0)throw new Error("Image input config format must be RGBA for HTMLImageElement");a.tensorFormat="RGBA",a.height=p,a.width=f}else a.tensorFormat="RGBA",a.height=p,a.width=f;c.drawImage(e,0,0),s=c.getImageData(0,0,f,p).data}else throw new Error("Can not access image data")}else if(r){let l,c;if(t!==void 0&&t.resizedWidth!==void 0&&t.resizedHeight!==void 0?(l=t.resizedHeight,c=t.resizedWidth):(l=e.height,c=e.width),t!==void 0&&(a=t),a.format="RGBA",a.height=l,a.width=c,t!==void 0){let p=u();p.width=c,p.height=l;let f=d(p);if(f!=null)f.putImageData(e,0,0),s=f.getImageData(0,0,c,l).data;else throw new Error("Can not access image data")}else s=e.data}else if(o){if(t===void 0)throw new Error("Please provide image config with format for Imagebitmap");let l=u();l.width=e.width,l.height=e.height;let c=d(l);if(c!=null){let p=e.height,f=e.width;return c.drawImage(e,0,0,f,p),s=c.getImageData(0,0,f,p).data,a.height=p,a.width=f,vn(s,a)}else throw new Error("Can not access image data")}else{if(i)return new Promise((l,c)=>{let p=u(),f=d(p);if(!e||!f)return c();let m=new Image;m.crossOrigin="Anonymous",m.src=e,m.onload=()=>{p.width=m.width,p.height=m.height,f.drawImage(m,0,0,p.width,p.height);let h=f.getImageData(0,0,p.width,p.height);a.height=p.height,a.width=p.width,l(vn(h.data,a))}});throw new Error("Input data provided is not supported - aborted tensor creation")}if(s!==void 0)return vn(s,a);throw new Error("Input data provided is not supported - aborted tensor creation")},Tr=(e,t)=>{let{width:n,height:r,download:o,dispose:i}=t,s=[1,r,n,4];return new pe({location:"texture",type:"float32",texture:e,dims:s,download:o,dispose:i})},Ir=(e,t)=>{let{dataType:n,dims:r,download:o,dispose:i}=t;return new pe({location:"gpu-buffer",type:n??"float32",gpuBuffer:e,dims:r,download:o,dispose:i})},Cr=(e,t)=>{let{dataType:n,dims:r,download:o,dispose:i}=t;return new pe({location:"ml-tensor",type:n??"float32",mlTensor:e,dims:r,download:o,dispose:i})},Ar=(e,t,n)=>new pe({location:"cpu-pinned",type:e,data:t,dims:n??[t.length]})});var Ke,ut,Er,Pr,zr=E(()=>{"use strict";Ke=new Map([["float32",Float32Array],["uint8",Uint8Array],["int8",Int8Array],["uint16",Uint16Array],["int16",Int16Array],["int32",Int32Array],["bool",Uint8Array],["float64",Float64Array],["uint32",Uint32Array],["int4",Uint8Array],["uint4",Uint8Array]]),ut=new Map([[Float32Array,"float32"],[Uint8Array,"uint8"],[Int8Array,"int8"],[Uint16Array,"uint16"],[Int16Array,"int16"],[Int32Array,"int32"],[Float64Array,"float64"],[Uint32Array,"uint32"]]),Er=!1,Pr=()=>{if(!Er){Er=!0;let e=typeof BigInt64Array<"u"&&BigInt64Array.from,t=typeof BigUint64Array<"u"&&BigUint64Array.from,n=globalThis.Float16Array,r=typeof n<"u"&&n.from;e&&(Ke.set("int64",BigInt64Array),ut.set(BigInt64Array,"int64")),t&&(Ke.set("uint64",BigUint64Array),ut.set(BigUint64Array,"uint64")),r?(Ke.set("float16",n),ut.set(n,"float16")):Ke.set("float16",Uint16Array)}}});var Or,Dr,Br=E(()=>{"use strict";vt();Or=e=>{let t=1;for(let n=0;n<e.length;n++){let r=e[n];if(typeof r!="number"||!Number.isSafeInteger(r))throw new TypeError(`dims[${n}] must be an integer, got: ${r}`);if(r<0)throw new RangeError(`dims[${n}] must be a non-negative integer, got: ${r}`);t*=r}return t},Dr=(e,t)=>{switch(e.location){case"cpu":return new pe(e.type,e.data,t);case"cpu-pinned":return new pe({location:"cpu-pinned",data:e.data,type:e.type,dims:t});case"texture":return new pe({location:"texture",texture:e.texture,type:e.type,dims:t});case"gpu-buffer":return new pe({location:"gpu-buffer",gpuBuffer:e.gpuBuffer,type:e.type,dims:t});case"ml-tensor":return new pe({location:"ml-tensor",mlTensor:e.mlTensor,type:e.type,dims:t});default:throw new Error(`tensorReshape: tensor location ${e.location} is not supported`)}}});var pe,vt=E(()=>{"use strict";xr();kr();zr();Br();pe=class{constructor(t,n,r){Pr();let o,i;if(typeof t=="object"&&"location"in t)switch(this.dataLocation=t.location,o=t.type,i=t.dims,t.location){case"cpu-pinned":{let a=Ke.get(o);if(!a)throw new TypeError(`unsupported type "${o}" to create tensor from pinned buffer`);if(!(t.data instanceof a))throw new TypeError(`buffer should be of type ${a.name}`);this.cpuData=t.data;break}case"texture":{if(o!=="float32")throw new TypeError(`unsupported type "${o}" to create tensor from texture`);this.gpuTextureData=t.texture,this.downloader=t.download,this.disposer=t.dispose;break}case"gpu-buffer":{if(o!=="float32"&&o!=="float16"&&o!=="int32"&&o!=="int64"&&o!=="uint32"&&o!=="uint8"&&o!=="bool"&&o!=="uint4"&&o!=="int4")throw new TypeError(`unsupported type "${o}" to create tensor from gpu buffer`);this.gpuBufferData=t.gpuBuffer,this.downloader=t.download,this.disposer=t.dispose;break}case"ml-tensor":{if(o!=="float32"&&o!=="float16"&&o!=="int32"&&o!=="int64"&&o!=="uint32"&&o!=="uint64"&&o!=="int8"&&o!=="uint8"&&o!=="bool"&&o!=="uint4"&&o!=="int4")throw new TypeError(`unsupported type "${o}" to create tensor from MLTensor`);this.mlTensorData=t.mlTensor,this.downloader=t.download,this.disposer=t.dispose;break}default:throw new Error(`Tensor constructor: unsupported location '${this.dataLocation}'`)}else{let a,u;if(typeof t=="string")if(o=t,u=r,t==="string"){if(!Array.isArray(n))throw new TypeError("A string tensor's data must be a string array.");a=n}else{let d=Ke.get(t);if(d===void 0)throw new TypeError(`Unsupported tensor type: ${t}.`);if(Array.isArray(n)){if(t==="float16"&&d===Uint16Array||t==="uint4"||t==="int4")throw new TypeError(`Creating a ${t} tensor from number array is not supported. Please use ${d.name} as data.`);t==="uint64"||t==="int64"?a=d.from(n,BigInt):a=d.from(n)}else if(n instanceof d)a=n;else if(n instanceof Uint8ClampedArray)if(t==="uint8")a=Uint8Array.from(n);else throw new TypeError("A Uint8ClampedArray tensor's data must be type of uint8");else if(t==="float16"&&n instanceof Uint16Array&&d!==Uint16Array)a=new globalThis.Float16Array(n.buffer,n.byteOffset,n.length);else throw new TypeError(`A ${o} tensor's data must be type of ${d}`)}else if(u=n,Array.isArray(t)){if(t.length===0)throw new TypeError("Tensor type cannot be inferred from an empty array.");let d=typeof t[0];if(d==="string")o="string",a=t;else if(d==="boolean")o="bool",a=Uint8Array.from(t);else throw new TypeError(`Invalid element type of data array: ${d}.`)}else if(t instanceof Uint8ClampedArray)o="uint8",a=Uint8Array.from(t);else{let d=ut.get(t.constructor);if(d===void 0)throw new TypeError(`Unsupported type for tensor data: ${t.constructor}.`);o=d,a=t}if(u===void 0)u=[a.length];else if(!Array.isArray(u))throw new TypeError("A tensor's dims must be a number array");i=u,this.cpuData=a,this.dataLocation="cpu"}let s=Or(i);if(this.cpuData&&s!==this.cpuData.length&&!((o==="uint4"||o==="int4")&&Math.ceil(s/2)===this.cpuData.length))throw new Error(`Tensor's size(${s}) does not match data length(${this.cpuData.length}).`);this.type=o,this.dims=i,this.size=s}static async fromImage(t,n){return Sr(t,n)}static fromTexture(t,n){return Tr(t,n)}static fromGpuBuffer(t,n){return Ir(t,n)}static fromMLTensor(t,n){return Cr(t,n)}static fromPinnedBuffer(t,n,r){return Ar(t,n,r)}toDataURL(t){return $r(this,t)}toImageData(t){return vr(this,t)}get data(){if(this.ensureValid(),!this.cpuData)throw new Error("The data is not on CPU. Use `getData()` to download GPU data to CPU, or use `texture` or `gpuBuffer` property to access the GPU data directly.");return this.cpuData}get location(){return this.dataLocation}get texture(){if(this.ensureValid(),!this.gpuTextureData)throw new Error("The data is not stored as a WebGL texture.");return this.gpuTextureData}get gpuBuffer(){if(this.ensureValid(),!this.gpuBufferData)throw new Error("The data is not stored as a WebGPU buffer.");return this.gpuBufferData}get mlTensor(){if(this.ensureValid(),!this.mlTensorData)throw new Error("The data is not stored as a WebNN MLTensor.");return this.mlTensorData}async getData(t){switch(this.ensureValid(),this.dataLocation){case"cpu":case"cpu-pinned":return this.data;case"texture":case"gpu-buffer":case"ml-tensor":{if(!this.downloader)throw new Error("The current tensor is not created with a specified data downloader.");if(this.isDownloading)throw new Error("The current tensor is being downloaded.");try{this.isDownloading=!0;let n=await this.downloader();return this.downloader=void 0,this.dataLocation="cpu",this.cpuData=n,t&&this.disposer&&(this.disposer(),this.disposer=void 0),n}finally{this.isDownloading=!1}}default:throw new Error(`cannot get data from location: ${this.dataLocation}`)}}dispose(){if(this.isDownloading)throw new Error("The current tensor is being downloaded.");this.disposer&&(this.disposer(),this.disposer=void 0),this.cpuData=void 0,this.gpuTextureData=void 0,this.gpuBufferData=void 0,this.mlTensorData=void 0,this.downloader=void 0,this.isDownloading=void 0,this.dataLocation="none"}ensureValid(){if(this.dataLocation==="none")throw new Error("The tensor is disposed.")}reshape(t){if(this.ensureValid(),this.downloader||this.disposer)throw new Error("Cannot reshape a tensor that owns GPU resource.");return Dr(this,t)}}});var $e,xn=E(()=>{"use strict";vt();$e=pe});var dt,Mr,fe,me,Sn=E(()=>{"use strict";$n();dt=(e,t)=>{(typeof ge.trace>"u"?!ge.wasm.trace:!ge.trace)||console.timeStamp(`${e}::ORT::${t}`)},Mr=(e,t)=>{let n=new Error().stack?.split(/\r\n|\r|\n/g)||[],r=!1;for(let o=0;o<n.length;o++){if(r&&!n[o].includes("TRACE_FUNC")){let i=`FUNC_${e}::${n[o].trim().split(" ")[1]}`;t&&(i+=`::${t}`),dt("CPU",i);return}n[o].includes("TRACE_FUNC")&&(r=!0)}},fe=e=>{(typeof ge.trace>"u"?!ge.wasm.trace:!ge.trace)||Mr("BEGIN",e)},me=e=>{(typeof ge.trace>"u"?!ge.wasm.trace:!ge.trace)||Mr("END",e)}});var xt,Rr=E(()=>{"use strict";wn();xn();Sn();xt=class e{constructor(t){this.handler=t}async run(t,n,r){fe();let o={},i={};if(typeof t!="object"||t===null||t instanceof $e||Array.isArray(t))throw new TypeError("'feeds' must be an object that use input names as keys and OnnxValue as corresponding values.");let s=!0;if(typeof n=="object"){if(n===null)throw new TypeError("Unexpected argument[1]: cannot be null.");if(n instanceof $e)throw new TypeError("'fetches' cannot be a Tensor");if(Array.isArray(n)){if(n.length===0)throw new TypeError("'fetches' cannot be an empty array.");s=!1;for(let d of n){if(typeof d!="string")throw new TypeError("'fetches' must be a string array or an object.");if(this.outputNames.indexOf(d)===-1)throw new RangeError(`'fetches' contains invalid output name: ${d}.`);o[d]=null}if(typeof r=="object"&&r!==null)i=r;else if(typeof r<"u")throw new TypeError("'options' must be an object.")}else{let d=!1,l=Object.getOwnPropertyNames(n);for(let c of this.outputNames)if(l.indexOf(c)!==-1){let p=n[c];(p===null||p instanceof $e)&&(d=!0,s=!1,o[c]=p)}if(d){if(typeof r=="object"&&r!==null)i=r;else if(typeof r<"u")throw new TypeError("'options' must be an object.")}else i=n}}else if(typeof n<"u")throw new TypeError("Unexpected argument[1]: must be 'fetches' or 'options'.");for(let d of this.inputNames)if(typeof t[d]>"u")throw new Error(`input '${d}' is missing in 'feeds'.`);if(s)for(let d of this.outputNames)o[d]=null;let a=await this.handler.run(t,o,i),u={};for(let d in a)if(Object.hasOwnProperty.call(a,d)){let l=a[d];l instanceof $e?u[d]=l:u[d]=new $e(l.type,l.data,l.dims)}return me(),u}async release(){return this.handler.dispose()}static async create(t,n,r,o){fe();let i,s={};if(typeof t=="string"){if(i=t,typeof n=="object"&&n!==null)s=n;else if(typeof n<"u")throw new TypeError("'options' must be an object.")}else if(t instanceof Uint8Array){if(i=t,typeof n=="object"&&n!==null)s=n;else if(typeof n<"u")throw new TypeError("'options' must be an object.")}else if(t instanceof ArrayBuffer||typeof SharedArrayBuffer<"u"&&t instanceof SharedArrayBuffer){let l=t,c=0,p=t.byteLength;if(typeof n=="object"&&n!==null)s=n;else if(typeof n=="number"){if(c=n,!Number.isSafeInteger(c))throw new RangeError("'byteOffset' must be an integer.");if(c<0||c>=l.byteLength)throw new RangeError(`'byteOffset' is out of range [0, ${l.byteLength}).`);if(p=t.byteLength-c,typeof r=="number"){if(p=r,!Number.isSafeInteger(p))throw new RangeError("'byteLength' must be an integer.");if(p<=0||c+p>l.byteLength)throw new RangeError(`'byteLength' is out of range (0, ${l.byteLength-c}].`);if(typeof o=="object"&&o!==null)s=o;else if(typeof o<"u")throw new TypeError("'options' must be an object.")}else if(typeof r<"u")throw new TypeError("'byteLength' must be a number.")}else if(typeof n<"u")throw new TypeError("'options' must be an object.");i=new Uint8Array(l,c,p)}else throw new TypeError("Unexpected argument[0]: must be 'path' or 'buffer'.");let[a,u]=await hr(s),d=await a.createInferenceSessionHandler(i,u);return me(),new e(d)}startProfiling(){this.handler.startProfiling()}endProfiling(){this.handler.endProfiling()}get inputNames(){return this.handler.inputNames}get outputNames(){return this.handler.outputNames}get inputMetadata(){return this.handler.inputMetadata}get outputMetadata(){return this.handler.outputMetadata}}});var Ur,Vr=E(()=>{"use strict";Rr();Ur=xt});var Nr=E(()=>{"use strict"});var Lr=E(()=>{"use strict"});var Gr=E(()=>{"use strict"});var Wr=E(()=>{"use strict"});var Tn={};et(Tn,{InferenceSession:()=>Ur,TRACE:()=>dt,TRACE_FUNC_BEGIN:()=>fe,TRACE_FUNC_END:()=>me,Tensor:()=>$e,env:()=>te,registerBackend:()=>Ne});var ve=E(()=>{"use strict";gr();wr();Vr();xn();Nr();Lr();Sn();Gr();Wr()});var St=E(()=>{"use strict"});var Kr={};et(Kr,{default:()=>Hu});var qr,Fr,Hu,jr=E(()=>{"use strict";In();Le();Tt();qr="ort-wasm-proxy-worker",Fr=globalThis.self?.name===qr;Fr&&(self.onmessage=e=>{let{type:t,in:n}=e.data;try{switch(t){case"init-wasm":It(n.wasm).then(()=>{Ct(n).then(()=>{postMessage({type:t})},r=>{postMessage({type:t,err:r})})},r=>{postMessage({type:t,err:r})});break;case"init-ep":{let{epName:r,env:o}=n;At(o,r).then(()=>{postMessage({type:t})},i=>{postMessage({type:t,err:i})});break}case"copy-from":{let{buffer:r}=n,o=lt(r);postMessage({type:t,out:o});break}case"create":{let{model:r,options:o}=n;kt(r,o).then(i=>{postMessage({type:t,out:i})},i=>{postMessage({type:t,err:i})});break}case"release":Et(n),postMessage({type:t});break;case"run":{let{sessionId:r,inputIndices:o,inputs:i,outputIndices:s,options:a}=n;Pt(r,o,i,s,new Array(s.length).fill(null),a).then(u=>{u.some(d=>d[3]!=="cpu")?postMessage({type:t,err:"Proxy does not support non-cpu tensor location."}):postMessage({type:t,out:u},Ot([...i,...u]))},u=>{postMessage({type:t,err:u})});break}case"end-profiling":zt(n),postMessage({type:t});break;default:}}catch(r){postMessage({type:t,err:r})}});Hu=Fr?null:e=>new Worker(e??ye,{type:"classic",name:qr})});var qu,Fu,ye,Dt,Cn,Ku,ju,Xr,Zu,Zr,Yr,Qr,Jr,Tt=E(()=>{"use strict";St();qu=typeof location>"u"?void 0:location.origin,Fu=()=>{if(!!1)return typeof document<"u"?document.currentScript?.src:typeof self<"u"?self.location?.href:void 0},ye=Fu(),Dt=()=>{if(ye&&!ye.startsWith("blob:"))return ye.substring(0,ye.lastIndexOf("/")+1)},Cn=(e,t)=>{try{let n=t??ye;return(n?new URL(e,n):new URL(e)).origin===qu}catch{return!1}},Ku=(e,t)=>{let n=t??ye;try{return(n?new URL(e,n):new URL(e)).href}catch{return}},ju=(e,t)=>`${t??"./"}${e}`,Xr=async e=>{let n=await(await fetch(e,{credentials:"same-origin"})).blob();return URL.createObjectURL(n)},Zu=async e=>(await import(/*webpackIgnore:true*/e)).default,Zr=(jr(),at(Kr)).default,Yr=async()=>{if(!ye)throw new Error("Failed to load proxy worker: cannot determine the script source URL.");if(Cn(ye))return[void 0,Zr()];let e=await Xr(ye);return[e,Zr(e)]},Qr=void 0,Jr=async(e,t,n)=>{if(!e&&!t&&Qr&&ye&&Cn(ye))return[void 0,Qr];{let r="ort-wasm-simd-threaded.jsep.mjs",o=e??Ku(r,t),i=!!1&&n&&o&&!Cn(o,t),s=i?await Xr(o):o??ju(r,t);return[i?s:void 0,await Zu(s)]}}});var An,kn,Bt,eo,Qu,Xu,Yu,It,ne,Le=E(()=>{"use strict";Tt();kn=!1,Bt=!1,eo=!1,Qu=()=>{if(typeof SharedArrayBuffer>"u")return!1;try{return typeof MessageChannel<"u"&&new MessageChannel().port1.postMessage(new SharedArrayBuffer(1)),WebAssembly.validate(new Uint8Array([0,97,115,109,1,0,0,0,1,4,1,96,0,0,3,2,1,0,5,4,1,3,1,1,10,11,1,9,0,65,0,254,16,2,0,26,11]))}catch{return!1}},Xu=()=>{try{return WebAssembly.validate(new Uint8Array([0,97,115,109,1,0,0,0,1,4,1,96,0,0,3,2,1,0,10,30,1,28,0,65,0,253,15,253,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,253,186,1,26,11]))}catch{return!1}},Yu=()=>{try{return WebAssembly.validate(new Uint8Array([0,97,115,109,1,0,0,0,1,5,1,96,0,1,123,3,2,1,0,10,19,1,17,0,65,1,253,15,65,2,253,15,65,3,253,15,253,147,2,11]))}catch{return!1}},It=async e=>{if(kn)return Promise.resolve();if(Bt)throw new Error("multiple calls to 'initializeWebAssembly()' detected.");if(eo)throw new Error("previous call to 'initializeWebAssembly()' failed.");Bt=!0;let t=e.initTimeout,n=e.numThreads;if(e.simd!==!1){if(e.simd==="relaxed"){if(!Yu())throw new Error("Relaxed WebAssembly SIMD is not supported in the current environment.")}else if(!Xu())throw new Error("WebAssembly SIMD is not supported in the current environment.")}let r=Qu();n>1&&!r&&(typeof self<"u"&&!self.crossOriginIsolated&&console.warn("env.wasm.numThreads is set to "+n+", but this will not work unless you enable crossOriginIsolated mode. See https://web.dev/cross-origin-isolation-guide/ for more info."),console.warn("WebAssembly multi-threading is not supported in the current environment. Falling back to single-threading."),e.numThreads=n=1);let o=e.wasmPaths,i=typeof o=="string"?o:void 0,s=o?.mjs,a=s?.href??s,u=o?.wasm,d=u?.href??u,l=e.wasmBinary,[c,p]=await Jr(a,i,n>1),f=!1,m=[];if(t>0&&m.push(new Promise(h=>{setTimeout(()=>{f=!0,h()},t)})),m.push(new Promise((h,b)=>{let y={numThreads:n};if(l)y.wasmBinary=l;else if(d||i)y.locateFile=g=>d??i+g;else if(a&&a.indexOf("blob:")!==0)y.locateFile=g=>new URL(g,a).href;else if(c){let g=Dt();g&&(y.locateFile=_=>g+_)}p(y).then(g=>{Bt=!1,kn=!0,An=g,h(),c&&URL.revokeObjectURL(c)},g=>{Bt=!1,eo=!0,b(g)})})),await Promise.race(m),f)throw new Error(`WebAssembly backend initializing failed due to timeout: ${t}ms`)},ne=()=>{if(kn&&An)return An;throw new Error("WebAssembly is not initialized yet.")}});var be,ct,Y,Mt=E(()=>{"use strict";Le();be=(e,t)=>{let n=ne(),r=n.lengthBytesUTF8(e)+1,o=n._malloc(r);return n.stringToUTF8(e,o,r),t.push(o),o},ct=(e,t,n,r)=>{if(typeof e=="object"&&e!==null){if(n.has(e))throw new Error("Circular reference in options");n.add(e)}Object.entries(e).forEach(([o,i])=>{let s=t?t+o:o;if(typeof i=="object")ct(i,s+".",n,r);else if(typeof i=="string"||typeof i=="number")r(s,i.toString());else if(typeof i=="boolean")r(s,i?"1":"0");else throw new Error(`Can't handle extra config type: ${typeof i}`)})},Y=e=>{let t=ne(),n=t.stackSave();try{let r=t.PTR_SIZE,o=t.stackAlloc(2*r);t._OrtGetLastError(o,o+r);let i=Number(t.getValue(o,r===4?"i32":"i64")),s=t.getValue(o+r,"*"),a=s?t.UTF8ToString(s):"";throw new Error(`${e} ERROR_CODE: ${i}, ERROR_MESSAGE: ${a}`)}finally{t.stackRestore(n)}}});var to,no=E(()=>{"use strict";Le();Mt();to=e=>{let t=ne(),n=0,r=[],o=e||{};try{if(e?.logSeverityLevel===void 0)o.logSeverityLevel=2;else if(typeof e.logSeverityLevel!="number"||!Number.isInteger(e.logSeverityLevel)||e.logSeverityLevel<0||e.logSeverityLevel>4)throw new Error(`log serverity level is not valid: ${e.logSeverityLevel}`);if(e?.logVerbosityLevel===void 0)o.logVerbosityLevel=0;else if(typeof e.logVerbosityLevel!="number"||!Number.isInteger(e.logVerbosityLevel))throw new Error(`log verbosity level is not valid: ${e.logVerbosityLevel}`);e?.terminate===void 0&&(o.terminate=!1);let i=0;return e?.tag!==void 0&&(i=be(e.tag,r)),n=t._OrtCreateRunOptions(o.logSeverityLevel,o.logVerbosityLevel,!!o.terminate,i),n===0&&Y("Can't create run options."),e?.extra!==void 0&&ct(e.extra,"",new WeakSet,(s,a)=>{let u=be(s,r),d=be(a,r);t._OrtAddRunConfigEntry(n,u,d)!==0&&Y(`Can't set a run config entry: ${s} - ${a}.`)}),[n,r]}catch(i){throw n!==0&&t._OrtReleaseRunOptions(n),r.forEach(s=>t._free(s)),i}}});var Ju,ed,td,Rt,nd,ro,oo=E(()=>{"use strict";Le();Mt();Ju=e=>{switch(e){case"disabled":return 0;case"basic":return 1;case"extended":return 2;case"all":return 99;default:throw new Error(`unsupported graph optimization level: ${e}`)}},ed=e=>{switch(e){case"sequential":return 0;case"parallel":return 1;default:throw new Error(`unsupported execution mode: ${e}`)}},td=e=>{e.extra||(e.extra={}),e.extra.session||(e.extra.session={});let t=e.extra.session;t.use_ort_model_bytes_directly||(t.use_ort_model_bytes_directly="1"),e.executionProviders&&e.executionProviders.some(n=>(typeof n=="string"?n:n.name)==="webgpu")&&(e.enableMemPattern=!1)},Rt=(e,t,n,r)=>{let o=be(t,r),i=be(n,r);ne()._OrtAddSessionConfigEntry(e,o,i)!==0&&Y(`Can't set a session config entry: ${t} - ${n}.`)},nd=async(e,t,n)=>{for(let r of t){let o=typeof r=="string"?r:r.name,i=[];switch(o){case"webnn":if(o="WEBNN",typeof r!="string"){let c=r?.deviceType;c&&Rt(e,"deviceType",c,n)}break;case"webgpu":if(o="JS",typeof r!="string"){let l=r;if(l?.preferredLayout){if(l.preferredLayout!=="NCHW"&&l.preferredLayout!=="NHWC")throw new Error(`preferredLayout must be either 'NCHW' or 'NHWC': ${l.preferredLayout}`);Rt(e,"preferredLayout",l.preferredLayout,n)}}break;case"wasm":case"cpu":continue;default:throw new Error(`not supported execution provider: ${o}`)}let s=be(o,n),a=i.length,u=0,d=0;if(a>0){u=ne()._malloc(a*ne().PTR_SIZE),n.push(u),d=ne()._malloc(a*ne().PTR_SIZE),n.push(d);for(let l=0;l<a;l++)ne().setValue(u+l*ne().PTR_SIZE,i[l][0],"*"),ne().setValue(d+l*ne().PTR_SIZE,i[l][1],"*")}await ne()._OrtAppendExecutionProvider(e,s,u,d,a)!==0&&Y(`Can't append execution provider: ${o}.`)}},ro=async e=>{let t=ne(),n=0,r=[],o=e||{};td(o);try{let i=Ju(o.graphOptimizationLevel??"all"),s=ed(o.executionMode??"sequential"),a=typeof o.logId=="string"?be(o.logId,r):0,u=o.logSeverityLevel??2;if(!Number.isInteger(u)||u<0||u>4)throw new Error(`log serverity level is not valid: ${u}`);let d=o.logVerbosityLevel??0;if(!Number.isInteger(d)||d<0||d>4)throw new Error(`log verbosity level is not valid: ${d}`);let l=typeof o.optimizedModelFilePath=="string"?be(o.optimizedModelFilePath,r):0;if(n=t._OrtCreateSessionOptions(i,!!o.enableCpuMemArena,!!o.enableMemPattern,s,!!o.enableProfiling,0,a,u,d,l),n===0&&Y("Can't create session options."),o.executionProviders&&await nd(n,o.executionProviders,r),o.enableGraphCapture!==void 0){if(typeof o.enableGraphCapture!="boolean")throw new Error(`enableGraphCapture must be a boolean value: ${o.enableGraphCapture}`);Rt(n,"enableGraphCapture",o.enableGraphCapture.toString(),r)}if(o.freeDimensionOverrides)for(let[c,p]of Object.entries(o.freeDimensionOverrides)){if(typeof c!="string")throw new Error(`free dimension override name must be a string: ${c}`);if(typeof p!="number"||!Number.isInteger(p)||p<0)throw new Error(`free dimension override value must be a non-negative integer: ${p}`);let f=be(c,r);t._OrtAddFreeDimensionOverride(n,f,p)!==0&&Y(`Can't set a free dimension override: ${c} - ${p}.`)}return o.extra!==void 0&&ct(o.extra,"",new WeakSet,(c,p)=>{Rt(n,c,p,r)}),[n,r]}catch(i){throw n!==0&&t._OrtReleaseSessionOptions(n)!==0&&Y("Can't release session options."),r.forEach(s=>t._free(s)),i}}});var Ge,ke,We,tt,pt,Ut,Vt,En,V=E(()=>{"use strict";Ge=e=>{switch(e){case"int8":return 3;case"uint8":return 2;case"bool":return 9;case"int16":return 5;case"uint16":return 4;case"int32":return 6;case"uint32":return 12;case"float16":return 10;case"float32":return 1;case"float64":return 11;case"string":return 8;case"int64":return 7;case"uint64":return 13;case"int4":return 22;case"uint4":return 21;default:throw new Error(`unsupported data type: ${e}`)}},ke=e=>{switch(e){case 3:return"int8";case 2:return"uint8";case 9:return"bool";case 5:return"int16";case 4:return"uint16";case 6:return"int32";case 12:return"uint32";case 10:return"float16";case 1:return"float32";case 11:return"float64";case 8:return"string";case 7:return"int64";case 13:return"uint64";case 22:return"int4";case 21:return"uint4";default:throw new Error(`unsupported data type: ${e}`)}},We=(e,t)=>{let n=[-1,4,1,1,2,2,4,8,-1,1,2,8,4,8,-1,-1,-1,-1,-1,-1,-1,.5,.5][e],r=typeof t=="number"?t:t.reduce((o,i)=>o*i,1);return n>0?Math.ceil(r*n):void 0},tt=e=>{switch(e){case"float16":return typeof Float16Array<"u"&&Float16Array.from?Float16Array:Uint16Array;case"float32":return Float32Array;case"uint8":return Uint8Array;case"int8":return Int8Array;case"uint16":return Uint16Array;case"int16":return Int16Array;case"int32":return Int32Array;case"bool":return Uint8Array;case"float64":return Float64Array;case"uint32":return Uint32Array;case"int64":return BigInt64Array;case"uint64":return BigUint64Array;default:throw new Error(`unsupported type: ${e}`)}},pt=e=>{switch(e){case"verbose":return 0;case"info":return 1;case"warning":return 2;case"error":return 3;case"fatal":return 4;default:throw new Error(`unsupported logging level: ${e}`)}},Ut=e=>e==="float32"||e==="float16"||e==="int32"||e==="int64"||e==="uint32"||e==="uint8"||e==="bool"||e==="uint4"||e==="int4",Vt=e=>e==="float32"||e==="float16"||e==="int32"||e==="int64"||e==="uint32"||e==="uint64"||e==="int8"||e==="uint8"||e==="bool"||e==="uint4"||e==="int4",En=e=>{switch(e){case"none":return 0;case"cpu":return 1;case"cpu-pinned":return 2;case"texture":return 3;case"gpu-buffer":return 4;case"ml-tensor":return 5;default:throw new Error(`unsupported data location: ${e}`)}}});var mt,Pn=E(()=>{"use strict";St();mt=async e=>{if(typeof e=="string")if(!1)try{let{readFile:t}=_n("node:fs/promises");return new Uint8Array(await t(e))}catch(t){if(t.code==="ERR_FS_FILE_TOO_LARGE"){let{createReadStream:n}=_n("node:fs"),r=n(e),o=[];for await(let i of r)o.push(i);return new Uint8Array(Buffer.concat(o))}throw t}else{let t=await fetch(e);if(!t.ok)throw new Error(`failed to load external data file: ${e}`);let n=t.headers.get("Content-Length"),r=n?parseInt(n,10):0;if(r<1073741824)return new Uint8Array(await t.arrayBuffer());{if(!t.body)throw new Error(`failed to load external data file: ${e}, no response body.`);let o=t.body.getReader(),i;try{i=new ArrayBuffer(r)}catch(a){if(a instanceof RangeError){let u=Math.ceil(r/65536);i=new WebAssembly.Memory({initial:u,maximum:u}).buffer}else throw a}let s=0;for(;;){let{done:a,value:u}=await o.read();if(a)break;let d=u.byteLength;new Uint8Array(i,s,d).set(u),s+=d}return new Uint8Array(i,0,r)}}else return e instanceof Blob?new Uint8Array(await e.arrayBuffer()):e instanceof Uint8Array?e:new Uint8Array(e)}});var rd,od,io,so,Nt,id,j,Ee=E(()=>{"use strict";V();rd=["V","I","W","E","F"],od=(e,t)=>{console.log(`[${rd[e]},${new Date().toISOString()}]${t}`)},Nt=(e,t)=>{io=e,so=t},id=(e,t)=>{let n=pt(e),r=pt(io);n>=r&&od(n,typeof t=="function"?t():t)},j=(...e)=>{so&&id(...e)}});var zn,Pe,x,Ze,Lt,ao,uo,q=E(()=>{"use strict";zn=class{static calcMatMulShape(t,n){return t[1]!==n[0]?void 0:[t[0],n[1]]}},Pe=class{static calcShape(t,n,r=!1){let o=t.length,i=n.length;if(o===0)return n;if(i===0)return t;let s=Math.max(t.length,n.length),a=new Array(s);if(r){if(o<2||i<2)return;let u=zn.calcMatMulShape([t[o-2],t[o-1]],[n[i-2],n[i-1]]);if(u===void 0)return;[a[s-2],a[s-1]]=u}for(let u=r?3:1;u<=s;u++){let d=o-u<0?1:t[o-u],l=i-u<0?1:n[i-u];if(d!==l&&d>1&&l>1)return;let c=Math.max(d,l);if(d&&l)a[s-u]=Math.max(d,l);else{if(c>1)return;a[s-u]=0}}return a}static isValidBroadcast(t,n){let r=t.length,o=n.length;if(r>o)return!1;for(let i=1;i<=r;i++)if(t[r-i]!==1&&t[r-i]!==n[o-i])return!1;return!0}},x=class e{static size(t){return e.getSizeFromDimensionRange(t,0,t.length)}static convertShape(t,n=4){let r=t.length;if(r===0)return[];let o=new Array(r),i=r-1;for(;i>=0;){if(t[i]%n===0){o[i]=t[i]/n;break}if(n%t[i]!==0)throw new Error("cannot convert shape");o[i]=1,n/=t[i],i--}for(i--;i>=0;i--)o[i]=t[i];return o}static sizeFromDimension(t,n){if(n<0||n>t.length)throw new Error(`invalid dimension of ${n} for sizeFromDimension as Tensor has ${t.length} dimensions.`);return e.getSizeFromDimensionRange(t,n,t.length)}static sizeToDimension(t,n){if(n<0||n>t.length)throw new Error(`invalid dimension of ${n} for sizeToDimension as Tensor has ${t.length} dimensions.`);return e.getSizeFromDimensionRange(t,0,n)}static getSizeFromDimensionRange(t,n,r){let o=1;for(let i=n;i<r;i++){if(t[i]<0)throw new Error("cannot get valid size from specified dimension range. Most likely the range contains negative values in them.");o*=Number(t[i])}return o}static computeStrides(t){let n=t.length;if(n===0)return[];if(n===1)return[1];let r=new Array(n);r[n-1]=1,r[n-2]=t[n-1];for(let o=n-3;o>=0;--o)r[o]=r[o+1]*t[o+1];return r}static normalizeAxis(t,n){if(t<-n&&t>=n)throw new Error("unsupported axis for this operation.");return t<0?t+n:t}static normalizeAxes(t,n){return t.map(r=>this.normalizeAxis(r,n??t.length))}static sortBasedOnPerm(t,n){return n?n.map(r=>t[r]):t.slice().reverse()}static padShape(t,n){let r=t.length;return t.map((o,i)=>o+n[i]+n[i+r])}static areEqual(t,n){return t.length!==n.length?!1:t.every((r,o)=>r===n[o])}},Ze=class e{static adjustPoolAttributes(t,n,r,o,i,s){if(!t&&r.length!==n.length-2)throw new Error("length of specified kernel shapes should be 2 less than length of input dimensions");if(t)for(let a=0;a<n.length-2;a++)a>=r.length?r.push(n[a+2]):r[a]=n[a+2];for(let a=0;a<r.length;a++)if(a<o.length){if(o[a]<0)throw new Error("strides should be greater than or equal to 1")}else o.push(1);for(let a=0;a<r.length;a++)if(a<i.length){if(i[a]<0)throw new Error("dilations should be greater than or equal to 1")}else i.push(1);for(let a=0;a<r.length*2;a++)if(a<s.length){if(s[a]<0)throw new Error("pad should be greater than or equal to 1")}else s.push(0);for(let a=0;a<r.length;a++){if(r[a]<=0)throw new Error("kernel shapes need to be greater than 0");if(s[a]>=r[a]||s[a+r.length]>=r[a])throw new Error("pads should be smaller than kernel")}}static adjustPadsBasedOnAutoPad(t,n,r,o,i,s,a){if(a){if(i.length!==2*(t.length-2))throw new Error("length of pads should be twice the length of data dimensions");if(n.length!==t.length-2)throw new Error("length of strides should be the length of data dimensions");if(o.length!==t.length-2)throw new Error("length of kernel shapes should be the length of data dimensions");for(let u=0;u<t.length-2;u++)e.adjustPadAndReturnShape(t[u+(s?1:2)],n[u],r[u],o[u],i,u,u+t.length-2,a)}}static computePoolOutputShape(t,n,r,o,i,s,a){if(n.length<=0)throw new Error("input shape must be of size greater than 0");let u=[n[0],n[1]];return e.computeShapeHelper(t,n,u,r,o,i,s,a),u}static computeConvOutputShape(t,n,r,o,i,s,a){if(t.length<=0||n.length<=0)throw new Error("invalid input tensor dims or invalid filter tensor dims");let u=[t[0],n[0]];return e.computeShapeHelper(!1,t,u,r,o,i,s,a),u}static computeShapeHelper(t,n,r,o,i,s,a,u){if(t)for(let d=0;d<n.length-2;d++)r.push(1);else for(let d=0;d<n.length-2;d++)r.push(e.adjustPadAndReturnShape(n[d+2],o[d],i[d],s[d],a,d,d+n.length-2,u))}static adjustPadAndReturnShape(t,n,r,o,i,s,a,u){let d=r*(o-1)+1;if(u&&u!=="NOTSET")switch(u){case"VALID":return i[s]=0,i[a]=0,Math.floor((t-d)/n+1);case"SAME_LOWER":case"SAME_UPPER":if(r!==1)throw new Error("Dilation not supported for SAME_UPPER or SAME_LOWER");{let c=((t+n-1)/n-1)*n+o-t;return i[s]=Math.floor(u==="SAME_LOWER"?(c+1)/2:c/2),i[a]=c-i[s],Math.floor((t+c-o)/n+1)}default:throw new Error("Unsupported AutoPad type")}else return Math.floor((t+i[s]+i[a]-d)/n+1)}},Lt=class{static getShapeOfGemmResult(t,n,r,o,i){if(t.length!==2||r.length!==2)throw new Error("shape need to be of size 2");let s,a,u;n?(s=t[1],a=t[0]):(s=t[0],a=t[1]);let d=-1;if(o?(u=r[0],d=1):(u=r[1],d=0),r[d]!==a)throw new Error("dimension mismatch");if(s<=0||u<=0||a<=0)throw new Error("invalid shape specified");if(i&&!Pe.isValidBroadcast(i,[s,u]))throw new Error("gemm: invalid bias shape for broadcast");return[s,u,a]}},ao=-34028234663852886e22,uo=34028234663852886e22});var Gt,On=E(()=>{"use strict";V();Gt=(e,t)=>new(tt(t))(e)});var co,Bn,po,sd,lo,ad,mo,Wt,Ht,Dn,fo,ho=E(()=>{"use strict";V();Ee();co=new Map([["float32",32],["float16",16],["int32",32],["uint32",32],["int64",64],["uint64",64],["int8",8],["uint8",8],["int4",4],["uint4",4]]),Bn=(e,t)=>{if(t==="int32")return e;let n=co.get(t);if(!n)throw new Error(`WebNN backend does not support data type: ${t}`);let r=n/8;if(e.byteLength%r!==0)throw new Error(`Invalid Uint8Array length - must be a multiple of ${r}.`);let o=e.byteLength/r,i=new(tt(t))(e.buffer,e.byteOffset,o);switch(t){case"int64":case"uint64":{let s=new Int32Array(o);for(let a=0;a<o;a++){let u=i[a];if(u>2147483647n||u<-2147483648n)throw new Error("Can not convert int64 data to int32 - value out of range.");s[a]=Number(u)}return new Uint8Array(s.buffer)}case"int8":case"uint8":case"uint32":{if(t==="uint32"&&i.some(a=>a>2147483647))throw new Error("Can not convert uint32 data to int32 - value out of range.");let s=Int32Array.from(i,Number);return new Uint8Array(s.buffer)}default:throw new Error(`Unsupported data conversion from ${t} to 'int32'`)}},po=(e,t)=>{if(t==="int32")return e;if(e.byteLength%4!==0)throw new Error("Invalid Uint8Array length - must be a multiple of 4 (int32).");let n=e.byteLength/4,r=new Int32Array(e.buffer,e.byteOffset,n);switch(t){case"int64":{let o=BigInt64Array.from(r,BigInt);return new Uint8Array(o.buffer)}case"uint64":{if(r.some(i=>i<0))throw new Error("Can not convert int32 data to uin64 - negative value found.");let o=BigUint64Array.from(r,BigInt);return new Uint8Array(o.buffer)}case"int8":{if(r.some(i=>i<-128||i>127))throw new Error("Can not convert int32 data to int8 - value out of range.");let o=Int8Array.from(r,Number);return new Uint8Array(o.buffer)}case"uint8":{if(r.some(o=>o<0||o>255))throw new Error("Can not convert int32 data to uint8 - value out of range.");return Uint8Array.from(r,Number)}case"uint32":{if(r.some(i=>i<0))throw new Error("Can not convert int32 data to uint32 - negative value found.");let o=Uint32Array.from(r,Number);return new Uint8Array(o.buffer)}default:throw new Error(`Unsupported data conversion from 'int32' to ${t}`)}},sd=1,lo=()=>sd++,ad=new Map([["int8","int32"],["uint8","int32"],["uint32","int32"],["int64","int32"]]),mo=(e,t)=>{let n=co.get(e);if(!n)throw new Error(`WebNN backend does not support data type: ${e}`);return t.length>0?Math.ceil(t.reduce((r,o)=>r*o)*n/8):0},Wt=class{constructor(t){this.isDataConverted=!1;let{sessionId:n,context:r,tensor:o,dataType:i,shape:s,fallbackDataType:a}=t;this.sessionId=n,this.mlContext=r,this.mlTensor=o,this.dataType=i,this.tensorShape=s,this.fallbackDataType=a}get tensor(){return this.mlTensor}get type(){return this.dataType}get fallbackType(){return this.fallbackDataType}get shape(){return this.tensorShape}get byteLength(){return mo(this.dataType,this.tensorShape)}destroy(){j("verbose",()=>"[WebNN] TensorWrapper.destroy"),this.mlTensor.destroy()}write(t){this.mlContext.writeTensor(this.mlTensor,t)}async read(t){if(this.fallbackDataType){let n=await this.mlContext.readTensor(this.mlTensor),r=po(new Uint8Array(n),this.dataType);if(t){(t instanceof ArrayBuffer?new Uint8Array(t):new Uint8Array(t.buffer,t.byteOffset,t.byteLength)).set(r);return}else return r.buffer}else return t?this.mlContext.readTensor(this.mlTensor,t):this.mlContext.readTensor(this.mlTensor)}canReuseTensor(t,n,r){return this.mlContext===t&&this.dataType===n&&this.tensorShape.length===r.length&&this.tensorShape.every((o,i)=>o===r[i])}setIsDataConverted(t){this.isDataConverted=t}},Ht=class{constructor(t,n){this.tensorManager=t;this.wrapper=n}get tensorWrapper(){return this.wrapper}releaseTensor(){this.tensorWrapper&&(this.tensorManager.releaseTensor(this.tensorWrapper),this.wrapper=void 0)}async ensureTensor(t,n,r,o){let i=this.tensorManager.getMLContext(t),s;if(!i.opSupportLimits().input.dataTypes.includes(n)){if(s=ad.get(n),!s||!i.opSupportLimits().input.dataTypes.includes(s))throw new Error(`WebNN backend does not support data type: ${n}`);j("verbose",()=>`[WebNN] TensorIdTracker.ensureTensor: fallback dataType from ${n} to ${s}`)}if(this.wrapper){if(this.wrapper.canReuseTensor(i,n,r))return this.wrapper.tensor;if(o){if(this.wrapper.byteLength!==mo(n,r))throw new Error("Unable to copy data to tensor with different size.");this.activeUpload=new Uint8Array(await this.wrapper.read())}this.tensorManager.releaseTensor(this.wrapper)}let a=typeof MLTensorUsage>"u"?void 0:MLTensorUsage.READ|MLTensorUsage.WRITE;return this.wrapper=await this.tensorManager.getCachedTensor(t,n,r,a,!0,!0,s),o&&this.activeUpload&&(this.wrapper.write(this.activeUpload),this.activeUpload=void 0),this.wrapper.tensor}upload(t){let n=t;if(this.wrapper){if(this.wrapper.fallbackType)if(this.wrapper.fallbackType==="int32")n=Bn(t,this.wrapper.type),this.wrapper.setIsDataConverted(!0);else throw new Error(`Unsupported fallback data type: ${this.wrapper.fallbackType}`);if(t.byteLength===this.wrapper.byteLength){this.wrapper.write(n);return}else j("verbose",()=>"Data size does not match tensor size. Releasing tensor."),this.releaseTensor()}this.activeUpload?this.activeUpload.set(n):this.activeUpload=new Uint8Array(n)}async download(t){if(this.activeUpload){let n=this.wrapper?.isDataConverted?po(this.activeUpload,this.wrapper?.type):this.activeUpload;if(t){t instanceof ArrayBuffer?new Uint8Array(t).set(n):new Uint8Array(t.buffer,t.byteOffset,t.byteLength).set(n);return}else return n.buffer}if(!this.wrapper)throw new Error("Tensor has not been created.");return t?this.wrapper.read(t):this.wrapper.read()}},Dn=class{constructor(t){this.backend=t;this.tensorTrackersById=new Map;this.freeTensors=[];this.externalTensors=new Set}getMLContext(t){let n=this.backend.getMLContext(t);if(!n)throw new Error("MLContext not found for session.");return n}reserveTensorId(){let t=lo();return this.tensorTrackersById.set(t,new Ht(this)),t}releaseTensorId(t){let n=this.tensorTrackersById.get(t);n&&(this.tensorTrackersById.delete(t),n.tensorWrapper&&this.releaseTensor(n.tensorWrapper))}async ensureTensor(t,n,r,o,i){j("verbose",()=>`[WebNN] TensorManager.ensureTensor {tensorId: ${n}, dataType: ${r}, shape: ${o}, copyOld: ${i}}`);let s=this.tensorTrackersById.get(n);if(!s)throw new Error("Tensor not found.");return s.ensureTensor(t,r,o,i)}upload(t,n){let r=this.tensorTrackersById.get(t);if(!r)throw new Error("Tensor not found.");r.upload(n)}async download(t,n){j("verbose",()=>`[WebNN] TensorManager.download {tensorId: ${t}, dstBuffer: ${n?.byteLength}}`);let r=this.tensorTrackersById.get(t);if(!r)throw new Error("Tensor not found.");return r.download(n)}releaseTensorsForSession(t){for(let n of this.freeTensors)n.sessionId===t&&n.destroy();this.freeTensors=this.freeTensors.filter(n=>n.sessionId!==t)}registerTensor(t,n,r,o){let i=this.getMLContext(t),s=lo(),a=new Wt({sessionId:t,context:i,tensor:n,dataType:r,shape:o});return this.tensorTrackersById.set(s,new Ht(this,a)),this.externalTensors.add(a),s}async getCachedTensor(t,n,r,o,i,s,a){let u=this.getMLContext(t);for(let[l,c]of this.freeTensors.entries())if(c.canReuseTensor(u,n,r)){j("verbose",()=>`[WebNN] Reusing tensor {dataType: ${n}, ${a?`fallbackDataType: ${a},`:""} shape: ${r}`);let p=this.freeTensors.splice(l,1)[0];return p.sessionId=t,p}j("verbose",()=>`[WebNN] MLContext.createTensor {dataType: ${n}, ${a?`fallbackDataType: ${a},`:""} shape: ${r}}`);let d=await u.createTensor({dataType:a??n,shape:r,dimensions:r,usage:o,writable:i,readable:s});return new Wt({sessionId:t,context:u,tensor:d,dataType:n,shape:r,fallbackDataType:a})}releaseTensor(t){this.externalTensors.has(t)&&this.externalTensors.delete(t),this.freeTensors.push(t)}},fo=(...e)=>new Dn(...e)});var qt,ud,Ft,go=E(()=>{"use strict";V();Le();On();ho();Ee();qt=new Map([[1,"float32"],[10,"float16"],[6,"int32"],[12,"uint32"],[7,"int64"],[13,"uint64"],[22,"int4"],[21,"uint4"],[3,"int8"],[2,"uint8"],[9,"uint8"]]),ud=(e,t)=>{if(e===t)return!0;if(e===void 0||t===void 0)return!1;let n=Object.keys(e).sort(),r=Object.keys(t).sort();return n.length===r.length&&n.every((o,i)=>o===r[i]&&e[o]===t[o])},Ft=class{constructor(t){this.tensorManager=fo(this);this.mlContextBySessionId=new Map;this.sessionIdsByMLContext=new Map;this.mlContextCache=[];this.sessionGraphInputs=new Map;this.sessionGraphOutputs=new Map;this.temporaryGraphInputs=[];this.temporaryGraphOutputs=[];this.temporarySessionTensorIds=new Map;Nt(t.logLevel,!!t.debug)}get currentSessionId(){if(this.activeSessionId===void 0)throw new Error("No active session");return this.activeSessionId}onRunStart(t){j("verbose",()=>`[WebNN] onRunStart {sessionId: ${t}}`),this.activeSessionId=t}onRunEnd(t){j("verbose",()=>`[WebNN] onRunEnd {sessionId: ${t}}`);let n=this.temporarySessionTensorIds.get(t);if(n){for(let r of n)j("verbose",()=>`[WebNN] releasing temporary tensor {tensorId: ${r}}`),this.tensorManager.releaseTensorId(r);this.temporarySessionTensorIds.delete(t),this.activeSessionId=void 0}}async createMLContext(t){if(t instanceof GPUDevice){let r=this.mlContextCache.findIndex(o=>o.gpuDevice===t);if(r!==-1)return this.mlContextCache[r].mlContext;{let o=await navigator.ml.createContext(t);return this.mlContextCache.push({gpuDevice:t,mlContext:o}),o}}else if(t===void 0){let r=this.mlContextCache.findIndex(o=>o.options===void 0&&o.gpuDevice===void 0);if(r!==-1)return this.mlContextCache[r].mlContext;{let o=await navigator.ml.createContext();return this.mlContextCache.push({mlContext:o}),o}}let n=this.mlContextCache.findIndex(r=>ud(r.options,t));if(n!==-1)return this.mlContextCache[n].mlContext;{let r=await navigator.ml.createContext(t);return this.mlContextCache.push({options:t,mlContext:r}),r}}registerMLContext(t,n){this.mlContextBySessionId.set(t,n);let r=this.sessionIdsByMLContext.get(n);r||(r=new Set,this.sessionIdsByMLContext.set(n,r)),r.add(t),this.temporaryGraphInputs.length>0&&(this.sessionGraphInputs.set(t,this.temporaryGraphInputs),this.temporaryGraphInputs=[]),this.temporaryGraphOutputs.length>0&&(this.sessionGraphOutputs.set(t,this.temporaryGraphOutputs),this.temporaryGraphOutputs=[])}onReleaseSession(t){this.sessionGraphInputs.delete(t),this.sessionGraphOutputs.delete(t);let n=this.mlContextBySessionId.get(t);if(!n)return;this.tensorManager.releaseTensorsForSession(t),this.mlContextBySessionId.delete(t);let r=this.sessionIdsByMLContext.get(n);if(r.delete(t),r.size===0){this.sessionIdsByMLContext.delete(n);let o=this.mlContextCache.findIndex(i=>i.mlContext===n);o!==-1&&this.mlContextCache.splice(o,1)}}getMLContext(t){return this.mlContextBySessionId.get(t)}reserveTensorId(){return this.tensorManager.reserveTensorId()}releaseTensorId(t){j("verbose",()=>`[WebNN] releaseTensorId {tensorId: ${t}}`),this.tensorManager.releaseTensorId(t)}async ensureTensor(t,n,r,o,i){let s=qt.get(r);if(!s)throw new Error(`Unsupported ONNX data type: ${r}`);return this.tensorManager.ensureTensor(t??this.currentSessionId,n,s,o,i)}async createTemporaryTensor(t,n,r){j("verbose",()=>`[WebNN] createTemporaryTensor {onnxDataType: ${n}, shape: ${r}}`);let o=qt.get(n);if(!o)throw new Error(`Unsupported ONNX data type: ${n}`);let i=this.tensorManager.reserveTensorId();await this.tensorManager.ensureTensor(t,i,o,r,!1);let s=this.temporarySessionTensorIds.get(t);return s?s.push(i):this.temporarySessionTensorIds.set(t,[i]),i}uploadTensor(t,n){if(!ne().shouldTransferToMLTensor)throw new Error("Trying to upload to a MLTensor while shouldTransferToMLTensor is false");j("verbose",()=>`[WebNN] uploadTensor {tensorId: ${t}, data: ${n.byteLength}}`),this.tensorManager.upload(t,n)}async downloadTensor(t,n){return this.tensorManager.download(t,n)}createMLTensorDownloader(t,n){return async()=>{let r=await this.tensorManager.download(t);return Gt(r,n)}}registerMLTensor(t,n,r,o){let i=qt.get(r);if(!i)throw new Error(`Unsupported ONNX data type: ${r}`);let s=this.tensorManager.registerTensor(t,n,i,o);return j("verbose",()=>`[WebNN] registerMLTensor {tensor: ${n}, dataType: ${i}, dimensions: ${o}} -> {tensorId: ${s}}`),s}registerMLConstant(t,n,r,o,i,s,a=!1){if(!s)throw new Error("External mounted files are not available.");let u=t;t.startsWith("./")&&(u=t.substring(2));let d=s.get(u);if(!d)throw new Error(`File with name ${u} not found in preloaded files.`);if(n+r>d.byteLength)throw new Error("Out of bounds: data offset and length exceed the external file data size.");let l=d.slice(n,n+r).buffer,c;switch(i.dataType){case"float32":c=new Float32Array(l);break;case"float16":c=typeof Float16Array<"u"&&Float16Array.from?new Float16Array(l):new Uint16Array(l);break;case"int32":c=new Int32Array(l);break;case"uint32":c=new Uint32Array(l);break;case"int64":if(a){let p=Bn(new Uint8Array(l),"int64");c=new Int32Array(p.buffer),i.dataType="int32"}else c=new BigInt64Array(l);break;case"uint64":c=new BigUint64Array(l);break;case"int8":c=new Int8Array(l);break;case"int4":case"uint4":case"uint8":c=new Uint8Array(l);break;default:throw new Error(`Unsupported data type: ${i.dataType} in creating WebNN Constant from external data.`)}return j("verbose",()=>`[WebNN] registerMLConstant {dataType: ${i.dataType}, shape: ${i.shape}}} ${a?"(Note: it was int64 data type and registered to int32 as workaround)":""}`),o.constant(i,c)}registerGraphInput(t){this.temporaryGraphInputs.push(t)}registerGraphOutput(t){this.temporaryGraphOutputs.push(t)}isGraphInput(t,n){let r=this.sessionGraphInputs.get(t);return r?r.includes(n):!1}isGraphOutput(t,n){let r=this.sessionGraphOutputs.get(t);return r?r.includes(n):!1}isGraphInputOutputTypeSupported(t,n,r=!0){let o=this.mlContextBySessionId.get(t),i=qt.get(Ge(n));return typeof i>"u"?!1:r?!!o?.opSupportLimits().input.dataTypes.includes(i):!!o?.opSupportLimits().output.dataTypes.includes(i)}flush(){}}});var Kt=E(()=>{"use strict"});var yo,Mn,Rn,dd,ld,bo,Vn,Un,wo,$o=E(()=>{"use strict";Ee();Kt();yo=new Map([[64,250],[128,200],[256,200],[512,200],[2048,230],[4096,200],[8192,50],[16384,50],[32768,50],[65536,50],[131072,50],[262144,50],[524288,50],[1048576,50],[2097152,30],[4194304,20],[8388608,10],[12582912,10],[16777216,10],[26214400,15],[33554432,22],[44236800,2],[58982400,6],[67108864,6],[134217728,6],[167772160,6]]),Mn=[],Rn=e=>Math.ceil(Number(e)/16)*16,dd=e=>{for(let t=0;t<Mn.length;t++){let n=Mn[t];if(e<=n)return n}return Math.ceil(e/16)*16},ld=1,bo=()=>ld++,Vn=async(e,t,n,r)=>{let o=Rn(n),i=e.device.createBuffer({size:o,usage:GPUBufferUsage.COPY_DST|GPUBufferUsage.MAP_READ});try{let s=e.getCommandEncoder();e.endComputePass(),s.copyBufferToBuffer(t,0,i,0,o),e.flush(),await i.mapAsync(GPUMapMode.READ);let a=i.getMappedRange();if(r){let u=r();return u.set(new Uint8Array(a,0,n)),u}else return new Uint8Array(a.slice(0,n))}finally{i.destroy()}},Un=class{constructor(t){this.backend=t;this.storageCache=new Map,this.freeBuffers=new Map,this.freeUniformBuffers=new Map,this.buffersPending=[],this.capturedPendingBuffers=new Map;for(let[n]of yo)Mn.push(n),this.freeBuffers.set(n,[]),this.freeUniformBuffers.set(n,[]);this.sessionCount=0}upload(t,n){let r=n.buffer,o=n.byteOffset,i=n.byteLength,s=Rn(i),a=this.storageCache.get(t);if(!a)throw new Error("gpu data for uploading does not exist");if(Number(a.originalSize)!==i)throw new Error(`inconsistent data size. gpu data size=${a.originalSize}, data size=${i}`);let u=this.backend.device.createBuffer({mappedAtCreation:!0,size:s,usage:GPUBufferUsage.MAP_WRITE|GPUBufferUsage.COPY_SRC}),d=u.getMappedRange();new Uint8Array(d).set(new Uint8Array(r,o,i)),u.unmap();let l=this.backend.device.createCommandEncoder();l.copyBufferToBuffer(u,0,a.gpuData.buffer,0,s),this.backend.device.queue.submit([l.finish()]),u.destroy(),j("verbose",()=>`[WebGPU] GpuDataManager.upload(id=${t})`)}memcpy(t,n){let r=this.storageCache.get(t);if(!r)throw new Error("source gpu data for memcpy does not exist");let o=this.storageCache.get(n);if(!o)throw new Error("destination gpu data for memcpy does not exist");if(r.originalSize!==o.originalSize)throw new Error("inconsistent source and destination gpu data size");let i=Rn(r.originalSize),s=this.backend.getCommandEncoder();this.backend.endComputePass(),s.copyBufferToBuffer(r.gpuData.buffer,0,o.gpuData.buffer,0,i)}registerExternalBuffer(t,n,r){let o;if(r){if(o=r[0],t===r[1])return j("verbose",()=>`[WebGPU] GpuDataManager.registerExternalBuffer(size=${n}) => id=${o}, buffer is the same, skip.`),o;if(this.backend.capturedCommandList.has(this.backend.currentSessionId))throw new Error(`Registering a different external buffer under graph capture mode is not supported yet.
+             Please use the previous external buffer!`)}else o=bo();return this.storageCache.set(o,{gpuData:{id:o,type:0,buffer:t},originalSize:n}),j("verbose",()=>`[WebGPU] GpuDataManager.registerExternalBuffer(size=${n}) => id=${o}, registered.`),o}unregisterExternalBuffer(t){t!==void 0&&(this.storageCache.delete(t),j("verbose",()=>`[WebGPU] GpuDataManager.unregisterExternalBuffer() => id=${t}`))}create(t,n=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST){let r=dd(t),o,i=(n&GPUBufferUsage.STORAGE)===GPUBufferUsage.STORAGE,s=(n&GPUBufferUsage.UNIFORM)===GPUBufferUsage.UNIFORM;if(i||s){let d=(i?this.freeBuffers:this.freeUniformBuffers).get(r);d?d.length>0?o=d.pop():o=this.backend.device.createBuffer({size:r,usage:n}):o=this.backend.device.createBuffer({size:r,usage:n})}else o=this.backend.device.createBuffer({size:r,usage:n});let a={id:bo(),type:0,buffer:o};return this.storageCache.set(a.id,{gpuData:a,originalSize:Number(t)}),j("verbose",()=>`[WebGPU] GpuDataManager.create(size=${t}) => id=${a.id}`),a}get(t){return this.storageCache.get(t)?.gpuData}release(t){let n=typeof t=="bigint"?Number(t):t,r=this.storageCache.get(n);if(!r){if(this.storageCache.size===0)return 0;throw new Error("releasing data does not exist")}return j("verbose",()=>`[WebGPU] GpuDataManager.release(id=${n}), gpuDataId=${r.gpuData.id}`),this.storageCache.delete(n),this.buffersPending.push(r.gpuData.buffer),r.originalSize}async download(t,n){let r=this.storageCache.get(Number(t));if(!r)throw new Error("data does not exist");await Vn(this.backend,r.gpuData.buffer,r.originalSize,n)}refreshPendingBuffers(){if(this.buffersPending.length!==0)if(this.backend.sessionStatus==="default"){for(let t of this.buffersPending){let n=yo.get(t.size);if((t.usage&GPUBufferUsage.STORAGE)===GPUBufferUsage.STORAGE){let r=this.freeBuffers.get(t.size)||[];n===void 0||r.length>=n?t.destroy():r.push(t)}else if((t.usage&GPUBufferUsage.UNIFORM)===GPUBufferUsage.UNIFORM){let r=this.freeUniformBuffers.get(t.size)||[];n===void 0||r.length>=n?t.destroy():r.push(t)}else t.destroy()}this.buffersPending=[]}else{let t=this.capturedPendingBuffers.get(this.backend.currentSessionId);t||(t=[],this.capturedPendingBuffers.set(this.backend.currentSessionId,t));for(let n of this.buffersPending)t.push(n);this.buffersPending=[]}}dispose(){this.freeBuffers.forEach(t=>{t.forEach(n=>{n.destroy()})}),this.freeUniformBuffers.forEach(t=>{t.forEach(n=>{n.destroy()})}),this.storageCache.forEach(t=>{t.gpuData.buffer.destroy()}),this.capturedPendingBuffers.forEach(t=>{t.forEach(n=>{n.destroy()})}),this.storageCache=new Map,this.freeBuffers=new Map,this.freeUniformBuffers=new Map,this.capturedPendingBuffers=new Map}onCreateSession(){this.sessionCount+=1}onReleaseSession(t){let n=this.capturedPendingBuffers.get(t);n&&(n.forEach(r=>{r.destroy()}),this.capturedPendingBuffers.delete(t)),this.sessionCount-=1,this.sessionCount===0&&(j("warning",()=>"[WebGPU] Clearing webgpu buffer cache"),this.storageCache.forEach(r=>{r.gpuData.buffer.destroy()}),this.storageCache=new Map)}},wo=(...e)=>new Un(...e)});var Nn,N,ie=E(()=>{"use strict";Nn=class{constructor(t){Object.assign(this,t)}get cacheKey(){return this.key||(this.key=Object.getOwnPropertyNames(this).sort().map(t=>`${this[t]}`).join(";")),this.key}},N=e=>new Nn(e)});var Qe,Gn,re,le,P,X,Wn,Xe,Te,B,jt,S,C,vo,Zt,Ln,xo,K=E(()=>{"use strict";V();q();Qe=64,Gn=(e,t)=>{if(t===3)throw new Error("vec3 has same alignment as vec4, use vec4 instead");switch(Number(e)){case 10:return t>1?`vec${t}<f16>`:"f16";case 1:return t>1?`vec${t}<f32>`:"f32";case 6:return t>1?`vec${t}<i32>`:"i32";case 12:return t>1?`vec${t}<u32>`:"u32";case 7:if(t>1)throw new Error("currently not supported vecX of uint64 yet");return["vec2<u32>","i32"];case 13:if(t>1)throw new Error("currently not supported vecX of uint64 yet");return["vec2<u32>","u32"];case 9:if(t!==4)throw new Error("bool must be vec4");return["u32","vec4<bool>"];case 22:return"i32";case 21:return"u32";default:throw new Error(`Unknown data type: ${e}`)}},re=(e,t=1)=>{let n=Gn(e,t);return typeof n=="string"?n:n[0]},le=(e,t=1)=>{let n=Gn(e,t);return typeof n=="string"?n:n[1]},P=(...e)=>{let t=[];return e.forEach(n=>{n.length!==0&&t.push({type:12,data:n},{type:12,data:x.computeStrides(n)})}),t},X=e=>e%4===0?4:e%2===0?2:1,Wn=(e="f32",t,n="0")=>!t||t===1?`${e}(${n})`:`vec${t}<${e}>(${n})`,Xe=(e,t,n)=>e==="f32"?n:t===1?`f32(${n})`:`vec${t}<f32>(${n})`,Te=(e,t)=>t===4?`(${e}.x + ${e}.y + ${e}.z + ${e}.w)`:t===2?`(${e}.x + ${e}.y)`:t===3?`(${e}.x + ${e}.y + ${e}.z)`:e,B=(e,t,n,r)=>e.startsWith("uniforms.")&&n>4?typeof t=="string"?r==="f16"?`${e}[(${t}) / 8][(${t}) % 8 / 4][(${t}) % 8 % 4]`:`${e}[(${t}) / 4][(${t}) % 4]`:r==="f16"?`${e}[${Math.floor(t/8)}][${Math.floor(t%8/4)}][${t%8%4}]`:`${e}[${Math.floor(t/4)}][${t%4}]`:n>1?`${e}[${t}]`:e,jt=(e,t,n,r,o)=>{let i=typeof n=="number",s=i?n:n.length,a=[...new Array(s).keys()],u=s<2?"u32":s<=4?`vec${s}<u32>`:`array<u32, ${s}>`,d=Gn(t,o),l=typeof d=="string"?d:d[1],c=typeof d=="string"?d:d[0],p={indices:u,value:l,storage:c,tensor:t},f=k=>typeof k=="string"?k:`${k}u`,m={offsetToIndices:!1,indicesToOffset:!1,broadcastedIndicesToOffset:!1,set:!1,setByIndices:!1,get:!1,getByIndices:!1},h=i?"uniforms.":"",b=`${h}${e}_shape`,y=`${h}${e}_strides`,g="";for(let k=0;k<s-1;k++)g+=`
+    let dim${k} = current / ${B(y,k,s)};
+    let rest${k} = current % ${B(y,k,s)};
+    indices[${k}] = dim${k};
+    current = rest${k};
+    `;g+=`indices[${s-1}] = current;`;let _=s<2?"":`
+  fn o2i_${e}(offset: u32) -> ${p.indices} {
+    var indices: ${p.indices};
+    var current = offset;
+    ${g}
+    return indices;
+  }`,w=k=>(m.offsetToIndices=!0,s<2?k:`o2i_${e}(${k})`),v=[];if(s>=2)for(let k=s-1;k>=0;k--)v.push(`${B(y,k,s)} * (indices[${k}])`);let $=s<2?"":`
+  fn i2o_${e}(indices: ${p.indices}) -> u32 {
+    return ${v.join("+")};
+  }`,T=k=>(m.indicesToOffset=!0,s<2?k:`i2o_${e}(${k})`),I=(...k)=>s===0?"0u":`${p.indices}(${k.map(f).join(",")})`,A=(k,L)=>s<2?`${k}`:`${B(k,L,s)}`,z=(k,L,oe)=>s<2?`${k}=${oe};`:`${B(k,L,s)}=${oe};`,M={},R=(k,L)=>{m.broadcastedIndicesToOffset=!0;let oe=`${L.name}broadcastedIndicesTo${e}Offset`;if(oe in M)return`${oe}(${k})`;let we=[];for(let Se=s-1;Se>=0;Se--){let ue=L.indicesGet("outputIndices",Se+L.rank-s);we.push(`${A(y,Se)} * (${ue} % ${A(b,Se)})`)}return M[oe]=`fn ${oe}(outputIndices: ${L.type.indices}) -> u32 {
+             return ${we.length>0?we.join("+"):"0u"};
+           }`,`${oe}(${k})`},W=(k,L)=>(()=>{if(p.storage===p.value)return`${e}[${k}]=${L};`;if(p.storage==="vec2<u32>"&&p.value==="i32")return`${e}[${k}]=vec2<u32>(u32(${L}), select(0u, 0xFFFFFFFFu, ${L} < 0));`;if(p.storage==="vec2<u32>"&&p.value==="u32")return`${e}[${k}]=vec2<u32>(u32(${L}), 0u);`;if(p.storage==="u32"&&p.value==="vec4<bool>")return`${e}[${k}]=dot(vec4<u32>(0x1, 0x100, 0x10000, 0x1000000), vec4<u32>(${L}));`;throw new Error(`not supported combination of storage type ${p.storage} and value type ${p.value} yet`)})(),O=k=>(()=>{if(p.storage===p.value)return`${e}[${k}]`;if(p.storage==="vec2<u32>"&&p.value==="i32")return`i32(${e}[${k}].x)`;if(p.storage==="vec2<u32>"&&p.value==="u32")return`u32(${e}[${k}].x)`;if(p.storage==="u32"&&p.value==="vec4<bool>")return`vec4<bool>(bool(${e}[${k}] & 0xFFu), bool(${e}[${k}] & 0xFF00u), bool(${e}[${k}] & 0xFF0000u), bool(${e}[${k}] & 0xFF000000u))`;throw new Error(`not supported combination of storage type ${p.storage} and value type ${p.value} yet`)})(),ee=s<2?"":`
+  fn get_${e}ByIndices(indices: ${p.indices}) -> ${l} {
+    return ${O(`i2o_${e}(indices)`)};
+  }`,G=s<2?"":(()=>{let k=a.map(oe=>`d${oe}: u32`).join(", "),L=a.map(oe=>`d${oe}`).join(", ");return`
+  fn get_${e}(${k}) -> ${l} {
+    return get_${e}ByIndices(${I(L)});
+  }`})(),D=(...k)=>{if(k.length!==s)throw new Error(`indices length must be ${s}`);let L=k.map(f).join(",");return s===0?O("0u"):s===1?O(L[0]):(m.get=!0,m.getByIndices=!0,m.indicesToOffset=!0,`get_${e}(${L})`)},Z=k=>s<2?O(k):(m.getByIndices=!0,m.indicesToOffset=!0,`get_${e}ByIndices(${k})`),U=s<2?"":`
+  fn set_${e}ByIndices(indices: ${p.indices}, value: ${l}) {
+    ${W(`i2o_${e}(indices)`,"value")}
+  }`,Q=s<2?"":(()=>{let k=a.map(oe=>`d${oe}: u32`).join(", "),L=a.map(oe=>`d${oe}`).join(", ");return`
+  fn set_${e}(${k}, value: ${l}) {
+    set_${e}ByIndices(${I(L)}, value);
+  }`})();return{impl:()=>{let k=[],L=!1;return m.offsetToIndices&&(k.push(_),L=!0),m.indicesToOffset&&(k.push($),L=!0),m.broadcastedIndicesToOffset&&(Object.values(M).forEach(oe=>k.push(oe)),L=!0),m.set&&(k.push(Q),L=!0),m.setByIndices&&(k.push(U),L=!0),m.get&&(k.push(G),L=!0),m.getByIndices&&(k.push(ee),L=!0),!i&&L&&k.unshift(`const ${b} = ${p.indices}(${n.join(",")});`,`const ${y} = ${p.indices}(${x.computeStrides(n).join(",")});`),k.join(`
+`)},type:p,offsetToIndices:w,indicesToOffset:T,broadcastedIndicesToOffset:R,indices:I,indicesGet:A,indicesSet:z,set:(...k)=>{if(k.length!==s+1)throw new Error(`indices length must be ${s}`);let L=k[s];if(typeof L!="string")throw new Error("value must be string");let oe=k.slice(0,s).map(f).join(",");return s===0?W("0u",L):s===1?W(oe[0],L):(m.set=!0,m.setByIndices=!0,m.indicesToOffset=!0,`set_${e}(${oe}, ${L})`)},setByOffset:W,setByIndices:(k,L)=>s<2?W(k,L):(m.setByIndices=!0,m.indicesToOffset=!0,`set_${e}ByIndices(${k}, ${L});`),get:D,getByOffset:O,getByIndices:Z,usage:r,name:e,strides:y,shape:b,rank:s}},S=(e,t,n,r=1)=>jt(e,t,n,"input",r),C=(e,t,n,r=1)=>jt(e,t,n,"output",r),vo=(e,t,n)=>jt(e,t,n,"atomicOutput",1),Zt=(e,t,n,r=1)=>jt(e,t,n,"internal",r),Ln=class{constructor(t,n){this.normalizedDispatchGroup=t;this.limits=n;this.internalVariables=[];this.variables=[];this.uniforms=[];this.variableIndex=0}guardAgainstOutOfBoundsWorkgroupSizes(t){return`if (global_idx >= ${typeof t=="number"?`${t}u`:t}) { return; }`}mainStart(t=Qe){let n=typeof t=="number"?t:t[0],r=typeof t=="number"?1:t[1],o=typeof t=="number"?1:t[2];if(n>this.limits.maxComputeWorkgroupSizeX||r>this.limits.maxComputeWorkgroupSizeY||o>this.limits.maxComputeWorkgroupSizeZ)throw new Error(`workgroup size [${n}, ${r}, ${o}] exceeds the maximum workgroup size [${this.limits.maxComputeWorkgroupSizeX}, ${this.limits.maxComputeWorkgroupSizeY}, ${this.limits.maxComputeWorkgroupSizeZ}].`);if(n*r*o>this.limits.maxComputeInvocationsPerWorkgroup)throw new Error(`workgroup size [${n}, ${r}, ${o}] exceeds the maximum workgroup invocations ${this.limits.maxComputeInvocationsPerWorkgroup}.`);let i=this.normalizedDispatchGroup[1]===1&&this.normalizedDispatchGroup[2]===1,s=i?`@builtin(global_invocation_id) global_id : vec3<u32>,
+    @builtin(workgroup_id) workgroup_id : vec3<u32>,
+    @builtin(local_invocation_index) local_idx : u32,
+    @builtin(local_invocation_id) local_id : vec3<u32>`:`@builtin(global_invocation_id) global_id : vec3<u32>,
+                                             @builtin(local_invocation_id) local_id : vec3<u32>,
+    @builtin(local_invocation_index) local_idx : u32,
+    @builtin(workgroup_id) workgroup_id : vec3<u32>,
+    @builtin(num_workgroups) num_workgroups : vec3<u32>`,a=i?`let global_idx = global_id.x;
+         let workgroup_index = workgroup_id.x;`:`let workgroup_index = workgroup_id.z * num_workgroups[0] * num_workgroups[1] +
+             workgroup_id.y * num_workgroups[0] + workgroup_id.x;
+         let global_idx = workgroup_index * ${n*r*o}u + local_idx;`;return`@compute @workgroup_size(${n}, ${r}, ${o})
+  fn main(${s}) {
+    ${a}
+  `}appendVariableUniforms(t){t.rank!==0&&(t.shape.startsWith("uniforms.")&&this.uniforms.push({name:t.shape.replace("uniforms.",""),type:"u32",length:t.rank}),t.strides.startsWith("uniforms.")&&this.uniforms.push({name:t.strides.replace("uniforms.",""),type:"u32",length:t.rank}))}declareVariable(t,n){if(t.usage==="internal")throw new Error("cannot use internal variable with declareVariable(). use registerInternalVariables() instead.");this.variables.push(t),this.appendVariableUniforms(t);let r=t.usage==="input"?"read":"read_write",o=t.usage==="atomicOutput"?"atomic<i32>":t.type.storage;return`@group(0) @binding(${n}) var<storage, ${r}> ${t.name}: array<${o}>;`}declareVariables(...t){return t.map(n=>this.declareVariable(n,this.variableIndex++)).join(`
+`)}registerInternalVariable(t){if(t.usage!=="internal")throw new Error("cannot use input or output variable with registerInternalVariable(). use declareVariables() instead.");this.internalVariables.push(t),this.appendVariableUniforms(t)}registerInternalVariables(...t){return t.forEach(n=>this.registerInternalVariable(n)),this}registerUniform(t,n,r=1){return this.uniforms.push({name:t,type:n,length:r}),this}registerUniforms(t){return this.uniforms=this.uniforms.concat(t),this}uniformDeclaration(){if(this.uniforms.length===0)return"";let t=[];for(let{name:n,type:r,length:o}of this.uniforms)if(o&&o>4)r==="f16"?t.push(`@align(16) ${n}:array<mat2x4<${r}>, ${Math.ceil(o/8)}>`):t.push(`${n}:array<vec4<${r}>, ${Math.ceil(o/4)}>`);else{let i=o==null||o===1?r:`vec${o}<${r}>`;t.push(`${n}:${i}`)}return`
+      struct Uniforms { ${t.join(", ")} };
+      @group(0) @binding(${this.variableIndex}) var<uniform> uniforms: Uniforms;`}get additionalImplementations(){return this.uniformDeclaration()+this.variables.map(t=>t.impl()).join(`
+`)+this.internalVariables.map(t=>t.impl()).join(`
+`)}get variablesInfo(){if(this.uniforms.length===0)return;let t=n=>[12,10,1,6][["u32","f16","f32","i32"].indexOf(n)];return this.uniforms.map(n=>[t(n.type),n.length??1])}},xo=(e,t)=>new Ln(e,t)});var cd,So,pd,md,fd,hd,ce,To,Io,Re=E(()=>{"use strict";V();q();ie();K();cd=(e,t)=>{if(!e||e.length!==1)throw new Error("Transpose requires 1 input.");if(t.length!==0&&t.length!==e[0].dims.length)throw new Error(`perm size ${t.length} does not match input rank ${e[0].dims.length}`)},So=(e,t)=>t.length!==0?t:[...new Array(e).keys()].reverse(),pd=(e,t)=>x.sortBasedOnPerm(e,So(e.length,t)),md=(e,t,n,r)=>{let o=`fn perm(i: ${r.type.indices}) -> ${n.type.indices} {
+    var a: ${n.type.indices};`;for(let i=0;i<t;++i)o+=`a[${e[i]}]=i[${i}];`;return o+="return a;}"},fd=(e,t)=>{let n=[],r=[];for(let o=0;o<e.length;++o)e[o]!==1&&n.push(e[o]),e[t[o]]!==1&&r.push(t[o]);return{newShape:n,newPerm:r}},hd=(e,t)=>{let n=0;for(let r=0;r<e.length;++r)if(t[e[r]]!==1){if(e[r]<n)return!1;n=e[r]}return!0},ce=(e,t)=>{let n=e.dataType,r=e.dims.length,o=So(r,t),i=pd(e.dims,o),s=e.dims,a=i,u=r<2||hd(o,e.dims),d;if(u)return d=h=>{let b=S("input",n,s,4),y=C("output",n,a,4);return`
+  ${h.registerUniform("output_size","u32").declareVariables(b,y)}
+  ${h.mainStart()}
+    ${h.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+    output[global_idx] = input[global_idx];
+  }`},{name:"TransposeCopy",shaderCache:{inputDependencies:["type"]},getRunData:()=>{let h=x.size(i);return{outputs:[{dims:i,dataType:e.dataType}],dispatchGroup:{x:Math.ceil(h/64/4)},programUniforms:[{type:12,data:Math.ceil(h/4)}]}},getShaderSource:d};let{newShape:l,newPerm:c}=fd(e.dims,o),p=x.areEqual(c,[2,3,1]),f=x.areEqual(c,[3,1,2]);if(l.length===2||p||f){s=p?[l[0],l[1]*l[2]]:f?[l[0]*l[1],l[2]]:l,a=[s[1],s[0]];let h=16;return d=b=>{let y=S("a",n,s.length),g=C("output",n,a.length);return`
+  ${b.registerUniform("output_size","u32").declareVariables(y,g)}
+  var<workgroup> tile : array<array<${g.type.value}, ${h+1}>, ${h}>;
+  ${b.mainStart([h,h,1])}
+    let stride = (uniforms.output_shape[1] - 1) / ${h} + 1;
+    let workgroup_id_x = workgroup_index % stride;
+    let workgroup_id_y = workgroup_index / stride;
+    let input_col = workgroup_id_y * ${h}u + local_id.x;
+    let input_row = workgroup_id_x * ${h}u + local_id.y;
+    if (input_row < uniforms.a_shape[0] && input_col < uniforms.a_shape[1]) {
+      tile[local_id.y][local_id.x] = ${y.getByIndices(`${y.type.indices}(input_row, input_col)`)};
+    }
+    workgroupBarrier();
+
+    let output_col = workgroup_id_x * ${h}u + local_id.x;
+    let output_row = workgroup_id_y * ${h}u + local_id.y;
+    if (output_row < uniforms.output_shape[0] && output_col < uniforms.output_shape[1]) {
+      ${g.setByIndices(`${g.type.indices}(output_row, output_col)`,"tile[local_id.x][local_id.y]")}
+    }
+  }`},{name:"TransposeShared",shaderCache:{inputDependencies:["type"]},getRunData:()=>{let b=x.size(i);return{outputs:[{dims:i,dataType:e.dataType}],dispatchGroup:{x:Math.ceil(a[1]/h),y:Math.ceil(a[0]/h)},programUniforms:[{type:12,data:b},...P(s,a)]}},getShaderSource:d}}return d=h=>{let b=S("a",n,s.length),y=C("output",n,a.length);return`
+  ${h.registerUniform("output_size","u32").declareVariables(b,y)}
+
+  ${md(o,r,b,y)}
+
+  ${h.mainStart()}
+    ${h.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+
+    let indices = ${y.offsetToIndices("global_idx")};
+    let aIndices = perm(indices);
+
+    ${y.setByOffset("global_idx",b.getByIndices("aIndices"))}
+  }`},{name:"Transpose",shaderCache:{hint:`${t}`,inputDependencies:["rank"]},getRunData:()=>{let h=x.size(i);return{outputs:[{dims:i,dataType:e.dataType}],dispatchGroup:{x:Math.ceil(h/64)},programUniforms:[{type:12,data:h},...P(s,a)]}},getShaderSource:d}},To=(e,t)=>{cd(e.inputs,t.perm),e.compute(ce(e.inputs[0],t.perm))},Io=e=>N({perm:e.perm})});var gd,yd,bd,_d,wd,$d,vd,xd,Sd,Td,ze,Co,Ao,ko,Eo,Po,zo,Oo,Do,Bo,Mo,Ro=E(()=>{"use strict";V();q();K();Qt();Re();gd={max:"select(bestValue, candidate, candidate > bestValue)",min:"select(bestValue, candidate, candidate < bestValue)",mean:"bestValue + candidate",sum:"bestValue + candidate",prod:"bestValue * candidate",sumSquare:"bestValue + candidate * candidate",logSumExp:"bestValue + exp(candidate)",l1:"bestValue + abs(candidate)",l2:"bestValue + candidate * candidate",logSum:"bestValue + candidate"},yd={max:"select(bestValue, candidate, candidate > bestValue)",min:"select(bestValue, candidate, candidate < bestValue)",mean:"bestValue + candidate",sum:"bestValue + candidate",prod:"bestValue * candidate",sumSquare:"bestValue + candidate",logSumExp:"bestValue + candidate",l1:"bestValue + candidate",l2:"bestValue + candidate",logSum:"bestValue + candidate"},bd={max:"_A[offset]",min:"_A[offset]",mean:"0",sum:"0",prod:"1",sumSquare:"0",logSumExp:"0",l1:"0",l2:"0",logSum:"0"},_d={max:"bestValue",min:"bestValue",sum:"bestValue",prod:"bestValue",sumSquare:"bestValue",logSumExp:"log(bestValue)",l1:"bestValue",l2:"sqrt(bestValue)",logSum:"log(bestValue)"},wd=(e,t)=>{let n=[];for(let r=t-e;r<t;++r)n.push(r);return n},$d=(e,t)=>{let n=[],r=e.length;for(let i=0;i<r;i++)t.indexOf(i)===-1&&n.push(e[i]);let o=t.map(i=>e[i]);return[n,o]},vd=(e,t)=>{let n=e.length+t.length,r=[],o=0;for(let i=0;i<n;i++)t.indexOf(i)===-1?r.push(e[o++]):r.push(1);return r},xd=(e,t)=>{for(let n=0;n<e.length;++n)if(e[e.length-n-1]!==t-1-n)return!1;return!0},Sd=(e,t)=>{let n=[];if(!xd(e,t)){for(let r=0;r<t;++r)e.indexOf(r)===-1&&n.push(r);e.forEach(r=>n.push(r))}return n},Td=(e,t,n,r,o,i,s)=>{let a=n[0].dims,u=x.size(i),d=x.size(s),l=S("_A",n[0].dataType,a),c=C("output",o,i),p=64;u===1&&(p=256);let f=`
+          var<workgroup> aBestValues : array<f32, ${p}>;
+       `,m=h=>`
+        ${h.registerUniform("reduceSize","u32").declareVariables(l,c)}
+        ${f}
+        fn DIV_CEIL(a : u32, b : u32) -> u32 {
+          return ((a - 1u) / b + 1u);
+         }
+         ${h.mainStart(p)}
+
+          let outputIndex = global_idx / ${p};
+          let offset = outputIndex * uniforms.reduceSize;
+
+          var bestValue = f32(${bd[r]});
+          let Length = uniforms.reduceSize;
+          for (var k = local_idx; k < Length; k = k + ${p}) {
+           let candidate = f32(${l.getByOffset("offset + k")});
+           bestValue = ${gd[r]};
+          }
+          aBestValues[local_idx] = bestValue;
+          workgroupBarrier();
+
+         var reduceSize = min(Length, ${p}u);
+         for (var currentSize = reduceSize / 2u; reduceSize > 1u;
+             currentSize = reduceSize / 2u) {
+           let interval = DIV_CEIL(reduceSize, 2u);
+           if (local_idx < currentSize) {
+            let candidate = aBestValues[local_idx + interval];
+            bestValue = ${yd[r]};
+            aBestValues[local_idx] = bestValue;
+           }
+           reduceSize = interval;
+           workgroupBarrier();
+         }
+
+         if (local_idx == 0u) {
+          ${c.setByOffset("outputIndex",`${r==="mean"?`${c.type.storage}(bestValue / f32(uniforms.reduceSize))`:`${c.type.storage}(${_d[r]})`}`)};
+         }
+        }`;return{name:e,shaderCache:{hint:`${t};${p}`,inputDependencies:["type"]},getShaderSource:m,getRunData:()=>({outputs:[{dims:i,dataType:o}],dispatchGroup:{x:u},programUniforms:[{type:12,data:d}]})}},ze=(e,t,n,r)=>{let o=e.inputs.length===1?n:Hn(e.inputs,n),i=o.axes;i.length===0&&!o.noopWithEmptyAxes&&(i=e.inputs[0].dims.map((f,m)=>m));let s=x.normalizeAxes(i,e.inputs[0].dims.length),a=s,u=e.inputs[0],d=Sd(a,e.inputs[0].dims.length);d.length>0&&(u=e.compute(ce(e.inputs[0],d),{inputs:[0],outputs:[-1]})[0],a=wd(a.length,u.dims.length));let[l,c]=$d(u.dims,a),p=l;o.keepDims&&(p=vd(l,s)),e.compute(Td(t,o.cacheKey,[u],r,e.inputs[0].dataType,p,c),{inputs:[u]})},Co=(e,t)=>{ze(e,"ReduceMeanShared",t,"mean")},Ao=(e,t)=>{ze(e,"ReduceL1Shared",t,"l1")},ko=(e,t)=>{ze(e,"ReduceL2Shared",t,"l2")},Eo=(e,t)=>{ze(e,"ReduceLogSumExpShared",t,"logSumExp")},Po=(e,t)=>{ze(e,"ReduceMaxShared",t,"max")},zo=(e,t)=>{ze(e,"ReduceMinShared",t,"min")},Oo=(e,t)=>{ze(e,"ReduceProdShared",t,"prod")},Do=(e,t)=>{ze(e,"ReduceSumShared",t,"sum")},Bo=(e,t)=>{ze(e,"ReduceSumSquareShared",t,"sumSquare")},Mo=(e,t)=>{ze(e,"ReduceLogSumShared",t,"logSum")}});var Oe,Id,Xt,Hn,De,Cd,Ad,kd,Ed,Pd,zd,Od,Dd,Bd,Md,Be,Uo,Vo,No,Lo,Go,Wo,Ho,qo,Fo,Ko,Qt=E(()=>{"use strict";V();q();ie();K();Ro();Oe=e=>{if(!e||e.length===0||e.length>2)throw new Error("Reduce op requires 1 or 2 inputs.");if(e.length===2&&e[1].dims.length!==1)throw new Error("Invalid axes input dims.")},Id=e=>["","",`var value = ${e.getByIndices("input_indices")};`,""],Xt=(e,t,n,r,o,i,s=!1,a=!1)=>{let u=[],d=n[0].dims,l=d.length,c=x.normalizeAxes(o,l),p=!a&&c.length===0;d.forEach((b,y)=>{p||c.indexOf(y)>=0?s&&u.push(1):u.push(b)});let f=u.length,m=x.size(u);return{name:e,shaderCache:t,getShaderSource:b=>{let y=[],g=S("_A",n[0].dataType,l),_=C("output",i,f),w=r(g,_,c),v=w[2];for(let $=0,T=0;$<l;$++)p||c.indexOf($)>=0?(s&&T++,v=`for(var j${$}: u32 = 0; j${$} < ${d[$]}; j${$}++) {
+                  ${w[2].includes("last_index")?`let last_index = j${$};`:""}
+                  ${g.indicesSet("input_indices",$,`j${$}`)}
+                  ${v}
+                }`):(y.push(`${g.indicesSet("input_indices",$,_.indicesGet("output_indices",T))};`),T++);return`
+
+        ${b.registerUniform("output_size","u32").declareVariables(g,_)}
+
+        ${b.mainStart()}
+          ${b.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+          var input_indices: ${g.type.indices};
+          let output_indices = ${_.offsetToIndices("global_idx")};
+
+          ${y.join(`
+`)}
+          ${w[0]}       // init ops for reduce max/min
+          ${w[1]}
+          ${v}
+          ${w[3]}
+          ${w.length===4?_.setByOffset("global_idx","value"):w.slice(4).join(`
+`)}
+        }`},getRunData:()=>({outputs:[{dims:u,dataType:i}],dispatchGroup:{x:Math.ceil(m/64)},programUniforms:[{type:12,data:m},...P(d,u)]})}},Hn=(e,t)=>{let n=[];return e[1].dims[0]>0&&e[1].getBigInt64Array().forEach(r=>n.push(Number(r))),N({axes:n,keepDims:t.keepDims,noopWithEmptyAxes:t.noopWithEmptyAxes})},De=(e,t,n,r)=>{let o=e.inputs,i=o.length===1?n:Hn(o,n);e.compute(Xt(t,{hint:i.cacheKey,inputDependencies:["rank"]},[o[0]],i.noopWithEmptyAxes&&i.axes.length===0?Id:r,i.axes,o[0].dataType,i.keepDims,i.noopWithEmptyAxes),{inputs:[0]})},Cd=(e,t)=>{Oe(e.inputs),De(e,"ReduceLogSum",t,(r,o)=>[`var value = ${o.type.storage}(0);`,"",`value += ${r.getByIndices("input_indices")};`,"value = log(value);"])},Ad=(e,t)=>{Oe(e.inputs),De(e,"ReduceL1",t,(r,o)=>[`var value = ${o.type.storage}(0);`,"",`value += abs(${r.getByIndices("input_indices")});`,""])},kd=(e,t)=>{Oe(e.inputs),De(e,"ReduceL2",t,(r,o)=>[`var t = ${o.type.value}(0); var value = ${o.type.value}(0);`,"",`t = ${r.getByIndices("input_indices")}; value += (t * t);`,"value = sqrt(value);"])},Ed=(e,t)=>{Oe(e.inputs),De(e,"ReduceLogSumExp",t,(r,o)=>[`var value = ${o.type.storage}(0);`,"",`value += exp(${r.getByIndices("input_indices")});`,"value = log(value);"])},Pd=(e,t)=>{Oe(e.inputs),De(e,"ReduceMax",t,(r,o,i)=>{let s=[];for(let a=0;a<r.rank;a++)(i.indexOf(a)>=0||i.length===0)&&s.push(r.indicesSet("input_indices",a,0));return[`${s.join(`
+`)}`,`var value = ${r.getByIndices("input_indices")};`,`value = max(value, ${r.getByIndices("input_indices")});`,""]})},zd=(e,t)=>{Oe(e.inputs),De(e,"ReduceMean",t,(r,o,i)=>{let s=1;for(let a=0;a<r.rank;a++)(i.indexOf(a)>=0||i.length===0)&&(s*=e.inputs[0].dims[a]);return["var sum = f32(0);","",`sum += f32(${r.getByIndices("input_indices")});`,`let value = ${o.type.value}(sum / ${s});`]})},Od=(e,t)=>{Oe(e.inputs),De(e,"ReduceMin",t,(r,o,i)=>{let s=[];for(let a=0;a<r.rank;a++)(i.indexOf(a)>=0||i.length===0)&&s.push(`input_indices[${a}] = 0;`);return[`${s.join(`
+`)}`,`var value = ${r.getByIndices("input_indices")};`,`value = min(value, ${r.getByIndices("input_indices")});`,""]})},Dd=(e,t)=>{Oe(e.inputs),De(e,"ReduceProd",t,(r,o)=>[`var value = ${o.type.storage}(1);`,"",`value *= ${r.getByIndices("input_indices")};`,""])},Bd=(e,t)=>{Oe(e.inputs),De(e,"ReduceSum",t,(r,o)=>[`var value = ${o.type.storage}(0);`,"",`value += ${r.getByIndices("input_indices")};`,""])},Md=(e,t)=>{Oe(e.inputs),De(e,"ReduceSumSquare",t,(r,o)=>[`var t = ${o.type.value}(0); var value = ${o.type.value}(0);`,"",`t = ${r.getByIndices("input_indices")}; value += t * t;`,""])},Be=(e,t,n)=>{if(t.length===0)return n;let r=1,o=1;for(let i=0;i<t.length;i++)t.indexOf(i)===-1?r*=e[i]:o*=e[i];return o<32&&r>1024},Uo=(e,t)=>{Be(e.inputs[0].dims,t.axes,t.noopWithEmptyAxes)?zd(e,t):Co(e,t)},Vo=(e,t)=>{Be(e.inputs[0].dims,t.axes,t.noopWithEmptyAxes)?Ad(e,t):Ao(e,t)},No=(e,t)=>{Be(e.inputs[0].dims,t.axes,t.noopWithEmptyAxes)?kd(e,t):ko(e,t)},Lo=(e,t)=>{Be(e.inputs[0].dims,t.axes,t.noopWithEmptyAxes)?Ed(e,t):Eo(e,t)},Go=(e,t)=>{Be(e.inputs[0].dims,t.axes,t.noopWithEmptyAxes)?Pd(e,t):Po(e,t)},Wo=(e,t)=>{Be(e.inputs[0].dims,t.axes,t.noopWithEmptyAxes)?Od(e,t):zo(e,t)},Ho=(e,t)=>{Be(e.inputs[0].dims,t.axes,t.noopWithEmptyAxes)?Dd(e,t):Oo(e,t)},qo=(e,t)=>{Be(e.inputs[0].dims,t.axes,t.noopWithEmptyAxes)?Bd(e,t):Do(e,t)},Fo=(e,t)=>{Be(e.inputs[0].dims,t.axes,t.noopWithEmptyAxes)?Md(e,t):Bo(e,t)},Ko=(e,t)=>{Be(e.inputs[0].dims,t.axes,t.noopWithEmptyAxes)?Cd(e,t):Mo(e,t)}});var jo,Zo,Qo,qn,Xo=E(()=>{"use strict";V();ie();Qt();jo=e=>{if(!e||e.length===0||e.length>2)throw new Error("ArgMinMaxOp op requires 1 or 2 inputs.");if(e[0].dataType!==1)throw new Error("Invalid input type.")},Zo=(e,t)=>{jo(e.inputs);let n=(r,o,i)=>{let s=[];for(let a=0;a<r.rank;a++)(i.indexOf(a)>=0||i.length===0)&&s.push(`input_indices[${a}] = 0;`);return[`${s.join(`
+`)}`,`var value = ${r.getByIndices("input_indices")};
+var best_index : i32 = 0;`,`if (${r.getByIndices("input_indices")} ${t.selectLastIndex>0?"<=":"<"} value) {
+         value = ${r.getByIndices("input_indices")};
+         best_index = i32(last_index);
+       }`,"",o.setByOffset("global_idx","best_index")]};e.compute(Xt("ArgMin",{hint:t.cacheKey,inputDependencies:["rank"]},[e.inputs[0]],n,[t.axis],7,t.keepDims),{inputs:[0]})},Qo=(e,t)=>{jo(e.inputs);let n=(r,o,i)=>{let s=[];for(let a=0;a<r.rank;a++)(i.indexOf(a)>=0||i.length===0)&&s.push(`input_indices[${a}] = 0;`);return[`${s.join(`
+`)}`,`var value = ${r.getByIndices("input_indices")};
+var best_index : i32 = 0;`,`if (${r.getByIndices("input_indices")} ${t.selectLastIndex>0?">=":">"} value) {
+         value = ${r.getByIndices("input_indices")};
+         best_index = i32(last_index);
+       }`,"",o.setByOffset("global_idx","best_index")]};e.compute(Xt("argMax",{hint:t.cacheKey,inputDependencies:["rank"]},[e.inputs[0]],n,[t.axis],7,t.keepDims),{inputs:[0]})},qn=e=>N(e)});var Rd,Fn,Ud,Vd,Nd,nt,Ld,Yo,Yt=E(()=>{"use strict";V();q();Kt();K();Rd=(e,t)=>{let n=e[0],r=e[1],o=e[2],i=e[3],s=e[4],a=e[5];if(s&&a)throw new Error("Attention cannot have both past and attention_bias");if(n.dims.length!==3)throw new Error('Input "input" must have 3 dimensions');let u=n.dims[0],d=n.dims[1],l=n.dims[2];if(o.dims.length!==1)throw new Error('Input "bias" is expected to have 1 dimensions');if(r.dims.length!==2)throw new Error('Input "weights" is expected to have 2 dimensions');if(r.dims[0]!==l)throw new Error("Input 1 dimension 0 should have same length as dimension 2 of input 0");if(o.dims[0]!==r.dims[1])throw new Error('Input "bias" dimension 0 should have same length as dimension 1 of input "weights"');let c=o.dims[0]/3,p=c,f=p;if(t.qkvHiddenSizes.length>0){if(t.qkvHiddenSizes.length!==3)throw new Error("qkv_hidden_sizes attribute should have 3 elements");for(let _ of t.qkvHiddenSizes)if(_%t.numHeads!==0)throw new Error("qkv_hidden_sizes should be divisible by num_heads");c=t.qkvHiddenSizes[0],p=t.qkvHiddenSizes[1],f=t.qkvHiddenSizes[2]}let m=d;if(c!==p)throw new Error("qkv_hidden_sizes first element should be same as the second");if(o.dims[0]!==c+p+f)throw new Error('Input "bias" dimension 0 should have same length as sum of Q/K/V hidden sizes');let h=0;if(s){if(p!==f)throw new Error('Input "past" expect k_hidden_size == v_hidden_size');if(s.dims.length!==5)throw new Error('Input "past" must have 5 dimensions');if(s.dims[0]!==2)throw new Error('Input "past" first dimension must be 2');if(s.dims[1]!==u)throw new Error('Input "past" second dimension must be batch_size');if(s.dims[2]!==t.numHeads)throw new Error('Input "past" third dimension must be num_heads');if(s.dims[4]!==p/t.numHeads)throw new Error('Input "past" fifth dimension must be k_hidden_size / num_heads');t.pastPresentShareBuffer||(h=s.dims[3])}let b=m+h,y=-1,g=0;if(i)throw new Error("Mask not supported");if(s)throw new Error("past is not supported");if(a){if(a.dims.length!==4)throw new Error('Input "attention_bias" must have 4 dimensions');if(a.dims[0]!==u||a.dims[1]!==t.numHeads||a.dims[2]!==d||a.dims[3]!==b)throw new Error('Expect "attention_bias" shape (batch_size, num_heads, sequence_length, total_sequence_length)')}return{batchSize:u,sequenceLength:d,pastSequenceLength:h,kvSequenceLength:m,totalSequenceLength:b,maxSequenceLength:y,inputHiddenSize:l,hiddenSize:c,vHiddenSize:f,headSize:Math.floor(c/t.numHeads),vHeadSize:Math.floor(f/t.numHeads),numHeads:t.numHeads,isUnidirectional:!1,pastPresentShareBuffer:!1,maskFilterValue:t.maskFilterValue,maskType:g,scale:t.scale,broadcastResPosBias:!1,passPastInKv:!1,qkvFormat:1}},Fn=(e,t,n)=>t&&e?`
+      let total_sequence_length_input = u32(${t.getByOffset("0")});
+      let present_sequence_length = max(total_sequence_length_input, uniforms.past_sequence_length);
+      let is_subsequent_prompt: bool = sequence_length > 1 && sequence_length != total_sequence_length_input;
+      let is_first_prompt: bool = is_subsequent_prompt == false && sequence_length == total_sequence_length_input;
+      total_sequence_length = u32(${e?.getByOffset("batchIdx")}) + 1;
+      var past_sequence_length: u32 = 0;
+      if (is_first_prompt == false) {
+        past_sequence_length = total_sequence_length - sequence_length;
+      }
+       `:`
+    ${n?"let past_sequence_length = uniforms.past_sequence_length":""};
+    let present_sequence_length = total_sequence_length;
+    `,Ud=(e,t,n,r,o,i,s,a)=>{let u=X(s?1:i),d=64,l=i/u;l<d&&(d=32);let c=Math.ceil(i/u/d),p=[{type:12,data:t},{type:12,data:n},{type:12,data:r},{type:12,data:o},{type:12,data:l},{type:12,data:c}],f=re(e.dataType,u),m=le(1,u),h=["type"];s&&h.push("type"),a&&h.push("type");let b=y=>{let g=C("x",e.dataType,e.dims,u),_=[g],w=s?S("seq_lens",s.dataType,s.dims):void 0;w&&_.push(w);let v=a?S("total_sequence_length_input",a.dataType,a.dims):void 0;v&&_.push(v);let $=le(e.dataType),T=[{name:"batch_size",type:"u32"},{name:"num_heads",type:"u32"},{name:"past_sequence_length",type:"u32"},{name:"sequence_length",type:"u32"},{name:"total_sequence_length",type:"u32"},{name:"elements_per_thread",type:"u32"}];return`
+  var<workgroup> thread_max: array<f32, ${d}>;
+  var<workgroup> thread_sum: array<f32, ${d}>;
+  ${y.registerUniforms(T).declareVariables(..._)}
+  ${y.mainStart([d,1,1])}
+    let batchIdx = workgroup_id.z / uniforms.num_heads;
+    let headIdx = workgroup_id.z % uniforms.num_heads;
+    let sequence_length = uniforms.sequence_length;
+    var total_sequence_length = uniforms.total_sequence_length;
+    ${Fn(w,v,!1)}
+    let local_offset = local_idx * uniforms.elements_per_thread;
+    let offset = (global_idx / ${d}) * uniforms.total_sequence_length + local_offset;
+    let seq_causal_length = ${s?"u32(past_sequence_length + workgroup_id.y + 1)":"total_sequence_length"};
+    var thread_max_vector = ${m}(-3.402823e+38f);
+    for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < seq_causal_length; i++) {
+      thread_max_vector = max(${m}(x[offset + i]), thread_max_vector);
+    }
+    thread_max[local_idx] = ${(()=>{switch(u){case 1:return"thread_max_vector";case 2:return"max(thread_max_vector.x, thread_max_vector.y)";case 4:return"max(max(thread_max_vector.x, thread_max_vector.y), max(thread_max_vector.z, thread_max_vector.w))";default:throw new Error(`Unsupported components: ${u}`)}})()};
+    workgroupBarrier();
+
+    var max_value =  f32(-3.402823e+38f);
+    for (var i = 0u; i < ${d}; i++) {
+      max_value = max(thread_max[i], max_value);
+    }
+
+    var sum_vector = ${m}(0);
+    for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < seq_causal_length; i++) {
+      sum_vector += exp(${m}(x[offset + i]) - max_value);
+    }
+    thread_sum[local_idx] = ${(()=>{switch(u){case 1:return"sum_vector";case 2:return"sum_vector.x + sum_vector.y";case 4:return"sum_vector.x + sum_vector.y + sum_vector.z + sum_vector.w";default:throw new Error(`Unsupported components: ${u}`)}})()};
+    workgroupBarrier();
+
+    var sum: f32 = 0;
+    for (var i = 0u; i < ${d}; i++) {
+      sum += thread_sum[i];
+    }
+
+    if (sum == 0) {
+      for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < seq_causal_length; i++) {
+        x[offset + i] = ${g.type.value}(${$}(1.0) / ${$}(seq_causal_length));
+      }
+    } else {
+      for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < seq_causal_length; i++) {
+        var f32input = ${m}(x[offset + i]);
+        x[offset + i] = ${g.type.value}(exp(f32input - max_value) / sum);
+      }
+    }
+      ${s?`
+        for (var total_seq_id: u32 = seq_causal_length; total_seq_id + local_offset < uniforms.total_sequence_length; total_seq_id++) {
+          x[offset + total_seq_id] = ${g.type.value}(${$}(0));
+        }`:""};
+  }`};return{name:"AttentionProbsSoftmax",shaderCache:{hint:`${d};${f};${u}`,inputDependencies:h},getShaderSource:b,getRunData:()=>({outputs:[],dispatchGroup:{x:1,y:o,z:t*n},programUniforms:p})}},Vd=(e,t,n,r,o,i,s,a,u)=>{let d=s+i.kvSequenceLength,l=[i.batchSize,i.numHeads,i.sequenceLength,d],c=e>1&&r,p=i.kvNumHeads?i.kvNumHeads:i.numHeads,f=c?[i.batchSize,p,d,i.headSize]:void 0,m=i.nReps?i.nReps:1,h=i.scale===0?1/Math.sqrt(i.headSize):i.scale,b=X(i.headSize),y=i.headSize/b,g=12,_={x:Math.ceil(d/g),y:Math.ceil(i.sequenceLength/g),z:i.batchSize*i.numHeads},w=[{type:12,data:i.sequenceLength},{type:12,data:y},{type:12,data:d},{type:12,data:i.numHeads},{type:12,data:i.headSize},{type:1,data:h},{type:12,data:s},{type:12,data:i.kvSequenceLength},{type:12,data:m}],v=c&&r&&x.size(r.dims)>0,$=["type","type"];v&&$.push("type"),o&&$.push("type"),a&&$.push("type"),u&&$.push("type");let T=[{dims:l,dataType:t.dataType,gpuDataType:0}];c&&T.push({dims:f,dataType:t.dataType,gpuDataType:0});let I=A=>{let z=S("q",t.dataType,t.dims,b),M=S("key",n.dataType,n.dims,b),R=[z,M];if(v){let U=S("past_key",r.dataType,r.dims,b);R.push(U)}o&&R.push(S("attention_bias",o.dataType,o.dims));let W=a?S("seq_lens",a.dataType,a.dims):void 0;W&&R.push(W);let O=u?S("total_sequence_length_input",u.dataType,u.dims):void 0;O&&R.push(O);let ee=C("output",t.dataType,l),G=[ee];c&&G.push(C("present_key",t.dataType,f,b));let D=le(1,b),Z=[{name:"M",type:"u32"},{name:"K",type:"u32"},{name:"N",type:"u32"},{name:"num_heads",type:"u32"},{name:"head_size",type:"u32"},{name:"alpha",type:"f32"},{name:"past_sequence_length",type:"u32"},{name:"kv_sequence_length",type:"u32"},{name:"n_reps",type:"u32"}];return`
+  const TILE_SIZE = ${g}u;
+
+  var<workgroup> tileQ: array<${z.type.storage}, ${g*g}>;
+  var<workgroup> tileK: array<${z.type.storage}, ${g*g}>;
+  ${A.registerUniforms(Z).declareVariables(...R,...G)}
+  ${A.mainStart([g,g,1])}
+    // x holds the N and y holds the M
+    let headIdx = workgroup_id.z % uniforms.num_heads;
+    let kvHeadIdx = ${m===1?"headIdx":"headIdx / uniforms.n_reps"};
+    let kv_num_heads = ${m===1?"uniforms.num_heads":"uniforms.num_heads / uniforms.n_reps"};
+    let batchIdx = workgroup_id.z / uniforms.num_heads;
+    let m = workgroup_id.y * TILE_SIZE;
+    let n = workgroup_id.x * TILE_SIZE;
+    let sequence_length = uniforms.M;
+    var total_sequence_length = uniforms.N;
+    ${Fn(W,O,!0)}
+    let absKvHeadIdx = batchIdx * kv_num_heads + kvHeadIdx;
+    let qOffset = workgroup_id.z * uniforms.M * uniforms.K + m * uniforms.K;
+    ${v&&c?"let pastKeyOffset = absKvHeadIdx * uniforms.past_sequence_length * uniforms.K;":""};
+    let kOffset = absKvHeadIdx * uniforms.kv_sequence_length * uniforms.K;
+    ${c?"let presentKeyOffset = absKvHeadIdx * uniforms.N * uniforms.K;":""}
+    var value = ${D}(0);
+    for (var w: u32 = 0u; w < uniforms.K; w += TILE_SIZE) {
+      if (global_id.y < uniforms.M && w + local_id.x < uniforms.K) {
+        tileQ[TILE_SIZE * local_id.y + local_id.x] = q[qOffset + local_id.y * uniforms.K + w + local_id.x];
+      }
+      if (n + local_id.y < uniforms.N && w + local_id.x < uniforms.K) {
+        var idx = TILE_SIZE * local_id.y + local_id.x;
+      ${v&&c?`
+              if (n + local_id.y < past_sequence_length) {
+                tileK[idx] = past_key[pastKeyOffset + (n + local_id.y) * uniforms.K + w + local_id.x];
+              } else if (n + local_id.y - past_sequence_length < uniforms.kv_sequence_length) {
+                tileK[idx] = key[kOffset + (n + local_id.y - past_sequence_length) * uniforms.K + w + local_id.x];
+              }`:`
+          if (n + local_id.y < uniforms.kv_sequence_length) {
+            tileK[idx] = key[kOffset + (n + local_id.y) * uniforms.K + w + local_id.x];
+          }`}
+      ${c?`if (n + local_id.y < present_sequence_length) {
+        present_key[presentKeyOffset + (n + local_id.y) * uniforms.K + w + local_id.x] = tileK[idx];
+      }`:""}
+      }
+      workgroupBarrier();
+
+      for (var k: u32 = 0u; k < TILE_SIZE && w+k < uniforms.K; k++) {
+          value += ${D}(tileQ[TILE_SIZE * local_id.y + k] * tileK[TILE_SIZE * local_id.x + k]);
+      }
+
+      workgroupBarrier();
+    }
+
+    if (global_id.y < uniforms.M && global_id.x < total_sequence_length) {
+      let headOffset = workgroup_id.z * uniforms.M * uniforms.N;
+      let outputIdx = headOffset + global_id.y * uniforms.N + global_id.x;
+      var sum: f32 = ${(()=>{switch(b){case 1:return"value";case 2:return"value.x + value.y";case 4:return"value.x + value.y + value.z + value.w";default:throw new Error(`Unsupported components: ${b}`)}})()};
+        output[outputIdx] = ${ee.type.value} (sum * uniforms.alpha) + ${o?"attention_bias[outputIdx]":"0.0"};
+    }
+  }`};return{name:"AttentionProbs",shaderCache:{hint:`${b};${o!==void 0};${r!==void 0};${e}`,inputDependencies:$},getRunData:()=>({outputs:T,dispatchGroup:_,programUniforms:w}),getShaderSource:I}},Nd=(e,t,n,r,o,i,s=void 0,a=void 0)=>{let u=i+o.kvSequenceLength,d=o.nReps?o.nReps:1,l=o.vHiddenSize*d,c=e>1&&r,p=o.kvNumHeads?o.kvNumHeads:o.numHeads,f=c?[o.batchSize,p,u,o.headSize]:void 0,m=[o.batchSize,o.sequenceLength,l],h=12,b={x:Math.ceil(o.vHeadSize/h),y:Math.ceil(o.sequenceLength/h),z:o.batchSize*o.numHeads},y=[{type:12,data:o.sequenceLength},{type:12,data:u},{type:12,data:o.vHeadSize},{type:12,data:o.numHeads},{type:12,data:o.headSize},{type:12,data:l},{type:12,data:i},{type:12,data:o.kvSequenceLength},{type:12,data:d}],g=c&&r&&x.size(r.dims)>0,_=["type","type"];g&&_.push("type"),s&&_.push("type"),a&&_.push("type");let w=[{dims:m,dataType:t.dataType,gpuDataType:0}];c&&w.push({dims:f,dataType:t.dataType,gpuDataType:0});let v=$=>{let T=S("probs",t.dataType,t.dims),I=S("v",n.dataType,n.dims),A=[T,I];g&&A.push(S("past_value",r.dataType,r.dims));let z=s?S("seq_lens",s.dataType,s.dims):void 0;s&&A.push(z);let M=a?S("total_sequence_length_input",a.dataType,a.dims):void 0;a&&A.push(M);let W=[C("output",t.dataType,m)];c&&W.push(C("present_value",t.dataType,f));let O=[{name:"M",type:"u32"},{name:"K",type:"u32"},{name:"N",type:"u32"},{name:"num_heads",type:"u32"},{name:"head_size",type:"u32"},{name:"v_hidden_size",type:"u32"},{name:"past_sequence_length",type:"u32"},{name:"kv_sequence_length",type:"u32"},{name:"n_reps",type:"u32"}];return`
+  const TILE_SIZE = ${h}u;
+  var<workgroup> tileQ: array<${T.type.value}, ${h*h}>;
+  var<workgroup> tileV: array<${T.type.value}, ${h*h}>;
+  ${$.registerUniforms(O).declareVariables(...A,...W)}
+  ${$.mainStart([h,h,1])}
+   let headIdx = workgroup_id.z % uniforms.num_heads;
+   let batchIdx = workgroup_id.z / uniforms.num_heads;
+   let kvHeadIdx = ${d===1?"headIdx":"headIdx / uniforms.n_reps"};
+   let kv_num_heads = ${d===1?"uniforms.num_heads":"uniforms.num_heads / uniforms.n_reps"};
+   let m = global_id.y;
+   let n = global_id.x;
+   let sequence_length = uniforms.M;
+   var total_sequence_length = uniforms.K;
+   ${Fn(z,M,!0)}
+   let offsetA = workgroup_id.z * uniforms.M * uniforms.K + m * uniforms.K;
+   let absKvHeadIdx = batchIdx * kv_num_heads + kvHeadIdx; // kvHeadIdx is relative to the batch
+   ${g&&c?"let pastValueOffset = absKvHeadIdx * uniforms.N * uniforms.past_sequence_length + n;":""};
+   let vOffset = absKvHeadIdx * uniforms.N * uniforms.kv_sequence_length + n;
+   ${c?"let presentValueOffset = absKvHeadIdx * uniforms.N * uniforms.K + n;":""}
+   var value = ${T.type.storage}(0);
+   for (var w: u32 = 0u; w < uniforms.K; w += TILE_SIZE) {
+      if (m < uniforms.M && w + local_id.x < uniforms.K) {
+        tileQ[TILE_SIZE * local_id.y + local_id.x] = probs[offsetA + w + local_id.x];
+      }
+      if (n < uniforms.N && w + local_id.y < uniforms.K) {
+        var idx = TILE_SIZE * local_id.y + local_id.x;
+        ${g&&c?`
+        if (w + local_id.y < past_sequence_length) {
+          tileV[idx] = past_value[pastValueOffset + (w + local_id.y) * uniforms.N];
+        } else if (w + local_id.y - past_sequence_length < uniforms.kv_sequence_length) {
+          tileV[idx] = v[vOffset + (w + local_id.y - past_sequence_length) * uniforms.N];
+        }
+      `:`
+            if (w + local_id.y < uniforms.kv_sequence_length) {
+              tileV[idx] = v[vOffset + (w + local_id.y) * uniforms.N];
+            }`}
+        ${c?`
+            if (w + local_id.y < present_sequence_length) {
+          present_value[presentValueOffset + (w + local_id.y) * uniforms.N] = tileV[idx];
+        }`:""}
+      }
+     workgroupBarrier();
+     for (var k: u32 = 0u; k < TILE_SIZE && w+k < total_sequence_length; k++) {
+       value += tileQ[TILE_SIZE * local_id.y + k] * tileV[TILE_SIZE * k + local_id.x];
+     }
+     workgroupBarrier();
+   }
+
+   // we need to transpose output from BNSH_v to BSND_v
+   if (m < uniforms.M && n < uniforms.N) {
+     let outputIdx = batchIdx * uniforms.M * uniforms.v_hidden_size + m * uniforms.v_hidden_size
+       + headIdx * uniforms.N + n;
+     output[outputIdx] = value;
+   }
+  }`};return{name:"AttentionScore",shaderCache:{hint:`${r!==void 0};${e}`,inputDependencies:_},getRunData:()=>({outputs:w,dispatchGroup:b,programUniforms:y}),getShaderSource:v}},nt=(e,t,n,r,o,i,s,a,u,d,l=void 0,c=void 0)=>{let p=Math.min(e.outputCount,1+(s?1:0)+(a?1:0)),f=p>1?d.pastSequenceLength:0,m=f+d.kvSequenceLength,h=u&&x.size(u.dims)>0?u:void 0,b=[t,n];p>1&&s&&x.size(s.dims)>0&&b.push(s),h&&b.push(h),l&&b.push(l),c&&b.push(c);let y=e.compute(Vd(p,t,n,s,h,d,f,l,c),{inputs:b,outputs:p>1?[-1,1]:[-1]})[0];e.compute(Ud(y,d.batchSize,d.numHeads,f,d.sequenceLength,m,l,c),{inputs:l&&c?[y,l,c]:[y],outputs:[]});let g=[y,r];p>1&&a&&x.size(a.dims)>0&&g.push(a),l&&g.push(l),c&&g.push(c),e.compute(Nd(p,y,r,a,d,f,l,c),{inputs:g,outputs:p>1?[0,2]:[0]})},Ld=(e,t)=>{let n=[t.batchSize,t.numHeads,t.sequenceLength,t.headSize],r=t.sequenceLength,o=t.inputHiddenSize,i=t.headSize,s=12,a={x:Math.ceil(t.headSize/s),y:Math.ceil(t.sequenceLength/s),z:t.batchSize*t.numHeads},u=[e.inputs[0],e.inputs[1],e.inputs[2]],d=[{type:12,data:r},{type:12,data:o},{type:12,data:i},{type:12,data:t.numHeads},{type:12,data:t.headSize},{type:12,data:t.hiddenSize},{type:12,data:t.hiddenSize+t.hiddenSize+t.vHiddenSize}],l=c=>{let p=C("output_q",u[0].dataType,n),f=C("output_k",u[0].dataType,n),m=C("output_v",u[0].dataType,n),h=S("input",u[0].dataType,u[0].dims),b=S("weight",u[1].dataType,u[1].dims),y=S("bias",u[2].dataType,u[2].dims),g=h.type.storage,_=[{name:"M",type:"u32"},{name:"K",type:"u32"},{name:"N",type:"u32"},{name:"num_heads",type:"u32"},{name:"head_size",type:"u32"},{name:"hidden_size",type:"u32"},{name:"ldb",type:"u32"}];return`
+  const TILE_SIZE = ${s}u;
+  var<workgroup> tileInput: array<${g}, ${s*s}>;
+  var<workgroup> tileWeightQ: array<${g}, ${s*s}>;
+  var<workgroup> tileWeightK: array<${g}, ${s*s}>;
+  var<workgroup> tileWeightV: array<${g}, ${s*s}>;
+  ${c.registerUniforms(_).declareVariables(h,b,y,p,f,m)}
+  ${c.mainStart([s,s,1])}
+    let batchIndex = workgroup_id.z / uniforms.num_heads;
+    let headNumber = workgroup_id.z % uniforms.num_heads;
+    let m = global_id.y;
+    let n = global_id.x;
+
+    let inputOffset = batchIndex * (uniforms.M * uniforms.K) + m * uniforms.K;
+    let biasOffsetQ = headNumber * uniforms.head_size;
+    let biasOffsetK = uniforms.hidden_size + biasOffsetQ;
+    let biasOffsetV = uniforms.hidden_size + biasOffsetK;
+
+    var valueQ = ${g}(0);
+    var valueK = ${g}(0);
+    var valueV = ${g}(0);
+    for (var w: u32 = 0u; w < uniforms.K; w += TILE_SIZE) {
+      if (m < uniforms.M && w + local_id.x < uniforms.K) {
+        tileInput[TILE_SIZE * local_id.y + local_id.x] = input[inputOffset + w + local_id.x];
+      }
+      if (n < uniforms.N && w + local_id.y < uniforms.K) {
+        let offset = n + (w + local_id.y) * uniforms.ldb;
+        tileWeightQ[TILE_SIZE * local_id.y + local_id.x] = weight[biasOffsetQ + offset];
+        tileWeightK[TILE_SIZE * local_id.y + local_id.x] = weight[biasOffsetK + offset];
+        tileWeightV[TILE_SIZE * local_id.y + local_id.x] = weight[biasOffsetV + offset];
+      }
+      workgroupBarrier();
+      for (var k: u32 = 0u; k<TILE_SIZE && w+k < uniforms.K; k++) {
+        let inputTileOffset = TILE_SIZE * local_id.y + k;
+        let weightTileOffset = TILE_SIZE * k + local_id.x;
+        valueQ += tileInput[inputTileOffset] * tileWeightQ[weightTileOffset];
+        valueK += tileInput[inputTileOffset] * tileWeightK[weightTileOffset];
+        valueV += tileInput[inputTileOffset] * tileWeightV[weightTileOffset];
+      }
+
+      workgroupBarrier();
+    }
+
+    let headOffset = (m * uniforms.N + n) % uniforms.head_size;
+    valueQ += bias[headOffset + biasOffsetQ];
+    valueK += bias[headOffset + biasOffsetK];
+    valueV += bias[headOffset + biasOffsetV];
+
+    let offset = workgroup_id.z * uniforms.M * uniforms.N;
+    if (m < uniforms.M && n < uniforms.N) {
+      let outputIdx = offset + m * uniforms.N + n;
+      output_q[outputIdx] = valueQ;
+      output_k[outputIdx] = valueK;
+      output_v[outputIdx] = valueV;
+    }
+  }`};return e.compute({name:"AttentionPrepare",shaderCache:{inputDependencies:["type","type","type"]},getRunData:()=>({outputs:[{dims:n,dataType:e.inputs[0].dataType,gpuDataType:0},{dims:n,dataType:e.inputs[0].dataType,gpuDataType:0},{dims:n,dataType:e.inputs[0].dataType,gpuDataType:0}],dispatchGroup:a,programUniforms:d}),getShaderSource:l},{inputs:u,outputs:[-1,-1,-1]})},Yo=(e,t)=>{let n=Rd(e.inputs,t),[r,o,i]=Ld(e,n);return nt(e,r,o,i,e.inputs[4],void 0,void 0,void 0,e.inputs[5],n)}});var Gd,Wd,Hd,Jo,ei=E(()=>{"use strict";ve();V();q();ie();K();Gd=(e,t)=>{if(!e||e.length!==5)throw new Error("BatchNormalization requires 5 inputs");let n=(r,o,i)=>{let s=o.length;if(s!==r.length)throw new Error(`${i}: num dimensions != ${s}`);o.forEach((a,u)=>{if(a!==r[u])throw new Error(`${i}: dim[${u}] do not match`)})};if(e[0].dims.length>1){let r=t.format==="NHWC"?t.spatial?e[0].dims.slice(-1):e[0].dims.slice(-1).concat(e[0].dims.slice(1,e[0].dims.length-1)):e[0].dims.slice(1,t.spatial?2:void 0);n(e[1].dims,r,"Invalid input scale"),n(e[2].dims,r,"Invalid input B"),n(e[3].dims,r,"Invalid input mean"),n(e[4].dims,r,"Invalid input var")}else n(e[1].dims,[1],"Invalid input scale"),n(e[2].dims,[1],"Invalid input B"),n(e[3].dims,[1],"Invalid input mean"),n(e[4].dims,[1],"Invalid input var")},Wd=(e,t)=>{let{epsilon:n,spatial:r,format:o}=t,i=e[0].dims,s=r?X(i[i.length-1]):1,a=o==="NHWC"&&i.length>1?s:1,u=x.size(i)/s,d=r,l=d?i.length:i,c=S("x",e[0].dataType,e[0].dims,s),p=S("scale",e[1].dataType,e[1].dims,a),f=S("bias",e[2].dataType,e[2].dims,a),m=S("inputMean",e[3].dataType,e[3].dims,a),h=S("inputVar",e[4].dataType,e[4].dims,a),b=C("y",e[0].dataType,l,s),y=()=>{let _="";if(r)_=`let cOffset = ${i.length===1?"0u":o==="NHWC"?`outputIndices[${i.length-1}] / ${s}`:"outputIndices[1]"};`;else if(o==="NCHW")_=`
+            ${b.indicesSet("outputIndices","0","0")}
+            let cOffset = ${b.indicesToOffset("outputIndices")};`;else{_=`var cIndices = ${p.type.indices}(0);
+                       cIndices[0] = outputIndices[${i.length-1}];`;for(let w=1;w<p.rank;w++)_+=`cIndices[${w}] = outputIndices[${w}];`;_+=`let cOffset = ${p.indicesToOffset("cIndices")};`}return _},g=_=>`
+  const epsilon = ${n};
+  ${_.registerUniform("outputSize","u32").declareVariables(c,p,f,m,h,b)}
+  ${_.mainStart()}
+  ${_.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.outputSize")}
+    var outputIndices = ${b.offsetToIndices(`global_idx * ${s}`)};
+    ${y()}
+    let scale = ${p.getByOffset("cOffset")};
+    let bias = ${f.getByOffset("cOffset")};
+    let inputMean = ${m.getByOffset("cOffset")};
+    let inputVar = ${h.getByOffset("cOffset")};
+    let x = ${c.getByOffset("global_idx")};
+    let value = (x - inputMean) * inverseSqrt(inputVar + epsilon) * scale + bias;
+    ${b.setByOffset("global_idx","value")}
+  }`;return{name:"BatchNormalization",shaderCache:{hint:`${t.epsilon}_${t.format}_${r}_${s}`,inputDependencies:d?["rank","type","type","type","type"]:void 0},getShaderSource:g,getRunData:()=>({outputs:[{dims:e[0].dims,dataType:e[0].dataType}],dispatchGroup:{x:Math.ceil(u/64)},programUniforms:d?[{type:12,data:u},...P(i)]:[{type:12,data:u}]})}},Hd=e=>N(e),Jo=(e,t)=>{let{inputs:n,outputCount:r}=e,o=Hd({...t,outputCount:r});if(te.webgpu.validateInputContent&&Gd(n,o),t.trainingMode)throw new Error("BatchNormalization trainingMode is not supported yet.");e.compute(Wd(n,o))}});var qd,Fd,ti,ni=E(()=>{"use strict";q();K();qd=e=>{if(e[0].dims.length!==3)throw new Error("input should have 3 dimensions");if(![320,640,1280].includes(e[0].dims[2]))throw new Error("number of channels should be 320, 640 or 1280");if(e[1].dims.length!==1)throw new Error("bias is expected to have 1 dimensions");if(e[0].dims[2]!==e[1].dims[0])throw new Error("last dimension of input and bias are not the same")},Fd=e=>{let t=e[0].dims,n=e[0].dims[2],r=x.size(t)/4,o=e[0].dataType,i=S("input",o,t,4),s=S("bias",o,[n],4),a=S("residual",o,t,4),u=C("output",o,t,4);return{name:"BiasAdd",getRunData:()=>({outputs:[{dims:t,dataType:e[0].dataType}],dispatchGroup:{x:Math.ceil(r/64)}}),getShaderSource:l=>`
+  const channels = ${n}u / 4;
+  ${l.declareVariables(i,s,a,u)}
+
+  ${l.mainStart()}
+    ${l.guardAgainstOutOfBoundsWorkgroupSizes(r)}
+    let value = ${i.getByOffset("global_idx")}
+      + ${s.getByOffset("global_idx % channels")} + ${a.getByOffset("global_idx")};
+    ${u.setByOffset("global_idx","value")}
+  }`}},ti=e=>{qd(e.inputs),e.compute(Fd(e.inputs))}});var Kd,J,ri,oi,ii,si,ai,ui,di,li,ci,jd,pi,mi,fi,hi,ft,gi,Jt,yi,bi,_i,wi,$i,vi,xi,Si,Ti,Ii,Ci,Ai,ki,Ei,Pi,zi,Oi,Di,Kn,jn,Bi,Mi,Ri,Zd,Qd,Ui,en=E(()=>{"use strict";V();q();ie();K();Kd=(e,t,n,r,o,i,s)=>{let a=Math.ceil(t/4),u="";typeof o=="string"?u=`${o}(a)`:u=o("a");let d=S("inputData",n,[a],4),l=C("outputData",r,[a],4),c=[{name:"vec_size",type:"u32"}];return s&&c.push(...s),`
+      ${e.registerUniforms(c).declareVariables(d,l)}
+
+  ${i??""}
+
+  ${e.mainStart()}
+    ${e.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.vec_size")}
+
+    let a = ${d.getByOffset("global_idx")};
+    ${l.setByOffset("global_idx",u)}
+  }`},J=(e,t,n,r,o,i=e.dataType,s,a)=>{let u=[{type:12,data:Math.ceil(x.size(e.dims)/4)}];return s&&u.push(...s),{name:t,shaderCache:{hint:o,inputDependencies:["type"]},getShaderSource:d=>Kd(d,x.size(e.dims),e.dataType,i,n,r,a),getRunData:d=>({outputs:[{dims:e.dims,dataType:i}],dispatchGroup:{x:Math.ceil(x.size(d[0].dims)/64/4)},programUniforms:u})}},ri=e=>{e.compute(J(e.inputs[0],"Abs","abs"))},oi=e=>{e.compute(J(e.inputs[0],"Acos","acos"))},ii=e=>{e.compute(J(e.inputs[0],"Acosh","acosh"))},si=e=>{e.compute(J(e.inputs[0],"Asin","asin"))},ai=e=>{e.compute(J(e.inputs[0],"Asinh","asinh"))},ui=e=>{e.compute(J(e.inputs[0],"Atan","atan"))},di=e=>{e.compute(J(e.inputs[0],"Atanh","atanh"))},li=e=>N(e),ci=(e,t)=>{let n;switch(t.to){case 10:n="vec4<f16>";break;case 1:n="vec4<f32>";break;case 12:n="vec4<u32>";break;case 6:n="vec4<i32>";break;case 9:n="vec4<bool>";break;default:throw new RangeError(`not supported type (specified in attribute 'to' from 'Cast' operator): ${t.to}`)}e.compute(J(e.inputs[0],"Cast",n,void 0,t.cacheKey,t.to))},jd=e=>{let t,n,r=e.length>=2&&e[1].data!==0,o=e.length>=3&&e[2].data!==0;switch(e[0].dataType){case 1:t=r?e[1].getFloat32Array()[0]:-34028234663852886e22,n=o?e[2].getFloat32Array()[0]:34028234663852886e22;break;case 10:t=r?e[1].getUint16Array()[0]:64511,n=o?e[2].getUint16Array()[0]:31743;break;default:throw new Error("Unsupport data type")}return N({min:t,max:n})},pi=(e,t)=>{let n=t||jd(e.inputs),r=le(e.inputs[0].dataType);e.compute(J(e.inputs[0],"Clip",o=>`clamp(${o}, vec4<${r}>(uniforms.min), vec4<${r}>(uniforms.max))`,void 0,n.cacheKey,void 0,[{type:e.inputs[0].dataType,data:n.min},{type:e.inputs[0].dataType,data:n.max}],[{name:"min",type:r},{name:"max",type:r}]),{inputs:[0]})},mi=e=>{e.compute(J(e.inputs[0],"Ceil","ceil"))},fi=e=>{e.compute(J(e.inputs[0],"Cos","cos"))},hi=e=>{e.compute(J(e.inputs[0],"Cosh","cosh"))},ft=e=>N(e),gi=(e,t)=>{let n=le(e.inputs[0].dataType);e.compute(J(e.inputs[0],"Elu",r=>`elu_vf32(${r})`,`
+  const elu_alpha_ = ${n}(${t.alpha});
+
+  fn elu_f32(a: ${n}) -> ${n} {
+  return select((exp(a) - 1.0) * elu_alpha_, a, a >= 0.0);
+  }
+
+  fn elu_vf32(v: vec4<${n}>) -> vec4<${n}> {
+  return vec4(elu_f32(v.x), elu_f32(v.y), elu_f32(v.z), elu_f32(v.w));
+  }`,t.cacheKey))},Jt=(e="f32")=>`
+const r0: ${e} = 0.3275911;
+const r1: ${e} = 0.254829592;
+const r2: ${e} = -0.284496736;
+const r3: ${e} = 1.421413741;
+const r4: ${e} = -1.453152027;
+const r5: ${e} = 1.061405429;
+
+fn erf_vf32(v: vec4<${e}>) -> vec4<${e}> {
+  let absv = abs(v);
+  let x = 1.0 / (1.0 + r0 * absv);
+  return sign(v) * (1.0 - ((((r5 * x + r4) * x + r3) * x + r2) * x + r1) * x * exp(-absv * absv));
+}`,yi=e=>{let t=le(e.inputs[0].dataType);e.compute(J(e.inputs[0],"Erf",n=>`erf_vf32(${n})`,Jt(t)))},bi=e=>{e.compute(J(e.inputs[0],"Exp","exp"))},_i=e=>{e.compute(J(e.inputs[0],"Floor","floor"))},wi=e=>{let t=le(e.inputs[0].dataType);e.compute(J(e.inputs[0],"Gelu",n=>`0.5 * ${n} * (1.0 + erf_vf32(${n} * 0.7071067811865475))`,Jt(t)))},$i=(e,t)=>{let n=le(e.inputs[0].dataType);e.compute(J(e.inputs[0],"LeakyRelu",r=>`select(leaky_relu_alpha_ * ${r}, ${r}, ${r} >= vec4<${n}>(0.0))`,`const leaky_relu_alpha_ = ${n}(${t.alpha});`,t.cacheKey))},vi=e=>{e.compute(J(e.inputs[0],"Not",t=>`!${t}`))},xi=e=>{e.compute(J(e.inputs[0],"Neg",t=>`-${t}`))},Si=e=>{e.compute(J(e.inputs[0],"Reciprocal",t=>`1.0/${t}`))},Ti=e=>{let t=le(e.inputs[0].dataType);e.compute(J(e.inputs[0],"Relu",n=>`select(vec4<${t}>(0.0), ${n}, ${n} > vec4<${t}>(0.0))`))},Ii=e=>{e.compute(J(e.inputs[0],"Sigmoid",t=>`(1.0 / (1.0 + exp(-${t})))`))},Ci=e=>N(e),Ai=(e,t)=>{let n=le(e.inputs[0].dataType);e.compute(J(e.inputs[0],"HardSigmoid",r=>`max(vec4<${n}>(0.0), min(vec4<${n}>(1.0), ${t.alpha} * ${r} + vec4<${n}>(${t.beta})))`,void 0,t.cacheKey))},ki=e=>{e.compute(J(e.inputs[0],"Sin","sin"))},Ei=e=>{e.compute(J(e.inputs[0],"Sinh","sinh"))},Pi=e=>{e.compute(J(e.inputs[0],"Sqrt","sqrt"))},zi=e=>{e.compute(J(e.inputs[0],"Tan","tan"))},Oi=e=>`sign(${e}) * (1 - exp(-2 * abs(${e}))) / (1 + exp(-2 * abs(${e})))`,Di=e=>{e.compute(J(e.inputs[0],"Tanh",Oi))},Kn=(e="f32")=>`
+const fast_gelu_a: ${e} = 0.5;
+const fast_gelu_b: ${e} = 0.7978845608028654;
+const fast_gelu_c: ${e} = 0.035677408136300125;
+
+fn tanh_v(v: vec4<${e}>) -> vec4<${e}> {
+  return ${Oi("v")};
+}
+`,jn=e=>`(fast_gelu_a + fast_gelu_a * tanh_v(${e} * (fast_gelu_c * ${e} * ${e} + fast_gelu_b))) * ${e}`,Bi=e=>{let t=le(e.inputs[0].dataType);e.compute(J(e.inputs[0],"FastGelu",jn,Kn(t),void 0,e.inputs[0].dataType))},Mi=(e,t)=>{let n=le(e.inputs[0].dataType);return e.compute(J(e.inputs[0],"ThresholdedRelu",r=>`select(vec4<${n}>(0.0), ${r}, ${r} > thresholded_relu_alpha_)`,`const thresholded_relu_alpha_ = vec4<${n}>(${t.alpha});`,t.cacheKey)),0},Ri=e=>{e.compute(J(e.inputs[0],"Log","log"))},Zd=(e,t)=>`
+const alpha = vec4<${e}>(${t});
+const one = ${e}(1.0);
+const zero = ${e}(0.0);
+
+fn quick_gelu_impl(x: vec4<${e}>) -> vec4<${e}> {
+  let v = x *alpha;
+  var x1 : vec4<${e}>;
+  for (var i = 0; i < 4; i = i + 1) {
+    if (v[i] >= zero) {
+      x1[i] = one / (one + exp(-v[i]));
+    } else {
+      x1[i] = one - one / (one + exp(v[i]));
+    }
+  }
+  return x * x1;
+}
+`,Qd=e=>`quick_gelu_impl(${e})`,Ui=(e,t)=>{let n=le(e.inputs[0].dataType);e.compute(J(e.inputs[0],"QuickGelu",Qd,Zd(n,t.alpha),t.cacheKey,e.inputs[0].dataType))}});var Xd,Yd,Ni,Li=E(()=>{"use strict";q();K();en();Xd=e=>{if(e[0].dims.length!==3)throw new Error("input should have 3 dimensions");if(![2560,5120,10240].includes(e[0].dims[2]))throw new Error("hidden state should be 2560, 5120 or 10240");if(e[1].dims.length!==1)throw new Error("bias is expected to have 1 dimensions");if(e[0].dims[2]!==e[1].dims[0])throw new Error("last dimension of input and bias are not the same")},Yd=e=>{let t=e[0].dims.slice();t[2]=t[2]/2;let n=S("input",e[0].dataType,e[0].dims,4),r=S("bias",e[0].dataType,[e[0].dims[2]],4),o=C("output",e[0].dataType,t,4),i=x.size(t)/4,s=re(e[0].dataType);return{name:"BiasSplitGelu",getRunData:()=>({outputs:[{dims:t,dataType:e[0].dataType}],dispatchGroup:{x:Math.ceil(i/64)}}),getShaderSource:u=>`
+  const M_SQRT2 = sqrt(2.0);
+  const halfChannels = ${e[0].dims[2]/4/2}u;
+
+  ${u.declareVariables(n,r,o)}
+
+  ${Jt(s)}
+
+  ${u.mainStart()}
+    ${u.guardAgainstOutOfBoundsWorkgroupSizes(i)}
+    let biasIdx = global_idx % halfChannels;
+    let batchIndex = global_idx / halfChannels;
+    let inputOffset = biasIdx + batchIndex * halfChannels * 2;
+    let valueLeft = input[inputOffset] + bias[biasIdx];
+    let valueRight = input[inputOffset + halfChannels] + bias[biasIdx + halfChannels];
+    let geluRight = valueRight * 0.5 * (erf_vf32(valueRight / M_SQRT2) + 1);
+
+    ${o.setByOffset("global_idx","valueLeft * geluRight")}
+  }`}},Ni=e=>{Xd(e.inputs),e.compute(Yd(e.inputs))}});var Jd,el,Me,Gi,Wi,Hi,qi,Fi,Ki,ji,Zi,Qi,Xi,Yi=E(()=>{"use strict";V();q();K();Jd=(e,t,n,r,o,i,s,a,u,d,l,c)=>{let p,f;typeof a=="string"?p=f=(g,_)=>`${a}((${g}),(${_}))`:typeof a=="function"?p=f=a:(p=a.scalar,f=a.vector);let m=C("outputData",l,r.length,4),h=S("aData",u,t.length,4),b=S("bData",d,n.length,4),y;if(o)if(i){let g=x.size(t)===1,_=x.size(n)===1,w=t.length>0&&t[t.length-1]%4===0,v=n.length>0&&n[n.length-1]%4===0;g||_?y=m.setByOffset("global_idx",f(g?`${h.type.value}(${h.getByOffset("0")}.x)`:h.getByOffset("global_idx"),_?`${b.type.value}(${b.getByOffset("0")}.x)`:b.getByOffset("global_idx"))):y=`
+            let outputIndices = ${m.offsetToIndices("global_idx * 4u")};
+            let offsetA = ${h.broadcastedIndicesToOffset("outputIndices",m)};
+            let offsetB = ${b.broadcastedIndicesToOffset("outputIndices",m)};
+            ${m.setByOffset("global_idx",f(s||w?h.getByOffset("offsetA / 4u"):`${h.type.value}(${h.getByOffset("offsetA / 4u")}[offsetA % 4u])`,s||v?b.getByOffset("offsetB / 4u"):`${b.type.value}(${b.getByOffset("offsetB / 4u")}[offsetB % 4u])`))}
+          `}else y=m.setByOffset("global_idx",f(h.getByOffset("global_idx"),b.getByOffset("global_idx")));else{if(!i)throw new Error("no necessary to use scalar implementation for element-wise binary op implementation.");let g=(_,w,v="")=>{let $=`aData[indexA${w}][componentA${w}]`,T=`bData[indexB${w}][componentB${w}]`;return`
+            let outputIndices${w} = ${m.offsetToIndices(`global_idx * 4u + ${w}u`)};
+            let offsetA${w} = ${h.broadcastedIndicesToOffset(`outputIndices${w}`,m)};
+            let offsetB${w} = ${b.broadcastedIndicesToOffset(`outputIndices${w}`,m)};
+            let indexA${w} = offsetA${w} / 4u;
+            let indexB${w} = offsetB${w} / 4u;
+            let componentA${w} = offsetA${w} % 4u;
+            let componentB${w} = offsetB${w} % 4u;
+            ${_}[${w}] = ${v}(${p($,T)});
+          `};l===9?y=`
+            var data = vec4<u32>(0);
+            ${g("data",0,"u32")}
+            ${g("data",1,"u32")}
+            ${g("data",2,"u32")}
+            ${g("data",3,"u32")}
+            outputData[global_idx] = dot(vec4<u32>(0x1, 0x100, 0x10000, 0x1000000), vec4<u32>(data));`:y=`
+            ${g("outputData[global_idx]",0)}
+            ${g("outputData[global_idx]",1)}
+            ${g("outputData[global_idx]",2)}
+            ${g("outputData[global_idx]",3)}
+          `}return`
+        ${e.registerUniform("vec_size","u32").declareVariables(h,b,m)}
+
+        ${c??""}
+
+        ${e.mainStart()}
+        ${e.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.vec_size")}
+        ${y}
+      }`},el=(e,t,n,r,o,i,s=n.dataType)=>{let a=n.dims.map(h=>Number(h)??1),u=r.dims.map(h=>Number(h)??1),d=!x.areEqual(a,u),l=a,c=x.size(a),p=!1,f=!1,m=[d];if(d){let h=Pe.calcShape(a,u,!1);if(!h)throw new Error("Can't perform binary op on the given tensors");l=h.slice(),c=x.size(l);let b=x.size(a)===1,y=x.size(u)===1,g=a.length>0&&a[a.length-1]%4===0,_=u.length>0&&u[u.length-1]%4===0;m.push(b),m.push(y),m.push(g),m.push(_);let w=1;for(let v=1;v<l.length;v++){let $=a[a.length-v],T=u[u.length-v];if($===T)w*=$;else break}w%4===0?(f=!0,p=!0):(b||y||g||_)&&(p=!0)}else p=!0;return m.push(p),{name:e,shaderCache:{hint:t+m.map(h=>h.toString()).join("_"),inputDependencies:["rank","rank"]},getShaderSource:h=>Jd(h,a,u,l,p,d,f,o,n.dataType,r.dataType,s,i),getRunData:()=>({outputs:[{dims:l,dataType:s}],dispatchGroup:{x:Math.ceil(c/64/4)},programUniforms:[{type:12,data:Math.ceil(x.size(l)/4)},...P(a,u,l)]})}},Me=(e,t,n,r,o,i)=>{e.compute(el(t,o??"",e.inputs[0],e.inputs[1],n,r,i))},Gi=e=>{Me(e,"Add",(t,n)=>`${t}+${n}`)},Wi=e=>{Me(e,"Div",(t,n)=>`${t}/${n}`)},Hi=e=>{Me(e,"Equal",{scalar:(t,n)=>`u32(${t}==${n})`,vector:(t,n)=>`vec4<u32>(${t}==${n})`},void 0,void 0,9)},qi=e=>{Me(e,"Mul",(t,n)=>`${t}*${n}`)},Fi=e=>{let t=S("input",e.inputs[0].dataType,e.inputs[0].dims).type.value;Me(e,"Pow",{scalar:(r,o)=>`pow_custom(${r},${o})`,vector:(r,o)=>`pow_vector_custom(${r},${o})`},`
+    fn pow_custom(a : ${t}, b : ${t}) -> ${t} {
+      if (b == ${t}(0.0)) {
+        return ${t}(1.0);
+      } else if (a < ${t}(0.0) && f32(b) != floor(f32(b))) {
+        return ${t}(pow(f32(a), f32(b))); // NaN
+      }
+      return select(sign(a), ${t}(1.0), round(f32(abs(b) % ${t}(2.0))) != 1.0) * ${t}(${t==="i32"?"round":""}(pow(f32(abs(a)), f32(b))));
+    }
+    fn pow_vector_custom(a : vec4<${t}>, b : vec4<${t}>) -> vec4<${t}> {
+      // TODO: implement vectorized pow
+      return vec4<${t}>(pow_custom(a.x, b.x), pow_custom(a.y, b.y), pow_custom(a.z, b.z), pow_custom(a.w, b.w));
+    }
+      `)},Ki=e=>{Me(e,"Sub",(t,n)=>`${t}-${n}`)},ji=e=>{Me(e,"Greater",{scalar:(t,n)=>`u32(${t}>${n})`,vector:(t,n)=>`vec4<u32>(${t}>${n})`},void 0,void 0,9)},Zi=e=>{Me(e,"Less",{scalar:(t,n)=>`u32(${t}<${n})`,vector:(t,n)=>`vec4<u32>(${t}<${n})`},void 0,void 0,9)},Qi=e=>{Me(e,"GreaterOrEqual",{scalar:(t,n)=>`u32(${t}>=${n})`,vector:(t,n)=>`vec4<u32>(${t}>=${n})`},void 0,void 0,9)},Xi=e=>{Me(e,"LessOrEqual",{scalar:(t,n)=>`u32(${t}<=${n})`,vector:(t,n)=>`vec4<u32>(${t}<=${n})`},void 0,void 0,9)}});var nl,rl,ol,il,Ji,es,ts=E(()=>{"use strict";V();q();ie();K();nl=(e,t)=>{if(!e||e.length<1)throw new Error("too few inputs");let n=0,r=e[n],o=r.dataType,i=r.dims.length;e.forEach((s,a)=>{if(a!==n){if(s.dataType!==o)throw new Error("input tensors should be one type");if(s.dims.length!==i)throw new Error("input tensors should have the same shape");s.dims.forEach((u,d)=>{if(d!==t&&u!==r.dims[d])throw new Error("non concat dimensions must match")})}})},rl=(e,t)=>`
+  fn calculateInputIndex(index: u32) -> u32 {
+    let sizeInConcatAxis = array<u32, ${e}u>(${t});
+    for (var i: u32 = 0u; i < ${e}; i += 1u ) {
+      if (index < sizeInConcatAxis[i]) {
+        return i;
+      }
+    }
+    return ${e}u;
+  }`,ol=(e,t)=>{let n=e.length,r=[];for(let o=0;o<n;++o){let i=t.setByOffset("global_idx",e[o].getByIndices("indices"));n===1?r.push(i):o===0?r.push(`if (inputIndex == ${o}u) { ${i} }`):o===n-1?r.push(`else { ${i} }`):r.push(`else if (inputIndex == ${o}) { ${i} }`)}return r.join(`
+`)},il=(e,t,n,r)=>{let o=x.size(n),i=new Array(e.length),s=new Array(e.length),a=0,u=[],d=[],l=[{type:12,data:o}];for(let h=0;h<e.length;++h)a+=e[h].dims[t],i[h]=a,d.push(e[h].dims.length),s[h]=S(`input${h}`,r,d[h]),u.push("rank"),l.push({type:12,data:i[h]});for(let h=0;h<e.length;++h)l.push(...P(e[h].dims));l.push(...P(n));let c=C("output",r,n.length),p=c.indicesGet("indices",t),f=Array.from(Array(i.length).keys()).map(h=>`uniforms.sizeInConcatAxis${h}`).join(","),m=h=>`
+
+  ${(()=>{h.registerUniform("outputSize","u32");for(let b=0;b<e.length;b++)h.registerUniform(`sizeInConcatAxis${b}`,"u32");return h.declareVariables(...s,c)})()}
+
+  ${rl(i.length,f)}
+
+  ${h.mainStart()}
+    ${h.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.outputSize")}
+
+    var indices = ${c.offsetToIndices("global_idx")};
+
+    let inputIndex = calculateInputIndex(${p});
+    if (inputIndex != 0u) {
+      let sizeInConcatAxis = array<u32, ${i.length}u>(${f});
+      ${p} -= sizeInConcatAxis[inputIndex - 1u];
+    }
+
+    ${ol(s,c)}
+  }`;return{name:"Concat",shaderCache:{hint:`${t}`,inputDependencies:u},getRunData:()=>({outputs:[{dims:n,dataType:r}],dispatchGroup:{x:Math.ceil(o/64)},programUniforms:l}),getShaderSource:m}},Ji=(e,t)=>{let n=e.inputs,r=n[0].dims,o=x.normalizeAxis(t.axis,r.length);nl(n,o);let i=r.slice();i[o]=n.reduce((a,u)=>a+(u.dims.length>o?u.dims[o]:0),0);let s=n.filter(a=>x.size(a.dims)>0);e.compute(il(s,o,i,n[0].dataType),{inputs:s})},es=e=>N({axis:e.axis})});var Ie,Ce,Ae,tn,He=E(()=>{"use strict";V();q();Ie=(e,t,n="f32")=>{switch(e.activation){case"Relu":return`value = max(value, ${t}(0.0));`;case"Sigmoid":return`value = (${t}(1.0) / (${t}(1.0) + exp(-value)));`;case"Clip":return`value = clamp(value, ${t}(${n}(uniforms.clip_min)), ${t}(${n}(uniforms.clip_max)));`;case"HardSigmoid":return`value = max(${t}(0.0), min(${t}(1.0), ${n}(uniforms.alpha) * value + ${n}(uniforms.beta)));`;case"LeakyRelu":return`value = select(${n}(uniforms.alpha) * value, value, value >= ${t}(0.0));`;case"Tanh":return`let e2x = exp(-2.0 * abs(value));
+              value = sign(value) * (1.0 - e2x) / (1.0 + e2x);
+        `;case"":return"";default:throw new Error(`Unsupported activation ${e.activation}`)}},Ce=(e,t)=>{e.activation==="Clip"?t.push({type:1,data:e.clipMax},{type:1,data:e.clipMin}):e.activation==="HardSigmoid"?t.push({type:1,data:e.alpha},{type:1,data:e.beta}):e.activation==="LeakyRelu"&&t.push({type:1,data:e.alpha})},Ae=(e,t)=>{e.activation==="Clip"?t.push({name:"clip_max",type:"f32"},{name:"clip_min",type:"f32"}):e.activation==="HardSigmoid"?t.push({name:"alpha",type:"f32"},{name:"beta",type:"f32"}):e.activation==="LeakyRelu"&&t.push({name:"alpha",type:"f32"})},tn=e=>{let t=e?.activation||"";if(t==="HardSigmoid"){let[n,r]=e?.activation_params||[.2,.5];return{activation:t,alpha:n,beta:r}}else if(t==="Clip"){let[n,r]=e?.activation_params||[ao,uo];return{activation:t,clipMax:r,clipMin:n}}else if(t==="LeakyRelu"){let[n]=e?.activation_params||[.01];return{activation:t,alpha:n}}return{activation:t}}});var de,ns,nn=E(()=>{"use strict";de=(e,t)=>{switch(e){case 1:return t;case 2:return`vec2<${t}>`;case 3:return`vec3<${t}>`;case 4:return`vec4<${t}>`;default:throw new Error(`${e}-component is not supported.`)}},ns=e=>`
+      ${e?"value = value + getBiasByOutputCoords(coords);":""}
+      `});var rs,os=E(()=>{"use strict";rs=e=>`
+fn getIndexFromCoords4D(coords : vec4<i32>, shape : vec4<i32>) -> i32 {
+  return dot(coords, vec4<i32>(
+      shape.y * shape.z * shape.w, shape.z * shape.w, shape.w, 1));
+}
+fn getOutputIndexFromCoords(coords : vec4<i32>) -> i32 {
+  return dot(coords, vec4<i32>(
+    i32(${e}.x), i32(${e}.y), i32(${e}.z), 1));
+}
+`});var ht,rn,on=E(()=>{"use strict";V();q();K();He();ht=(e,t,n,r,o)=>{let i=r-n;return`
+      ${Array.from({length:n}).map((s,a)=>`
+      if (${B(t.shape,a,t.rank)} != 1) {
+        ${t.indicesSet(e,a,B(o,a+i,r))}
+      } else {
+        ${t.indicesSet(e,a,0)}
+      }`).join("")}
+`},rn=(e,t,n,r,o=!1,i)=>{let s=e[0].dims,a=e[1].dims,u=s[s.length-2],d=a[a.length-1],l=s[s.length-1],c=X(d),p=X(l),f=X(u),m=x.size(n)/c/f,h=e.length>2,b=r?r.slice(0,-2):n.slice(0,-2),g=[x.size(b),u,d],_=[{type:12,data:m},{type:12,data:u},{type:12,data:d},{type:12,data:l}];Ce(t,_),_.push(...P(b,s,a)),h&&_.push(...P(e[2].dims)),_.push(...P(g));let w=v=>{let $=Zt("batch_dims",e[0].dataType,b.length),T=S("a",e[0].dataType,s.length,p),I=S("b",e[1].dataType,a.length,c),A=C("output",e[0].dataType,g.length,c),z=re(A.type.tensor),M=Ie(t,A.type.value,z),R=[T,I],W="";if(h){let G=o?c:1;R.push(S("bias",e[2].dataType,e[2].dims.length,G)),W=`${o?`value += bias[col / ${G}];`:`value += ${A.type.value}(bias[row + i]);`}`}let O=[{name:"output_size",type:"u32"},{name:"M",type:"u32"},{name:"N",type:"u32"},{name:"K",type:"u32"}];Ae(t,O);let ee=()=>{let G=`var a_data: ${T.type.value};`;for(let D=0;D<p;D++)G+=`
+              let b_data${D} = b[(b_offset + (k + ${D}) * uniforms.N + col) / ${c}];`;for(let D=0;D<f;D++){G+=`a_data = a[(a_offset + (row + ${D}) * uniforms.K + k) / ${p}];`;for(let Z=0;Z<p;Z++)G+=`
+            values[${D}] = fma(${I.type.value}(a_data${p===1?"":`[${Z}]`}), b_data${Z}, values[${D}]);
+`}return G};return`
+  ${v.registerUniforms(O).registerInternalVariables($).declareVariables(...R,A)}
+  ${v.mainStart()}
+    ${v.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+    let col = (global_idx % (uniforms.N / ${c})) * ${c};
+    var index1 = global_idx / (uniforms.N / ${c});
+    let stride1 = uniforms.M / ${f};
+    let row = (index1 % stride1) * ${f};
+    let batch = index1 / stride1;
+
+    ${n.length===2?"":`let batch_indices = ${$.offsetToIndices("batch")};`}
+
+    var a_indices: ${T.type.indices};
+    ${ht("a_indices",T,T.rank-2,$.rank,"batch_indices")}
+    ${T.indicesSet("a_indices",T.rank-2,0)}
+    ${T.indicesSet("a_indices",T.rank-1,0)}
+    let a_offset = ${T.indicesToOffset("a_indices")};
+
+    var b_indices: ${I.type.indices};
+    ${ht("b_indices",I,I.rank-2,$.rank,"batch_indices")}
+    ${I.indicesSet("b_indices",I.rank-2,0)}
+    ${I.indicesSet("b_indices",I.rank-1,0)}
+    let b_offset = ${I.indicesToOffset("b_indices")};
+    var values: array<${A.type.value}, ${f}>;
+    for (var k: u32 = 0u; k < uniforms.K; k = k + ${p}) {
+      ${ee()}
+    }
+    for (var i = 0u; i < ${f}u; i++) {
+      var value = values[i];
+      ${W}
+      ${M}
+      let cur_indices = ${A.type.indices}(batch, row + i, col);
+      let offset = ${A.indicesToOffset("cur_indices")};
+      ${A.setByOffset(`offset / ${c}`,"value")};
+    }
+  }
+  `};return{name:"MatMulNaive",shaderCache:{hint:`${t.activation};${c};${p};${f};${o}`,inputDependencies:h?["rank","rank","rank"]:["rank","rank"]},getRunData:()=>({outputs:[{dims:i?i(n):n,dataType:e[0].dataType}],dispatchGroup:{x:Math.ceil(m/64)},programUniforms:_}),getShaderSource:w}}});var sl,al,Zn,is,ul,Qn,dl,gt,sn=E(()=>{"use strict";V();q();K();He();on();nn();sl=(e,t)=>e?`
+        mm_Asub[inputRow][inputCol] = mm_readA(batch,
+          kStart + inputRow,
+          globalRowStart / innerElementSize + inputCol${t?", batchIndices":""});
+        `:`
+        mm_Asub[inputRow][inputCol] = mm_readA(batch,
+          globalRow + innerRow,
+          kStart / innerElementSize + inputCol${t?", batchIndices":""});
+        `,al=(e,t)=>e?`
+        let ACached0 = mm_Asub[k * innerElementSize][localRow];
+        let ACached1 = mm_Asub[k * innerElementSize + 1][localRow];
+        let ACached2 = mm_Asub[k * innerElementSize + 2][localRow];
+        ${t===3?"":"let ACached3 = mm_Asub[k * innerElementSize + 3][localRow];"}
+        for (var i = 0; i < rowPerThread; i = i + 1) {
+          acc[i] = BCached0 * ACached0[i] + acc[i];
+          acc[i] = BCached1 * ACached1[i] + acc[i];
+          acc[i] = BCached2 * ACached2[i] + acc[i];
+          ${t===3?"":"acc[i] = BCached3 * ACached3[i] + acc[i];"}
+        }`:`
+        for (var i = 0; i < rowPerThread; i = i + 1) {
+          let ACached = mm_Asub[tileRow + i][k];
+          acc[i] = BCached0 * ACached.x + acc[i];
+          acc[i] = BCached1 * ACached.y + acc[i];
+          acc[i] = BCached2 * ACached.z + acc[i];
+          ${t===3?"":"acc[i] = BCached3 * ACached.w + acc[i];"}
+        }`,Zn=(e,t,n="f32",r,o=!1,i=32,s=!1,a=32)=>{let u=t[1]*e[1],d=t[0]*e[0],l=o?u:i,c=o?i:u,p=l/t[0],f=i/t[1];if(!((o&&p===4&&e[1]===4||!o&&(p===3||p===4))&&l%t[0]===0&&i%t[1]===0&&e[0]===4))throw new Error(`If transposeA ${o} is true, innerElementSize ${p} and workPerThread[1] ${e[1]} must be 4.
+      Otherwise, innerElementSize ${p} must be 3 or 4.
+  tileAWidth ${l} must be divisible by workgroupSize[0]${t[0]}. tileInner ${i} must be divisible by workgroupSize[1] ${t[1]}. colPerThread ${e[0]} must be 4.`);return`
+var<workgroup> mm_Asub: array<array<vec${p}<${n}>, ${l/p}>, ${c}>;
+var<workgroup> mm_Bsub: array<array<vec4<${n}>, ${d/e[0]}>, ${i}>;
+
+const rowPerThread = ${e[1]};
+const colPerThread = ${e[0]};
+const innerElementSize = ${p};
+const tileInner = ${i};
+
+@compute @workgroup_size(${t[0]}, ${t[1]}, ${t[2]})
+fn main(@builtin(local_invocation_id) localId : vec3<u32>,
+        @builtin(global_invocation_id) globalId : vec3<u32>,
+        @builtin(workgroup_id) workgroupId : vec3<u32>) {
+  let localRow = i32(localId.y);
+  let tileRow = localRow * rowPerThread;
+  let tileCol = i32(localId.x);
+
+  let globalRow =i32(globalId.y) * rowPerThread;
+  let globalCol = i32(globalId.x);
+  let batch = ${s?"0":"i32(globalId.z)"};
+  ${r?`let batchIndices = ${r.offsetToIndices("u32(batch)")};`:""}
+  let globalRowStart = i32(workgroupId.y) * ${u};
+
+  let num_tiles = ${s?`${Math.ceil(a/i)}`:"(uniforms.dim_inner - 1) / tileInner + 1"};
+  var kStart = ${s?`i32(globalId.z) * ${a}`:"0"};
+
+  var acc: array<vec4<${n}>, rowPerThread>;
+
+  // Loop over shared dimension.
+  let tileRowB = localRow * ${f};
+  for (var t = 0; t < num_tiles; t = t + 1) {
+      // Load one tile of A into local memory.
+      for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {
+          let inputRow = tileRow + innerRow;
+          let inputCol = tileCol;
+          ${sl(o,r)}
+      }
+
+      // Load one tile of B into local memory.
+      for (var innerRow = 0; innerRow < ${f}; innerRow = innerRow + 1) {
+          let inputRow = tileRowB + innerRow;
+          let inputCol = tileCol;
+          mm_Bsub[inputRow][inputCol] = mm_readB(batch, kStart + inputRow, globalCol${r?", batchIndices":""});
+      }
+      kStart = kStart + tileInner;
+      workgroupBarrier();
+
+      // Compute acc values for a single thread.
+      for (var k = 0; k < tileInner / innerElementSize; k = k + 1) {
+          let BCached0 = mm_Bsub[k * innerElementSize][tileCol];
+          let BCached1 = mm_Bsub[k * innerElementSize + 1][tileCol];
+          let BCached2 = mm_Bsub[k * innerElementSize + 2][tileCol];
+          ${p===3?"":"let BCached3 = mm_Bsub[k * innerElementSize + 3][tileCol];"}
+
+          ${al(o,p)}
+      }
+
+      workgroupBarrier();
+  }
+
+  for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {
+      mm_write(batch, globalRow + innerRow, globalCol, acc[innerRow]);
+  }
+}`},is=(e,t)=>e?`
+            mm_Asub[inputRow][inputCol] = mm_readA(batch,
+              kStart + inputRow,
+              globalRowStart + inputCol${t?", batchIndices":""});
+            `:`
+            mm_Asub[inputRow][inputCol] = mm_readA(batch,
+              globalRowStart + inputRow,
+              kStart + inputCol${t?", batchIndices":""});
+            `,ul=e=>e?"let ACached = mm_Asub[k][tileRow + innerRow];":"let ACached = mm_Asub[tileRow + innerRow][k];",Qn=(e,t,n="f32",r,o=!1,i=32,s=!1,a=32,u=!1)=>{let d=e[1]*t[1],l=e[0]*t[0],c=o?d:i,p=o?i:d;if(!(p%t[1]===0&&c%t[0]===0&&i%t[1]===0))throw new Error(`tileAHight ${p} must be divisible by workgroupSize[1]${t[1]}, tileAWidth ${c} must be divisible by workgroupSize[0]${t[0]}, tileInner ${i} must be divisible by workgroupSize[1]${t[1]}`);let f=p/t[1],m=c/t[0],h=i/t[1],b=u?`
+    let localRow = i32(localId.y);
+    let localCol = i32(localId.x);
+    let globalRowStart = i32(workgroupId.y) * ${d};
+    let globalColStart = i32(workgroupId.x) * ${l};
+
+    // Loop over shared dimension.
+    for (var t = 0; t < num_tiles; t = t + 1) {
+      // Load one tile of A into local memory.
+      for (var inputRow = localRow; inputRow < ${p}; inputRow = inputRow + ${t[1]}) {
+        for (var inputCol = localCol; inputCol < ${c}; inputCol = inputCol + ${t[0]}) {
+          ${is(o,r)}
+        }
+      }
+      // Load one tile of B into local memory.
+      for (var inputRow = localRow; inputRow < ${i}; inputRow = inputRow + ${t[1]}) {
+            for (var inputCol = localCol; inputCol < ${l}; inputCol = inputCol + ${t[0]}) {
+          mm_Bsub[inputRow][inputCol] = mm_readB(batch,
+            kStart + inputRow,
+            globalColStart + inputCol${r?", batchIndices":""});
+        }
+      }
+      kStart = kStart + tileInner;
+      workgroupBarrier();
+
+      // Compute acc values for a single thread.
+      var BCached : array<${n}, colPerThread>;
+      for (var k = 0; k < tileInner; k = k + 1) {
+        for (var inner = 0; inner < colPerThread; inner = inner + 1) {
+          BCached[inner] = mm_Bsub[k][localCol + inner * ${t[0]}];
+        }
+        for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {
+          let ACached = ${o?`mm_Asub[k][localRow + innerRow * ${t[1]}];`:`mm_Asub[localRow + innerRow * ${t[1]}][k];`}
+          for (var innerCol = 0; innerCol < colPerThread; innerCol = innerCol + 1) {
+            acc[innerRow][innerCol] = acc[innerRow][innerCol] +
+                ACached * BCached[innerCol];
+          }
+        }
+      }
+      workgroupBarrier();
+    }
+    for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {
+      let gRow = globalRowStart + localRow + innerRow * ${t[1]};
+      for (var innerCol = 0; innerCol < colPerThread; innerCol = innerCol + 1) {
+        let gCol = globalColStart + localCol + innerCol * ${t[0]};
+        mm_write(batch, gRow, gCol, acc[innerRow][innerCol]);
+      }
+    }
+    `:`
+let tileRow = i32(localId.y) * rowPerThread;
+let tileCol = i32(localId.x) * colPerThread;
+
+let globalRow = i32(globalId.y) * rowPerThread;
+let globalCol = i32(globalId.x) * colPerThread;
+let globalRowStart = i32(workgroupId.y) * ${d};
+
+let tileRowA = i32(localId.y) * ${f};
+let tileColA = i32(localId.x) * ${m};
+let tileRowB = i32(localId.y) * ${h};
+// Loop over shared dimension.
+for (var t = 0; t < num_tiles; t = t + 1) {
+  // Load one tile of A into local memory.
+  for (var innerRow = 0; innerRow < ${f}; innerRow = innerRow + 1) {
+    for (var innerCol = 0; innerCol < ${m}; innerCol = innerCol + 1) {
+      let inputRow = tileRowA + innerRow;
+      let inputCol = tileColA + innerCol;
+      ${is(o,r)}
+    }
+  }
+
+  // Load one tile of B into local memory.
+  for (var innerRow = 0; innerRow < ${h}; innerRow = innerRow + 1) {
+    for (var innerCol = 0; innerCol < colPerThread; innerCol = innerCol + 1) {
+      let inputRow = tileRowB + innerRow;
+      let inputCol = tileCol + innerCol;
+      mm_Bsub[inputRow][inputCol] = mm_readB(batch,
+        kStart + inputRow,
+        globalCol + innerCol${r?", batchIndices":""});
+    }
+  }
+  kStart = kStart + tileInner;
+  workgroupBarrier();
+
+  // Compute acc values for a single thread.
+  var BCached : array<${n}, colPerThread>;
+  for (var k = 0; k < tileInner; k = k + 1) {
+    for (var inner = 0; inner < colPerThread; inner = inner + 1) {
+      BCached[inner] = mm_Bsub[k][tileCol + inner];
+    }
+
+    for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {
+      ${ul(o)}
+      for (var innerCol = 0; innerCol < colPerThread; innerCol = innerCol + 1) {
+        acc[innerRow][innerCol] = acc[innerRow][innerCol] + ACached * BCached[innerCol];
+      }
+    }
+  }
+
+  workgroupBarrier();
+}
+
+for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {
+  for (var innerCol = 0; innerCol < colPerThread; innerCol = innerCol + 1) {
+    mm_write(batch, globalRow + innerRow, globalCol + innerCol,
+        acc[innerRow][innerCol]);
+  }
+}
+`;return`
+  var<workgroup> mm_Asub : array<array<${n}, ${c}>, ${p}>;
+  var<workgroup> mm_Bsub : array<array<${n}, ${l}>, ${i}>;
+  const rowPerThread = ${e[1]};
+  const colPerThread = ${e[0]};
+  const tileInner = ${i};
+
+@compute @workgroup_size(${t[0]}, ${t[1]}, ${t[2]})
+fn main(@builtin(local_invocation_id) localId : vec3<u32>,
+        @builtin(global_invocation_id) globalId : vec3<u32>,
+        @builtin(workgroup_id) workgroupId : vec3<u32>) {
+    let batch = ${s?"0":"i32(globalId.z)"};
+    ${r?`let batchIndices = ${r.offsetToIndices("u32(batch)")};`:""}
+    let num_tiles = ${s?`${Math.ceil(a/i)}`:"(uniforms.dim_inner - 1) / tileInner + 1"};
+    var kStart = ${s?`i32(globalId.z) * ${a}`:"0"};
+
+    var acc : array<array<${n}, colPerThread>, rowPerThread>;
+    ${b}
+  }
+`},dl=(e,t,n,r,o=!1)=>{let[i,s,a,u]=r,d=re(r[0].type.tensor);return`
+    fn mm_readA(batch: i32, row: i32, colIn: i32, batchIndices: ${i.type.indices}) -> ${de(e,d)} {
+      var value = ${de(e,d)}(0.0);
+      let col = colIn * ${e};
+      if(row < uniforms.dim_a_outer && col < uniforms.dim_inner)
+      {
+        var aIndices: ${s.type.indices};
+        ${ht("aIndices",s,s.rank-2,i.rank,"batchIndices")}
+        ${s.indicesSet("aIndices",s.rank-2,"u32(row)")}
+        ${s.indicesSet("aIndices",s.rank-1,"u32(colIn)")}
+        value = ${s.getByIndices("aIndices")};
+      }
+      return value;
+    }
+
+    fn mm_readB(batch: i32, row: i32, colIn: i32, batchIndices: ${i.type.indices}) -> ${de(e,d)} {
+      var value = ${de(e,d)}(0.0);
+      let col = colIn * ${e};
+      if(row < uniforms.dim_inner && col < uniforms.dim_b_outer)
+      {
+        var bIndices: ${a.type.indices};
+        ${ht("bIndices",a,a.rank-2,i.rank,"batchIndices")}
+        ${a.indicesSet("bIndices",a.rank-2,"u32(row)")}
+        ${a.indicesSet("bIndices",a.rank-1,"u32(colIn)")}
+        value = ${a.getByIndices("bIndices")};
+      }
+      return value;
+    }
+
+    fn mm_write(batch: i32, row: i32, colIn: i32, valueIn: ${de(e,d)}) {
+      let col = colIn * ${e};
+      if (row < uniforms.dim_a_outer && col < uniforms.dim_b_outer) {
+        var value = valueIn;
+        let coords = vec3<i32>(batch, row, colIn);
+        ${t?`value = value + ${o?"bias[colIn]":`${de(e,d)}(bias[row])`};`:""}
+        ${n}
+        ${u.setByIndices("vec3<u32>(coords)","value")}
+      }
+    }
+    `},gt=(e,t,n,r,o=!1,i)=>{let s=e[0].dims,a=e[1].dims,u=s.slice(0,-2),d=a.slice(0,-2),l=r?r.slice(0,-2):n.slice(0,-2),c=x.size(l),p=s[s.length-2],f=s[s.length-1],m=a[a.length-1],h=f%4===0&&m%4===0,b=p<=8?[4,1,1]:[4,4,1],y=[8,8,1],g=[Math.ceil(m/y[0]/b[0]),Math.ceil(p/y[1]/b[1]),Math.ceil(c/y[2]/b[2])],_=h?4:1,w=[...u,p,f/_],v=w.length,$=[...d,f,m/_],T=$.length,I=[c,p,m/_],A=[{type:6,data:p},{type:6,data:m},{type:6,data:f}];Ce(t,A),A.push(...P(l,w,$));let z=["rank","rank"],M=e.length>2;M&&(A.push(...P(e[2].dims)),z.push("rank")),A.push(...P(I));let R=W=>{let O=l.length,ee=Zt("batchDims",e[0].dataType,O,1),G=re(e[0].dataType),D=S("a",e[0].dataType,v,_),Z=S("b",e[1].dataType,T,_),U=C("result",e[0].dataType,I.length,_),Q=[D,Z];if(M){let L=o?_:1;Q.push(S("bias",e[2].dataType,e[2].dims.length,L))}let _e=[{name:"dim_a_outer",type:"i32"},{name:"dim_b_outer",type:"i32"},{name:"dim_inner",type:"i32"}];Ae(t,_e);let se=re(U.type.tensor),H=Ie(t,U.type.value,se),k=dl(_,M,H,[ee,D,Z,U],o);return`
+  ${W.registerUniforms(_e).registerInternalVariables(ee).declareVariables(...Q,U)}
+  ${k}
+  ${h?Zn(b,y,G,ee):Qn(b,y,G,ee)}
+                   `};return{name:"MatMul",shaderCache:{hint:`${b};${t.activation};${h};${o}`,inputDependencies:z},getRunData:()=>({outputs:[{dims:i?i(n):n,dataType:e[0].dataType}],dispatchGroup:{x:g[0],y:g[1],z:g[2]},programUniforms:A}),getShaderSource:R}}});var ll,ss,as=E(()=>{"use strict";V();Ee();K();He();nn();os();sn();ll=(e,t,n,r,o=!1,i,s=4,a=4,u=4,d="f32")=>{let l=z=>{switch(z){case 1:return"resData = x[xIndex];";case 3:return`resData = vec3<${d}>(x[xIndex], x[xIndex + 1], x[xIndex + 2]);`;case 4:return"resData = x[xIndex / 4];";default:throw new Error(`innerElementSize ${z} is not supported.`)}},c=z=>{switch(z){case 1:return"return w[row * i32(uniforms.w_shape[3]) + colIn];";case 4:return"return w[row * i32(uniforms.w_shape[3]) / 4 + colIn];";default:throw new Error(`innerElementSize ${z} is not supported.`)}},p=e?`
+    let coord = vec4<i32>(batch, xRow, xCol, xCh);
+    `:`
+    let coord = vec4<i32>(batch, xCh, xRow, xCol);
+    `,f=e?`
+    let coords = vec4<i32>(
+      batch,
+      row / outWidth,
+      row % outWidth,
+      col);
+    `:`
+    let coords = vec4<i32>(
+      batch,
+      row,
+      col / outWidth,
+      col % outWidth);
+    `,m=e?"i32(uniforms.x_shape[1])":"i32(uniforms.x_shape[2])",h=e?"i32(uniforms.x_shape[2])":"i32(uniforms.x_shape[3])",b=e?"row":"col",y=e?"col":"row",g=`
+    let inChannels = i32(uniforms.w_shape[2]);
+    let outWidth = ${e?"i32(uniforms.result_shape[2])":"i32(uniforms.result_shape[3])"};
+    let outRow = ${b} / outWidth;
+    let outCol = ${b} % outWidth;
+
+    let WRow = ${y} / (i32(uniforms.w_shape[1]) * inChannels);
+    let WCol = ${y} / inChannels % i32(uniforms.w_shape[1]);
+    let xRow = outRow * uniforms.stride[0] + uniforms.dilation[0] * WRow - uniforms.pad[0];
+    let xCol = outCol * uniforms.stride[1] + uniforms.dilation[1] * WCol - uniforms.pad[1];
+    let xCh = ${y} % inChannels;
+    var resData = ${de(s,d)}(0.0);
+    // The bounds checking is always needed since we use it to pad zero for
+    // the 'same' padding type.
+    if (xRow >= 0 && xRow < ${m} && xCol >= 0 && xCol < ${h}) {
+      ${p}
+      let xIndex = getIndexFromCoords4D(coord, vec4<i32>(uniforms.x_shape));
+      ${l(s)}
+    }
+    return resData;`,_=e?t&&r?`
+    let col = colIn * ${s};
+    ${g}`:`
+    let col = colIn * ${s};
+    if (row < uniforms.dim_a_outer && col < uniforms.dim_inner) {
+      ${g}
+    }
+    return ${de(s,d)}(0.0);`:r&&n?`
+    let col = colIn * ${s};
+    ${g}`:`
+    let col = colIn * ${s};
+    if (row < uniforms.dim_inner && col < uniforms.dim_b_outer) {
+      ${g}
+    }
+    return ${de(s,d)}(0.0);`,w=e?r&&n?c(a):`
+    let col = colIn * ${a};
+    if (row < uniforms.dim_inner && col < uniforms.dim_b_outer) {
+      ${c(a)}
+    }
+    return ${de(a,d)}(0.0);`:`
+    let col = colIn * ${a};
+    if (row < uniforms.dim_inner && col < uniforms.dim_a_outer) {
+      ${c(a)}
+    }
+    return ${de(a,d)}(0.0);`,v=de(u,d),$=e?de(s,d):de(a,d),T=e?de(a,d):de(s,d),I=Ie(i,v,d);return`
+    fn mm_readA(batch: i32, row : i32, colIn : i32) -> ${$} {
+      ${e?_:w}
+    }
+
+    fn mm_readB(batch: i32, row : i32, colIn : i32) -> ${T} {
+      ${e?w:_}
+    }
+
+    fn mm_write(batch: i32, row : i32, colIn : i32, valueIn : ${v}) {
+      let col = colIn * ${u};
+      if (row < uniforms.dim_a_outer && col < uniforms.dim_b_outer)
+      {
+      var value = valueIn;
+      let outWidth = ${e?"i32(uniforms.result_shape[2])":"i32(uniforms.result_shape[3])"};
+      ${f}
+      ${ns(o)}
+      ${I}
+      setOutputAtCoords(coords[0], coords[1], coords[2], coords[3], value);
+      }
+    }`},ss=(e,t,n,r,o,i,s,a,u)=>{let d=t.format==="NHWC",l=d?e[0].dims[3]:e[0].dims[1],c=n[0],p=d?n[2]:n[3],f=d?n[1]:n[2],m=d?n[3]:n[1],h=d&&(l%4===0||l%3===0)&&m%4===0,b=d?m:p*f,y=d?p*f:m,g=[8,8,1],_=r<=8?[4,1,1]:[4,4,1],w=[Math.ceil(b/g[0]/_[0]),Math.ceil(y/g[1]/_[1]),Math.ceil(c/g[2]/_[2])];j("verbose",()=>`[conv2d_mm_webgpu] dispatch = ${w}`);let v=h?d&&l%4!==0?3:4:1,$=g[1]*_[1],T=g[0]*_[0],I=Math.max(g[0]*v,g[1]),A=r%$===0,z=o%T===0,M=i%I===0,R=h?[v,4,4]:[1,1,1],W=[{type:6,data:r},{type:6,data:o},{type:6,data:i},{type:6,data:[t.pads[0],t.pads[1]]},{type:6,data:t.strides},{type:6,data:t.dilations}];Ce(t,W),W.push(...P(e[0].dims,e[1].dims));let O=["rank","rank"];s&&(W.push(...P(e[2].dims)),O.push("rank")),W.push(...P(n));let ee=G=>{let D=[{name:"dim_a_outer",type:"i32"},{name:"dim_b_outer",type:"i32"},{name:"dim_inner",type:"i32"},{name:"pad",type:"i32",length:2},{name:"stride",type:"i32",length:2},{name:"dilation",type:"i32",length:2}];Ae(t,D);let Z=h?4:1,U=re(e[0].dataType),Q=`
+      fn setOutputAtIndex(flatIndex : i32, value : ${h?`vec4<${U}>`:U}) {
+        result[flatIndex] = ${h?`vec4<${U}>`:U}(value);
+      }
+      fn setOutputAtCoords(d0 : i32, d1 : i32, d2 : i32, d3 : i32, value : ${h?`vec4<${U}>`:U}) {
+        let flatIndex = getOutputIndexFromCoords(vec4<i32>(d0, d1, d2, d3));
+        setOutputAtIndex(flatIndex ${h?"/ 4":""}, value);
+      }`,_e=S("x",e[0].dataType,e[0].dims.length,v===3?1:v),se=S("w",e[1].dataType,e[1].dims.length,Z),H=[_e,se],k=C("result",e[0].dataType,n.length,Z);if(s){let L=S("bias",e[2].dataType,e[2].dims.length,Z);H.push(L),Q+=`
+        fn getBiasByOutputCoords(coords : vec4<i32>) -> ${h?`vec4<${U}>`:U} {
+          return bias[coords.${d?"w":"y"}${h?"/ 4":""}];
+        }`}return`
+        ${rs("uniforms.result_strides")}
+        //struct Uniforms { xShape : vec4<i32>, wShape : vec4<i32>, outShape : vec4<i32>,
+        //  outShapeStrides: vec3<i32>, filterDims : vec2<i32>, pad : vec2<i32>, stride : vec2<i32>,
+        //  dilation : vec2<i32>, dimAOuter : i32, dimBOuter : i32, dimInner : i32 };
+        ${G.registerUniforms(D).declareVariables(...H,k)}
+        ${Q}
+        ${ll(d,A,z,M,s,t,R[0],R[1],R[2],U)}
+        ${h?Zn(_,g,U,void 0,!d,I):Qn(_,g,U,void 0,!d,I,!1,void 0,a)}`};return{name:"Conv2DMatMul",shaderCache:{hint:`${t.cacheKey};${v};${h};${A};${z};${M};${$};${T};${I}`,inputDependencies:O},getRunData:()=>({outputs:[{dims:u?u(n):n,dataType:e[0].dataType}],dispatchGroup:{x:w[0],y:w[1],z:w[2]},programUniforms:W}),getShaderSource:ee}}});var cl,us,an,pl,ds,ml,ls,cs,ps=E(()=>{"use strict";V();Ee();q();K();He();nn();cl=e=>{let t=1;for(let n=0;n<e.length;n++)t*=e[n];return t},us=e=>typeof e=="number"?[e,e,e]:e,an=(e,t)=>t<=1?e:e+(e-1)*(t-1),pl=(e,t,n,r=1)=>{let o=an(t,r);return Math.floor((e[0]*(n-1)-n+o)/2)},ds=(e,t,n,r,o)=>{o==null&&(o=pl(e,t[0],r[0]));let i=[0,0,0,n];for(let s=0;s<3;s++)e[s]+2*o>=t[s]&&(i[s]=Math.trunc((e[s]-t[s]+2*o)/r[s]+1));return i},ml=(e,t,n,r,o,i,s,a,u,d)=>{let l,c,p,f;if(e==="VALID"&&(e=0),typeof e=="number"){l={top:e,bottom:e,left:e,right:e,front:e,back:e};let m=ds([t,n,r,1],[a,u,d],1,[o,i,s],e);c=m[0],p=m[1],f=m[2]}else if(Array.isArray(e)){if(!e.every((h,b,y)=>h===y[0]))throw Error(`Unsupported padding parameter: ${e}`);l={top:e[0],bottom:e[1],left:e[2],right:e[3],front:e[4],back:e[5]};let m=ds([t,n,r,1],[a,u,d],1,[o,i,s],e[0]);c=m[0],p=m[1],f=m[2]}else if(e==="SAME_UPPER"){c=Math.ceil(t/o),p=Math.ceil(n/i),f=Math.ceil(r/s);let m=(c-1)*o+a-t,h=(p-1)*i+u-n,b=(f-1)*s+d-r,y=Math.floor(m/2),g=m-y,_=Math.floor(h/2),w=h-_,v=Math.floor(b/2),$=b-v;l={top:_,bottom:w,left:v,right:$,front:y,back:g}}else throw Error(`Unknown padding parameter: ${e}`);return{padInfo:l,outDepth:c,outHeight:p,outWidth:f}},ls=(e,t,n,r,o,i=!1,s="channelsLast")=>{let a,u,d,l,c;if(s==="channelsLast")[a,u,d,l,c]=e;else if(s==="channelsFirst")[a,c,u,d,l]=e;else throw new Error(`Unknown dataFormat ${s}`);let[p,,f,m,h]=t,[b,y,g]=us(n),[_,w,v]=us(r),$=an(f,_),T=an(m,w),I=an(h,v),{padInfo:A,outDepth:z,outHeight:M,outWidth:R}=ml(o,u,d,l,b,y,g,$,T,I),W=i?p*c:p,O=[0,0,0,0,0];return s==="channelsFirst"?O=[a,W,z,M,R]:s==="channelsLast"&&(O=[a,z,M,R,W]),{batchSize:a,dataFormat:s,inDepth:u,inHeight:d,inWidth:l,inChannels:c,outDepth:z,outHeight:M,outWidth:R,outChannels:W,padInfo:A,strideDepth:b,strideHeight:y,strideWidth:g,filterDepth:f,filterHeight:m,filterWidth:h,effectiveFilterDepth:$,effectiveFilterHeight:T,effectiveFilterWidth:I,dilationDepth:_,dilationHeight:w,dilationWidth:v,inShape:e,outShape:O,filterShape:t}},cs=(e,t,n,r,o,i)=>{let s=i==="channelsLast",a=s?e[0].dims[3]:e[0].dims[1],u=!1,d=[64,1,1],l={x:n.map((g,_)=>_)},c=[Math.ceil(cl(l.x.map(g=>n[g]))/d[0]),1,1];j("verbose",()=>`[conv3d_naive_webgpu] dispatch = ${c}`);let p=u?s&&a%4!==0?3:4:1,f=x.size(n),m=[{type:12,data:f},{type:12,data:r},{type:12,data:o},{type:12,data:t.strides},{type:12,data:t.dilations}];Ce(t,m),m.push(...P(e[0].dims,e[1].dims));let h=["rank","rank"],b=e.length===3;b&&(m.push(...P(e[2].dims)),h.push("rank")),m.push(...P(n));let y=g=>{let _=[{name:"output_size",type:"u32"},{name:"filter_dims",type:"u32",length:r.length},{name:"pads",type:"u32",length:o.length},{name:"strides",type:"u32",length:t.strides.length},{name:"dilations",type:"u32",length:t.dilations.length}];Ae(t,_);let w=u?4:1,v=re(e[0].dataType),$=S("x",e[0].dataType,e[0].dims.length,p===3?1:p),T=S("W",e[1].dataType,e[1].dims.length,w),I=[$,T],A=C("result",e[0].dataType,n.length,w),z="";if(b){let W=S("bias",e[2].dataType,e[2].dims.length,w);I.push(W),z+=`
+        fn getBiasByOutputCoords(coords : array<u32, 5>) -> ${u?`vec4<${v}>`:v} {
+          return bias[${s?B("coords",4,5):B("coords",1,5)}${u?"/ 4":""}];
+        }`}let M=de(p,v),R=Ie(t,M,v);return`
+            ${z}
+            fn getX(d0 : u32, d1 : u32, d2 : u32, d3 : u32, d4 : u32) -> f32 {
+              let aIndices = array<u32, 5>(d0, d1, d2, d3, d4);
+              return ${$.getByIndices("aIndices")};
+            }
+            fn getW(d0 : u32, d1 : u32, d2 : u32, d3 : u32, d4 : u32) -> f32 {
+              let aIndices = array<u32, 5>(d0, d1, d2, d3, d4);
+              return ${T.getByIndices("aIndices")};
+            }
+          ${g.registerUniforms(_).declareVariables(...I,A)}
+          ${g.mainStart()}
+          ${g.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+              let coords = ${A.offsetToIndices("global_idx")};
+              let batch = ${B("coords",0,$.rank)};
+              let d2 = ${s?B("coords",$.rank-1,$.rank):B("coords",1,$.rank)};
+              let xFRCCorner = vec3<u32>(${s?B("coords",1,$.rank):B("coords",2,$.rank)},
+              ${s?B("coords",2,$.rank):B("coords",3,$.rank)},
+              ${s?B("coords",3,$.rank):B("coords",4,$.rank)}) * uniforms.strides - uniforms.pads;
+              let xFCorner = xFRCCorner.x;
+              let xRCorner = xFRCCorner.y;
+              let xCCorner = xFRCCorner.z;
+              let xShapeY = ${s?B("uniforms.x_shape",1,$.rank):B("uniforms.x_shape",2,$.rank)};
+              let xShapeZ = ${s?B("uniforms.x_shape",2,$.rank):B("uniforms.x_shape",3,$.rank)};
+              let xShapeW = ${s?B("uniforms.x_shape",3,$.rank):B("uniforms.x_shape",4,$.rank)};
+              let xShapeU = ${s?B("uniforms.x_shape",4,$.rank):B("uniforms.x_shape",1,$.rank)};
+              let inputDepthNearestVec4 = (xShapeU / 4) * 4;
+              let inputDepthVec4Remainder = xShapeU % 4;
+
+              var value = 0.0;
+              for (var wF = 0u; wF < uniforms.filter_dims[0]; wF++) {
+                let xF = xFCorner + wF * uniforms.dilations[0];
+                if (xF < 0 || xF >= xShapeY) {
+                  continue;
+                }
+
+                for (var wR = 0u; wR < uniforms.filter_dims[1]; wR++) {
+                  let xR = xRCorner + wR * uniforms.dilations[1];
+                  if (xR < 0 || xR >= xShapeZ) {
+                    continue;
+                  }
+
+                  for (var wC = 0u; wC < uniforms.filter_dims[2]; wC++) {
+                    let xC = xCCorner + wC * uniforms.dilations[2];
+                    if (xC < 0 || xC >= xShapeW) {
+                      continue;
+                    }
+
+                    for (var d1 = 0u; d1 < inputDepthNearestVec4; d1 += 4) {
+                      ${s?`let xValues = vec4<f32>(
+                               getX(batch, xF, xR, xC, d1),
+                               getX(batch, xF, xR, xC, d1 + 1),
+                               getX(batch, xF, xR, xC, d1 + 2),
+                               getX(batch, xF, xR, xC, d1 + 3));
+                            `:`let xValues = vec4<f32>(
+                               getX(batch, d1, xF, xR, xC),
+                               getX(batch, d1 + 1, xF, xR, xC),
+                               getX(batch, d1 + 2, xF, xR, xC),
+                               getX(batch, d1 + 3, xF, xR, xC));
+                            `}
+                            let wValues = vec4<f32>(
+                              getW(d2, d1, wF, wR, wC),
+                              getW(d2, d1 + 1, wF, wR, wC),
+                              getW(d2, d1 + 2, wF, wR, wC),
+                              getW(d2, d1 + 3, wF, wR, wC));
+                      value += dot(xValues, wValues);
+                    }
+                    if (inputDepthVec4Remainder == 1) {
+                        ${s?`value += getX(batch, xF, xR, xC, inputDepthNearestVec4)
+                          * getW(d2, inputDepthNearestVec4, wF, wR, wC);`:`value += getX(batch, inputDepthNearestVec4, xF, xR, xC)
+                          * getW(d2, inputDepthNearestVec4, wF, wR, wC);`}
+                    } else if (inputDepthVec4Remainder == 2) {
+                      ${s?`let xValues = vec2<f32>(
+                        getX(batch, xF, xR, xC, inputDepthNearestVec4),
+                        getX(batch, xF, xR, xC, inputDepthNearestVec4 + 1));
+                      `:`let xValues = vec2<f32>(
+                        getX(batch, inputDepthNearestVec4, xF, xR, xC),
+                        getX(batch, inputDepthNearestVec4 + 1, xF, xR, xC));
+                    `}
+                    let wValues = vec2<f32>(
+                      getW(d2, inputDepthNearestVec4, wF, wR, wC),
+                      getW(d2, inputDepthNearestVec4 + 1, wF, wR, wC));
+                      value += dot(xValues, wValues);
+                    } else if (inputDepthVec4Remainder == 3) {
+                      ${s?`let xValues = vec3<f32>(
+                        getX(batch, xF, xR, xC, inputDepthNearestVec4),
+                        getX(batch, xF, xR, xC, inputDepthNearestVec4 + 1),
+                        getX(batch, xF, xR, xC, inputDepthNearestVec4 + 2));
+                      `:`let xValues = vec3<f32>(
+                        getX(batch, inputDepthNearestVec4, xF, xR, xC),
+                        getX(batch, inputDepthNearestVec4 + 1, xF, xR, xC),
+                        getX(batch, inputDepthNearestVec4 + 2, xF, xR, xC));
+                    `}
+                    let wValues = vec3<f32>(
+                      getW(d2, inputDepthNearestVec4, wF, wR, wC),
+                      getW(d2, inputDepthNearestVec4 + 1, wF, wR, wC),
+                      getW(d2, inputDepthNearestVec4 + 2, wF, wR, wC));
+                      value += dot(xValues, wValues);
+                    }
+                  }
+                }
+              }
+              ${b?"value = value + getBiasByOutputCoords(coords)":""};
+              ${R}
+              result[global_idx] = f32(value);
+          }`};return{name:"Conv3DNaive",shaderCache:{hint:`${t.cacheKey};${s};${p};${b}`,inputDependencies:h},getRunData:()=>({outputs:[{dims:n,dataType:e[0].dataType}],dispatchGroup:{x:c[0],y:c[1],z:c[2]},programUniforms:m}),getShaderSource:y}}});var ms,fs,hs=E(()=>{"use strict";V();q();K();He();ms=(e,t,n,r)=>{let o=e.length>2,i=o?"value += b[output_channel];":"",s=e[0].dims,a=e[1].dims,u=t.format==="NHWC",d=u?n[3]:n[1],l=d/t.group,c=u&&l>=4?X(d):1,p=x.size(n)/c,f=[{type:12,data:p},{type:12,data:t.dilations},{type:12,data:[t.strides[0],t.strides[1]]},{type:12,data:[t.pads[0],t.pads[1]]},{type:12,data:l}];Ce(t,f),f.push(...P(s,[a[0],a[1],a[2],a[3]/c]));let m=o?["rank","rank","rank"]:["rank","rank"];f.push(...P([n[0],n[1],n[2],n[3]/c]));let h=b=>{let y=C("output",e[0].dataType,n.length,c),g=re(y.type.tensor),_=Ie(t,y.type.value,g),w=S("x",e[0].dataType,s.length),v=S("w",e[1].dataType,a.length,c),$=[w,v];o&&$.push(S("b",e[2].dataType,e[2].dims,c));let T=[{name:"output_size",type:"u32"},{name:"dilations",type:"u32",length:t.dilations.length},{name:"strides",type:"u32",length:2},{name:"pads",type:"u32",length:2},{name:"output_channels_per_group",type:"u32"}];Ae(t,T);let I=u?`
+      for (var wHeight: u32 = 0u; wHeight < uniforms.w_shape[0]; wHeight++) {
+        let xHeight = xRCCorner.x + wHeight * uniforms.dilations[0];
+
+        if (xHeight < 0u || xHeight >= uniforms.x_shape[1]) {
+          continue;
+        }
+
+        for (var wWidth: u32 = 0u; wWidth < uniforms.w_shape[1]; wWidth++) {
+          let xWidth = xRCCorner.y + wWidth * uniforms.dilations[1];
+          if (xWidth < 0u || xWidth >= uniforms.x_shape[2]) {
+            continue;
+          }
+
+          for (var wInChannel: u32 = 0u; wInChannel < uniforms.w_shape[2]; wInChannel++) {
+            let input_channel = in_channel_offset + wInChannel;
+            let xVal = ${w.get("batch","xHeight","xWidth","input_channel")};
+            let wVal = ${v.get("wHeight","wWidth","wInChannel","output_channel")};
+            value += xVal * wVal;
+          }
+        }
+      }
+      `:`
+      for (var wInChannel: u32 = 0u; wInChannel < uniforms.w_shape[1]; wInChannel++) {
+        let input_channel = in_channel_offset + wInChannel;
+        for (var wHeight: u32 = 0u; wHeight < uniforms.w_shape[2]; wHeight++) {
+          let xHeight = xRCCorner.x + wHeight * uniforms.dilations[0];
+
+          if (xHeight < 0u || xHeight >= uniforms.x_shape[2]) {
+            continue;
+          }
+
+          for (var wWidth: u32 = 0u; wWidth < uniforms.w_shape[3]; wWidth++) {
+            let xWidth = xRCCorner.y + wWidth * uniforms.dilations[1];
+            if (xWidth < 0u || xWidth >= uniforms.x_shape[3]) {
+              continue;
+            }
+
+            let xVal = ${w.get("batch","input_channel","xHeight","xWidth")};
+            let wVal = ${v.get("output_channel","wInChannel","wHeight","wWidth")};
+            value += xVal * wVal;
+          }
+        }
+      }
+      `;return`
+  ${b.registerUniforms(T).declareVariables(...$,y)}
+
+  ${b.mainStart()}
+    ${b.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+
+    let outputIndices = ${y.offsetToIndices("global_idx")};
+    let batch: u32 = outputIndices[0];
+    let output_channel: u32 = outputIndices[${u?3:1}];
+    let xRCCorner: vec2<u32> = vec2<u32>(outputIndices[${u?1:2}], outputIndices[${u?2:3}]) * uniforms.strides - uniforms.pads;
+    let group_id: u32 = output_channel * ${c} / uniforms.output_channels_per_group;
+    var in_channel_offset = group_id * uniforms.w_shape[${u?2:1}];
+
+    var value: ${y.type.value} = ${y.type.value}(0);
+    ${I}
+    ${i}
+    ${_}
+    ${y.setByOffset("global_idx","value")}
+  }`};return{name:"GroupedConv",shaderCache:{hint:`${t.cacheKey}_${c}`,inputDependencies:m},getRunData:()=>({outputs:[{dims:r?r(n):n,dataType:e[0].dataType}],dispatchGroup:{x:Math.ceil(p/64)},programUniforms:f}),getShaderSource:h}},fs=(e,t,n,r)=>{let o=e.length>2,i=X(n[3]),s=X(n[2]),a=x.size(n)/i/s,u=[e[0].dims[0],e[0].dims[1],e[0].dims[2],e[0].dims[3]/i],d=[e[1].dims[0],e[1].dims[1],e[1].dims[2],e[1].dims[3]/i],l=[n[0],n[1],n[2],n[3]/i],c=[{type:12,data:a},{type:6,data:[t.strides[0],t.strides[1]]},{type:6,data:[t.pads[0],t.pads[1]]}];Ce(t,c),c.push(...P(u,d,l));let p=(s-1)*t.strides[1]+d[1],f=m=>{let h=C("output",e[0].dataType,l.length,i),b=re(h.type.tensor),y=Ie(t,h.type.value,b),g=S("x",e[0].dataType,u.length,i),_=S("w",e[1].dataType,d.length,i),w=[g,_];o&&w.push(S("b",e[2].dataType,e[2].dims,i));let v=o?"value += b[output_channel];":"",$=[{name:"output_size",type:"u32"},{name:"strides",type:"i32",length:2},{name:"pads",type:"i32",length:2}];return Ae(t,$),`
+  ${m.registerUniforms($).declareVariables(...w,h)}
+  ${m.mainStart()}
+    ${m.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+    let width0 = uniforms.output_shape[3];
+    let output_channel = global_idx % width0;
+    var index1 = global_idx / width0;
+    let width1 = uniforms.output_shape[2] / ${s}u;
+    let col = (index1 % width1) * ${s}u;
+    index1 = index1 / width1;
+    let row = index1 % uniforms.output_shape[1];
+    let batch = index1 / uniforms.output_shape[1];
+
+    let x_corner = vec2<i32>(i32(row), i32(col)) * uniforms.strides - uniforms.pads;
+
+    var x_vals: array<${g.type.value}, ${p}>;
+    var values: array<${h.type.value}, ${s}>;
+    let input_channel = output_channel;
+    // Use constant instead of uniform can give better performance for w's height/width.
+    for (var w_height: u32 = 0u; w_height < ${d[0]}; w_height++) {
+      let x_height = x_corner.x + i32(w_height);
+      if (x_height >= 0 && u32(x_height) < uniforms.x_shape[1]) {
+        for (var i = 0; i < ${p}; i++) {
+          let x_width = x_corner.y + i;
+          if (x_width >= 0 && u32(x_width) < uniforms.x_shape[2]) {
+            x_vals[i] = ${g.get("batch","u32(x_height)","u32(x_width)","input_channel")};
+          } else {
+            x_vals[i] = ${g.type.value}(0);
+          }
+        }
+        for (var w_width: u32 = 0u; w_width < ${d[1]}; w_width++) {
+          let w_val = ${_.get("w_height","w_width","0","output_channel")};
+          for (var i = 0u; i < ${s}u; i++) {
+            values[i] = fma(x_vals[i * u32(uniforms.strides[1]) + w_width], w_val, values[i]);
+          }
+        }
+      }
+    }
+
+    for (var i = 0u; i < ${s}u; i++) {
+      var value = values[i];
+      ${v}
+      ${y}
+      ${h.set("batch","row","col + i","output_channel","value")};
+    }
+  }`};return{name:"GroupedConv-Vectorize",shaderCache:{hint:`${t.cacheKey};${i};${s};${p};${d[0]};${d[1]}`,inputDependencies:o?["rank","rank","type"]:["rank","rank"]},getRunData:()=>({outputs:[{dims:r?r(n):n,dataType:e[0].dataType}],dispatchGroup:{x:Math.ceil(a/64)},programUniforms:c}),getShaderSource:f}}});var fl,Xn,hl,Yn,Jn,gs,gl,yl,er,ys=E(()=>{"use strict";q();as();ps();sn();hs();He();on();Re();fl=(e,t,n,r,o,i)=>{let s=e[0],a=e.slice(i?1:2,i?3:4),u=a.length,d=t[0],c=t.slice(2).map((m,h)=>m+(m-1)*(n[h]-1)),f=a.map((m,h)=>m+r[h]+r[h+u]).map((m,h)=>Math.floor((m-c[h]+o[h])/o[h]));return f.splice(0,0,s),f.splice(i?3:1,0,d),f},Xn=[2,3,1,0],hl=(e,t)=>{if(!e||e.length!==2&&e.length!==3)throw new Error("Conv requires 2 or 3 inputs");if(e[0].dims.length>5)throw new Error("greater than 5D is not supported");if(e[0].dims.length!==e[1].dims.length)throw new Error("filter does not have same dimension as input");let n=e[0].dims[t.format==="NHWC"?e[0].dims.length-1:1],r=e[1].dims[1]*t.group;if(n!==r)throw new Error("FILTER_IN_CHANNEL should be equal to DATA_CHANNEL");if(e.length===3&&(e[2].dims.length!==1||e[1].dims[0]!==e[2].dims[0]))throw new Error("invalid bias");let o=e[0].dims.length-2;if(t.dilations.length!==o)throw new Error(`dilations should be ${o}D`);if(t.strides.length!==o)throw new Error(`strides should be ${o}D`);if(t.pads.length!==o*2)throw new Error(`pads should be ${o*2}D`);if(t.kernelShape.length!==0&&t.kernelShape.length!==e[1].dims.length-2)throw new Error("invalid kernel shape")},Yn=(e,t)=>{let n=e.kernelShape.slice();n.length<t[1].dims.length-2&&n.push(...Array(t[1].dims.length-2-n.length).fill(0));for(let i=2;i<t[1].dims.length;++i)n[i-2]===0&&(n[i-2]=t[1].dims[i]);let r=e.pads.slice();Ze.adjustPadsBasedOnAutoPad(t[0].dims,e.strides,e.dilations,n,r,e.format==="NHWC",e.autoPad);let o=Object.assign({},e);return Object.assign(o,{kernelShape:n,pads:r}),o},Jn=e=>{let t=tn(e),n=e.format,r=["NOTSET","VALID","SAME_UPPER","SAME_LOWER"][e.auto_pad],o=e.dilations,i=e.group,s=e.kernel_shape,a=e.pads,u=e.strides,d=e.w_is_const();return{autoPad:r,format:n,dilations:o,group:i,kernelShape:s,pads:a,strides:u,wIsConst:d,...t,cacheKey:`${e.format};${t.activation};`}},gs=(e,t,n,r)=>{let o=n.format==="NHWC",i=fl(t[0].dims,t[1].dims,n.dilations,n.pads,n.strides,o);if(n.group!==1){let $=[t[0]];if(o){let I=e.kernelCustomData.wT??e.compute(ce(t[1],Xn),{inputs:[1],outputs:[n.wIsConst?-2:-1]})[0];n.wIsConst&&!e.kernelCustomData.wT&&(e.kernelCustomData.wT=I),$.push(I)}else $.push(t[1]);t.length===3&&$.push(t[2]),!e.adapterInfo.isArchitecture("ampere")&&o&&t[1].dims[0]===n.group&&t[1].dims[1]===1&&n.dilations[0]===1&&n.dilations[1]===1?e.compute(fs($,n,i,r),{inputs:$}):e.compute(ms($,n,i,r),{inputs:$});return}let s=t.length===3,a=t[0].dims[o?1:2],u=t[0].dims[o?2:3],d=t[0].dims[o?3:1],l=t[1].dims[2],c=t[1].dims[3],p=i[o?1:2],f=i[o?2:3],m=i[o?3:1],h=o&&l===a&&c===u&&n.pads[0]===0&&n.pads[1]===0;if(h||l===1&&c===1&&n.dilations[0]===1&&n.dilations[1]===1&&n.strides[0]===1&&n.strides[1]===1&&n.pads[0]===0&&n.pads[1]===0){let $=i[0],T,I,A,z=[];if(o){let W=e.kernelCustomData.wT??e.compute(ce(t[1],Xn),{inputs:[1],outputs:[n.wIsConst?-2:-1]})[0];if(n.wIsConst&&!e.kernelCustomData.wT&&(e.kernelCustomData.wT=W),h){let O=a*u*d;T=t[0].reshape([1,$,O]),I=W.reshape([1,O,m]),A=[1,$,m]}else T=t[0].reshape([$,a*u,d]),I=W.reshape([1,d,m]),A=[$,p*f,m];z.push(T),z.push(I)}else T=t[0].reshape([$,d,a*u]),I=t[1].reshape([1,m,d]),A=[$,m,p*f],z.push(I),z.push(T);s&&z.push(t[2]);let M=A[2],R=z[0].dims[z[0].dims.length-1];M<8&&R<8?e.compute(rn(z,n,i,A,o,r),{inputs:z}):e.compute(gt(z,n,i,A,o,r),{inputs:z});return}let b=!0,y=e.kernelCustomData.wT??e.compute(ce(t[1],Xn),{inputs:[1],outputs:[n.wIsConst?-2:-1]})[0];n.wIsConst&&!e.kernelCustomData.wT&&(e.kernelCustomData.wT=y);let g=[t[0],y];s&&g.push(t[2]);let _=o?p*f:m,w=o?m:p*f,v=l*c*d;e.compute(ss(g,n,i,_,w,v,s,b,r),{inputs:g})},gl=(e,t)=>{let n=t.format==="NHWC",r=[e.inputs[0].reshape(n?[e.inputs[0].dims[0],1,e.inputs[0].dims[1],e.inputs[0].dims[2]]:[e.inputs[0].dims[0],e.inputs[0].dims[1],1,e.inputs[0].dims[2]]),e.inputs[1].reshape([e.inputs[1].dims[0],e.inputs[1].dims[1],1,e.inputs[1].dims[2]])];e.inputs.length===3&&r.push(e.inputs[2]);let o=[0,t.pads[0],0,t.pads[1]],i=[1].concat(t.strides),s=[1].concat(t.dilations),a=[1].concat(t.kernelShape),u=Yn({...t,pads:o,strides:i,dilations:s,kernelShape:a},r);gs(e,r,u,d=>n?[d[0],d[2],d[3]]:[d[0],d[1],d[3]])},yl=(e,t,n)=>{let r=n.format==="NHWC"?"channelsLast":"channelsFirst",o=Yn(n,t),i=n.autoPad==="NOTSET"?n.pads:n.autoPad,s=ls(t[0].dims,t[1].dims,n.strides,n.dilations,i,!1,r);e.compute(cs(t,o,s.outShape,[s.filterDepth,s.filterHeight,s.filterWidth],[s.padInfo.front,s.padInfo.top,s.padInfo.left],r))},er=(e,t)=>{if(hl(e.inputs,t),e.inputs[0].dims.length===3)gl(e,t);else if(e.inputs[0].dims.length===5)yl(e,e.inputs,t);else{let n=Yn(t,e.inputs);gs(e,e.inputs,n)}}});var bs,_s=E(()=>{"use strict";V();Ee();q();K();bs=(e,t,n)=>{let r=e.length>2,o=t.outputShape,i=t.format==="NHWC",s=t.group,a=e[1].dims,u=a[2]/s,d=a[3],l=i?X(u):1,c=i&&d===1&&u>=4,p=c?Math.floor(u/4)*4:Math.floor(u/l)*l,f=u-p,m=i?X(d):1,h=i?d===1?l:m:1,b=x.size(o)/m,y=[Math.ceil(b/64),1,1];j("verbose",()=>`[conv2d_backprop_webgpu] dispatch = ${y}`);let g=["rank","rank"],_=[t.strides[0],t.strides[1]],w=[t.kernelShape[i?1:2],t.kernelShape[i?2:3]],v=[t.dilations[0],t.dilations[1]],$=[w[0]+(t.dilations[0]<=1?0:(t.kernelShape[i?1:2]-1)*(t.dilations[0]-1)),w[1]+(t.dilations[1]<=1?0:(t.kernelShape[i?2:3]-1)*(t.dilations[1]-1))],T=[$[0]-1-Math.floor((t.pads[0]+t.pads[2])/2),$[1]-1-Math.floor((t.pads[1]+t.pads[3])/2)],I=[{type:12,data:b},{type:12,data:_},{type:12,data:w},{type:12,data:v},{type:12,data:$},{type:6,data:T},{type:12,data:p},{type:12,data:u},{type:12,data:d},...P(e[0].dims,e[1].dims)];r&&(I.push(...P(e[2].dims)),g.push("rank")),I.push(...P(o));let A=z=>{let M=[{name:"output_size",type:"u32"},{name:"strides",type:"u32",length:_.length},{name:"filter_dims",type:"u32",length:w.length},{name:"dilations",type:"u32",length:w.length},{name:"effective_filter_dims",type:"u32",length:$.length},{name:"pads",type:"i32",length:T.length},{name:"input_channels_per_group_int",type:"u32"},{name:"input_channels_per_group",type:"u32"},{name:"output_channels_per_group",type:"u32"}],R=re(e[0].dataType),W=i?1:2,O=i?2:3,ee=i?3:1,G=S("W",e[1].dataType,e[1].dims.length,h),D=S("Dy",e[0].dataType,e[0].dims.length,l),Z=[D,G];r&&Z.push(S("bias",e[2].dataType,[o[ee]].length,m));let U=C("result",e[0].dataType,o.length,m),Q=()=>{let H="";if(c)l===4?H+=`
+        let xValue = ${D.getByOffset("x_offset")};
+        let wValue = ${G.getByOffset("w_offset")};
+        dotProd = dotProd + dot(xValue, wValue);
+        x_offset += 1u;
+        w_offset += 1u;`:l===2?H+=`
+          dotProd = dotProd + dot(vec4<${R}>(${D.getByOffset("x_offset")}, ${D.getByOffset("x_offset + 1u")}), vec4<${R}>(${G.getByOffset("w_offset")}, ${G.getByOffset("w_offset + 1u")}));
+          x_offset += 2u;
+          w_offset += 2u;`:l===1&&(H+=`
+          dotProd = dotProd + dot(vec4<${R}>(${D.getByOffset("x_offset")}, ${D.getByOffset("x_offset + 1u")}, ${D.getByOffset("x_offset + 2u")}, ${D.getByOffset("x_offset + 3u")}), vec4<${R}>(${G.getByOffset("w_offset")}, ${G.getByOffset("w_offset + 1u")}, ${G.getByOffset("w_offset + 2u")}, ${G.getByOffset("w_offset + 3u")}));
+          x_offset += 4u;
+          w_offset += 4u;`);else if(H+=`
+                  let xValue = ${i?D.getByOffset(`${D.indicesToOffset(`${D.type.indices}(batch, idyR, idyC, inputChannel)`)} / ${l}`):D.get("batch","inputChannel","idyR","idyC")};
+        `,l===1)H+=`
+          let w_offset = ${G.indicesToOffset(`${G.type.indices}(u32(wRPerm), u32(wCPerm), inputChannel, wOutChannel)`)};
+          let wValue = ${G.getByOffset(`w_offset / ${h}`)};
+          dotProd = dotProd + xValue * wValue;`;else for(let k=0;k<l;k++)H+=`
+            let wValue${k} = ${G.getByOffset(`${G.indicesToOffset(`${G.type.indices}(u32(wRPerm), u32(wCPerm), inputChannel + ${k}, wOutChannel)`)} / ${h}`)};
+            dotProd = dotProd + xValue[${k}] * wValue${k};`;return H},_e=()=>{if(f===0)return"";if(!c)throw new Error(`packInputAs4 ${c} is not true.`);let H="";if(l===1){H+="dotProd = dotProd";for(let k=0;k<f;k++)H+=`
+            + ${D.getByOffset(`x_offset + ${k}`)} * ${G.getByOffset(`w_offset + ${k}`)}`;H+=";"}else if(l===2){if(f!==2)throw new Error(`Invalid inputChannelsRemainder ${f}.`);H+=`
+          let xValue = ${D.getByOffset("x_offset")};
+          let wValue = ${G.getByOffset("w_offset")};
+          dotProd = dotProd + dot(xValue, wValue);`}return H},se=`
+            let outputIndices = ${U.offsetToIndices(`global_idx * ${m}`)};
+            let batch = ${U.indicesGet("outputIndices",0)};
+            let d1 = ${U.indicesGet("outputIndices",ee)};
+            let r = ${U.indicesGet("outputIndices",W)};
+            let c = ${U.indicesGet("outputIndices",O)};
+            let dyCorner = vec2<i32>(i32(r), i32(c)) - uniforms.pads;
+            let dyRCorner = dyCorner.x;
+            let dyCCorner = dyCorner.y;
+            let groupId = d1 / uniforms.output_channels_per_group;
+            let wOutChannel = d1 - groupId * uniforms.output_channels_per_group;
+            // Convolve dy(?, ?, d2) with w(:, :, d1, d2) to compute dx(xR, xC, d1).
+            // ? = to be determined. : = across all values in that axis.
+            var dotProd = ${U.type.value}(0.0);
+            var wR: u32 = 0;
+            if (uniforms.dilations.x == 1) {
+              // Minimum wR >= 0 that satisfies (dyRCorner + wR) % (uniforms.strides.x) == 0
+              wR = u32(((dyRCorner + i32(uniforms.strides.x) - 1) / i32(uniforms.strides.x)) * i32(uniforms.strides.x) - dyRCorner);
+            }
+            for (; wR < uniforms.effective_filter_dims.x; wR = wR + 1) {
+              if (wR % uniforms.dilations.x != 0) {
+                continue;
+              }
+              let dyR = (${R}(dyRCorner) + ${R}(wR)) / ${R}(uniforms.strides[0]);
+              let wRPerm = uniforms.filter_dims.x - 1 - wR / uniforms.dilations.x;
+              if (dyR < 0.0 || dyR >= ${R}(uniforms.Dy_shape[${W}]) || fract(dyR) > 0.0 ||
+                  wRPerm < 0) {
+                continue;
+              }
+              let idyR: u32 = u32(dyR);
+              var wC: u32 = 0;
+              if (uniforms.dilations.y == 1) {
+                // Minimum wC >= 0 that satisfies (dyCCorner + wC) % (uniforms.strides.y) == 0
+                wC = u32(((dyCCorner + i32(uniforms.strides.y) - 1) / i32(uniforms.strides.y)) * i32(uniforms.strides.y) - dyCCorner);
+              }
+              for (; wC < uniforms.effective_filter_dims.y; wC = wC + 1) {
+                if (wC % uniforms.dilations.y != 0) {
+                  continue;
+                }
+                let dyC = (${R}(dyCCorner) + ${R}(wC)) / ${R}(uniforms.strides.y);
+                let wCPerm = uniforms.filter_dims.y - 1 - wC / uniforms.dilations.y;
+                if (dyC < 0.0 || dyC >= ${R}(uniforms.Dy_shape[${O}]) ||
+                    fract(dyC) > 0.0 || wCPerm < 0) {
+                  continue;
+                }
+                let idyC: u32 = u32(dyC);
+                var inputChannel = groupId * uniforms.input_channels_per_group;
+                ${c?`
+                var x_offset = ${D.indicesToOffset(`${D.type.indices}(batch, idyR, idyC, inputChannel)`)} / ${l};
+                var w_offset = ${G.indicesToOffset(`${G.type.indices}(wRPerm, wCPerm, inputChannel, wOutChannel)`)} / ${h};
+                  `:""}
+                for (var d2: u32 = 0; d2 < uniforms.input_channels_per_group_int; d2 = d2 + ${c?4:l}) {
+                  ${Q()}
+                  inputChannel = inputChannel + ${c?4:l};
+                }
+                ${_e()}
+                wC = wC + uniforms.strides.y - 1;
+              }
+              wR = wR + uniforms.strides[0] - 1;
+            }
+            let value = dotProd${r?` + bias[d1 / ${m}]`:""};
+            ${U.setByOffset("global_idx","value")};
+          `;return`
+    ${z.registerUniforms(M).declareVariables(...Z,U)}
+      ${z.mainStart()}
+      ${z.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")};
+    ${se}}`};return{name:"ConvTranspose2D",shaderCache:{hint:`${t.cacheKey};${l}${h}${m}${c}${f}`,inputDependencies:g},getRunData:()=>({dispatchGroup:{x:y[0],y:y[1],z:y[2]},outputs:[{dims:n?n(o):o,dataType:e[0].dataType}],programUniforms:I}),getShaderSource:A}}});var bl,_l,wl,ws,$s,$l,vs,vl,xs,Ss=E(()=>{"use strict";_s();He();Re();bl=(e,t,n,r,o,i)=>(e-1)*t+n+(r-1)*o+1-i,_l=(e,t,n,r,o)=>{let i=Math.floor(e/2);t==="SAME_UPPER"?(n[r]=i,n[o]=e-i):t==="SAME_LOWER"&&(n[r]=e-i,n[o]=i)},wl=(e,t,n,r,o,i,s,a,u,d)=>{let l=e.length-2,c=d.length===0;u.length<l&&u.push(...Array(l-u.length).fill(0));let p=e[0],f=t[a?3:1]*o;for(let m=0,h=e.length-l-(a?1:0);m<l;++m,++h){let b=e[h],y=c?b*s[m]:d[m],g=bl(b,s[m],i[m],t[h],n[m],y);_l(g,r,i,m,m+l),c&&d.push(s[m]*(b-1)+u[m]+(t[h]-1)*n[m]+1-i[m]-i[m+l])}d.splice(0,0,p),d.splice(a?3:1,0,f)},ws=(e,t)=>{let n=e.kernelShape.slice();if(e.kernelShape.length===0||e.kernelShape.reduce((c,p)=>c*p,1)===0){n.length=0;for(let c=2;c<t[1].dims.length;++c)n.push(t[1].dims[c])}let r=e.format==="NHWC";n.splice(0,0,t[1].dims[0]),n.splice(r?3:1,0,t[1].dims[1]);let o=e.pads.slice(),i=e.outputShape.slice(),s=e.outputPadding.slice(),a=t[0].dims,u=e.dilations.slice();if(u.reduce((c,p)=>c+p,0)===0){let c=t[0].dims.length-2;u=new Array(c).fill(1)}let d=e.strides.slice();if(d.reduce((c,p)=>c+p,0)===0){let c=t[0].dims.length-2;d=new Array(c).fill(1)}wl(a,n,u,e.autoPad,e.group,o,d,r,s,i);let l=Object.assign({},e);return Object.assign(l,{kernelShape:n,pads:o,outputPadding:s,outputShape:i,dilations:u,strides:d}),l},$s=e=>{let t=tn(e),n=e.format,r=["NOTSET","VALID","SAME_UPPER","SAME_LOWER"][typeof e.autoPad>"u"?0:e.autoPad],o=e.dilations,i=e.group,s=e.kernelShape,a=e.pads,u=e.strides,d=e.wIsConst(),l=e.outputPadding,c=e.outputShape;return{autoPad:r,format:n,dilations:o,group:i,kernelShape:s,outputPadding:l,outputShape:c,pads:a,strides:u,wIsConst:d,...t,cacheKey:`${e.format};${t.activation};`}},$l=(e,t)=>{if(!e||e.length!==2&&e.length!==3)throw new Error("Conv requires 2 or 3 inputs");if(e[0].dims.length!==4&&e[0].dims.length!==3)throw new Error("currently only support 2-dimensional conv");if(e[0].dims.length!==e[1].dims.length)throw new Error("filter does not have same dimension as input");let n=e[0].dims[t.format==="NHWC"?e[0].dims.length-1:1],r=e[1].dims[0];if(n!==r)throw new Error("FILTER_IN_CHANNEL should be equal to DATA_CHANNEL");let o=e[1].dims[1]*t.group;if(e.length===3&&(e[2].dims.length!==1||e[2].dims[0]!==o))throw new Error("invalid bias");let i=e[0].dims.length-2;if(t.dilations.reduce((l,c)=>l+c,0)>0&&t.dilations.length!==i)throw new Error(`dilations should be ${i}D`);if(t.strides.reduce((l,c)=>l+c,0)>0&&t.strides.length!==i)throw new Error(`strides should be ${i}D`);if(t.pads.reduce((l,c)=>l+c,0)>0&&t.pads.length!==i*2)throw new Error(`pads should be ${i*2}D`);if(t.outputPadding.length!==i&&t.outputPadding.length!==0)throw new Error(`output_padding should be ${i}D`);if(t.kernelShape.reduce((l,c)=>l+c,0)>0&&t.kernelShape.length!==0&&t.kernelShape.length!==e[1].dims.length-2)throw new Error("invalid kernel shape");if(t.outputShape.length!==0&&t.outputShape.length!==e[0].dims.length-2)throw new Error("invalid output shape")},vs=(e,t,n,r)=>{let o=e.kernelCustomData.wT??e.compute(ce(t[1],[2,3,0,1]),{inputs:[1],outputs:[n.wIsConst?-2:-1]})[0];n.wIsConst&&!e.kernelCustomData.wT&&(e.kernelCustomData.wT=o);let i=[t[0],o];t.length===3&&i.push(t[2]),e.compute(bs(i,n,r),{inputs:i})},vl=(e,t)=>{let n=t.format==="NHWC",r=[e.inputs[0].reshape(n?[e.inputs[0].dims[0],1,e.inputs[0].dims[1],e.inputs[0].dims[2]]:[e.inputs[0].dims[0],e.inputs[0].dims[1],1,e.inputs[0].dims[2]]),e.inputs[1].reshape([e.inputs[1].dims[0],e.inputs[1].dims[1],1,e.inputs[1].dims[2]])];e.inputs.length===3&&r.push(e.inputs[2]);let o=t.kernelShape;(o.length===0||o[0]===0)&&(o=[e.inputs[1].dims[2]]);let i=t.dilations;(i.length===0||i[0]===0)&&(i=[1]);let s=t.strides;(s.length===0||s[0]===0)&&(s=[1]);let a=t.pads;a.length===0&&(a=[0,0]),a=[0,a[0],0,a[1]],s=[1].concat(s),i=[1].concat(i),o=[1].concat(o);let u=t.outputPadding;u=[0].concat(u);let d=ws({...t,pads:a,strides:s,dilations:i,kernelShape:o,outputPadding:u},r);vs(e,r,d,l=>n?[l[0],l[2],l[3]]:[l[0],l[1],l[3]])},xs=(e,t)=>{if($l(e.inputs,t),e.inputs[0].dims.length===3)vl(e,t);else{let n=ws(t,e.inputs);vs(e,e.inputs,n)}}});var xl,Ts,Is,Cs=E(()=>{"use strict";V();q();ie();K();xl=(e,t,n,r)=>{let o=x.size(t),i=t.length,s=S("input",e,i),a=C("output",e,i),u=n.dataType===6?n.getInt32Array()[0]:Number(n.getBigInt64Array()[0]),d=x.normalizeAxis(u,i),l=c=>{let p=` i32(${s.indicesGet("inputIndices","uniforms.axis")}) `,f=B("uniforms.input_shape","uniforms.axis",i),m=r.reverse?p+(r.exclusive?" + 1":""):"0",h=r.reverse?f:p+(r.exclusive?"":" + 1");return`
+                ${c.registerUniform("outputSize","u32").registerUniform("axis","u32").declareVariables(s,a)}
+                ${c.mainStart()}
+                  ${c.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.outputSize")}
+                  var inputIndices = ${a.offsetToIndices("global_idx")};
+                  var sum = ${a.type.value}(0);
+                  let first : i32 = ${m};
+                  let last : i32 = ${h};
+                  for (var i : i32 = first; i < last; i++) {
+                    ${s.indicesSet("inputIndices","uniforms.axis","u32(i)")};
+                    sum = sum + ${s.getByIndices("inputIndices")};
+                  }
+                  ${a.setByOffset("global_idx","sum")};
+                }`};return{name:"CumSum",shaderCache:{hint:r.cacheKey,inputDependencies:["rank"]},getRunData:()=>({outputs:[{dims:t,dataType:e}],dispatchGroup:{x:Math.ceil(o/64)},programUniforms:[{type:12,data:o},{type:12,data:d},...P(t,t)]}),getShaderSource:l}},Ts=(e,t)=>{let n=e.inputs[0].dims,r=e.inputs[0].dataType,o=e.inputs[1];e.compute(xl(r,n,o,t),{inputs:[0]})},Is=e=>{let t=e.exclusive===1,n=e.reverse===1;return N({exclusive:t,reverse:n})}});var Sl,Tl,Il,As,ks,Es=E(()=>{"use strict";V();q();ie();K();Sl=e=>{if(!e||e.length!==1)throw new Error("DepthToSpace requires 1 input.");if(e[0].dims.length!==4)throw new Error("DepthToSpace requires 4D input.")},Tl=(e,t,n,r)=>{let o=[];o.push(`fn perm(i: ${r.type.indices}) -> ${n.type.indices} {
+    var a: ${n.type.indices};`);for(let i=0;i<t;++i)o.push(n.indicesSet("a",e[i],`i[${i}]`));return o.push("return a;}"),o.join(`
+`)},Il=(e,t)=>{let n,r,o,i,s,a,u=t.format==="NHWC",d=t.blocksize,l=t.mode==="DCR";u?([n,r,o,i]=e.dims,s=l?[n,r,o,d,d,i/d**2]:[n,r,o,i/d**2,d,d],a=l?[0,1,3,2,4,5]:[0,1,4,2,5,3]):([n,r,o,i]=[e.dims[0],e.dims[2],e.dims[3],e.dims[1]],s=l?[n,d,d,i/d**2,r,o]:[n,i/d**2,d,d,r,o],a=l?[0,3,4,1,5,2]:[0,1,4,2,5,3]);let c=e.reshape(s),p=c.dims.length,f=e.dataType,m=S("a",f,p),h=C("output",f,p),b=y=>`
+  ${y.registerUniform("output_size","u32").declareVariables(m,h)}
+
+  ${Tl(a,p,m,h)}
+
+  ${y.mainStart()}
+    ${y.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+
+    let indices = ${h.offsetToIndices("global_idx")};
+    let aIndices = perm(indices);
+
+    ${h.setByOffset("global_idx",m.getByIndices("aIndices"))}
+  }`;return{name:"DepthToSpace",shaderCache:{hint:`${e.dims};${t.blocksize};${t.mode}`,inputDependencies:["rank"]},getRunData:y=>{let g=u?[n,r*d,o*d,i/d**2]:[n,i/d**2,r*d,o*d],_=x.size(g),w=c.dims,v=x.sortBasedOnPerm(w,a);return{outputs:[{dims:g,dataType:y[0].dataType}],dispatchGroup:{x:Math.ceil(_/64)},programUniforms:[{type:12,data:_},...P(w,v)]}},getShaderSource:b}},As=(e,t)=>{Sl(e.inputs),e.compute(Il(e.inputs[0],t))},ks=e=>N({blocksize:e.blocksize,mode:e.mode,format:e.format})});var tr,un,Ps,Cl,Al,nr,rr,zs,kl,Os,Ds,Bs=E(()=>{"use strict";V();q();ie();K();tr="[a-zA-Z]|\\.\\.\\.",un="("+tr+")+",Ps="^"+un+"$",Cl="("+un+",)*"+un,Al="^"+Cl+"$",nr=class{constructor(t=-1){this.symbolToIndices=new Map,this.inputIndex=t}addSymbol(t,n){let r=this.symbolToIndices.get(t);r===void 0?r=[n]:r.push(n),this.symbolToIndices.set(t,r)}},rr=class{constructor(t,n){this.equation=n;this.hasEllipsis=!1,this.symbolToInfo=new Map,this.lhs=new Array,this.outputDims=[];let[r,o]=n.includes("->")?n.split("->",2):[n,""];if(!r.match(RegExp(Al)))throw new Error("Invalid LHS term");if(r.split(",").forEach((a,u)=>{let d=t[u].dims.slice();if(!a.match(RegExp(Ps)))throw new Error("Invalid LHS term");let l=this.processTerm(a,!0,d,u);this.lhs.push(l)}),o==="")o+=[...this.symbolToInfo.entries()].filter(([a,u])=>u.count===1||a==="...").map(([a])=>a).join("");else if(!o.match(RegExp(un)))throw new Error("Invalid RHS");o.match(RegExp(tr,"g"))?.forEach(a=>{if(a==="...")this.outputDims=this.outputDims.concat(this.ellipsisDims);else{let u=this.symbolToInfo.get(a);if(u===void 0)throw new Error("Invalid RHS symbol");this.outputDims.push(u.dimValue)}}),this.rhs=this.processTerm(o,!1,this.outputDims)}addSymbol(t,n,r){let o=this.symbolToInfo.get(t);if(o!==void 0){if(o.dimValue!==n&&o.count!==1)throw new Error("Dimension mismatch");o.count++,o.inputIndices.push(r)}else o={count:1,dimValue:n,inputIndices:[r]};this.symbolToInfo.set(t,o)}processTerm(t,n,r,o=-1){let i=r.length,s=!1,a=[],u=0;if(!t.match(RegExp(Ps))&&!n&&t!=="")throw new Error("Invalid LHS term");let d=t.match(RegExp(tr,"g")),l=new nr(o);return d?.forEach((c,p)=>{if(c==="..."){if(s)throw new Error("Only one ellipsis is allowed per input term");s=!0;let f=i-d.length+1;if(f<0)throw new Error("Ellipsis out of bounds");if(a=r.slice(u,u+f),this.hasEllipsis){if(this.ellipsisDims.length!==a.length||this.ellipsisDims.toString()!==a.toString())throw new Error("Ellipsis dimensions mismatch")}else if(n)this.hasEllipsis=!0,this.ellipsisDims=a;else throw new Error("Ellipsis must be specified in the LHS");for(let m=0;m<a.length;m++){let h=String.fromCharCode(48+m);l.addSymbol(h,p+m),this.addSymbol(h,r[u++],o)}}else l.addSymbol(c,p+(this.hasEllipsis?this.ellipsisDims.length-1:0)),this.addSymbol(c,r[u++],o)}),l}},zs=e=>e+"_max",kl=(e,t,n,r)=>{let i=e.map(l=>l.length).map((l,c)=>S(`input${c}`,t,l)),s=x.size(r),a=C("output",t,r.length),u=[...n.symbolToInfo.keys()].filter(l=>!n.rhs.symbolToIndices.has(l)),d=l=>{let c=[],p="var prod = 1.0;",f="var sum = 0.0;",m="sum += prod;",h=[],b=[],y=[],g=[],_=n.symbolToInfo.size===n.rhs.symbolToIndices.size;n.symbolToInfo.forEach((v,$)=>{if(n.rhs.symbolToIndices.has($)){let T=n.rhs.symbolToIndices.get($)?.[0];T!==void 0&&n.lhs.forEach((I,A)=>{if(v.inputIndices.includes(A)){let z=I.symbolToIndices.get($);if(z===void 0)throw new Error("Invalid symbol error");z.forEach(M=>{c.push(`${i[A].indicesSet(`input${A}Indices`,M,a.indicesGet("outputIndices",T))}`)})}})}else n.lhs.forEach((T,I)=>{if(v.inputIndices.includes(I)){let A=T.symbolToIndices.get($);if(A===void 0)throw new Error("Invalid symbol error");A.forEach(z=>{h.push(`${i[I].indicesSet(`input${I}Indices`,z,`${$}`)}`)}),g.push(`prod *= ${i[I].getByIndices(`input${I}Indices`)};`)}}),b.push(`for(var ${$}: u32 = 0; ${$} < uniforms.${zs($)}; ${$}++) {`),y.push("}")});let w=_?[...c,`let sum = ${i.map((v,$)=>v.getByIndices(`input${$}Indices`)).join(" * ")};`]:[...c,f,...b,...h,p,...g,m,...y];return`
+            ${l.registerUniforms(u.map(v=>({name:`${zs(v)}`,type:"u32"}))).registerUniform("outputSize","u32").declareVariables(...i,a)}
+
+            ${l.mainStart()}
+            ${l.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.outputSize")}
+            var outputIndices = ${a.offsetToIndices("global_idx")};
+            ${i.map((v,$)=>`var input${$}Indices: ${i[$].type.indices};`).join(`
+`)}
+            ${w.join(`
+`)};
+            ${a.setByOffset("global_idx","sum")};
+          }`};return{name:"Einsum",shaderCache:{hint:n.equation,inputDependencies:e.map(()=>"rank")},getRunData:()=>{let l=u.filter(p=>n.symbolToInfo.has(p)).map(p=>({type:12,data:n.symbolToInfo.get(p)?.dimValue||0}));l.push({type:12,data:s});let c=e.map((p,f)=>[...P(p)]).reduce((p,f)=>p.concat(f),l);return c.push(...P(r)),{outputs:[{dims:r,dataType:t}],dispatchGroup:{x:Math.ceil(s/64)},programUniforms:c}},getShaderSource:d}},Os=(e,t)=>{let n=new rr(e.inputs,t.equation),r=n.outputDims,o=e.inputs.map((i,s)=>i.dims);e.compute(kl(o,e.inputs[0].dataType,n,r))},Ds=e=>{let t=e.equation.replace(/\s+/g,"");return N({equation:t})}});var El,Ms,Pl,zl,Rs,Us=E(()=>{"use strict";V();q();K();El=e=>{if(!e||e.length!==2)throw new Error("Expand requires 2 input.");let t=e[0].dims,n=Array.from(e[1].getBigInt64Array(),Number),r=n.length<t.length?0:n.length-t.length,o=t.length<n.length?0:t.length-n.length;for(;r<n.length&&o<t.length;++r,++o)if(n[r]!==t[o]&&n[r]!==1&&t[o]!==1)throw new Error("Expand requires shape to be broadcastable to input")},Ms=(e,t)=>{let n=e.length-t.length,r=[];for(let o=0;o<n;++o)r.push(e[o]);for(let o=0;o<t.length;++o)r.push(t[o]===1?e[o+n]:t[o]);return r},Pl=(e,t)=>e.length>t.length?Ms(e,t):Ms(t,e),zl=e=>{let t=e[0].dims,n=Array.from(e[1].getBigInt64Array(),Number),r=Pl(t,n),o=e[0].dataType,i=o===9||x.size(t)===1,s=o===9||t.length>0&&t[t.length-1]%4===0?4:1,a=i||r.length>0&&r[r.length-1]%4===0?4:1,u=Math.ceil(x.size(r)/a),d=c=>{let p=S("input",o,t.length,s),f=C("output",o,r.length,a),m;if(o===9){let h=(b,y,g="")=>`
+          let outputIndices${y} = ${f.offsetToIndices(`outputOffset + ${y}u`)};
+          let offset${y} = ${p.broadcastedIndicesToOffset(`outputIndices${y}`,f)};
+          let index${y} = offset${y} / 4u;
+          let component${y} = offset${y} % 4u;
+          ${b}[${y}] = ${g}(${p.getByOffset(`index${y}`)}[component${y}]);
+        `;m=`
+        let outputOffset = global_idx * ${a};
+        var data = vec4<u32>(0);
+        ${h("data",0,"u32")}
+        ${h("data",1,"u32")}
+        ${h("data",2,"u32")}
+        ${h("data",3,"u32")}
+        ${f.setByOffset("global_idx","data")}
+      }`}else m=`
+        let outputIndices = ${f.offsetToIndices(`global_idx * ${a}`)};
+        let inputOffset = ${p.broadcastedIndicesToOffset("outputIndices",f)};
+        let data = ${f.type.value}(${p.getByOffset(`inputOffset / ${s}`)});
+        ${f.setByOffset("global_idx","data")}
+      }`;return`
+    ${c.registerUniform("vec_size","u32").declareVariables(p,f)}
+    ${c.mainStart()}
+    ${c.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.vec_size")}
+    ${m}`},l=[{type:12,data:u},...P(t,r)];return{name:"Expand",shaderCache:{hint:`${r.length};${s}${a}`,inputDependencies:["rank"]},getShaderSource:d,getRunData:()=>({outputs:[{dims:r,dataType:e[0].dataType}],dispatchGroup:{x:Math.ceil(u/64)},programUniforms:l})}},Rs=e=>{El(e.inputs),e.compute(zl(e.inputs),{inputs:[0]})}});var Ol,Vs,Ns=E(()=>{"use strict";V();q();K();en();Ol=e=>{let t=e[0].dataType,n=x.size(e[0].dims),r=x.size(e[1].dims),o=r%4===0,i=s=>{let a=S("x",t,[1],4),u=S("bias",t,[1],4),d=C("y",t,[1],4),l=[{name:"output_vec_size",type:"u32"},{name:"bias_size",type:"u32"}],c=f=>`
+      let bias${f}_offset: u32 = (global_idx * 4 + ${f}) % uniforms.bias_size;
+      let bias${f} = ${u.getByOffset(`bias${f}_offset / 4`)}[bias${f}_offset % 4];`,p=o?`
+      let bias = ${u.getByOffset("global_idx % (uniforms.bias_size / 4)")};`:`${c(0)}${c(1)}${c(2)}${c(3)}
+      let bias = ${a.type.value}(bias0, bias1, bias2, bias3);`;return`${s.registerUniforms(l).declareVariables(a,u,d)}
+
+    ${Kn(le(t))}
+
+    ${s.mainStart(Qe)}
+      ${s.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_vec_size")}
+
+      let x = ${a.getByOffset("global_idx")};
+      ${p}
+      let x_in = x + bias;
+      ${d.setByOffset("global_idx",jn("x_in"))}
+    }`};return{name:"FastGeluWithBias",shaderCache:{hint:`${o}`,inputDependencies:["type","type"]},getShaderSource:i,getRunData:s=>({outputs:[{dims:s[0].dims,dataType:s[0].dataType}],programUniforms:[{type:12,data:Math.ceil(n/4)},{type:12,data:r}],dispatchGroup:{x:Math.ceil(n/Qe/4)}})}},Vs=e=>{e.inputs.length<2||x.size(e.inputs[1].dims)===0?Bi(e):e.compute(Ol(e.inputs))}});var Dl,Bl,Ls,Gs,Ws=E(()=>{"use strict";V();q();ie();K();Dl=e=>{if(!e||e.length!==2)throw new Error("Gather requires 2 inputs.")},Bl=(e,t)=>{let n=e[0].dims,r=e[1].dims,o=n.length,i=x.normalizeAxis(t.axis,o),s=n.slice(0);s.splice(i,1,...r);let a=n[i],u=e[0].dataType===9?4:1,d=Math.ceil(x.size(s)/u),l=[{type:12,data:d},{type:6,data:a},{type:12,data:i},...P(e[0].dims,e[1].dims,s)],c=p=>{let f=S("data",e[0].dataType,e[0].dims.length,u),m=S("inputIndices",e[1].dataType,e[1].dims.length),h=C("output",e[0].dataType,s.length,u),b=g=>{let _=r.length,w=`var indicesIndices${g}  = ${m.type.indices}(0);`;for(let v=0;v<_;v++)w+=`${_>1?`indicesIndices${g}[${v}]`:`indicesIndices${g}`} = ${s.length>1?`outputIndices${g}[uniforms.axis + ${v}]`:`outputIndices${g}`};`;w+=`
+          var idx${g} = ${m.getByIndices(`indicesIndices${g}`)};
+          if (idx${g} < 0) {
+            idx${g} = idx${g} + uniforms.axisDimLimit;
+          }
+          var dataIndices${g} : ${f.type.indices};
+        `;for(let v=0,$=0;v<o;v++)v===i?(w+=`${o>1?`dataIndices${g}[${v}]`:`dataIndices${g}`} = u32(idx${g});`,$+=_):(w+=`${o>1?`dataIndices${g}[${v}]`:`dataIndices${g}`} = ${s.length>1?`outputIndices${g}[${$}]`:`outputIndices${g}`};`,$++);return w},y;if(e[0].dataType===9){let g=(_,w,v="")=>`
+          let outputIndices${w} = ${h.offsetToIndices(`outputOffset + ${w}u`)};
+          ${b(w)};
+          let offset${w} = ${f.indicesToOffset(`dataIndices${w}`)};
+          let index${w} = offset${w} / 4u;
+          let component${w} = offset${w} % 4u;
+          ${_}[${w}] = ${v}(${f.getByOffset(`index${w}`)}[component${w}]);
+        `;y=`
+        let outputOffset = global_idx * ${u};
+        var value = vec4<u32>(0);
+        ${g("value",0,"u32")}
+        ${g("value",1,"u32")}
+        ${g("value",2,"u32")}
+        ${g("value",3,"u32")}
+        ${h.setByOffset("global_idx","value")}
+      `}else y=`
+      let outputIndices = ${h.offsetToIndices("global_idx")};
+      ${b("")};
+      let value = ${f.getByIndices("dataIndices")};
+      ${h.setByOffset("global_idx","value")};
+      `;return`
+      ${p.registerUniform("outputSize","u32").registerUniform("axisDimLimit","i32").registerUniform("axis","u32").declareVariables(f,m,h)}
+      ${p.mainStart()}
+        ${p.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.outputSize")}
+        ${y}
+      }`};return{name:"Gather",shaderCache:{hint:t.cacheKey,inputDependencies:["rank","rank"]},getRunData:()=>({outputs:[{dims:s,dataType:e[0].dataType}],dispatchGroup:{x:Math.ceil(d/64)},programUniforms:l}),getShaderSource:c}},Ls=e=>N({axis:e.axis}),Gs=(e,t)=>{let n=e.inputs;Dl(n),e.compute(Bl(e.inputs,t))}});var Ml,Hs,qs,Fs=E(()=>{"use strict";V();q();K();Ml=(e,t,n,r,o,i,s,a,u)=>{let d=[{type:12,data:i},{type:12,data:r},{type:12,data:o},{type:12,data:n},{type:12,data:s},{type:12,data:a},{type:12,data:u}],l=[i];d.push(...P(t.dims,l));let c=p=>{let f=S("indices_data",t.dataType,t.dims.length),m=C("input_slice_offsets_data",12,1,1),h=[f,m],b=[{name:"output_size",type:"u32"},{name:"batch_dims",type:"u32"},{name:"input_dims",type:"u32",length:o.length},{name:"sizes_from_slice_dims_data",type:"u32",length:n.length},{name:"num_slices_per_batch",type:"u32"},{name:"input_batch_stride",type:"u32"},{name:"num_slice_dims",type:"u32"}];return`
+  ${p.registerUniforms(b).declareVariables(...h)}
+  ${p.mainStart()}
+    ${p.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+    let batch_idx = global_idx / uniforms.num_slices_per_batch;
+    let base_offset = batch_idx * uniforms.input_batch_stride;
+
+    let slice_indices_base_offset = global_idx * uniforms.num_slice_dims;
+    var relative_slice_offset = 0;
+    for (var dim_idx = 0u; dim_idx < uniforms.num_slice_dims; dim_idx ++) {
+      var index = i32(indices_data[dim_idx + slice_indices_base_offset].x);
+      let input_dim_idx = uniforms.batch_dims + dim_idx;
+      if (index < 0) {
+        ${o.length===1?"index += i32(uniforms.input_dims);":"index += i32(uniforms.input_dims[input_dim_idx]);"}
+      }
+      ${n.length===1?"relative_slice_offset += index * i32(uniforms.sizes_from_slice_dims_data);":"relative_slice_offset += index * i32(uniforms.sizes_from_slice_dims_data[dim_idx]);"}
+    }
+
+    input_slice_offsets_data[global_idx] =  base_offset + u32(relative_slice_offset);
+  }`};return e.compute({name:"computeSliceOffsets",shaderCache:{hint:`${o.length}_${n.length}`,inputDependencies:["rank"]},getRunData:()=>({outputs:[{dims:l,dataType:e.inputs[1].dataType}],dispatchGroup:{x:Math.ceil(i/64)},programUniforms:d}),getShaderSource:c},{inputs:[t],outputs:[-1]})[0]},Hs=(e,t)=>{let n=e.inputs,r=n[0].dims,o=n[0].dataType,i=n[1].dims,s=i[i.length-1],a=x.sizeToDimension(i,i.length-1),u=x.sizeFromDimension(r,t.batchDims+s),d=x.sizeToDimension(r,t.batchDims),l=x.sizeFromDimension(r,t.batchDims),c=a/d,p=new Array(s),f=u;for(let w=0;w<s;++w)p[s-1-w]=f,f*=r[t.batchDims+s-1-w];let m=Ml(e,n[1],p,t.batchDims,r,a,c,l,s),h=t.batchDims+s;if(h>r.length)throw new Error("last dimension of indices must not be larger than rank of input tensor");let b=i.slice(0,-1).concat(r.slice(h)),y=x.size(b),g=[{type:12,data:y},{type:12,data:u},...P(n[0].dims,m.dims,b)],_=w=>{let v=S("data",n[0].dataType,n[0].dims.length),$=S("slice_offsets",12,m.dims.length),T=C("output",n[0].dataType,b.length);return`
+          ${w.registerUniform("output_size","u32").registerUniform("slice_size","u32").declareVariables(v,$,T)}
+            ${w.mainStart()}
+            ${w.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+          let slice_offset = slice_offsets[global_idx / uniforms.slice_size];
+          output[global_idx] = data[u32(slice_offset) + global_idx % uniforms.slice_size];
+        }`};e.compute({name:"GatherND",shaderCache:{hint:t.cacheKey,inputDependencies:["rank","rank"]},getRunData:()=>({outputs:[{dims:b,dataType:o}],dispatchGroup:{x:Math.ceil(y/64)},programUniforms:g}),getShaderSource:_},{inputs:[n[0],m]})},qs=e=>({batchDims:e.batch_dims,cacheKey:""})});var Rl,Ul,Ks,js,Zs=E(()=>{"use strict";V();q();ie();K();Rl=(e,t)=>{if(e.length<3||e.length>4)throw new Error("GatherBlockQuantized requires 3 or 4 inputs.");let n=x.normalizeAxis(t.quantizeAxis,e[0].dims.length),r=t.blockSize,o=e[0],i=e[2],s=e.length===4?e[3]:void 0;if(i.dims.length!==o.dims.length||!o.dims.map((a,u)=>u===n?Math.ceil(a/r)===i.dims[u]:a===i.dims[u]).reduce((a,u)=>a&&u,!0))throw new Error("Scales must have the same rank as the input tensor and the dims should match except on gatherAxis.");if(s){if(s.dataType!==o.dataType)throw new Error("Zero point must have the same data type as the input tensor.");if(s.dims.length!==i.dims.length||!s.dims.map((a,u)=>a===i.dims[u]).reduce((a,u)=>a&&u,!0))throw new Error("Zero point must have the same rank as the input tensor and the dims should match except on quantizeAxis.")}},Ul=(e,t)=>{let n=e[0].dims,r=e[1].dims,o=n.length,i=x.normalizeAxis(t.gatherAxis,o),s=x.normalizeAxis(t.quantizeAxis,o),a=n.slice(0);a.splice(i,1,...r);let u=x.size(a),d=e[2].dataType,c=e[0].dataType===22,p=[{type:12,data:u},{type:12,data:s},{type:12,data:i},{type:12,data:t.blockSize},...P(...e.map((m,h)=>m.dims),a)],f=m=>{let h=S("data",e[0].dataType,e[0].dims.length),b=S("inputIndices",e[1].dataType,e[1].dims.length),y=S("scales",e[2].dataType,e[2].dims.length),g=e.length>3?S("zeroPoint",e[3].dataType,e[3].dims.length):void 0,_=C("output",d,a.length),w=[h,b,y];g&&w.push(g);let v=[{name:"output_size",type:"u32"},{name:"quantize_axis",type:"u32"},{name:"gather_axis",type:"u32"},{name:"block_size",type:"u32"}];return`
+        ${m.registerUniforms(v).declareVariables(...w,_)}
+        ${m.mainStart()}
+        let output_indices = ${_.offsetToIndices("global_idx")};
+        var indices_indices = ${b.type.indices}(0);
+        ${r.length>1?`
+          for (var i: u32 = 0; i < ${r.length}; i++) {
+            let index = ${_.indicesGet("output_indices","uniforms.gather_axis + i")};
+            ${b.indicesSet("indices_indices","i","index")};
+          }`:`indices_indices = ${_.indicesGet("output_indices","uniforms.gather_axis")};`};
+        var data_indices = ${h.type.indices}(0);
+        for (var i: u32 = 0; i < uniforms.gather_axis; i++) {
+          let index = ${_.indicesGet("output_indices","i")};
+          ${h.indicesSet("data_indices","i","index")};
+        }
+        var index_from_indices = ${b.getByIndices("indices_indices")};
+        if (index_from_indices < 0) {
+          index_from_indices += ${n[i]};
+        }
+        ${h.indicesSet("data_indices","uniforms.gather_axis","u32(index_from_indices)")};
+        for (var i = uniforms.gather_axis + 1; i < ${a.length}; i++) {
+          let index = ${_.indicesGet("output_indices",`i + ${r.length} - 1`)};
+          ${h.indicesSet("data_indices","i","index")};
+        }
+        let data_offset = ${h.indicesToOffset("data_indices")};
+        let data_index = data_offset % 8;
+        // Convert 4-bit packed data to 8-bit packed data.
+        let packed_4bit_quantized_data = ${h.getByOffset("data_offset / 8")};
+        let packed_8bit_quantized_data = (packed_4bit_quantized_data >> (4 * (data_index % 2))) & 0x0f0f0f0f;
+        let quantized_data_vec = ${c?"unpack4xI8":"unpack4xU8"}(u32(packed_8bit_quantized_data));
+        let quantized_data = quantized_data_vec[data_index / 2];
+        var scale_indices = data_indices;
+        let quantize_axis_index = ${y.indicesGet("data_indices","uniforms.quantize_axis")} / uniforms.block_size;
+        ${y.indicesSet("scale_indices","uniforms.quantize_axis","quantize_axis_index")};
+        var scale = ${y.getByIndices("scale_indices")};
+        ${g?`
+              let zero_point_indices = scale_indices;
+              let zero_point_offset = ${g.indicesToOffset("zero_point_indices")};
+              let zero_point_index = zero_point_offset % 8;
+              let packed_4bit_zero_points = ${g.getByOffset("zero_point_offset / 8")};
+              let packed_8bit_zero_points = (packed_4bit_zero_points >> (4 * (zero_point_index % 2))) & 0x0f0f0f0f;
+              let zero_point_vec = ${c?"unpack4xI8":"unpack4xU8"}(u32(packed_8bit_zero_points));
+              let zero_point = zero_point_vec[zero_point_index / 2];`:"var zero_point = 0"};
+        let dequantized_data = ${le(d)}(quantized_data - zero_point) * scale;
+        ${_.setByOffset("global_idx","dequantized_data")};
+    }`};return{name:"GatherBlockQuantized",shaderCache:{hint:`${t.cacheKey};${e.filter((m,h)=>h!==1).map(m=>m.dims.join("_")).join(";")}`,inputDependencies:Array.from({length:e.length},(m,h)=>"rank")},getRunData:()=>({outputs:[{dims:a,dataType:d}],dispatchGroup:{x:Math.ceil(u/64)},programUniforms:p}),getShaderSource:f}},Ks=(e,t)=>{let n=e.inputs;Rl(n,t),e.compute(Ul(e.inputs,t))},js=e=>N({blockSize:e.blockSize,gatherAxis:e.gatherAxis,quantizeAxis:e.quantizeAxis})});var Vl,Nl,Qs,Xs,Ys=E(()=>{"use strict";V();q();ie();K();Vl=e=>{if(!e||e.length!==2)throw new Error("GatherElements requires 2 inputs.");if(e[0].dims.length<1)throw new Error("GatherElements requires that the data input be rank >= 1.");if(e[0].dims.length!==e[1].dims.length)throw new Error(`GatherElements requires that the data input and
+                     indices input tensors be of same rank.`)},Nl=(e,t)=>{let n=e[0].dims,r=e[0].dataType,o=n.length,i=e[1].dims,s=e[1].dataType,a=x.normalizeAxis(t.axis,o),u=n[a],d=i.slice(0),l=x.size(d),c=S("input",r,o),p=S("indicesInput",s,i.length),f=C("output",r,d.length),m=[{type:12,data:l},{type:6,data:u},{type:12,data:a}];return m.push(...P(n,i,d)),{name:"GatherElements",shaderCache:{inputDependencies:["rank","rank"]},getRunData:()=>({outputs:[{dims:d,dataType:e[0].dataType}],dispatchGroup:{x:Math.ceil(l/64)},programUniforms:m}),getShaderSource:y=>`
+      ${y.registerUniform("outputSize","u32").registerUniform("axisDimLimit","i32").registerUniform("axis","u32").declareVariables(c,p,f)}
+      ${y.mainStart()}
+      ${y.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.outputSize")}
+
+      let outputIndices = ${f.offsetToIndices("global_idx")};
+
+      var idx = ${p.getByOffset("global_idx")};
+      if (idx < 0) {
+        idx = idx + uniforms.axisDimLimit;
+      }
+      var inputIndices = ${c.type.indices}(outputIndices);
+      ${c.indicesSet("inputIndices","uniforms.axis","u32(idx)")};
+      let value = ${c.getByIndices("inputIndices")};
+
+      ${f.setByOffset("global_idx","value")};
+  }`}},Qs=e=>N({axis:e.axis}),Xs=(e,t)=>{let n=e.inputs;Vl(n),e.compute(Nl(e.inputs,t))}});var Ll,Gl,Js,ea,ta=E(()=>{"use strict";V();q();K();Ll=e=>{if(!e)throw new Error("Input is missing");if(e.length<2||e.length>3)throw new Error("Invaid input number.");if(e.length===3&&e[2].dims.length>2)throw new Error("Invalid input shape of C");if(e[0].dataType!==e[1].dataType||e.length===3&&e[0].dataType!==e[2].dataType)throw new Error("Input types are mismatched")},Gl=(e,t)=>{let n=e[0].dims.slice(),r=e[1].dims.slice(),[o,i,s]=Lt.getShapeOfGemmResult(n,t.transA,r,t.transB,e.length===3?e[2].dims:void 0),a=[o,i];if(!a)throw new Error("Can't use gemm on the given tensors");let u=16,d=Math.ceil(i/u),l=Math.ceil(o/u),c=!0,p=x.size(a),f=[{type:12,data:c?d:p},{type:12,data:o},{type:12,data:i},{type:12,data:s},{type:1,data:t.alpha},{type:1,data:t.beta}],m=["type","type"];e.length===3&&(f.push(...P(e[2].dims)),m.push("rank")),f.push(...P(a));let h=y=>{let g="";t.transA&&t.transB?g="value += a[k * uniforms.M + m] * b[n * uniforms.K + k];":t.transA&&!t.transB?g="value += a[k * uniforms.M + m] * b[k * uniforms.N + n];":!t.transA&&t.transB?g="value += a[m * uniforms.K + k] * b[n * uniforms.K + k];":!t.transA&&!t.transB&&(g="value += a[m * uniforms.K + k] * b[k * uniforms.N + n];");let _=t.alpha===1?"":"value *= uniforms.alpha;",w=S("a",e[0].dataType,e[0].dims),v=S("b",e[1].dataType,e[1].dims),$=w.type.value,T=null,I=[w,v];e.length===3&&(T=S("c",e[2].dataType,e[2].dims.length),I.push(T));let A=C("output",e[0].dataType,a.length);I.push(A);let z=[{name:"output_size",type:"u32"},{name:"M",type:"u32"},{name:"N",type:"u32"},{name:"K",type:"u32"},{name:"alpha",type:"f32"},{name:"beta",type:"f32"}];return`
+  ${y.registerUniforms(z).declareVariables(...I)}
+
+  ${y.mainStart()}
+    ${y.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+
+    let m = global_idx / uniforms.N;
+    let n = global_idx % uniforms.N;
+
+    var value = ${$}(0);
+    for (var k: u32 = 0u; k < uniforms.K; k++) {
+      ${g}
+    }
+
+    ${_}
+    ${T!=null?`let cOffset = ${T.broadcastedIndicesToOffset("vec2(m, n)",A)}; value += ${$}(uniforms.beta) * ${T.getByOffset("cOffset")};`:""}
+    output[global_idx] = value;
+  }`},b=y=>{let g=S("a",e[0].dataType,e[0].dims),_=S("b",e[1].dataType,e[1].dims),w=null,v=[g,_];e.length===3&&(w=S("c",e[2].dataType,e[2].dims.length),v.push(w));let $=C("output",e[0].dataType,a.length);v.push($);let T=[{name:"num_tile_n",type:"u32"},{name:"M",type:"u32"},{name:"N",type:"u32"},{name:"K",type:"u32"},{name:"alpha",type:"f32"},{name:"beta",type:"f32"}],I="",A="";t.transA&&t.transB?(A=`
+      var col = tile_row_start + local_id.x;
+      var row = k_start + local_id.y;
+      if (col < uniforms.M && row < uniforms.K) {
+        tile_a[local_id.y][local_id.x] = a[row * uniforms.M + col];
+      } else {
+        tile_a[local_id.y][local_id.x] = ${g.type.value}(0);
+      }
+
+      col = k_start + local_id.x;
+      row = tile_col_start + local_id.y;
+      if (col < uniforms.K && row < uniforms.N) {
+        tile_b[local_id.y][local_id.x] = b[row * uniforms.K + col];
+      } else {
+        tile_b[local_id.y][local_id.x] = ${_.type.value}(0);
+      }
+      `,I="value += tile_a[k][local_id.y] * tile_b[local_id.x][k];"):t.transA&&!t.transB?(A=`
+      var col = tile_row_start + local_id.x;
+      var row = k_start + local_id.y;
+      if (col < uniforms.M && row < uniforms.K) {
+        tile_a[local_id.y][local_id.x] = a[row * uniforms.M + col];
+      } else {
+        tile_a[local_id.y][local_id.x] = ${g.type.value}(0);
+      }
+
+      col = tile_col_start + local_id.x;
+      row = k_start + local_id.y;
+      if (col < uniforms.N && row < uniforms.K) {
+        tile_b[local_id.y][local_id.x] = b[row * uniforms.N + col];
+      } else {
+        tile_b[local_id.y][local_id.x] = ${_.type.value}(0);
+      }
+      `,I="value += tile_a[k][local_id.y] * tile_b[k][local_id.x];"):!t.transA&&t.transB?(A=`
+      var col = k_start + local_id.x;
+      var row = tile_row_start + local_id.y;
+      if (col < uniforms.K && row < uniforms.M) {
+        tile_a[local_id.y][local_id.x] = a[row * uniforms.K + col];
+      } else {
+        tile_a[local_id.y][local_id.x] = ${g.type.value}(0);
+      }
+
+      col = k_start + local_id.x;
+      row = tile_col_start + local_id.y;
+      if (col < uniforms.K && row < uniforms.N) {
+        tile_b[local_id.y][local_id.x] = b[row * uniforms.K + col];
+      } else {
+        tile_b[local_id.y][local_id.x] = ${_.type.value}(0);
+      }
+      `,I="value += tile_a[local_id.y][k] * tile_b[local_id.x][k];"):!t.transA&&!t.transB&&(A=`
+      var col = k_start + local_id.x;
+      var row = tile_row_start + local_id.y;
+      if (col < uniforms.K && row < uniforms.M) {
+        tile_a[local_id.y][local_id.x] = a[row * uniforms.K + col];
+      } else {
+        tile_a[local_id.y][local_id.x] = ${g.type.value}(0);
+      }
+
+      col = tile_col_start + local_id.x;
+      row = k_start + local_id.y;
+      if (col < uniforms.N && row < uniforms.K) {
+        tile_b[local_id.y][local_id.x] = b[row * uniforms.N + col];
+      } else {
+        tile_b[local_id.y][local_id.x] = ${_.type.value}(0);
+      }
+      `,I="value += tile_a[local_id.y][k] * tile_b[k][local_id.x];");let z=t.alpha===1?"":"value *= uniforms.alpha;";return`
+  ${y.registerUniforms(T).declareVariables(...v)}
+  var<workgroup> tile_a: array<array<${g.type.storage}, ${u}>, ${u}>;
+  var<workgroup> tile_b: array<array<${_.type.storage}, ${u}>, ${u}>;
+  ${y.mainStart([u,u,1])}
+    let tile_col_start = (workgroup_index % uniforms.num_tile_n) * ${u};
+    let tile_row_start = (workgroup_index / uniforms.num_tile_n) * ${u};
+    let num_tiles = (uniforms.K - 1) / ${u} + 1;
+    var k_start = 0u;
+    var value = ${$.type.value}(0);
+    for (var t: u32 = 0u; t < num_tiles; t++) {
+      ${A}
+      k_start = k_start + ${u};
+      workgroupBarrier();
+
+      for (var k: u32 = 0u; k < ${u}; k++) {
+        ${I}
+      }
+      workgroupBarrier();
+    }
+
+    ${z}
+    let m = tile_row_start + local_id.y;
+    let n = tile_col_start + local_id.x;
+    ${w!=null?`let cOffset = ${w.broadcastedIndicesToOffset("vec2(m, n)",$)}; value += ${$.type.value}(uniforms.beta) * ${w.getByOffset("cOffset")};`:""}
+    if (m < uniforms.M && n < uniforms.N) {
+      output[m * uniforms.N + n] = value;
+    }
+  }`};return c?{name:"GemmShared",shaderCache:{hint:`${t.cacheKey}`,inputDependencies:m},getRunData:()=>({outputs:[{dims:a,dataType:e[0].dataType}],dispatchGroup:{x:d*l},programUniforms:f}),getShaderSource:b}:{name:"Gemm",shaderCache:{hint:`${t.cacheKey}`,inputDependencies:m},getRunData:()=>({outputs:[{dims:a,dataType:e[0].dataType}],dispatchGroup:{x:Math.ceil(p/64)},programUniforms:f}),getShaderSource:h}},Js=e=>{let t=e.transA,n=e.transB,r=e.alpha,o=e.beta;return{transA:t,transB:n,alpha:r,beta:o,cacheKey:`${e.transA};${e.transB};${e.alpha===1}`}},ea=(e,t)=>{Ll(e.inputs),e.compute(Gl(e.inputs,t))}});var Ue,qe,rt,ot,Wl,Hl,ql,Fl,Kl,jl,Zl,Ql,na,ra,oa=E(()=>{"use strict";V();q();ie();K();[Ue,qe,rt,ot]=[0,1,2,3],Wl=e=>{if(e[0].dims.length!==4)throw new Error("only 4-D tensor is supported.");if(e[0].dims.length!==e[1].dims.length)throw new Error("input dimensions must be equal to grid dimensions");if(e[0].dims.length-2!==e[1].dims[e[1].dims.length-1])throw new Error(`last dimension of grid must be equal to ${e[0].dims.length-2}`);if(e[0].dims[0]!==e[1].dims[0])throw new Error("grid batch size must match input batch size")},Hl=`
+  fn gs_get_cubic_coeffs(x: f32) -> vec4<f32> {
+    let cubic_alpha = -0.75f;
+    let x_abs = abs(x);
+    var coeffs: vec4<f32>;
+    coeffs[0] = (((cubic_alpha * (x_abs + 1) - 5 * cubic_alpha) * (x_abs + 1) + 8 * cubic_alpha) * (x_abs + 1) - 4 * cubic_alpha);
+    coeffs[1] = (((cubic_alpha + 2) * x_abs - (cubic_alpha + 3)) * x_abs * x_abs + 1);
+    coeffs[2] = (((cubic_alpha + 2) * (1 - x_abs) - (cubic_alpha + 3)) * (1 - x_abs) * (1 - x_abs) + 1);
+    coeffs[3] = (((cubic_alpha * (2 - x_abs) - 5 * cubic_alpha) * (2 - x_abs) + 8 * cubic_alpha) * (2 - x_abs) - 4 * cubic_alpha);
+    return coeffs;
+  }
+`,ql=e=>`
+  fn gs_bicubic_interpolate(p: mat4x4<${e}>, x: f32, y: f32) -> ${e} {
+    var v: vec4<f32>;
+    var coeffs = gs_get_cubic_coeffs(x);
+    for (var i = 0; i < 4; i++) {
+      v[i] = coeffs[0] * p[i][0] + coeffs[1] * p[i][1] + coeffs[2] * p[i][2] + coeffs[3] * p[i][3];
+    }
+    coeffs = gs_get_cubic_coeffs(y);
+    let pixel = ${e}(coeffs[0] * v[0] + coeffs[1] * v[1] + coeffs[2] * v[2] + coeffs[3] * v[3]);
+    return pixel;
+  }
+`,Fl=e=>`
+  fn gs_denormalize(n: f32, length: i32) -> f32 {
+    ${e.alignCorners===0?`
+    // alignCorners: false => [-1, 1] to [-0.5, length - 0.5]
+    return ((n + 1.0) * f32(length) - 1.0) / 2.0;
+    `:`
+    // alignCorners: true => [-1, 1] to [0, length - 1]
+    return (n + 1.0) / 2.0 * (f32(length - 1));
+    `}
+  }
+`,Kl=e=>`
+  ${e.paddingMode==="reflection"?`
+      fn gs_reflect(x: i32, x_min: f32, x_max: f32) -> u32 {
+        var dx = 0.0;
+        var fx = f32(x);
+        let range = x_max - x_min;
+        if (fx < x_min) {
+          dx = x_min - fx;
+          let n = u32(dx / range);
+          let r = dx - f32(n) * range;
+          if (n % 2 == 0) {
+            fx = x_min + r;
+          } else {
+            fx = x_max - r;
+          }
+        } else if (fx > x_max) {
+          dx = fx - x_max;
+          let n = u32(dx / range);
+          let r = dx - f32(n) * range;
+          if (n % 2 == 0) {
+            fx = x_max - r;
+          } else {
+            fx = x_min + r;
+          }
+        }
+        return u32(fx);
+      }`:""}
+`,jl=(e,t,n)=>`
+  fn pixel_at_grid(r: i32, c: i32, H: i32, W: i32, batch: u32, channel: u32, border: vec4<f32>) -> ${t} {
+     var pixel = ${t}(0);
+     var indices = vec4<u32>(0);
+     indices[${Ue}] = batch;
+     indices[${qe}] = channel;`+(()=>{switch(n.paddingMode){case"zeros":return`
+          if (r >= 0 && r < H && c >=0 && c < W) {
+            indices[${rt}] = u32(r);
+            indices[${ot}] = u32(c);
+          } else {
+            return ${t}(0);
+          }
+        `;case"border":return`
+          indices[${rt}] = u32(clamp(r, 0, H - 1));
+          indices[${ot}] = u32(clamp(c, 0, W - 1));
+        `;case"reflection":return`
+          indices[${rt}] = gs_reflect(r, border[1], border[3]);
+          indices[${ot}] = gs_reflect(c, border[0], border[2]);
+        `;default:throw new Error(`padding mode ${n.paddingMode} is not supported`)}})()+`
+    return ${e.getByIndices("indices")};
+  }
+`,Zl=(e,t,n)=>(()=>{switch(n.mode){case"nearest":return`
+          let result = pixel_at_grid(i32(round(y)), i32(round(x)), H_in, W_in, indices[${Ue}], indices[${qe}], border);
+        `;case"bilinear":return`
+          let x1 = i32(floor(x));
+          let y1 = i32(floor(y));
+          let x2 = x1 + 1;
+          let y2 = y1 + 1;
+
+          let p11 = pixel_at_grid(y1, x1, H_in, W_in, indices[${Ue}], indices[${qe}], border);
+          let p12 = pixel_at_grid(y1, x2, H_in, W_in, indices[${Ue}], indices[${qe}], border);
+          let p21 = pixel_at_grid(y2, x1, H_in, W_in, indices[${Ue}], indices[${qe}], border);
+          let p22 = pixel_at_grid(y2, x2, H_in, W_in, indices[${Ue}], indices[${qe}], border);
+
+          let dx2 = ${t}(f32(x2) - x);
+          let dx1 = ${t}(x - f32(x1));
+          let dy2 = ${t}(f32(y2) - y);
+          let dy1 = ${t}(y - f32(y1));
+          let result = dy2 * (dx2 * p11 + dx1 * p12) + dy1 * (dx2 * p21 + dx1 * p22);
+        `;case"bicubic":return`
+          let x0 = i32(floor(x)) - 1;
+          let y0 = i32(floor(y)) - 1;
+          var p: mat4x4<${t}>;
+          for (var h = 0; h < 4; h++) {
+            for (var w = 0; w < 4; w++) {
+              p[h][w] = pixel_at_grid(h + y0, w + x0, H_in, W_in, indices[${Ue}], indices[${qe}], border);
+            }
+          }
+
+          let dx = x - f32(x0 + 1);
+          let dy = y - f32(y0 + 1);
+          let result = gs_bicubic_interpolate(p, dx, dy);
+        `;default:throw new Error(`mode ${n.mode} is not supported`)}})()+`${e.setByOffset("global_idx","result")}`,Ql=(e,t)=>{let n=S("x",e[0].dataType,e[0].dims.length),r=[e[1].dims[0],e[1].dims[1],e[1].dims[2]],o=S("grid",e[1].dataType,r.length,2),i=[e[0].dims[0],e[0].dims[1],e[1].dims[1],e[1].dims[2]];t.format==="NHWC"&&(i=[e[0].dims[0],e[1].dims[1],e[1].dims[2],e[0].dims[3]],[Ue,qe,rt,ot]=[0,3,1,2]);let s=C("output",e[0].dataType,i.length),a=n.type.value,u=x.size(i),d=[{type:12,data:u},...P(e[0].dims,r,i)],l=c=>`
+  ${c.registerUniform("output_size","u32").declareVariables(n,o,s)}
+  ${Hl}
+  ${ql(a)}
+  ${Fl(t)}
+  ${Kl(t)}
+  ${jl(n,a,t)}
+
+  ${c.mainStart()}
+    ${c.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+      let H_in = i32(uniforms.x_shape[${rt}]);
+      let W_in = i32(uniforms.x_shape[${ot}]);
+
+      ${t.alignCorners===0?`
+      let x_min = -0.5;
+      let x_max = f32(W_in) - 0.5;
+      let y_min = -0.5;
+      let y_max = f32(H_in) - 0.5;
+      `:`
+      let x_min = 0.0;
+      let x_max = f32(W_in) - 1.0;
+      let y_min = 0.0;
+      let y_max = f32(H_in) - 1.0;
+      `};
+      let border = vec4<f32>(x_min, y_min, x_max, y_max);
+
+      let indices = ${s.offsetToIndices("global_idx")};
+      var grid_indices = vec3<u32>(indices[${Ue}], indices[${rt}], indices[${ot}]);
+      let nxy = ${o.getByIndices("grid_indices")};
+      var x = gs_denormalize(f32(nxy[0]), W_in);
+      var y = gs_denormalize(f32(nxy[1]), H_in);
+
+      ${Zl(s,a,t)}
+  }`;return{name:"GridSample",shaderCache:{hint:`${t.cacheKey}`,inputDependencies:["type","type"]},getRunData:c=>{let p=x.size(i);return{outputs:[{dims:i,dataType:c[0].dataType}],dispatchGroup:{x:Math.ceil(p/64)},programUniforms:d}},getShaderSource:l}},na=(e,t)=>{Wl(e.inputs),e.compute(Ql(e.inputs,t))},ra=e=>N({alignCorners:e.align_corners,mode:e.mode,paddingMode:e.padding_mode,format:e.format})});var he,Jl,sa,ia,ec,yt,aa,or=E(()=>{"use strict";V();q();ie();Kt();Yt();K();Re();he=(e,t)=>e.length>t&&e[t].dims.length>0?e[t]:void 0,Jl=(e,t)=>{let n=e[0],r=he(e,1),o=he(e,2),i=he(e,3),s=he(e,4),a=he(e,5),u=he(e,6),d=he(e,7);if(n.dims.length!==3&&n.dims.length!==5)throw new Error("Input query is expected to have 3 or 5 dimensions");let l=n.dims[0],c=n.dims[1],p=n.dims.length===3?n.dims[2]:t.numHeads*n.dims[4],f=c,m=0,h=0,b=Math.floor(p/t.numHeads);if(u&&d&&x.size(u.dims)&&x.size(d.dims)){if(u.dims.length!==4)throw new Error('Input "past_key" is expected to have 4 dimensions');if(u.dims[0]!==l||u.dims[1]!==t.numHeads||u.dims[3]!==b)throw new Error('Input "past_key" shape (batch_size, num_heads, past_sequence_length, head_size)');if(d.dims[0]!==l||d.dims[1]!==t.numHeads||d.dims[3]!==b)throw new Error('Input "past_value" shape (batch_size, num_heads, past_sequence_length, head_size)');if(u.dims[2]!==d.dims[2])throw new Error('Input "past_key" and "past_value" shall have same dim 2 (past_sequence_length)');if(d.dims.length!==4)throw new Error('Input "past_value" is expected to have 4 dimensions');m=u.dims[2],h=u.dims[2]}else if(u&&x.size(u.dims)||d&&x.size(d.dims))throw new Error('Input "past_key" and "past_value" shall be both present or both absent');let y;if(r&&x.size(r.dims)>0){if(n.dims.length!==3)throw new Error('Input "query" is expected to have 3 dimensions when key is given');if(r.dims.length<3||r.dims.length>5)throw new Error('Input "key" is expected to have 3, 4, or 5 dimensions');if(n.dims[0]!==r.dims[0])throw new Error('Input "query" and "key" shall have same dim 0 (batch size)');if(r.dims.length===3){if(r.dims[2]!==n.dims[2])throw new Error('Input "query" and "key" shall have same dim 2 (hidden_size)');y=2,f=r.dims[1]}else if(r.dims.length===5){if(r.dims[2]!==t.numHeads||r.dims[3]!==2||r.dims[4]!==b)throw new Error('Expect "key" shape (batch_size, kv_sequence_length, num_heads, 2, head_size) for packed kv');if(o)throw new Error('Expect "value" be none when "key" has packed kv format.');y=5,f=r.dims[1]}else{if(r.dims[1]!==t.numHeads||r.dims[3]!==b)throw new Error('Expect "key" shape (batch_size, num_heads, kv_sequence_length, head_size) for past_key');y=0,f=r.dims[2]}}else{if(n.dims.length!==5)throw new Error('Input "query" is expected to have 5 dimensions when key is empty');if(n.dims[2]!==t.numHeads||n.dims[3]!==3)throw new Error('Expect "query" shape (batch_size, kv_sequence_length, num_heads, 3, head_size) for packed kv');y=3}if(i&&x.size(i.dims)>0){if(i.dims.length!==1)throw new Error('Input "bias" is expected to have 1 dimension');if(r&&r.dims.length===5&&r.dims[3]===2)throw new Error("bias is not allowed for packed kv.")}let g=m+f,_=0;if(s&&x.size(s.dims)>0){_=8;let T=s.dims;throw T.length===1?T[0]===l?_=1:T[0]===3*l+2&&(_=3):T.length===2&&T[0]===l&&T[1]===g&&(_=5),_===8?new Error('Input "key_padding_mask" shape shall be (batch_size) or (batch_size, total_sequence_length)'):new Error("Mask not supported")}let w=!1,v=p;if(o&&x.size(o.dims)>0){if(o.dims.length!==3&&o.dims.length!==4)throw new Error('Input "value" is expected to have 3 or 4 dimensions');if(n.dims[0]!==o.dims[0])throw new Error('Input "query" and "value" shall have same dim 0 (batch_size)');if(o.dims.length===3){if(f!==o.dims[1])throw new Error('Input "key" and "value" shall have the same dim 1 (kv_sequence_length)');v=o.dims[2]}else{if(f!==o.dims[2])throw new Error('Input "key" and "value" shall have the same dim 2 (kv_sequence_length)');v=o.dims[1]*o.dims[3],w=!0}}let $=!1;if(s&&x.size(s.dims)>0)throw new Error("Key padding mask is not supported");if(a&&x.size(a.dims)>0){if(a.dims.length!==4)throw new Error('Input "attention_bias" is expected to have 4 dimensions');if(a.dims[0]!==l||a.dims[1]!==t.numHeads||a.dims[2]!==c||a.dims[3]!==g)throw new Error('Expect "attention_bias" shape (batch_size, num_heads, sequence_length, total_sequence_length)')}return{batchSize:l,sequenceLength:c,pastSequenceLength:m,kvSequenceLength:f,totalSequenceLength:g,maxSequenceLength:h,inputHiddenSize:0,hiddenSize:p,vHiddenSize:v,headSize:b,vHeadSize:Math.floor(v/t.numHeads),numHeads:t.numHeads,isUnidirectional:!1,pastPresentShareBuffer:!1,maskFilterValue:t.maskFilterValue,maskType:_,scale:t.scale,broadcastResPosBias:$,passPastInKv:w,qkvFormat:y}},sa=e=>N({...e}),ia=N({perm:[0,2,1,3]}),ec=(e,t,n,r,o,i,s)=>{let a=[r,o,i],u=x.size(a),d=[{type:12,data:u},{type:12,data:s},{type:12,data:i}],l=c=>{let p=C("qkv_with_bias",t.dataType,a),f=S("qkv",t.dataType,a),m=S("bias",n.dataType,a),h=[{name:"output_size",type:"u32"},{name:"bias_offset",type:"u32"},{name:"hidden_size",type:"u32"}];return`
+  ${c.registerUniforms(h).declareVariables(f,m,p)}
+  ${c.mainStart()}
+    ${c.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+    let bias_offset_idx = (global_idx % uniforms.hidden_size) + uniforms.bias_offset;
+
+    qkv_with_bias[global_idx] = qkv[global_idx] + bias[bias_offset_idx];
+  }`};return e.compute({name:"MultiHeadAttentionAddBias",shaderCache:{inputDependencies:["type","type"]},getRunData:()=>({outputs:[{dims:a,dataType:t.dataType,gpuDataType:0}],dispatchGroup:{x:Math.ceil(u/64)},programUniforms:d}),getShaderSource:l},{inputs:[t,n],outputs:[-1]})[0]},yt=(e,t,n,r,o,i,s,a)=>{let u=i;if(s&&x.size(s.dims)>0){if(r===1)throw new Error("AddBiasReshape is not implemented. Please export your model with packed QKV or KV");return u=ec(e,i,s,t,r,n*o,a),u=u.reshape([t,r,n,o]),n===1||r===1?u:e.compute(ce(u,ia.perm),{inputs:[u],outputs:[-1]})[0]}else return i.dims.length===3&&(u=i.reshape([t,r,n,o])),n===1||r===1?u:e.compute(ce(u,ia.perm),{inputs:[u],outputs:[-1]})[0]},aa=(e,t)=>{let n=Jl(e.inputs,t),r=e.inputs[0],o=he(e.inputs,1),i=he(e.inputs,2),s=he(e.inputs,3),a=he(e.inputs,4),u=he(e.inputs,5),d=he(e.inputs,6),l=he(e.inputs,7);if(r.dims.length===5)throw new Error("Packed QKV is not implemented");if(o?.dims.length===5)throw new Error("Packed KV is not implemented");let c=o&&i&&o.dims.length===4&&i.dims.length===4,p=yt(e,n.batchSize,n.numHeads,n.sequenceLength,n.headSize,r,s,0);if(c)return nt(e,p,o,i,a,void 0,d,l,u,n);if(!o||!i)throw new Error("key and value must be provided");let f=yt(e,n.batchSize,n.numHeads,n.kvSequenceLength,n.headSize,o,s,n.hiddenSize),m=yt(e,n.batchSize,n.numHeads,n.kvSequenceLength,n.vHeadSize,i,s,2*n.hiddenSize);nt(e,p,f,m,a,void 0,d,l,u,n)}});var tc,nc,rc,oc,ir,ua,da,sr=E(()=>{"use strict";V();q();ie();K();tc=e=>{if(!e||e.length<1)throw new Error("too few inputs")},nc=(e,t)=>{let n=[],r=t.numOutputs;return e[1].dims[0]>0&&(e[1].getBigInt64Array().forEach(o=>n.push(Number(o))),r=n.length),N({numOutputs:r,axis:t.axis,splitSizes:n})},rc=e=>`
+fn calculateOutputIndex(index: u32) -> u32 {
+    for (var i: u32 = 0u; i < ${e}u; i += 1u ) {
+    if (index < ${B("uniforms.size_in_split_axis","i",e)}) {
+        return i;
+    }
+    }
+    return ${e}u;
+}`,oc=e=>{let t=e.length,n=[];for(let r=0;r<t;++r){let o=e[r].setByIndices("indices","input[global_idx]");t===1?n.push(o):r===0?n.push(`if (output_number == ${r}u) { ${o} }`):r===t-1?n.push(`else { ${o} }`):n.push(`else if (output_number == ${r}) { ${o} }`)}return`
+      fn writeBufferData(output_number: u32, indices: ${e[0].type.indices}, global_idx: u32) {
+        ${n.join(`
+`)}
+      }`},ir=(e,t)=>{let n=e[0].dims,r=x.size(n),o=e[0].dataType,i=x.normalizeAxis(t.axis,n.length),s=new Array(t.numOutputs),a=S("input",o,n.length),u=new Array(t.numOutputs),d=[],l=[],c=0,p=[{type:12,data:r}];for(let m=0;m<t.numOutputs;m++){c+=t.splitSizes[m],u[m]=c;let h=n.slice();h[i]=t.splitSizes[m],l.push(h),s[m]=C(`output${m}`,o,h.length),d.push({dims:l[m],dataType:e[0].dataType})}p.push({type:12,data:u},...P(n,...l));let f=m=>`
+  ${m.registerUniform("input_size","u32").registerUniform("size_in_split_axis","u32",u.length).declareVariables(a,...s)}
+  ${rc(u.length)}
+  ${oc(s)}
+
+  ${m.mainStart()}
+    ${m.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.input_size")}
+
+    var indices = ${a.offsetToIndices("global_idx")};
+    var index = ${a.indicesGet("indices",i)};
+    let output_number = calculateOutputIndex(index);
+    if (output_number != 0) {
+      index -= ${B("uniforms.size_in_split_axis","output_number - 1u",u.length)};
+      ${a.indicesSet("indices",i,"index")};
+    }
+    writeBufferData(output_number, indices, global_idx);
+  }`;return{name:"Split",shaderCache:{hint:t.cacheKey,inputDependencies:["rank"]},getShaderSource:f,getRunData:()=>({outputs:d,dispatchGroup:{x:Math.ceil(r/64)},programUniforms:p})}},ua=(e,t)=>{tc(e.inputs);let n=e.inputs.length===1?t:nc(e.inputs,t);e.compute(ir(e.inputs,n),{inputs:[0]})},da=e=>{let t=e.axis,n=e.splitSizes,r=e.numOutputs<0?n.length:e.numOutputs;if(r!==n.length)throw new Error("numOutputs and splitSizes lengh must be equal");return N({axis:t,numOutputs:r,splitSizes:n})}});var ic,dn,la,ar=E(()=>{"use strict";V();q();ie();K();ic=(e,t)=>{let[n,r,o,i]=e,{numHeads:s,rotaryEmbeddingDim:a}=t;if(n.dims.length!==3&&n.dims.length!==4)throw new Error(`Input 'x' is expected to have 3 or 4 dimensions, got ${n.dims.length}`);if(!x.areEqual(r.dims,[])&&!x.areEqual(r.dims,[1])&&r.dims.length!==2)throw new Error(`Input 'position_ids' is expected to have 0, 1, or 2 dimensions, got ${r.dims.length}`);if(o.dims.length!==2)throw new Error(`Input 'cos_cache' is expected to have 2 dimensions, got ${o.dims.length}`);if(i.dims.length!==2)throw new Error(`Input 'sin_cache' is expected to have 2 dimensions, got ${i.dims.length}`);if(!x.areEqual(o.dims,i.dims))throw new Error("Inputs 'cos_cache' and 'sin_cache' are expected to have the same shape");if(a>0&&s===0)throw new Error("num_heads must be provided if rotary_embedding_dim is specified");let u=n.dims[0],d=n.dims[n.dims.length-2],l=o.dims[0],c=x.sizeFromDimension(n.dims,1)/d,p=a===0?o.dims[1]*2:c/s;if(a>p)throw new Error("rotary_embedding_dim must be less than or equal to head_size");if(r.dims.length===2){if(u!==r.dims[0])throw new Error(`Input 'position_ids' dimension 0 should be of size batch_size, got ${r.dims[0]}`);if(d!==r.dims[1])throw new Error(`Input 'position_ids' dimension 1 should be of size sequence_length, got ${r.dims[1]}`)}if(p/2!==o.dims[1]&&a/2!==o.dims[1])throw new Error(`Input 'cos_cache' dimension 1 should be same as head_size / 2 or rotary_embedding_dim / 2, got ${o.dims[1]}`);if(d>l)throw new Error("Updating cos_cache and sin_cache in RotaryEmbedding is not currently supported")},dn=(e,t)=>{let{interleaved:n,numHeads:r,rotaryEmbeddingDim:o,scale:i}=t,s=e[0].dims[0],a=x.sizeFromDimension(e[0].dims,1),u=e[0].dims[e[0].dims.length-2],d=a/u,l=e[2].dims[1],c=o===0?l*2:d/r,p=new Array(s,u,d/c,c-l),f=x.computeStrides(p),m=[{type:1,data:i},{type:12,data:p},{type:12,data:f},...e[0].dims.length===3?new Array({type:12,data:[a,d,c,1]}):[],...e[0].dims.length===4?new Array({type:12,data:[a,c,u*c,1]}):[],...P(e[0].dims,e[1].dims,e[2].dims,e[3].dims,e[0].dims)],h=b=>{let y=S("input",e[0].dataType,e[0].dims.length),g=S("position_ids",e[1].dataType,e[1].dims.length),_=S("cos_cache",e[2].dataType,e[2].dims.length),w=S("sin_cache",e[3].dataType,e[3].dims.length),v=C("output",e[0].dataType,e[0].dims.length);return b.registerUniforms([{name:"scale",type:"f32"},{name:"global_shape",type:"u32",length:p.length},{name:"global_strides",type:"u32",length:f.length},{name:"input_output_strides",type:"u32",length:f.length}]),`
+        ${b.declareVariables(y,g,_,w,v)}
+
+        ${b.mainStart(Qe)}
+          let half_rotary_emb_dim = uniforms.${_.name}_shape[1];
+          let bsnh = global_idx / uniforms.global_strides % uniforms.global_shape;
+          let size = uniforms.global_shape[0] * uniforms.global_strides[0];
+          ${b.guardAgainstOutOfBoundsWorkgroupSizes("size")}
+
+          if (bsnh[3] < half_rotary_emb_dim) {
+            let position_ids_idx =
+                ${g.broadcastedIndicesToOffset("bsnh.xy",C("",g.type.tensor,2))};
+            let position_id =
+                u32(${g.getByOffset("position_ids_idx")}) + select(0, bsnh[1], position_ids_idx == 0);
+            let i = dot(bsnh, uniforms.input_output_strides) + select(0, bsnh[3], ${n});
+            let j = i + select(half_rotary_emb_dim, 1, ${n});
+            let re = ${y.getByOffset("i")} * ${_.get("position_id","bsnh[3]")} -
+                ${y.getByOffset("j")} * ${w.get("position_id","bsnh[3]")};
+            ${v.setByOffset("i","re")}
+            let im = ${y.getByOffset("i")} * ${w.get("position_id","bsnh[3]")} +
+                ${y.getByOffset("j")} * ${_.get("position_id","bsnh[3]")};
+            ${v.setByOffset("j","im")}
+          } else {
+            let k = dot(bsnh, uniforms.input_output_strides) + half_rotary_emb_dim;
+            ${v.setByOffset("k",y.getByOffset("k"))}
+          }
+        }`};return{name:"RotaryEmbedding",shaderCache:{hint:N({interleaved:n}).cacheKey,inputDependencies:["rank","rank","rank","rank"]},getShaderSource:h,getRunData:()=>({outputs:[{dims:e[0].dims,dataType:e[0].dataType}],dispatchGroup:{x:Math.ceil(x.size(p)/Qe)},programUniforms:m})}},la=(e,t)=>{ic(e.inputs,t),e.compute(dn(e.inputs,t))}});var sc,ac,ca,uc,pa,ma=E(()=>{"use strict";ie();V();Yt();or();sr();Re();ar();K();sc=(e,t)=>{if(t.doRotary&&e.length<=7)throw new Error("cos_cache and sin_cache inputs are required if do_rotary is specified");let n=e[0],r=e[1],o=e[2],i=e[3],s=e[4];if(t.doRotary!==0&&e.length<=7)throw new Error("cos_cast and sin_cache are expected if do_rotary attribute is non-zero");if(t.localWindowSize!==-1)throw new Error("Local attention is not supported");if(t.softcap!==0)throw new Error("Softcap is not supported");if(t.rotaryInterleaved!==0)throw new Error("Rotary interleaved is not supported");if(t.smoothSoftmax)throw new Error("Smooth softmax is not supported");if(n.dims.length!==3&&n.dims.length!==5)throw new Error("Input query is expected to have 3 or 5 dimensions");let a=!1,u=n.dims[0],d=n.dims[1],l=n.dims.length===3?a?n.dims[2]/3:n.dims[2]:t.numHeads*n.dims[4],c=d,p=0,f=!r||r.dims.length===0,m=Math.floor(f?l/(t.numHeads+2*t.kvNumHeads):l/t.numHeads);f&&(l=m*t.numHeads);let h=i&&i.dims.length!==0,b=s&&s.dims.length!==0;if(h&&i.dims.length===4&&i.dims[0]===u&&i.dims[1]!==t.kvNumHeads&&i.dims[2]===t.kvNumHeads&&i.dims[3]===m)throw new Error("BSNH pastKey/pastValue is not supported");if(h&&b){if(i.dims.length!==4)throw new Error('Input "past_key" is expected to have 4 dimensions');if(s.dims.length!==4)throw new Error('Input "past_value" is expected to have 4 dimensions');p=i.dims[2]}else if(h||b)throw new Error('Input "past_key" and "past_value" shall be both present or both absent');let g=1;if(r&&r.dims.length>0){if(n.dims.length!==3)throw new Error('Input "query" is expected to have 3 dimensions when key is given');if(r.dims.length<3||r.dims.length>5)throw new Error('Input "key" is expected to have 3, 4, or 5 dimensions');if(n.dims[0]!==r.dims[0])throw new Error('Input "query" and "key" shall have same dim 0 (batch size)');if(r.dims.length===3){if(n.dims[2]%r.dims[2]!==0)throw new Error('Dimension 2 of "query" should be a multiple of "key"');c=r.dims[1]}else if(r.dims.length===5){if(r.dims[2]!==t.numHeads||r.dims[3]!==2||r.dims[4]!==m)throw new Error('Expect "key" shape (batch_size, kv_sequence_length, num_heads, 2, head_size) for packed kv');if(o)throw new Error('Expect "value" be none when "key" has packed kv format.');c=r.dims[1]}else{if(r.dims[1]!==t.numHeads||r.dims[3]!==m)throw new Error('Expect "key" shape (batch_size, num_heads, kv_sequence_length, head_size) for past_key');c=r.dims[2]}}else{if(n.dims.length!==3&&n.dims.length!==5)throw new Error('Input "query" is expected to have 3 or 5 dimensions when key is empty');if(n.dims.length===5&&(n.dims[2]!==t.numHeads||n.dims[3]!==3))throw new Error('Expect "query" shape (batch_size, kv_sequence_length, num_heads, 3, head_size) for packed kv');g=3}let _=0,w=!1,v=t.kvNumHeads?m*t.kvNumHeads:l;if(o&&o.dims.length>0){if(o.dims.length!==3&&o.dims.length!==4)throw new Error('Input "value" is expected to have 3 or 4 dimensions');if(n.dims[0]!==o.dims[0])throw new Error('Input "query" and "value" shall have same dim 0 (batch_size)');if(o.dims.length===3){if(c!==o.dims[1])throw new Error('Input "key" and "value" shall have the same dim 1 (kv_sequence_length)');v=o.dims[2]}else{if(c!==o.dims[2])throw new Error('Input "past_key" and "past_value" shall have the same dim 2 (kv_sequence_length)');v=o.dims[1]*o.dims[3],w=!0}}let $=e.length>4?e[5]:void 0;if($&&$.dims.length!==1&&$.dims[0]!==u)throw new Error('Input "seqlens" is expected to have 1 dimension and the same dim 0 as batch_size');return{batchSize:u,sequenceLength:d,pastSequenceLength:p,kvSequenceLength:c,totalSequenceLength:-1,maxSequenceLength:-1,inputHiddenSize:0,hiddenSize:l,vHiddenSize:v,headSize:m,vHeadSize:Math.floor(v/t.kvNumHeads),numHeads:t.numHeads,kvNumHeads:t.kvNumHeads,nReps:t.numHeads/t.kvNumHeads,pastPresentShareBuffer:!1,maskType:_,scale:t.scale,broadcastResPosBias:!1,passPastInKv:w,qkvFormat:g}},ac=N({perm:[0,2,1,3]}),ca=(e,t,n)=>{let r=t,o=n.kvNumHeads;return t.dims.length===3&&n.kvSequenceLength!==0&&(r=t.reshape([n.batchSize,n.kvSequenceLength,o,n.headSize]),r=e.compute(ce(r,ac.perm),{inputs:[r],outputs:[-1]})[0]),r},uc=(e,t,n,r)=>{let o=7,i=["type","type"],s=[e*t],a=e*t,u=[{type:12,data:a},{type:12,data:t},{type:12,data:e}],d=l=>{let c=S("seq_lens",n.dataType,n.dims),p=S("total_seq_lens",r.dataType,r.dims),f=C("pos_ids",o,s),m=[{name:"output_size",type:"u32"},{name:"sequence_length",type:"u32"},{name:"batch_size",type:"u32"}];return`
+  ${l.registerUniforms(m).declareVariables(c,p,f)}
+  ${l.mainStart()}
+    ${l.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+    let total_sequence_length = u32(${p.getByOffset("0")});
+    let is_subsequent_prompt = uniforms.sequence_length > 1 && uniforms.sequence_length != total_sequence_length;
+    let is_first_prompt = !is_subsequent_prompt && uniforms.sequence_length == total_sequence_length;
+    let batch_idx = global_idx / uniforms.sequence_length;
+    let sequence_idx = i32(global_idx % uniforms.sequence_length);
+    var pos_id: i32 = 0;
+    let seqlen = ${c.getByOffset("batch_idx")};
+    let total_seqlen = seqlen + 1;
+    if (is_first_prompt) {
+      if (sequence_idx < total_seqlen) {
+        pos_id = sequence_idx;
+      } else {
+        pos_id = 1;
+      }
+      ${f.setByOffset("global_idx","pos_id")}
+    } else if (is_subsequent_prompt) {
+      let past_seqlen = total_seqlen - i32(uniforms.sequence_length);
+      if (past_seqlen + sequence_idx < total_seqlen) {
+        pos_id = past_seqlen + sequence_idx;
+      } else {
+        pos_id = 1;
+      }
+      ${f.setByOffset("global_idx","pos_id")}
+    } else if (global_idx < uniforms.batch_size) {
+      ${f.setByOffset("global_idx","seqlen")}
+    };
+  }
+  `};return{name:"GeneratePositionIds",shaderCache:{hint:`${e};${t}`,inputDependencies:i},getRunData:()=>({outputs:[{dims:s,dataType:o}],dispatchGroup:{x:Math.ceil(a/64)},programUniforms:u}),getShaderSource:d}},pa=(e,t)=>{let n=sc(e.inputs,t);if(e.inputs[0].dims.length===5)throw new Error("Packed QKV is not implemented");if(e.inputs[1]?.dims.length===5)throw new Error("Packed KV is not implemented");let r=e.inputs[0],o=e.inputs[1]&&e.inputs[1].dims.length>0?e.inputs[1]:void 0,i=e.inputs[2]&&e.inputs[2].dims.length>0?e.inputs[2]:void 0,s=e.inputs[3]&&e.inputs[3].dims.length!==0?e.inputs[3]:void 0,a=e.inputs[4]&&e.inputs[4].dims.length!==0?e.inputs[4]:void 0,u=e.inputs.length>4?e.inputs[5]:void 0,d=e.inputs.length>5?e.inputs[6]:void 0,l=n.kvNumHeads?n.kvNumHeads:n.numHeads,c=N({axis:2,numOutputs:3,splitSizes:[n.numHeads*n.headSize,l*n.headSize,l*n.headSize]}),[p,f,m]=!o&&!i?e.compute(ir([r],c),{inputs:[r],outputs:[-1,-1,-1]}):[r,o,i],h,b;if(t.doRotary){let w=e.compute(uc(n.batchSize,n.sequenceLength,u,d),{inputs:[u,d],outputs:[-1]})[0],v=e.inputs[7],$=e.inputs[8],T=N({interleaved:t.rotaryInterleaved!==0,numHeads:n.numHeads,rotaryEmbeddingDim:0,scale:t.scale}),I=[p,w,v,$],A=[-1];h=e.compute(dn(I,T),{inputs:I,outputs:A})[0],I.splice(0,1,f);let z=N({interleaved:t.rotaryInterleaved!==0,numHeads:n.kvNumHeads,rotaryEmbeddingDim:0,scale:t.scale});b=e.compute(dn(I,z),{inputs:I,outputs:A})[0]}let y=yt(e,n.batchSize,n.numHeads,n.sequenceLength,n.headSize,t.doRotary?h:p,void 0,0),g=ca(e,t.doRotary?b:f,n),_=ca(e,m,n);nt(e,y,g,_,void 0,void 0,s,a,void 0,n,u,d)}});var fa,dc,lc,ha,ga=E(()=>{"use strict";V();q();Re();K();fa=(e,t,n,r,o,i,s,a)=>{let u=X(i),d=u===1?"f32":`vec${u}f`,l=u===1?"vec2f":`mat2x${u}f`,c=o*s,p=64;c===1&&(p=256);let f=[o,s,i/u],m=[o,s,2],h=["rank","type","type"],b=[];b.push(...P(f,m));let y=g=>{let _=S("x",t.dataType,3,u),w=S("scale",n.dataType,n.dims),v=S("bias",r.dataType,r.dims),$=C("output",1,3,2),T=[_,w,v,$];return`
+  var<workgroup> workgroup_shared : array<${l}, ${p}>;
+  const workgroup_size = ${p}u;
+  ${g.declareVariables(...T)}
+  ${g.mainStart(p)}
+    let batch = workgroup_index / uniforms.x_shape[1];
+    let channel = workgroup_index % uniforms.x_shape[1];
+    let hight = uniforms.x_shape[2];
+    // initialize workgroup memory
+    var sum = ${d}(0);
+    var squared_sum = ${d}(0);
+    for (var h = local_idx; h < hight; h += workgroup_size) {
+      let value = ${d}(${_.get("batch","channel","h")});
+      sum += value;
+      squared_sum += value * value;
+    }
+    workgroup_shared[local_idx] = ${l}(sum, squared_sum);
+    workgroupBarrier();
+
+    for (var currSize = workgroup_size >> 1;  currSize > 0; currSize = currSize >> 1) {
+      if (local_idx < currSize) {
+        workgroup_shared[local_idx] = workgroup_shared[local_idx] + workgroup_shared[local_idx + currSize];
+      }
+      workgroupBarrier();
+    }
+    if (local_idx == 0) {
+      let sum_final = ${Te("workgroup_shared[0][0]",u)} / f32(hight * ${u});
+      let squared_sum_final = ${Te("workgroup_shared[0][1]",u)} / f32(hight * ${u});
+
+      let inv_std_dev = inverseSqrt(squared_sum_final - sum_final * sum_final + f32(${a}));
+      let channel_scale = inv_std_dev * f32(scale[channel]);
+      let channel_shift = f32(bias[channel]) - sum_final * channel_scale;
+      output[workgroup_index] = vec2f(channel_scale, channel_shift);
+    }
+  }`};return e.compute({name:"InstanceNormComputeChannelScaleShift",shaderCache:{hint:`${u};${a};${p}`,inputDependencies:h},getRunData:()=>({outputs:[{dims:m,dataType:1}],dispatchGroup:{x:c},programUniforms:b}),getShaderSource:y},{inputs:[t,n,r],outputs:[-1]})[0]},dc=(e,t,n)=>{let r=t[0].dims,o=r,i=2,s=r[0],a=r[1],u=x.sizeFromDimension(r,i),d=X(u),l=x.size(o)/d,c=fa(e,t[0],t[1],t[2],s,u,a,n.epsilon),p=[s,a,u/d],f=[s,a],m=["type","none"],h=b=>{let y=S("x",t[0].dataType,p.length,d),g=S("scale_shift",1,f.length,2),_=C("output",t[0].dataType,p.length,d),w=[y,g,_];return`
+  ${b.registerUniform("output_size","u32").declareVariables(...w)}
+  ${b.mainStart()}
+  ${b.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+      let outputIndices = ${_.offsetToIndices("global_idx")};
+      let batch = outputIndices[0];
+      let channel = outputIndices[1];
+      let scale_shift = ${g.getByIndices("vec2<u32>(batch, channel)")};
+      let value = ${y.getByOffset("global_idx")} * ${_.type.value}(scale_shift.x) + ${_.type.value}(scale_shift.y);
+      ${_.setByOffset("global_idx","value")};
+  }`};e.compute({name:"InstanceNormalization",shaderCache:{hint:`${d}`,inputDependencies:m},getRunData:()=>({outputs:[{dims:o,dataType:t[0].dataType}],dispatchGroup:{x:Math.ceil(l/64)},programUniforms:[{type:12,data:l},...P(p,f,p)]}),getShaderSource:h},{inputs:[t[0],c]})},lc=(e,t,n)=>{let r=t[0].dims,o=r,i=r[0],s=r[r.length-1],a=x.sizeFromDimension(r,1)/s,u=X(s),d=x.size(o)/u,l=[{type:12,data:a},{type:12,data:Math.floor(s/u)}],c=["type","type"],p=!1,f=[0,r.length-1];for(let y=0;y<r.length-2;y++)p=p||r[y+1]!==1,f.push(y+1);p=p&&r[r.length-1]!==1;let m=p?e.compute(ce(e.inputs[0],f),{inputs:[e.inputs[0]],outputs:[-1]})[0]:e.inputs[0].reshape(Array.from({length:r.length},(y,g)=>r[f[g]])),h=fa(e,m,t[1],t[2],i,a,s,n.epsilon),b=y=>{let g=re(t[0].dataType),_=u===1?"vec2f":`mat${u}x2f`,w=T=>{let I=T===0?"x":"y",A=u===1?"f32":`vec${u}f`;switch(u){case 1:return`${g}(${A}(scale.${I}))`;case 2:return`vec2<${g}>(${A}(scale[0].${I}, scale[1].${I}))`;case 4:return`vec4<${g}>(${A}(scale[0].${I}, scale[1].${I}, scale[2].${I}, scale[3].${I}))`;default:throw new Error(`Not supported compoents ${u}`)}},v=S("input",t[0].dataType,t[0].dims,u),$=C("output",t[0].dataType,o,u);return`
+  @group(0) @binding(0) var<storage, read> input : array<${v.type.storage}>;
+  @group(0) @binding(1) var<storage, read> scale_input : array<${_}>;
+  @group(0) @binding(2) var<storage, read_write> output : array<${$.type.storage}>;
+  struct Uniforms {H: u32, C : u32};
+  @group(0) @binding(3) var<uniform> uniforms: Uniforms;
+
+  ${y.mainStart()}
+    let current_image_number = global_idx / (uniforms.C * uniforms.H);
+    let current_channel_number = global_idx % uniforms.C;
+
+    let scale_offset = current_image_number * uniforms.C + current_channel_number;
+    let scale = scale_input[scale_offset];
+    output[global_idx] = fma(input[global_idx], ${w(0)}, ${w(1)});
+  }`};e.compute({name:"InstanceNormalizationNHWC",shaderCache:{hint:`${u}`,inputDependencies:c},getRunData:()=>({outputs:[{dims:o,dataType:t[0].dataType}],dispatchGroup:{x:Math.ceil(d/64)},programUniforms:l}),getShaderSource:b},{inputs:[t[0],h]})},ha=(e,t)=>{t.format==="NHWC"?lc(e,e.inputs,t):dc(e,e.inputs,t)}});var cc,pc,ya,ba=E(()=>{"use strict";V();q();K();cc=e=>{if(!e||e.length<2)throw new Error("layerNorm requires at least 2 inputs.")},pc=(e,t,n)=>{let r=t.simplified,o=e[0].dims,i=e[1],s=!r&&e[2],a=o,u=x.normalizeAxis(t.axis,o.length),d=x.sizeToDimension(o,u),l=x.sizeFromDimension(o,u),c=x.size(i.dims),p=s?x.size(s.dims):0;if(c!==l||s&&p!==l)throw new Error(`Size of X.shape()[axis:] == ${l}.
+       Size of scale and bias (if provided) must match this.
+       Got scale size of ${c} and bias size of ${p}`);let f=[];for(let v=0;v<o.length;++v)v<u?f.push(o[v]):f.push(1);let m=X(l),h=["type","type"],b=[{type:12,data:d},{type:1,data:l},{type:12,data:Math.floor(l/m)},{type:1,data:t.epsilon}];s&&h.push("type");let y=n>1,g=n>2,_=v=>{let $=re(e[0].dataType),T=[S("x",e[0].dataType,e[0].dims,m),S("scale",i.dataType,i.dims,m)];s&&T.push(S("bias",s.dataType,s.dims,m)),T.push(C("output",e[0].dataType,a,m)),y&&T.push(C("mean_data_output",1,f)),g&&T.push(C("inv_std_output",1,f));let I=[{name:"norm_count",type:"u32"},{name:"norm_size",type:"f32"},{name:"norm_size_vectorized",type:"u32"},{name:"epsilon",type:"f32"}];return`
+  ${v.registerUniforms(I).declareVariables(...T)}
+  ${v.mainStart()}
+    ${v.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.norm_count")}
+    let offset = global_idx * uniforms.norm_size_vectorized;
+    var mean_vector = ${Wn("f32",m)};
+    var mean_square_vector = ${Wn("f32",m)};
+
+    for (var h: u32 = 0u; h < uniforms.norm_size_vectorized; h++) {
+      let value = ${Xe($,m,"x[h + offset]")};
+      mean_vector += value;
+      mean_square_vector += value * value;
+    }
+    let mean = ${Te("mean_vector",m)} / uniforms.norm_size;
+    let inv_std_dev = inverseSqrt(${Te("mean_square_vector",m)} / uniforms.norm_size ${r?"":"- mean * mean"} + uniforms.epsilon);
+
+    for (var j: u32 = 0; j < uniforms.norm_size_vectorized; j++) {
+      let f32input = ${Xe($,m,"x[j + offset]")};
+      let f32scale = ${Xe($,m,"scale[j]")};
+      output[j + offset] = ${T[0].type.value}((f32input ${r?"":"- mean"}) * inv_std_dev * f32scale
+        ${s?`+ ${Xe($,m,"bias[j]")}`:""}
+      );
+    }
+
+    ${y?"mean_data_output[global_idx] = mean":""};
+    ${g?"inv_std_output[global_idx] = inv_std_dev":""};
+  }`},w=[{dims:a,dataType:e[0].dataType}];return y&&w.push({dims:f,dataType:1}),g&&w.push({dims:f,dataType:1}),{name:"LayerNormalization",shaderCache:{hint:`${m};${n};${r}`,inputDependencies:h},getRunData:()=>({outputs:w,dispatchGroup:{x:Math.ceil(d/64)},programUniforms:b}),getShaderSource:_}},ya=(e,t)=>{cc(e.inputs),e.compute(pc(e.inputs,t,e.outputCount))}});var mc,_a,wa=E(()=>{"use strict";q();on();sn();mc=e=>{if(!e||e.length!==2)throw new Error("MatMul requires 2 inputs.");if(e[0].dims[e[0].dims.length-1]!==e[1].dims[e[1].dims.length-2])throw new Error("shared dimension does not match.")},_a=e=>{mc(e.inputs);let t=Pe.calcShape(e.inputs[0].dims,e.inputs[1].dims,!0);if(!t)throw new Error("Can't use matmul on the given tensors");let n=t[t.length-1],r=e.inputs[0].dims[e.inputs[0].dims.length-1];if(n<8&&r<8)e.compute(rn(e.inputs,{activation:""},t));else{let o=t[t.length-2],i=x.size(e.inputs[0].dims.slice(0,-2)),s=x.size(e.inputs[1].dims.slice(0,-2));if(i!==1&&o===1&&s===1){let a=e.inputs[0].reshape([1,i,r]),u=e.inputs[1].reshape([1,r,n]),d=[1,i,n],l=[a,u];e.compute(gt(l,{activation:""},t,d),{inputs:l})}else e.compute(gt(e.inputs,{activation:""},t))}}});var fc,hc,gc,$a,va,xa=E(()=>{"use strict";V();q();ie();K();fc=(e,t)=>{if(e.length<3||e.length>4)throw new Error("MatMulNBits requires 3 or 4 inputs");let n=e[0],r=n.dims.length;if(n.dims[r-1]!==t.k)throw new Error("The last dim of input shape does not match the k value");let o=Math.floor((t.k+t.blockSize-1)/t.blockSize),i=t.blockSize/8*t.bits,s=e[1];if(!x.areEqual(s.dims,[t.n,o,i]))throw new Error("The second inputs must be 3D tensor with shape N X nBlocksPerCol X blobSize");let u=e[2].dims;if(x.size(u)!==t.n*o)throw new Error("scales input size error.");if(e.length===4){let l=e[3].dims,c=t.bits>4?t.n*o:t.n*Math.floor((o+1)/2);if(x.size(l)!==c)throw new Error("zeroPoints input size error.")}},hc=(e,t)=>{let n=e[0].dims,r=n.length,o=n[r-2],i=t.k,s=t.n,a=n.slice(0,r-2),u=x.size(a),l=e[1].dims[2]/4,c=e[0].dataType,p=X(t.k),f=X(l),m=X(s),h=a.concat([o,s]),b=o>1&&s/m%2===0?2:1,y=x.size(h)/m/b,g=64,_=[],w=[u,o,i/p],v=x.convertShape(e[1].dims).slice();v.splice(-1,1,l/f),_.push(...P(w)),_.push(...P(v)),_.push(...P(e[2].dims)),e.length===4&&_.push(...P(x.convertShape(e[3].dims)));let $=[u,o,s/m];_.push(...P($));let T=I=>{let A=w.length,z=S("a",e[0].dataType,A,p),M=S("b",12,v.length,f),R=S("scales",e[2].dataType,e[2].dims.length),W=[z,M,R],O=e.length===4?S("zero_points",12,e[3].dims.length):void 0;O&&W.push(O);let ee=$.length,G=C("output",e[0].dataType,ee,m),D=re(e[0].dataType),Z=(()=>{switch(p){case 1:return`array<${D}, 8>`;case 2:return`mat4x2<${D}>`;case 4:return`mat2x4<${D}>`;default:throw new Error(`${p}-component is not supported.`)}})(),U=()=>{let se=`
+          // reuse a data
+            var input_offset = ${z.indicesToOffset(`${z.type.indices}(batch, row, word_offset)`)};
+            var a_data: ${Z};
+            for (var j: u32 = 0; j < ${8/p}; j++) {
+              a_data[j] = ${z.getByOffset("input_offset")};
+              input_offset++;
+            }
+          `;for(let H=0;H<m*b;H++)se+=`
+            b_value = ${f===1?`b${H}_data`:`b${H}_data[i]`};
+            b_value_lower = unpack4xU8(b_value & b_mask);
+            b_value_upper = unpack4xU8((b_value >> 4) & b_mask);
+            b_quantized_values = ${Z}(${Array.from({length:4},(k,L)=>`${D}(b_value_lower[${L}]), ${D}(b_value_upper[${L}])`).join(", ")});
+            b_dequantized_values = ${p===1?`${Z}(${Array.from({length:8},(k,L)=>`(b_quantized_values[${L}] - ${O?`zero_point${H}`:"zero_point"}) * scale${H}`).join(", ")});`:`(b_quantized_values - ${Z}(${Array(8).fill(`${O?`zero_point${H}`:"zero_point"}`).join(",")})) * scale${H};`};
+            workgroup_shared[local_id.x * ${b} + ${Math.floor(H/m)}]${m>1?`[${H%m}]`:""} += ${Array.from({length:8/p},(k,L)=>`${p===1?`a_data[${L}] * b_dequantized_values[${L}]`:`dot(a_data[${L}], b_dequantized_values[${L}])`}`).join(" + ")};
+          `;return se},Q=()=>{let se=`
+            var col_index = col * ${m};
+            ${O?`
+            let zero_point_bytes_per_col = (nBlocksPerCol + 1) / 2;
+            var zero_point_byte_count: u32;
+            var zero_point_word_index: u32;
+            var zero_point_byte_offset: u32;
+            let zero_point_nibble_offset: u32 = block & 0x1u;
+            var zero_point_bits_offset: u32;
+            var zero_point_word: u32;`:`
+            // The default zero point is 8 for unsigned 4-bit quantization.
+            let zero_point = ${D}(8);`}
+            `;for(let H=0;H<m*b;H++)se+=`
+            let scale${H} = ${R.getByOffset("col_index * nBlocksPerCol + block")};
+            ${O?`
+            zero_point_byte_count = col_index * zero_point_bytes_per_col + (block >> 0x1u);
+            zero_point_word_index = zero_point_byte_count >> 0x2u;
+            zero_point_byte_offset = zero_point_byte_count & 0x3u;
+            zero_point_bits_offset = (zero_point_byte_offset << 3) + (zero_point_nibble_offset << 2);
+            zero_point_word = ${O.getByOffset("zero_point_word_index")} >> zero_point_bits_offset;
+            let zero_point${H} = ${D}((zero_point_word) & 0xFu);`:""}
+            col_index += 1;`;return se},_e=()=>{let se=`col_index = col * ${m};`;for(let H=0;H<m*b;H++)se+=`
+            let b${H}_data = ${M.getByIndices(`${M.type.indices}(col_index, block, word)`)};
+            col_index += 1;`;return se+=`
+            var b_value: u32;
+            let b_mask: u32 = 0x0F0F0F0Fu;
+            var b_value_lower: vec4<u32>;
+            var b_value_upper: vec4<u32>;
+            var b_quantized_values: ${Z};
+            var b_dequantized_values: ${Z};`,se};return`
+        var<workgroup> workgroup_shared: array<${G.type.value}, ${b*g}>;
+        ${I.declareVariables(...W,G)}
+        ${I.mainStart([g,1,1])}
+          let output_indices = ${G.offsetToIndices(`(global_idx / ${g}) * ${b}`)};
+          let col = output_indices[2];
+          let row = output_indices[1];
+          let batch = output_indices[0];
+          let nBlocksPerCol = uniforms.b_shape[1];
+
+          for (var block = local_id.x; block < nBlocksPerCol; block += ${g}) {
+            //process one block
+            var word_offset: u32 = block * ${t.blockSize/p};
+            ${Q()}
+            for (var word: u32 = 0; word < ${l}; word += ${f}) {
+              ${_e()}
+              for (var i: u32 = 0; i < ${f}; i++) {
+                ${U()}
+                word_offset += ${8/p};
+              }
+            }
+          }
+          workgroupBarrier();
+
+          if (local_id.x < ${b}) {
+            var output_value: ${G.type.value} = ${G.type.value}(0);
+            var workgroup_shared_offset: u32 = local_id.x;
+            for (var b: u32 = 0u; b < ${g}u; b++) {
+              output_value += workgroup_shared[workgroup_shared_offset];
+              workgroup_shared_offset += ${b};
+            }
+            ${G.setByIndices(`${G.type.indices}(batch, row, col + local_id.x)`,"output_value")};
+          }
+        }`};return{name:"MatMulNBits",shaderCache:{hint:`${t.blockSize};${t.bits};${p};${f};${m};${b};${g}`,inputDependencies:Array(e.length).fill("rank")},getRunData:()=>({outputs:[{dims:h,dataType:c}],dispatchGroup:{x:y},programUniforms:_}),getShaderSource:T}},gc=(e,t)=>{let n=e[0].dims,r=n.length,o=n[r-2],i=t.k,s=t.n,a=n.slice(0,r-2),u=x.size(a),l=e[1].dims[2]/4,c=e[0].dataType,p=X(t.k),f=X(l),m=a.concat([o,s]),h=128,b=s%8===0?8:s%4===0?4:1,y=h/b,g=y*f*8,_=g/p,w=g/t.blockSize,v=x.size(m)/b,$=[],T=[u,o,i/p],I=x.convertShape(e[1].dims).slice();I.splice(-1,1,l/f),$.push(...P(T)),$.push(...P(I)),$.push(...P(e[2].dims)),e.length===4&&$.push(...P(x.convertShape(e[3].dims)));let A=[u,o,s];$.push(...P(A));let z=M=>{let R=T.length,W=S("a",e[0].dataType,R,p),O=S("b",12,I.length,f),ee=S("scales",e[2].dataType,e[2].dims.length),G=[W,O,ee],D=e.length===4?S("zero_points",12,e[3].dims.length):void 0;D&&G.push(D);let Z=A.length,U=C("output",e[0].dataType,Z),Q=re(e[0].dataType),_e=()=>{switch(p){case 1:return`
+          let a_data0 = vec4<${Q}>(sub_a[word_offset], sub_a[word_offset + 1], sub_a[word_offset + 2], sub_a[word_offset + 3]);
+          let a_data1 = vec4<${Q}>(sub_a[word_offset + 4], sub_a[word_offset + 5], sub_a[word_offset + 6], sub_a[word_offset + 7]);`;case 2:return`
+          let a_data0 = vec4<${Q}>(sub_a[word_offset], sub_a[word_offset + 1]);
+          let a_data1 = vec4<${Q}>(sub_a[word_offset + 2], sub_a[word_offset + 3]);`;case 4:return`
+          let a_data0 = sub_a[word_offset];
+          let a_data1 = sub_a[word_offset + 1];`;default:throw new Error(`${p}-component is not supported.`)}};return`
+        var<workgroup> sub_a: array<${W.type.value}, ${_}>;
+        var<workgroup> inter_results: array<array<${U.type.value}, ${y}>, ${b}>;
+        ${M.declareVariables(...G,U)}
+        ${M.mainStart([y,b,1])}
+          let output_indices = ${U.offsetToIndices(`workgroup_index * ${b}`)};
+          let col = output_indices[2];
+          let row = output_indices[1];
+          let batch = output_indices[0];
+          let n_blocks_per_col = uniforms.b_shape[1];
+          let num_tiles =  (n_blocks_per_col - 1) / ${w} + 1;
+
+          // Loop over shared dimension.
+          for (var tile: u32 = 0; tile < num_tiles; tile += 1) {
+            let a_col_start = tile * ${_};
+            // load one tile A data into shared memory.
+            for (var a_offset = local_idx; a_offset < ${_}; a_offset += ${h})
+            {
+              let a_col = a_col_start + a_offset;
+              if (a_col < uniforms.a_shape[2])
+              {
+                sub_a[a_offset] = ${W.getByIndices(`${W.type.indices}(batch, row, a_col)`)};
+              } else {
+                sub_a[a_offset] = ${W.type.value}(0);
+              }
+            }
+            workgroupBarrier();
+
+            // each thread process one block
+            let b_row = col + local_id.y;
+            let block = tile * ${w} + local_id.x;
+            ${D?`
+            let zero_point_bytes_per_col = (n_blocks_per_col + 1) / 2;
+            let zero_point_byte_count = b_row * zero_point_bytes_per_col + (block >> 0x1u);
+            let zero_point_word_index = zero_point_byte_count >> 0x2u;
+            let zero_point_byte_offset = zero_point_byte_count & 0x3u;
+            let zero_point_nibble_offset: u32 = block & 0x1u;
+            let zero_point_bits_offset = (zero_point_byte_offset << 3) + (zero_point_nibble_offset << 2);
+            let zero_point_word = ${D.getByOffset("zero_point_word_index")} >> zero_point_bits_offset;
+            let zero_point = ${Q}((zero_point_word) & 0xFu);`:`
+            // The default zero point is 8 for unsigned 4-bit quantization.
+            let zero_point = ${Q}(8);`}
+            let scale = ${ee.getByOffset("b_row * n_blocks_per_col + block")};
+            let b_data = ${O.getByIndices(`${O.type.indices}(b_row, block, 0)`)};
+            var word_offset = local_id.x * ${t.blockSize/p};
+            for (var i: u32 = 0; i < ${f}; i++) {
+              ${_e()}
+              let b_value = ${f===1?"b_data":"b_data[i]"};
+              let b_value_lower = unpack4xU8(b_value & 0x0F0F0F0Fu);
+              let b_value_upper = unpack4xU8((b_value >> 4) & 0x0F0F0F0Fu);
+              let b_quantized_values = mat2x4<${Q}>(${Array.from({length:4},(se,H)=>`${Q}(b_value_lower[${H}]), ${Q}(b_value_upper[${H}])`).join(", ")});
+              let b_dequantized_values = (b_quantized_values - mat2x4<${Q}>(${Array(8).fill("zero_point").join(",")})) * scale;
+              inter_results[local_id.y][local_id.x] += ${Array.from({length:2},(se,H)=>`${`dot(a_data${H}, b_dequantized_values[${H}])`}`).join(" + ")};
+              word_offset += ${8/p};
+            }
+            workgroupBarrier();
+          }
+
+          if (local_idx < ${b}) {
+            var output_value: ${U.type.value} = ${U.type.value}(0);
+            for (var b = 0u; b < ${y}; b++) {
+              output_value += inter_results[local_idx][b];
+            }
+            if (col + local_idx < uniforms.output_shape[2])
+            {
+              ${U.setByIndices(`${U.type.indices}(batch, row, col + local_idx)`,"output_value")}
+            }
+          }
+        }`};return{name:"BlockwiseMatMulNBits32",shaderCache:{hint:`${t.blockSize};${p};${f};${y};${b}`,inputDependencies:Array(e.length).fill("rank")},getRunData:()=>({outputs:[{dims:m,dataType:c}],dispatchGroup:{x:v},programUniforms:$}),getShaderSource:z}},$a=(e,t)=>{fc(e.inputs,t),t.blockSize===32&&e.adapterInfo.isVendor("intel")&&e.adapterInfo.isArchitecture("gen-12lp")?e.compute(gc(e.inputs,t)):e.compute(hc(e.inputs,t))},va=e=>N(e)});var yc,bc,_c,wc,$c,vc,xc,Sc,Sa,Ta=E(()=>{"use strict";V();q();K();yc=e=>{if(!e||e.length<1)throw new Error("Too few inputs");if(e[0].dataType!==1&&e[0].dataType!==10)throw new Error("Input type must be float or float16.");if(e.length>=2){let t=e[0].dims.length*2===e[1].dims[0];if(e.length===4&&(t=e[3].dims[0]*2===e[1].dims[0]),!t)throw new Error("The pads should be a 1D tensor of shape [2 * input_rank] or [2 * num_axes].")}},bc=(e,t,n)=>{let r="";for(let o=t-1;o>=0;--o)r+=`
+            k = i32(${e.indicesGet("indices",o)}) - ${B("uniforms.pads",o,n)};
+            if (k < 0) {
+              break;
+            }
+            if (k >= i32(${B("uniforms.x_shape",o,t)})) {
+              break;
+            }
+            offset += k * i32(${B("uniforms.x_strides",o,t)});
+        `;return`
+          value = ${e.type.value}(uniforms.constant_value);
+          for (var i = 0; i < 1; i++) {
+            var offset = 0;
+            var k = 0;
+            ${r}
+            value = x[offset];
+          }
+      `},_c=(e,t,n)=>{let r="";for(let o=t-1;o>=0;--o)r+=`
+                k = i32(${e.indicesGet("indices",o)}) - ${B("uniforms.pads",o,n)};
+                if (k < 0) {
+                  k = -k;
+                }
+                {
+                  let _2n_1 = 2 * (i32(${B("uniforms.x_shape",o,t)}) - 1);
+                  k = k % _2n_1;
+                  if(k >= i32(${B("uniforms.x_shape",o,t)})) {
+                    k = _2n_1 - k;
+                  }
+                }
+                offset += k * i32(${B("uniforms.x_strides",o,t)});
+            `;return`
+              var offset = 0;
+              var k = 0;
+              ${r}
+              value = x[offset];
+          `},wc=(e,t,n)=>{let r="";for(let o=t-1;o>=0;--o)r+=`
+                k = i32(${e.indicesGet("indices",o)}) - ${B("uniforms.pads",o,n)};
+                if (k < 0) {
+                  k = 0;
+                }
+                if (k >= i32(${B("uniforms.x_shape",o,t)})) {
+                  k = i32(${B("uniforms.x_shape",o,t)}) - 1;
+                }
+                offset += k * i32(${B("uniforms.x_strides",o,t)});
+            `;return`
+              var offset = 0;
+              var k = 0;
+              ${r}
+              value = x[offset];
+          `},$c=(e,t,n)=>{let r="";for(let o=t-1;o>=0;--o)r+=`
+                k = i32(${e.indicesGet("indices",o)}) - ${B("uniforms.pads",o,n)};
+                if (k < 0)  {
+                  k += i32(${B("uniforms.x_shape",o,t)}]);
+                }
+                if (k >= i32(${B("uniforms.x_shape",o,t)})) {
+                  k -= i32(${B("uniforms.x_shape",o,t)});
+                }
+                offset += k * i32(${B("uniforms.x_strides",o,t)});
+            `;return`
+              var offset = 0;
+              var k = 0;
+              ${r}
+              value = x[offset];
+          `},vc=(e,t,n)=>{switch(n.mode){case 0:return bc(e,t,n.pads.length);case 1:return _c(e,t,n.pads.length);case 2:return wc(e,t,n.pads.length);case 3:return $c(e,t,n.pads.length);default:throw new Error("Invalid mode")}},xc=(e,t)=>{let n=x.padShape(e[0].dims.slice(),t.pads),r=e[0].dims,o=x.size(n),i=[{type:12,data:o},{type:6,data:t.pads}],s=e.length>=3&&e[2].data;t.mode===0&&i.push({type:s?e[2].dataType:1,data:t.value}),i.push(...P(e[0].dims,n));let a=["rank"],u=d=>{let l=C("output",e[0].dataType,n.length),c=S("x",e[0].dataType,r.length),p=c.type.value,f=vc(l,r.length,t),m=[{name:"output_size",type:"u32"},{name:"pads",type:"i32",length:t.pads.length}];return t.mode===0&&m.push({name:"constant_value",type:s?p:"f32"}),`
+            ${d.registerUniforms(m).declareVariables(c,l)}
+            ${d.mainStart()}
+            ${d.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+
+            let indices = ${l.offsetToIndices("global_idx")};
+
+            var value = ${p}(0);
+            ${f}
+            output[global_idx] = value;
+        }`};return{name:"Pad",shaderCache:{hint:`${t.mode}${s}`,inputDependencies:a},getRunData:()=>({outputs:[{dims:n,dataType:e[0].dataType}],dispatchGroup:{x:Math.ceil(x.size(n)/64)},programUniforms:i}),getShaderSource:u}},Sc=(e,t)=>{if(e.length>1){let n=e[1].getBigInt64Array(),r=e.length>=3&&e[2].data?e[2].dataType===10?e[2].getUint16Array()[0]:e[2].getFloat32Array()[0]:0,o=e[0].dims.length,i=new Int32Array(2*o).fill(0);if(e.length>=4){let a=e[3].getBigInt64Array();for(let u=0;u<a.length;u++)i[Number(a[u])]=Number(n[u]),i[Number(a[u])+o]=Number(n[u+a.length])}else n.forEach((a,u)=>i[Number(u)]=Number(a));let s=[];return i.forEach(a=>s.push(a)),{mode:t.mode,value:r,pads:s}}else return t},Sa=(e,t)=>{yc(e.inputs);let n=Sc(e.inputs,t);e.compute(xc(e.inputs,n),{inputs:[0]})}});var ln,Ia,Ca,Aa,ka,Tc,Ic,Ea,Pa,za,Oa,Da,Ba,Ma,Ra,Ua,Va,Na,La,Ga=E(()=>{"use strict";ve();V();q();K();ln=e=>{if(te.webgpu.validateInputContent&&(!e||e.length!==1))throw new Error("Pool ops requires 1 input.")},Ia=(e,t,n)=>{let r=t.format==="NHWC",o=e.dims.slice();r&&o.splice(1,0,o.pop());let i=Object.hasOwnProperty.call(t,"dilations"),s=t.kernelShape.slice(),a=t.strides.slice(),u=i?t.dilations.slice():[],d=t.pads.slice();Ze.adjustPoolAttributes(n,o,s,a,u,d);let l=Ze.computePoolOutputShape(n,o,a,u,s,d,t.autoPad),c=Object.assign({},t);i?Object.assign(c,{kernelShape:s,strides:a,pads:d,dilations:u,cacheKey:t.cacheKey}):Object.assign(c,{kernelShape:s,strides:a,pads:d,cacheKey:t.cacheKey});let p=l.slice();return p.push(p.splice(1,1)[0]),[c,r?p:l]},Ca=(e,t)=>{let n=t.format==="NHWC",r=x.size(e),o=x.size(t.kernelShape),i=[{type:12,data:r},{type:12,data:o}],s=[{name:"outputSize",type:"u32"},{name:"kernelSize",type:"u32"}];if(t.kernelShape.length<=2){let a=t.kernelShape[t.kernelShape.length-1],u=t.strides[t.strides.length-1],d=t.pads[t.pads.length/2-1],l=t.pads[t.pads.length-1],c=!!(d+l);i.push({type:12,data:a},{type:12,data:u},{type:12,data:d},{type:12,data:l}),s.push({name:"kw",type:"u32"},{name:"sw",type:"u32"},{name:"pwStart",type:"u32"},{name:"pwEnd",type:"u32"});let p=!1;if(t.kernelShape.length===2){let f=t.kernelShape[t.kernelShape.length-2],m=t.strides[t.strides.length-2],h=t.pads[t.pads.length/2-2],b=t.pads[t.pads.length-2];p=!!(h+b),i.push({type:12,data:f},{type:12,data:m},{type:12,data:h},{type:12,data:b}),s.push({name:"kh",type:"u32"},{name:"sh",type:"u32"},{name:"phStart",type:"u32"},{name:"phEnd",type:"u32"})}return[i,s,!0,c,p]}else{if(n)throw new Error("Pooling with kernelShape.length > 2 is not supported for NHWC format.");let a=x.computeStrides(t.kernelShape);i.push({type:12,data:a},{type:12,data:t.pads},{type:12,data:t.strides}),s.push({name:"kernelStrides",type:"u32",length:a.length},{name:"pads",type:"u32",length:t.pads.length},{name:"strides",type:"u32",length:t.strides.length});let u=t.pads.reduce((d,l)=>d+l);return[i,s,!!u,!1,!1]}},Aa=(e,t,n,r,o,i,s,a,u,d,l,c)=>{let p=o.format==="NHWC",f=t.type.value,m=C("output",t.type.tensor,r);if(o.kernelShape.length<=2){let h="",b="",y="",g=n-(p?2:1);if(l?h=`
+                for (var i: u32 = 0u; i < uniforms.kw; i++) {
+                  xIndices[${g}] = indices[${g}] * uniforms.sw - uniforms.pwStart + i;
+                  if (xIndices[${g}] < 0 || xIndices[${g}]
+                      >= uniforms.x_shape[${g}]) {
+                    pad++;
+                    continue;
+                  }
+                  let x_val = x[${t.indicesToOffset("xIndices")}];
+                  ${i}
+                }`:h=`
+                for (var i: u32 = 0u; i < uniforms.kw; i++) {
+                  xIndices[${g}] = indices[${g}] * uniforms.sw - uniforms.pwStart + i;
+                  let x_val = x[${t.indicesToOffset("xIndices")}];
+                  ${i}
+                }`,o.kernelShape.length===2){let w=n-(p?3:2);c?b=`
+                for (var j: u32 = 0u; j < uniforms.kh; j++) {
+                  xIndices[${w}] = indices[${w}] * uniforms.sh - uniforms.phStart + j;
+                  if (xIndices[${w}] < 0 || xIndices[${w}] >= uniforms.x_shape[${w}]) {
+                    pad += i32(uniforms.kw);
+                    continue;
+                  }
+              `:b=`
+                for (var j: u32 = 0u; j < uniforms.kh; j++) {
+                  xIndices[${w}] = indices[${w}] * uniforms.sh - uniforms.phStart + j;
+                `,y=`
+              }
+            `}return`
+            ${e.registerUniforms(u).declareVariables(t,m)}
+
+            ${e.mainStart()}
+              ${e.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.outputSize")}
+
+              let indices = ${m.offsetToIndices("global_idx")};
+              var xIndices = ${m.offsetToIndices("global_idx")};
+
+              var value = ${f}(${a});
+              var pad = 0;
+              ${b}
+              ${h}
+              ${y}
+              ${s}
+
+              output[global_idx] = value;
+            }`}else{if(p)throw new Error("Pooling with kernelShape.length > 2 is not supported for NHWC format.");let h=o.kernelShape.length,b=o.pads.length,y="";return d?y=`
+                if (xIndices[j] >= uniforms.x_shape[j]) {
+                  pad++;
+                  isPad = true;
+                  break;
+                }
+              }
+              if (!isPad) {
+                let x_val = x[${t.indicesToOffset("xIndices")}];
+                ${i}
+              }`:y=`
+              }
+              let x_val = x[${t.indicesToOffset("xIndices")}];
+              ${i}
+            `,`
+            ${e.registerUniforms(u).declareVariables(t,m)}
+
+            ${e.mainStart()}
+              ${e.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.outputSize")}
+              let indices = ${m.offsetToIndices("global_idx")};
+              var xIndices = ${m.offsetToIndices("global_idx")};
+
+              var offsets: array<u32, ${h}>;
+
+              var value = ${f}(${a});
+              var pad = 0;
+              var isPad = false;
+
+              for (var i: u32 = 0u; i < uniforms.kernelSize; i++) {
+                var offset = i;
+                for (var j = 0u; j < ${h-1}u; j++) {
+                  offsets[j] = offset / ${B("uniforms.kernelStrides","j",h)};
+                  offset -= offsets[j] * ${B("uniforms.kernelStrides","j",h)};
+                }
+                offsets[${h-1}] = offset;
+
+                isPad = false;
+                for (var j = ${n-h}u; j < ${n}u; j++) {
+                  xIndices[j] = indices[j] * ${B("uniforms.strides",`j - ${n-h}u`,h)}
+                    + offsets[j - ${n-h}u] - ${B("uniforms.pads","j - 2u",b)};
+                  ${y}
+              }
+              ${s}
+
+              output[global_idx] = value;
+            }`}},ka=e=>`${e.format};${e.ceilMode};${e.autoPad};${e.kernelShape.length}`,Tc=e=>`${ka(e)};${e.countIncludePad}`,Ic=e=>`${ka(e)};${e.storageOrder};${e.dilations}`,Ea=e=>({format:e.format,autoPad:["NOTSET","VALID","SAME_UPPER","SAME_LOWER"][e.auto_pad],ceilMode:e.ceil_mode,kernelShape:e.kernel_shape,strides:e.strides,pads:e.pads}),Pa=(e,t,n,r)=>{let[o,i]=Ia(t,r,n),s=S("x",t.dataType,t.dims.length),a=s.type.value,u="value += x_val;",d="";o.countIncludePad?d+=`value /= ${a}(uniforms.kernelSize);`:d+=`value /= ${a}(i32(uniforms.kernelSize) - pad);`;let[l,c,p,f,m]=Ca(i,o);l.push(...P(t.dims,i));let h=["rank"];return{name:e,shaderCache:{hint:`${r.cacheKey};${p};${f};${m}`,inputDependencies:h},getRunData:()=>({outputs:[{dims:i,dataType:t.dataType}],dispatchGroup:{x:Math.ceil(x.size(i)/64)},programUniforms:l}),getShaderSource:b=>Aa(b,s,t.dims.length,i.length,o,u,d,0,c,p,f,m)}},za=e=>{let t=e.count_include_pad!==0,n=Ea(e);if(n.ceilMode!==0)throw new Error("using ceil() in shape computation is not yet supported for AveragePool");let r={countIncludePad:t,...n,cacheKey:""};return{...r,cacheKey:Tc(r)}},Oa=(e,t)=>{ln(e.inputs),e.compute(Pa("AveragePool",e.inputs[0],!1,t))},Da={autoPad:"",ceilMode:0,countIncludePad:!1,kernelShape:[],strides:[],pads:[],storageOrder:0,dilations:[]},Ba=e=>{let t=e.format;return{format:t,...Da,cacheKey:t}},Ma=(e,t)=>{ln(e.inputs),e.compute(Pa("GlobalAveragePool",e.inputs[0],!0,t))},Ra=(e,t,n,r)=>{let[o,i]=Ia(t,r,n),s=`
+      value = max(x_val, value);
+    `,a="",u=S("x",t.dataType,t.dims.length),d=["rank"],[l,c,p,f,m]=Ca(i,o);return l.push(...P(t.dims,i)),{name:e,shaderCache:{hint:`${r.cacheKey};${p};${f};${m}`,inputDependencies:d},getRunData:()=>({outputs:[{dims:i,dataType:t.dataType}],dispatchGroup:{x:Math.ceil(x.size(i)/64)},programUniforms:l}),getShaderSource:h=>Aa(h,u,t.dims.length,i.length,o,s,a,t.dataType===10?-65504:-1e5,c,p,f,m)}},Ua=(e,t)=>{ln(e.inputs),e.compute(Ra("MaxPool",e.inputs[0],!1,t))},Va=e=>{let t=e.storage_order,n=e.dilations,r=Ea(e);if(t!==0)throw new Error("column major storage order is not yet supported for MaxPool");if(r.ceilMode!==0)throw new Error("using ceil() in shape computation is not yet supported for MaxPool");let o={storageOrder:t,dilations:n,...r,cacheKey:""};return{...o,cacheKey:Ic(o)}},Na=e=>{let t=e.format;return{format:t,...Da,cacheKey:t}},La=(e,t)=>{ln(e.inputs),e.compute(Ra("GlobalMaxPool",e.inputs[0],!0,t))}});var Ac,kc,Wa,Ha,qa=E(()=>{"use strict";V();q();ie();K();Ac=(e,t)=>{if(e.length<2||e.length>3)throw new Error("DequantizeLinear requires 2 or 3 inputs.");if(e.length===3&&e[1].dims===e[2].dims)throw new Error("x-scale and x-zero-point must have the same shape.");if(e.length===3&&e[0].dataType!==e[2].dataType)throw new Error("x and x-zero-point must have the same data type.");if(e[0].dataType===6&&e.length>2)throw new Error("In the case of dequantizing int32 there is no zero point.");if(e[1].dims.length!==0&&e[1].dims.length!==1&&e[1].dims.length!==e[0].dims.length)throw new Error("scale input must be a scalar, a 1D tensor, or have the same rank as the input tensor.");if(e.length>2){if(e[0].dataType!==e[2].dataType)throw new Error("x and x-zero-point must have the same data type.");if(e[1].dims.length!==e[2].dims.length)throw new Error("scale and zero-point inputs must have the same rank.");if(!e[1].dims.map((n,r)=>n===e[2].dims[r]).reduce((n,r)=>n&&r,!0))throw new Error("scale and zero-point inputs must have the same shape.")}if(t.blockSize>0){if(e[1].dims.length===0||e[1].dims.length===1&&e[1].dims[0]===1)throw new Error("blockSize must be set only for block quantization.");if(!e[1].dims.map((o,i)=>i===t.axis||o===e[0].dims[i]).reduce((o,i)=>o&&i,!0))throw new Error("For block qunatization, scale input shape to match the input shape except for the axis");if(e[1].dims.length!==e[0].dims.length)throw new Error("For block qunatization the scale input rank must be the same as the x rank.");let n=e[0].dims[t.axis],r=e[1].dims[t.axis];if(t.blockSize<Math.ceil(n/r)||t.blockSize>Math.ceil(n/(r-1)-1))throw new Error("blockSize must be with in the range [ceil(dI / Si), ceil(dI / (Si - 1) - 1)].")}},kc=(e,t)=>{let n=x.normalizeAxis(t.axis,e[0].dims.length),r=e[0].dataType,o=r===3,i=e[0].dims,s=e[1].dataType,a=x.size(i),u=r===3||r===2,d=u?[Math.ceil(x.size(e[0].dims)/4)]:e[0].dims,l=e[1].dims,c=e.length>2?e[2]:void 0,p=c?u?[Math.ceil(x.size(c.dims)/4)]:c.dims:void 0,f=l.length===0||l.length===1&&l[0]===1,m=f===!1&&l.length===1,h=X(a),b=f&&(!u||h===4),y=b?h:1,g=b&&!u?h:1,_=S("input",u?12:r,d.length,g),w=S("scale",s,l.length),v=c?S("zero_point",u?12:r,p.length):void 0,$=C("output",s,i.length,y),T=[_,w];v&&T.push(v);let I=[d,l];c&&I.push(p);let A=[{type:12,data:a/y},{type:12,data:n},{type:12,data:t.blockSize},...P(...I,i)],z=M=>{let R=[{name:"output_size",type:"u32"},{name:"axis",type:"u32"},{name:"block_size",type:"u32"}];return`
+      ${M.registerUniforms(R).declareVariables(...T,$)}
+      ${M.mainStart()}
+          ${M.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+          let output_indices = ${$.offsetToIndices("global_idx")};
+
+          // Set input x
+          ${u?`
+            let input = ${_.getByOffset("global_idx / 4")};
+            let x_vec = ${o?"unpack4xI8(input)":"unpack4xU8(input)"};
+            let x_value = ${y===1?"x_vec[global_idx % 4]":"x_vec"};`:`let x_value = ${_.getByOffset("global_idx")};`};
+
+          // Set scale input
+          ${f?`let scale_value= ${w.getByOffset("0")}`:m?`
+            let scale_index = ${$.indicesGet("output_indices","uniforms.axis")};
+            let scale_value= ${w.getByOffset("scale_index")};`:`
+            var scale_indices: ${w.type.indices} = output_indices;
+            let index = ${w.indicesGet("scale_indices","uniforms.axis")} / uniforms.block_size;
+            ${w.indicesSet("scale_indices","uniforms.axis","index")};
+            let scale_value= ${w.getByIndices("scale_indices")};`};
+
+          // Set zero-point input
+          ${v?f?u?`
+                let zero_point_input = ${v.getByOffset("0")};
+                let zero_point_vec =  ${o?"unpack4xI8(zero_point_input)":"unpack4xU8(zero_point_input)"};
+                let zero_point_value= zero_point_vec[0]`:`let zero_point_value = ${v.getByOffset("0")}`:m?u?`
+                let zero_point_index = ${$.indicesGet("output_indices","uniforms.axis")};
+                let zero_point_input = ${v.getByOffset("zero_point_index / 4")};
+                let zero_point_vec =  ${o?"unpack4xI8(zero_point_input)":"unpack4xU8(zero_point_input)"};
+                let zero_point_value = zero_point_vec[zero_point_index % 4]`:`
+                let zero_point_index = ${$.indicesGet("output_indices","uniforms.axis")};
+                let zero_point_value = ${v.getByOffset("zero_point_index")};`:u?`
+                let zero_point_offset = ${w.indicesToOffset("scale_indices")};
+                let zero_point_input = ${v.getByOffset("zero_point_offset / 4")};
+                let zero_point_vec = ${o?"unpack4xI8(zero_point_input)":"unpack4xU8(zero_point_input)"};
+                let zero_point_value = zero_point_vec[zero_point_offset % 4];`:`let zero_point_value = ${v.getByIndices("scale_indices")};`:`let zero_point_value = ${u?o?"i32":"u32":_.type.value}(0);`};
+      // Compute and write output
+      ${$.setByOffset("global_idx",`${$.type.value}(x_value - zero_point_value) * scale_value`)};
+      }`};return{name:"DequantizeLinear",shaderCache:{hint:t.cacheKey,inputDependencies:v?["rank","rank","rank"]:["rank","rank"]},getShaderSource:z,getRunData:()=>({outputs:[{dims:i,dataType:s}],dispatchGroup:{x:Math.ceil(a/y/64),y:1,z:1},programUniforms:A})}},Wa=(e,t)=>{Ac(e.inputs,t),e.compute(kc(e.inputs,t))},Ha=e=>N({axis:e.axis,blockSize:e.blockSize})});var Ec,Pc,Fa,Ka=E(()=>{"use strict";ve();V();K();Ec=(e,t,n)=>{let r=e===t,o=e<t&&n<0,i=e>t&&n>0;if(r||o||i)throw new Error("Range these inputs' contents are invalid.")},Pc=(e,t,n,r)=>{let o=Math.abs(Math.ceil((t-e)/n)),i=[o],s=o,a=[{type:12,data:s},{type:r,data:e},{type:r,data:n},...P(i)],u=d=>{let l=C("output",r,i.length),c=l.type.value,p=[{name:"outputSize",type:"u32"},{name:"start",type:c},{name:"delta",type:c}];return`
+        ${d.registerUniforms(p).declareVariables(l)}
+        ${d.mainStart()}
+        ${d.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.outputSize")}
+        output[global_idx] = uniforms.start + ${c}(global_idx) * uniforms.delta;
+      }`};return{name:"Range",shaderCache:{hint:`${r}`},getShaderSource:u,getRunData:()=>({outputs:[{dims:i,dataType:r}],dispatchGroup:{x:Math.ceil(s/64)},programUniforms:a})}},Fa=e=>{let t=0,n=0,r=0;e.inputs[0].dataType===6?(t=e.inputs[0].getInt32Array()[0],n=e.inputs[1].getInt32Array()[0],r=e.inputs[2].getInt32Array()[0]):e.inputs[0].dataType===1&&(t=e.inputs[0].getFloat32Array()[0],n=e.inputs[1].getFloat32Array()[0],r=e.inputs[2].getFloat32Array()[0]),te.webgpu.validateInputContent&&Ec(t,n,r),e.compute(Pc(t,n,r,e.inputs[0].dataType),{inputs:[]})}});var zc,ja,Za,Oc,Qa,Xa,Ya=E(()=>{"use strict";V();q();ie();K();zc=(e,t,n,r)=>{if(e!=="none"&&r!=="i32"&&r!=="u32"&&r!=="f32")throw new Error(`Input ${r} is not supported with reduction ${e}.`);let o=`{
+                var oldValue = 0;
+                loop {
+                  let newValueF32 =`,i=`;
+                  let newValue = bitcast<i32>(newValueF32);
+                  let res = atomicCompareExchangeWeak(&${t}, oldValue, newValue);
+                  if res.exchanged {
+                    break;
+                  }
+                  oldValue = res.old_value;
+                }
+              }`;switch(e){case"none":return`${t}=${n};`;case"add":return r==="i32"||r==="u32"?`atomicAdd(&${t}, bitcast<${r}>(${n}));`:`
+              ${o}bitcast<${r}>(oldValue) + (${n})${i}`;case"max":return r==="i32"||r==="u32"?`atomicMax(&${t}, bitcast<${r}>(${n}));`:`
+                ${o}max(bitcast<f32>(oldValue), (${n}))${i}`;case"min":return r==="i32"||r==="u32"?`atomicMin(&${t}, bitcast<${r}>(${n}));`:`${o}min(bitcast<${r}>(oldValue), (${n}))${i}`;case"mul":return`${o}(bitcast<${r}>(oldValue) * (${n}))${i}`;default:throw new Error(`Reduction ${e} is not supported.`)}},ja=(e,t)=>`${e===1?`
+    let element_count_dim = uniforms.output_strides;
+    let dim_value = uniforms.output_shape;`:`
+    let element_count_dim = uniforms.output_strides[${t?"i - indices_start":"i"}];
+    let dim_value = uniforms.output_shape[${t?"i - indices_start":"i"} + uniforms.last_index_dimension];`}
+    
+    if (index >= 0) {
+      if (index >= i32(dim_value)) {
+        index = i32(dim_value - 1);
+      }
+    } else {
+      if (index < -i32(dim_value)) {
+        index = 0;
+      } else {
+        index += i32(dim_value);
+      }
+    }
+    data_offset += u32((u32(index) * element_count_dim));`,Za=(e,t,n)=>`for (var i = 0u; i < uniforms.num_updates_elements; i++) {
+        let value = updates[uniforms.num_updates_elements * ${n?"global_idx":"idx"} + i];
+        ${zc(e.reduction,"output[data_offset + i]","value",t)}
+      }`,Oc=(e,t)=>{let n=e[0].dims,r=e[1].dims,o=n,i=1,s=Math.ceil(x.size(r)/i),a=r[r.length-1],u=x.sizeFromDimension(n,a),d=x.sizeFromDimension(r,0)/a,l=[{type:12,data:s},{type:12,data:a},{type:12,data:u},...P(e[1].dims,e[2].dims,o)],c=p=>{let f=S("indices",e[1].dataType,e[1].dims.length),m=S("updates",e[2].dataType,e[2].dims.length,i),h=t.reduction!=="none"&&t.reduction!==""?vo("output",e[0].dataType,o.length):C("output",e[0].dataType,o.length,i);return`
+      ${p.registerUniform("output_size","u32").registerUniform("last_index_dimension","u32").registerUniform("num_updates_elements","u32").declareVariables(f,m,h)}
+      ${p.mainStart()}
+        ${p.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+  var hasDuplicates = false;
+  if (${t.reduction==="none"}) {
+    for (var i = 0; i < ${d}; i = i + 1) {
+      for (var j = i + 1; j < ${d}; j = j + 1) {
+        var index_i = i32(indices[i].x);
+        var index_j = i32(indices[j].x);
+        if (index_i == index_j) {
+          hasDuplicates = true;
+          break;
+        }
+      }
+      if (hasDuplicates) {
+        break;
+      }
+    }
+  }
+
+  if (${t.reduction==="none"} && hasDuplicates) {
+    if (global_idx != 0u) {
+      return;
+    }
+    // Process each index-update pair individually when duplicates exist
+    for (var idx = 0u; idx < ${d}u; idx++) {
+      var data_offset = 0u;
+      for (var i = 0u; i < uniforms.last_index_dimension; i++) {
+        var index = i32(indices[idx * uniforms.last_index_dimension + i].x);
+        ${ja(n.length,!1)}
+      }
+      ${Za(t,h.type.value,!1)}
+    }
+    return;
+  }
+
+  var data_offset = 0u;
+  var indices_start = uniforms.last_index_dimension * global_idx;
+  var indices_end = indices_start + uniforms.last_index_dimension;
+  for (var i = indices_start; i < indices_end; i++) {
+    var index = i32(indices[i].x);
+    ${ja(n.length,!0)}
+  }
+  ${Za(t,h.type.value,!0)}
+  }`};return{name:"ScatterND",shaderCache:{hint:`${t.cacheKey}_${t.reduction}`,inputDependencies:["rank","rank"]},getRunData:()=>({outputs:[{dims:o,dataType:e[0].dataType}],dispatchGroup:{x:Math.ceil(s/64)},programUniforms:l}),getShaderSource:c}},Qa=e=>N({reduction:e.reduction}),Xa=(e,t)=>{e.compute(Oc(e.inputs,t),{inputs:[e.inputs[1],e.inputs[2]],outputs:[]})}});var Dc,Bc,Mc,Ja,Rc,Uc,Vc,Nc,Lc,Gc,Wc,Hc,eu,qc,Fc,Kc,jc,Zc,tu,nu,ru=E(()=>{"use strict";V();q();ie();K();Dc=(e,t)=>{if(e.every(n=>n>0||(()=>{throw new Error("Resize requires scales input values to be positive")})),e.length>0){if(t.mode==="linear"){if(!(e.length===2||e.length===3||e.length===4&&e[0]===1&&e[1]===1||e.length===4&&e[0]===1&&e[3]===1||e.length===5&&e[0]===1&&e[1]===1))throw new Error(`For linear mode, Resize requires scales to be 2D, 3D, 4D with either two outermost or one innermost and
+            one outermost scale values equal to 1, or 5D with two outermost scale values equal to 1`)}else if(t.mode==="cubic"&&!(e.length===2||e.length===4&&e[0]===1&&e[1]===1||e.length===4&&e[0]===1&&e[3]===1))throw new Error("Resize requires scales input size to be 2 or 4 for cubic mode")}},Bc=(e,t,n)=>{t.every(o=>o>=0&&o<n||(()=>{throw new Error("Resize requires axes input values to be positive and less than rank")}));let r=new Array(n).fill(1);return t.forEach((o,i)=>r[o]=e[i]),r},Mc=(e,t,n,r,o,i)=>{let[s,a,u]=n>10?[1,2,3]:[-1,e.length>1?1:-1,-1],d=e[0].dims.length;if(s>0&&e.length>s&&e[s].dims.length>0)e[s].getFloat32Array().forEach(l=>i.push(l));else if(t.coordinateTransformMode==="tf_crop_and_resize")throw new Error("Resize requires RoI input to be specified when coordinateTransformMode is tfCropAndResize");if(a>0&&e.length>a&&e[a].dims.length===1&&e[a].dims[0]>0){if(e[a].getFloat32Array().forEach(l=>r.push(l)),r.length!==0&&r.length!==d&&n>=18&&r.length!==t.axes.length)throw new Error("Resize requires scales input size to be same as input rank or axes size for opset 18 and up");Dc(r,t),t.axes.length>0&&Bc(r,t.axes,d).forEach((l,c)=>r[c]=l)}if(u>0&&e.length>u&&e[u].dims.length===1&&e[u].dims[0]>0&&(e[u].getBigInt64Array().forEach(l=>o.push(Number(l))),o.length!==0&&o.length!==d&&n>=18&&o.length!==t.axes.length))throw new Error("Resize requires sizes input size to be same as input rank or axes size for opset 18 and up");if(t.axes.length>0){if(r.length!==0&&r.length!==t.axes.length)throw new Error('Resize requires "scales" input size to be of axes rank when axes attributes is specified');if(o.length!==0&&o.length!==t.axes.length)throw new Error('Resize requires "sizes" input size to be of rank axes rank when axes attributes is specified')}if(typeof r<"u"&&typeof o<"u"&&r.length>0&&o.length>d)throw new Error("Resize requires only of scales or sizes to be specified")},Ja=(e,t,n,r)=>`
+  // The whole part and the fractional part are calculated separately due to inaccuracy of floating
+  // point division. As an example, f32(21) / f32(7) may evaluate to 2.99... instead of 3, causing an
+  // offset-by-one error later in floor().
+  let big = (${e}) * (${t});
+  let whole = ${r}(big / (${n}));
+  let fract = ${r}(big % (${n})) / ${r}(${n});
+  return whole + fract;
+`,Rc=(e,t)=>`fn getOriginalCoordinateFromResizedCoordinate(xResized: u32, xScale: f32, lengthResized: u32,
+     lengthOriginal: u32, roiStart: f32, roiEnd: f32) -> ${t} { `+(()=>{switch(e){case"asymmetric":return`
+          if (xScale < 1.0 || floor(xScale) != xScale) {
+            return ${t}(xResized) / ${t}(xScale);
+          } else {
+            ${Ja("xResized","lengthOriginal","lengthResized",t)}
+          }
+        `;case"pytorch_half_pixel":return`if (lengthResized > 1) {
+                    return (${t}(xResized) + 0.5) / ${t}(xScale) - 0.5;
+                  } else {
+                    return 0.0;
+                  }`;case"tf_half_pixel_for_nn":return`return (${t}(xResized) + 0.5) / ${t}(xScale);`;case"align_corners":return`if (lengthResized == 1) {
+                    return 0.0;
+                  } else {
+                    ${Ja("xResized","lengthOriginal - 1","lengthResized - 1",t)}
+                  }`;case"tf_crop_and_resize":return`if (lengthResized > 1) {
+                    return ${t}(roiStart) * ${t}(lengthOriginal - 1) +
+                        (${t}(xResized) * ${t}(roiEnd - roiStart) * ${t}(lengthOriginal - 1)) /
+                        ${t}(lengthResized - 1);
+                  } else {
+                    return 0.5 * ${t}(roiStart + roiEnd) * ${t}(lengthOriginal - 1);
+                  }`;case"half_pixel_symmetric":return`const outputWidth = ${t}xScale * ${t}(lengthResized);
+                  const adjustment = ${t}(lengthResized) / outputWidth;
+                  const center = ${t}(lengthOriginal) / 2;
+                  const offset = center * (1 - adjustment);
+                  return offset + ((${t}(xResized) + 0.5) / ${t}(xScale)) - 0.5;`;case"half_pixel":return`return ((${t}(xResized) + 0.5) / ${t}(xScale)) - 0.5;`;default:throw new Error(`Coordinate transform mode ${e} is not supported`)}})()+"}",Uc=(e,t,n)=>`fn getNearestPixelFromOriginal(xOriginal: ${n}, isDownSample: bool) -> ${n} {`+(()=>{switch(e){case"round_prefer_ceil":return"if (fract(xOriginal) == 0.5) {             return ceil(xOriginal);           } else {             return round(xOriginal);           }";case"floor":return"return floor(xOriginal);";case"ceil":return"return ceil(xOriginal);";case"round_prefer_floor":return"if (fract(xOriginal) == 0.5) {                     return floor(xOriginal);                   } else {                     return round(xOriginal);                   }";case"simple":default:if(t<11)return"if (isDownSample)                     {                       return ceil(xOriginal);                     } else {                       return xOriginal;                     }";throw new Error(`Nearest mode ${e} is not supported`)}})()+"}",Vc=(e,t,n)=>{let r=new Array(n).fill(0).concat(new Array(n).fill(1)),o=e.length===0?r:e.slice();return t.length>0?(t.forEach((i,s)=>{r[i]=o[s],r[s+n]=o[t.length+s]}),r):o},Nc=(e,t,n,r)=>{let o=[];if(n.length>0)if(r.length>0){if(e.forEach(i=>o.push(i)),Math.max(...r)>e.length)throw new Error("axes is out of bound");r.forEach((i,s)=>o[i]=n[s])}else n.forEach(i=>o.push(i));else{if(t.length===0)throw new Error("Resize requires either scales or sizes.");o=e.map((i,s)=>Math.round(i*t[s]))}return o},Lc=(e,t,n)=>{let r=(()=>{switch(n.keepAspectRatioPolicy){case"not_larger":return n.axes.length>0?Math.min(...n.axes.map(i=>t[i]),Number.MAX_VALUE):Math.min(...t,Number.MAX_VALUE);case"not_smaller":return n.axes.length>0?Math.max(...n.axes.map(i=>t[i]),Number.MIN_VALUE):Math.max(...t,Number.MIN_VALUE);default:throw new Error(`Keep aspect ratio policy ${n.keepAspectRatioPolicy} is not supported`)}})();t.fill(1,0,t.length);let o=e.slice();return n.axes.length>0?(n.axes.forEach(i=>t[i]=r),n.axes.forEach(i=>o[i]=Math.round(e[i]*t[i]))):(t.fill(r,0,t.length),o.forEach((i,s)=>o[s]=Math.round(i*t[s]))),o},Gc=(e,t,n,r,o)=>`
+    fn calculateOriginalIndicesFromOutputIndices(output_indices: ${e.type.indices}) -> array<${e.type.value}, ${n.length}> {
+      var original_indices: array<${e.type.value}, ${n.length}>;
+      for (var i:u32 = 0; i < ${n.length}; i++) {
+        var output_index = ${e.indicesGet("output_indices","i")};
+        var scale = ${B("uniforms.scales","i",r)};
+        var roi_low = ${B("uniforms.roi","i",o)};
+        var roi_hi = ${B("uniforms.roi",`i + ${t.length}`,o)};
+        if (scale == 1.0) {
+          original_indices[i] = ${e.type.value}(output_index);
+        } else {
+          var input_shape_i = ${B("uniforms.input_shape","i",t.length)};
+          var output_shape_i = ${B("uniforms.output_shape","i",n.length)};
+          original_indices[i] = getOriginalCoordinateFromResizedCoordinate(output_index, scale, output_shape_i,
+                                                                           input_shape_i, roi_low, roi_hi);
+        }
+      }
+      return original_indices;
+    }`,Wc=(e,t,n,r,o,i,s)=>`
+    fn calculateInputIndicesFromOutputIndices(output_indices: ${t.type.indices}) -> ${e.type.indices} {
+      var input_indices: ${e.type.indices};
+      for (var i:u32 = 0; i < ${r.length}; i++) {
+        var output_index = ${t.indicesGet("output_indices","i")};
+        var input_index: u32;
+        var scale = ${B("uniforms.scales","i",o)};
+        if (scale == 1.0) {
+          input_index = output_index;
+        } else {
+          var roi_low = ${B("uniforms.roi","i",i)};
+          var roi_hi = ${B("uniforms.roi",`i + ${n.length}`,i)};
+          var input_shape_i = ${B("uniforms.input_shape","i",n.length)};
+          var output_shape_i = ${B("uniforms.output_shape","i",r.length)};
+          var original_idx = getOriginalCoordinateFromResizedCoordinate(output_index, scale, output_shape_i,
+                                                                        input_shape_i, roi_low, roi_hi);
+          if (!${s} || (original_idx >= 0 && original_idx < ${t.type.value}(input_shape_i))) {
+            if (original_idx < 0) {
+              input_index = 0;
+            } else if (original_idx > ${t.type.value}(input_shape_i - 1)) {
+              input_index = input_shape_i - 1;
+            } else {
+              input_index = u32(getNearestPixelFromOriginal(original_idx, scale < 1));
+            }
+          } else {
+            input_index = u32(original_idx);
+          }
+        }
+        ${e.indicesSet("input_indices","i","input_index")}
+      }
+      return input_indices;
+    }`,Hc=(e,t)=>`
+    fn checkInputIndices(input_indices: ${e.type.indices}) -> bool {
+      for (var i:u32 = 0; i < ${t.length}; i++) {
+        var input_index = ${e.indicesGet("input_indices","i")};
+        if (input_index < 0 || input_index >= ${B("uniforms.input_shape","i",t.length)}) {
+          return false;
+        }
+      }
+      return true;
+    }`,eu=(e,t,n,r)=>e.rank>r?`
+    ${e.indicesSet("input_indices",t,"channel")};
+    ${e.indicesSet("input_indices",n,"batch")};
+`:"",qc=(e,t,n,r,o)=>{let[s,a,u,d]=n.length===2?[-1,0,1,-1]:[0,2,3,1],l=e.type.value;return`
+    fn getInputValue(batch: u32, channel: u32, row: u32, col: u32) -> ${l} {
+      var input_indices: ${e.type.indices};
+      ${e.indicesSet("input_indices",a,`max(0, min(row, ${n[a]} - 1))`)};
+      ${e.indicesSet("input_indices",u,`max(0, min(col, ${n[u]} - 1))`)};
+      ${eu(e,d,s,2)}
+      return ${e.getByIndices("input_indices")};
+    }
+
+    fn bilinearInterpolation(output_indices: ${t.type.indices}) -> ${l} {
+      var originalIndices = calculateOriginalIndicesFromOutputIndices(output_indices);
+      var row:${l} = originalIndices[${a}];
+      var col:${l} = originalIndices[${u}];
+      ${r?`if (row < 0 || row > (${n[a]} - 1) || col < 0 || col > (${n[u]} - 1)) {
+        return ${o};
+      }`:""};
+      row = max(0, min(row, ${n[a]} - 1));
+      col = max(0, min(col, ${n[u]} - 1));
+      var row1: u32 = u32(row);
+      var col1: u32 = u32(col);
+      var row2: u32 = u32(row + 1);
+      var col2: u32 = u32(col + 1);
+      var channel: u32 = ${n.length>2?`u32(originalIndices[${d}])`:"0"};
+      var batch: u32 =  ${n.length>2?`u32(originalIndices[${s}])`:"0"};
+      var x11: ${l} = getInputValue(batch, channel, row1, col1);
+      var x12: ${l} = getInputValue(batch, channel, row1, col2);
+      var x21: ${l} = getInputValue(batch, channel, row2, col1);
+      var x22: ${l} = getInputValue(batch, channel, row2, col2);
+      var dx1: ${l} = abs(row - ${l}(row1));
+      var dx2: ${l} = abs(${l}(row2) - row);
+      var dy1: ${l} = abs(col - ${l}(col1));
+      var dy2: ${l} = abs(${l}(col2) - col);
+      if (row1 == row2) {
+        dx1 = 0.5;
+        dx2 = 0.5;
+      }
+      if (col1 == col2) {
+        dy1 = 0.5;
+        dy2 = 0.5;
+      }
+      return (x11 * dx2 * dy2 + x12 * dx2 * dy1 + x21 * dx1 * dy2 + x22 * dx1 * dy1);
+    }`},Fc=(e,t,n,r,o,i,s,a,u,d)=>{let l=n.length===2,c=!0,[p,f]=l?[0,1]:c?[2,3]:[1,2],m=e.type.value,h=b=>{let y=b===p?"row":"col";return`
+      fn ${y}CubicInterpolation(input_indices: ${e.type.indices}, output_indices: ${t.type.indices}) -> ${m} {
+        var output_index = ${t.indicesGet("output_indices",b)};
+        var originalIdx: ${m} = getOriginalCoordinateFromResizedCoordinate(output_index, ${o[b]},
+        ${r[b]}, ${n[b]}, ${i[b]}, ${i[b]} + ${n.length});
+        var fractOriginalIdx: ${m} = originalIdx - floor(originalIdx);
+        var coefs = getCubicInterpolationCoefs(fractOriginalIdx);
+
+        if (${a} && (originalIdx < 0 || originalIdx > (${n[b]} - 1))) {
+          return ${u};
+        }
+        var data: array<${m}, 4> = array<${m}, 4>(0.0, 0.0, 0.0, 0.0);
+        for (var i: i32 = -1; i < 3; i++) {
+          var ${y}: ${m} = originalIdx + ${m}(i);
+          if (${y} < 0 || ${y} >= ${n[b]}) {
+            ${d?`coefs[i + 1] = 0.0;
+                        continue;`:a?`return ${u};`:`${y} = max(0, min(${y}, ${n[b]} - 1));`};
+          }
+        var input_indices_copy: ${e.type.indices} = input_indices;
+          ${e.indicesSet("input_indices_copy",b,`u32(${y})`)};
+          data[i + 1] = ${b===p?e.getByIndices("input_indices_copy"):"rowCubicInterpolation(input_indices_copy, output_indices)"};
+        }
+        return cubicInterpolation1D(data, coefs);
+      }`};return`
+    ${h(p)};
+    ${h(f)};
+  fn getCubicInterpolationCoefs(s: ${m}) -> array<${m}, 4> {
+    var absS = abs(s);
+    var coeffs: array<${m}, 4> = array<${m}, 4>(0.0, 0.0, 0.0, 0.0);
+    var oneMinusAbsS: ${m} = 1.0 - absS;
+    var twoMinusAbsS: ${m} = 2.0 - absS;
+    var onePlusAbsS: ${m} = 1.0 + absS;
+    coeffs[0] = ((${s} * onePlusAbsS - 5 * ${s}) * onePlusAbsS + 8 * ${s}) * onePlusAbsS - 4 * ${s};
+    coeffs[1] = ((${s} + 2) * absS - (${s} + 3)) * absS * absS + 1;
+    coeffs[2] = ((${s} + 2) * oneMinusAbsS - (${s} + 3)) * oneMinusAbsS * oneMinusAbsS + 1;
+    coeffs[3] = ((${s} * twoMinusAbsS - 5 * ${s}) * twoMinusAbsS + 8 * ${s}) * twoMinusAbsS - 4 * ${s};
+    return coeffs;
+  }
+
+  fn cubicInterpolation1D(x: array<${m}, 4>, coefs: array<${m}, 4>) -> ${m} {
+    var coefsSum: ${m} = coefs[0] + coefs[1] + coefs[2] + coefs[3];
+    return (x[0] * coefs[0] + x[1] * coefs[1]+ x[2] * coefs[2]+ x[3] * coefs[3]) / coefsSum;
+  }
+
+  fn bicubicInterpolation(output_indices: ${t.type.indices}) -> ${m} {
+    var input_indices: ${e.type.indices} = output_indices;
+    return colCubicInterpolation(input_indices, output_indices);
+  }
+    `},Kc=(e,t,n,r,o)=>{let[s,a,u,d,l]=n.length===3?[-1,0,1,2,-1]:[0,2,3,4,1],c=e.type.value;return`
+    fn getInputValue(batch: u32, channel: u32, depth:u32, height: u32, width: u32) -> ${c} {
+      var input_indices: ${e.type.indices};
+      ${e.indicesSet("input_indices",a,`max(0, min(depth, ${n[a]} - 1))`)};
+      ${e.indicesSet("input_indices",u,`max(0, min(height, ${n[u]} - 1))`)};
+      ${e.indicesSet("input_indices",d,`max(0, min(width, ${n[d]} - 1))`)};
+      ${eu(e,l,s,3)}
+      return ${e.getByIndices("input_indices")};
+    }
+
+    fn trilinearInterpolation(output_indices: ${t.type.indices}) -> ${c} {
+      var originalIndices = calculateOriginalIndicesFromOutputIndices(output_indices);
+      var depth:${c} = originalIndices[${a}];
+      var height:${c} = originalIndices[${u}];
+      var width:${c} = originalIndices[${d}];
+      ${r?`if (depth < 0 || depth > (${n[a]} - 1) || height < 0 || height > (${n[u]} - 1) || width < 0 || (width > ${n[d]} - 1)) {
+      return ${o};
+        }`:""};
+
+    depth = max(0, min(depth, ${n[a]} - 1));
+      height = max(0, min(height, ${n[u]} - 1));
+      width = max(0, min(width, ${n[d]} - 1));
+      var depth1: u32 = u32(depth);
+      var height1: u32 = u32(height);
+      var width1: u32 = u32(width);
+      var depth2: u32 = u32(depth + 1);
+      var height2: u32 = u32(height + 1);
+      var width2: u32 = u32(width + 1);
+      var channel: u32 = ${n.length>3?`u32(originalIndices[${l}])`:"0"};
+      var batch: u32 =  ${n.length>3?`u32(originalIndices[${s}])`:"0"};
+
+      var x111: ${c} = getInputValue(batch, channel, depth1, height1, width1);
+      var x112: ${c} = getInputValue(batch, channel, depth1, height1, width2);
+      var x121: ${c} = getInputValue(batch, channel, depth1, height2, width1);
+      var x122: ${c} = getInputValue(batch, channel, depth1, height2, width2);
+      var x211: ${c} = getInputValue(batch, channel, depth2, height1, width1);
+      var x212: ${c} = getInputValue(batch, channel, depth2, height1, width2);
+      var x221: ${c} = getInputValue(batch, channel, depth2, height2, width1);
+      var x222: ${c} = getInputValue(batch, channel, depth2, height2, width2);
+      var dx1: ${c} = abs(depth - ${c}(depth1));
+      var dx2: ${c} = abs(${c}(depth2) - depth);
+      var dy1: ${c} = abs(height - ${c}(height1));
+      var dy2: ${c} = abs(${c}(height2) - height);
+      var dz1: ${c} = abs(width - ${c}(width1));
+      var dz2: ${c} = abs(${c}(width2) - width);
+      if (depth1 == depth2) {
+        dx1 = 0.5;
+        dx2 = 0.5;
+      }
+      if (height1 == height2) {
+        dy1 = 0.5;
+        dy2 = 0.5;
+      }
+      if (width1 == width2) {
+        dz1 = 0.5;
+        dz2 = 0.5;
+      }
+      return (x111 * dx2 * dy2 * dz2 + x112 * dx2 * dy2 * dz1 + x121 * dx2 * dy1 *dz2 + x122 * dx2 * dy1 * dz1 +
+              x211 * dx1 * dy2 * dz2 + x212 * dx1 * dy2 * dz1 + x221 * dx1 * dy1 *dz2 + x222 * dx1 * dy1 * dz1);
+    }`},jc=(e,t,n,r,o,i)=>{let s=e.dims,a=Vc(i,t.axes,s.length),u=Nc(s,r,o,t.axes),d=r.slice();r.length===0&&(d=s.map((g,_)=>g===0?1:u[_]/g),t.keepAspectRatioPolicy!=="stretch"&&(u=Lc(s,d,t)));let l=C("output",e.dataType,u.length),c=S("input",e.dataType,s.length),p=x.size(u),f=s.length===u.length&&s.every((g,_)=>g===u[_]),m=t.coordinateTransformMode==="tf_crop_and_resize",h=t.extrapolationValue,b=c.type.value,y=g=>`
+      ${f?"":`
+      ${Rc(t.coordinateTransformMode,b)};
+      ${(()=>{switch(t.mode){case"nearest":return`
+              ${Hc(c,s)};
+              ${Uc(t.nearestMode,n,b)};
+              ${Wc(c,l,s,u,d.length,a.length,m)};
+              `;case"linear":return`
+              ${Gc(l,s,u,d.length,a.length)};
+              ${(()=>{if(s.length===2||s.length===4)return`${qc(c,l,s,m,h)}`;if(s.length===3||s.length===5)return`${Kc(c,l,s,m,h)}`;throw Error("Linear mode only supports input dims 2, 3, 4 and 5 are supported in linear mode.")})()};
+            `;case"cubic":return`
+            ${(()=>{if(s.length===2||s.length===4)return`${Fc(c,l,s,u,d,a,t.cubicCoeffA,m,t.extrapolationValue,t.excludeOutside)}`;throw Error("Cubic mode only supports input dims 2 and 4 are supported in linear mode.")})()};
+            `;default:throw Error("Invalid resize mode")}})()};
+      `}
+      ${g.registerUniform("output_size","u32").registerUniform("scales","f32",d.length).registerUniform("roi","f32",a.length).declareVariables(c,l)}
+      ${g.mainStart()}
+        ${g.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+        ${f?"output[global_idx] = input[global_idx];":`
+        let output_indices = ${l.offsetToIndices("global_idx")};
+        var input_indices: ${c.type.indices};
+        ${(()=>{switch(t.mode){case"nearest":return`input_indices = calculateInputIndicesFromOutputIndices(output_indices);
+                if (checkInputIndices(input_indices)) {
+                  output[global_idx] = ${c.getByIndices("input_indices")};
+                } else {
+                  output[global_idx] = ${t.extrapolationValue};
+                }`;case"linear":return`output[global_idx] = ${s.length===2||s.length===4?"bilinearInterpolation":"trilinearInterpolation"}(output_indices);`;case"cubic":return"output[global_idx] = bicubicInterpolation(output_indices);";default:throw Error(`Unsupported resize mode: ${t.mode}`)}})()};
+`}
+      }`;return{name:"Resize",shaderCache:{hint:`${t.cacheKey}|${n}|${d.length>0?t.mode==="cubic"?d:d.length:""}|${o.length>0?o:""}|${a.length>0?a:""}|${f}|${t.mode==="nearest"?s.length:s}`,inputDependencies:["rank"]},getShaderSource:y,getRunData:()=>({outputs:[{dims:u,dataType:e.dataType}],dispatchGroup:{x:Math.ceil(p/64)},programUniforms:[{type:12,data:p},{type:1,data:d},{type:1,data:a},...P(s,u)]})}},Zc=e=>{let t=e.customDataBuffer;return new Uint32Array(t,t.byteOffset,1)[0]},tu=(e,t)=>{let n=[],r=[],o=[],i=Zc(e);if(t.antialias!==0)throw Error("Only default value (0) for Antialias attribute is supported");Mc(e.inputs,t,i,n,r,o),e.compute(jc(e.inputs[0],t,i,n,r,o),{inputs:[0]})},nu=e=>{let t=e.antialias,n=e.axes,r=e.coordinateTransformMode,o=e.cubicCoeffA,i=e.excludeOutside!==0,s=e.extrapolationValue,a=e.keepAspectRatioPolicy,u=e.mode,d=e.nearestMode===""?"simple":e.nearestMode;return N({antialias:t,axes:n,coordinateTransformMode:r,cubicCoeffA:o,excludeOutside:i,extrapolationValue:s,keepAspectRatioPolicy:a,mode:u,nearestMode:d})}});var Qc,Xc,ou,iu=E(()=>{"use strict";V();q();K();Qc=e=>{if(!e||e.length<3)throw new Error("layerNorm requires at least 3 inputs.");let t=e[0],n=e[1],r=e[2];if(t.dataType!==n.dataType||t.dataType!==r.dataType)throw new Error("All inputs must have the same data type");if(t.dims.length!==3&&t.dims.length!==2)throw new Error("Input must be 2D or 3D");if(n.dims.length!==3&&n.dims.length!==2)throw new Error("Skip must be 2D or 3D");let o=t.dims[t.dims.length-1],i=t.dims[t.dims.length-2];if(n.dims[n.dims.length-1]!==o)throw new Error("Skip must have the same hidden size as input");if(n.dims[n.dims.length-2]!==i)throw new Error("Skip must have the same sequence length as input");if(r.dims.length!==1)throw new Error("Gamma must be 1D");if(r.dims[r.dims.length-1]!==o)throw new Error("Gamma must have the same hidden size as input");if(e.length>3){let s=e[3];if(s.dims.length!==1)throw new Error("Beta must be 1D");if(s.dims[s.dims.length-1]!==o)throw new Error("Beta must have the same hidden size as input")}if(e.length>4){let s=e[4];if(s.dims.length!==1)throw new Error("Bias must be 1D");if(s.dims[s.dims.length-1]!==o)throw new Error("Bias must have the same hidden size as input")}},Xc=(e,t,n,r)=>{let o=t.simplified,i=e[0].dims,s=x.size(i),a=i,u=s,d=i.slice(-1)[0],l=r?i.slice(0,-1).concat(1):[],c=!o&&e.length>3,p=e.length>4,f=r&&n>1,m=r&&n>2,h=n>3,b=64,y=X(d),g=[{type:12,data:u},{type:12,data:y},{type:12,data:d},{type:1,data:t.epsilon}],_=v=>{let $=[{name:"output_size",type:"u32"},{name:"components",type:"u32"},{name:"hidden_size",type:"u32"},{name:"epsilon",type:"f32"}],T=[S("x",e[0].dataType,e[0].dims,y),S("skip",e[1].dataType,e[1].dims,y),S("gamma",e[2].dataType,e[2].dims,y)];c&&T.push(S("beta",e[3].dataType,e[3].dims,y)),p&&T.push(S("bias",e[4].dataType,e[4].dims,y)),T.push(C("output",e[0].dataType,a,y)),f&&T.push(C("mean_output",1,l)),m&&T.push(C("inv_std_output",1,l)),h&&T.push(C("input_skip_bias_sum",e[0].dataType,a,y));let I=re(e[0].dataType),A=re(1,y);return`
+
+      ${v.registerUniforms($).declareVariables(...T)}
+      var<workgroup> sum_shared : array<${A}, ${b}>;
+      var<workgroup> sum_squared_shared : array<${A}, ${b}>;
+
+      ${v.mainStart([b,1,1])}
+        let ix = local_id.x;
+        let iy = global_id.x / ${b};
+
+        let hidden_size_vectorized: u32 = uniforms.hidden_size / uniforms.components;
+        var stride = hidden_size_vectorized / ${b};
+        let offset = ix * stride + iy * hidden_size_vectorized;
+        let offset1d = stride * ix;
+        if (ix == ${b-1}) {
+          stride = hidden_size_vectorized - stride * ix;
+        }
+        for (var i: u32 = 0; i < stride; i++) {
+          let skip_value = skip[offset + i];
+          let bias_value = ${p?"bias[offset1d + i]":I+"(0.0)"};
+          let input_value = x[offset + i];
+          let value = input_value + skip_value + bias_value;
+          ${h?"input_skip_bias_sum[offset + i] = value;":""}
+          output[offset + i] = value;
+          let f32_value = ${Xe(I,y,"value")};
+          sum_shared[ix] += f32_value;
+          sum_squared_shared[ix] += f32_value * f32_value;
+        }
+        workgroupBarrier();
+
+        var reduce_size : u32 = ${b};
+        for (var curr_size = reduce_size >> 1;  curr_size > 0; curr_size = reduce_size >> 1) {
+          reduce_size = curr_size + (reduce_size & 1);
+          if (ix < curr_size) {
+            sum_shared[ix] += sum_shared[ix + reduce_size];
+            sum_squared_shared[ix] += sum_squared_shared[ix + reduce_size];
+          }
+          workgroupBarrier();
+        }
+
+        let sum = sum_shared[0];
+        let square_sum = sum_squared_shared[0];
+        let mean = ${Te("sum",y)} / f32(uniforms.hidden_size);
+        let inv_std_dev = inverseSqrt(${Te("square_sum",y)} / f32(uniforms.hidden_size) ${o?"":"- mean * mean"} + uniforms.epsilon);
+        ${f?"mean_output[global_idx] = mean;":""}
+        ${m?"inv_std_output[global_idx] = inv_std_dev;":""}
+
+        for (var i: u32 = 0; i < stride; i++) {
+          output[offset + i] = (output[offset + i] ${o?"":`- ${I}(mean)`}) *
+            ${I}(inv_std_dev) * gamma[offset1d + i]
+            ${c?"+ beta[offset1d + i]":""};
+        }
+      }`},w=[{dims:a,dataType:e[0].dataType}];return n>1&&w.push({dims:l,dataType:1}),n>2&&w.push({dims:l,dataType:1}),n>3&&w.push({dims:i,dataType:e[0].dataType}),{name:"SkipLayerNormalization",shaderCache:{hint:`${y};${f};${m};${h}`,inputDependencies:e.map((v,$)=>"type")},getShaderSource:_,getRunData:()=>({outputs:w,dispatchGroup:{x:Math.ceil(u/d)},programUniforms:g})}},ou=(e,t)=>{Qc(e.inputs);let r=[0];e.outputCount>1&&r.push(-3),e.outputCount>2&&r.push(-3),e.outputCount>3&&r.push(3),e.compute(Xc(e.inputs,t,e.outputCount,!1),{outputs:r})}});var Yc,cn,Jc,su,ep,tp,au,uu,du=E(()=>{"use strict";V();q();ie();K();Yc=(e,t)=>{if(!e||e.length<1)throw new Error("too few inputs");if(t.axes.length!==0){if(t.axes.length!==t.starts.length||t.axes.length!==t.ends.length)throw new Error("axes, starts and ends must have the same length")}else if(t.starts.length!==t.ends.length)throw new Error("starts and ends must have the same length");e.slice(1).forEach((n,r)=>{if(e[r+1].dataType!==6&&e[r+1].dataType!==7)throw new Error(`Input ${r} must be an array of int32 or int64`)})},cn=(e,t)=>{let n=[];if(e.length>t)if(e[t].dataType===7)e[t].getBigInt64Array().forEach(r=>n.push(Number(r)));else if(e[t].dataType===6)e[t].getInt32Array().forEach(r=>n.push(Number(r)));else throw new Error(`Input ${t} must be an array of int32 or int64`);return n},Jc=(e,t)=>{if(e.length>1){let n=cn(e,1),r=cn(e,2),o=cn(e,3);return o.length===0&&(o=[...Array(e[0].dims.length).keys()]),N({starts:n,ends:r,axes:o})}else return t},su=(e,t,n,r,o)=>{let i=e;return e<0&&(i+=n[r[t]]),o[t]<0?Math.max(0,Math.min(i,n[r[t]]-1)):Math.max(0,Math.min(i,n[r[t]]))},ep=(e,t,n)=>`fn calculateInputIndices(output_indices: ${t.type.indices}) -> ${e.type.indices} {
+          var input_indices: ${e.type.indices};
+          var carry = 0u;
+          for (var i = ${n.length}; i >= 0; i--) {
+            let input_shape_i = ${B("uniforms.input_shape","i",n.length)};
+            let steps_i = ${B("uniforms.steps","i",n.length)};
+            let signs_i = ${B("uniforms.signs","i",n.length)};
+            let starts_i = ${B("uniforms.starts","i",n.length)};
+            var output_index = ${t.indicesGet("output_indices","i")};
+            var input_index = output_index * steps_i + starts_i + carry;
+            carry = input_index / input_shape_i;
+            input_index = input_index % input_shape_i;
+            if (signs_i < 0) {
+              input_index = input_shape_i - input_index - 1u + starts_i;
+            }
+            ${e.indicesSet("input_indices","i","input_index")};
+          }
+          return input_indices;
+      }`,tp=(e,t)=>{let n=e[0].dims,r=x.size(n),o=t.axes.length>0?x.normalizeAxes(t.axes,n.length):[...Array(n.length).keys()],i=cn(e,4);i.forEach(y=>y!==0||(()=>{throw new Error("step cannot be 0")})),i.length===0&&(i=Array(o.length).fill(1));let s=t.starts.map((y,g)=>su(y,g,n,o,i)),a=t.ends.map((y,g)=>su(y,g,n,o,i));if(o.length!==s.length||o.length!==a.length)throw new Error("start, ends and axes should have the same number of elements");if(o.length!==n.length)for(let y=0;y<n.length;++y)o.includes(y)||(s.splice(y,0,0),a.splice(y,0,n[y]),i.splice(y,0,1));let u=i.map(y=>Math.sign(y));i.forEach((y,g,_)=>{if(y<0){let w=(a[g]-s[g])/y,v=s[g],$=v+w*i[g];s[g]=$,a[g]=v,_[g]=-y}});let d=n.slice(0);o.forEach((y,g)=>{d[y]=Math.ceil((a[y]-s[y])/i[y])});let l={dims:d,dataType:e[0].dataType},c=C("output",e[0].dataType,d.length),p=S("input",e[0].dataType,e[0].dims.length),f=x.size(d),m=[{name:"outputSize",type:"u32"},{name:"starts",type:"u32",length:s.length},{name:"signs",type:"i32",length:u.length},{name:"steps",type:"u32",length:i.length}],h=[{type:12,data:f},{type:12,data:s},{type:6,data:u},{type:12,data:i},...P(e[0].dims,d)],b=y=>`
+      ${y.registerUniforms(m).declareVariables(p,c)}
+        ${ep(p,c,n)}
+        ${y.mainStart()}
+          ${y.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.outputSize")}
+          let output_indices = ${c.offsetToIndices("global_idx")};
+          let input_indices = calculateInputIndices(output_indices);
+          ${c.setByOffset("global_idx",p.getByIndices("input_indices"))}
+      }`;return{name:"Slice",shaderCache:{hint:`${u.length}_${s.length}_${i.length}`,inputDependencies:["rank"]},getShaderSource:b,getRunData:()=>({outputs:[l],dispatchGroup:{x:Math.ceil(r/64)},programUniforms:h})}},au=(e,t)=>{Yc(e.inputs,t);let n=Jc(e.inputs,t);e.compute(tp(e.inputs,n),{inputs:[0]})},uu=e=>{let t=e.starts,n=e.ends,r=e.axes;return N({starts:t,ends:n,axes:r})}});var np,rp,lu,cu,pu=E(()=>{"use strict";V();q();ie();Re();K();np=e=>{if(!e||e.length!==1)throw new Error("Softmax op requires 1 input.")},rp=(e,t)=>{let n=e.inputs[0],r=n.dims,o=x.size(r),i=r.length,s=x.normalizeAxis(t.axis,i),a=s<r.length-1,u,d=[];a?(d=Array.from({length:i},(T,I)=>I),d[s]=i-1,d[i-1]=s,u=e.compute(ce(n,d),{inputs:[n],outputs:[-1]})[0]):u=n;let l=u.dims,c=l[i-1],p=o/c,f=X(c),m=c/f,h=64;p===1&&(h=256);let b=(T,I)=>I===4?`max(max(${T}.x, ${T}.y), max(${T}.z, ${T}.w))`:I===2?`max(${T}.x, ${T}.y)`:I===3?`max(max(${T}.x, ${T}.y), ${T}.z)`:T,y=S("x",u.dataType,u.dims,f),g=C("result",u.dataType,u.dims,f),_=y.type.value,w=re(u.dataType)==="f32"?`var threadMax = ${_}(-3.402823e+38f);`:`var threadMax = ${_}(-65504.0h);`,v=T=>`
+      var<workgroup> rowMaxShared : ${_};
+      var<workgroup> rowSumShared : ${_};
+      var<workgroup> threadShared : array<${_}, ${h}>;
+
+      fn getValue(row: i32, col: i32, row_stride: i32) -> ${_} {
+        let index = row * row_stride + col;
+        return x[index];
+      }
+
+      fn setValue(row: i32, col: i32, row_stride: i32, value: ${_}) {
+        let index = row * row_stride + col;
+        result[index] = value;
+      }
+      ${T.registerUniform("packedCols","i32").declareVariables(y,g)}
+      ${T.mainStart(h)}
+        let gindex = i32(global_idx);
+        let lindex = i32(local_idx);
+        const wg = ${h};
+        let row = gindex / wg;
+        let cols = uniforms.packedCols;
+        let row_stride : i32 = uniforms.packedCols;
+
+        // find the rows max
+        ${w}
+        for (var col = lindex; col < cols; col += wg) {
+          let value = getValue(row, col, row_stride);
+          threadMax = max(threadMax, value);
+        }
+        if (lindex < cols) {
+          threadShared[lindex] = threadMax;
+        }
+        workgroupBarrier();
+
+        var reduceSize = min(cols, wg);
+        for (var currSize = reduceSize >> 1;  currSize > 0; currSize = reduceSize >> 1) {
+          reduceSize = currSize + (reduceSize & 1);
+          if (lindex < currSize) {
+            threadShared[lindex] = max(threadShared[lindex], threadShared[lindex + reduceSize]);
+          }
+          workgroupBarrier();
+        }
+        if (lindex == 0) {
+          rowMaxShared = ${_}(${b("threadShared[0]",f)});
+        }
+        workgroupBarrier();
+
+        // find the rows sum
+        var threadSum = ${_}(0.0);
+        for (var col = lindex; col < cols; col += wg) {
+          let subExp = exp(getValue(row, col, row_stride) - rowMaxShared);
+          threadSum += subExp;
+        }
+        threadShared[lindex] = threadSum;
+        workgroupBarrier();
+
+        for (var currSize = wg >> 1;  currSize > 0; currSize = currSize >> 1) {
+          if (lindex < currSize) {
+            threadShared[lindex] = threadShared[lindex] + threadShared[lindex + currSize];
+          }
+          workgroupBarrier();
+        }
+        if (lindex == 0) {
+          rowSumShared = ${_}(${Te("threadShared[0]",f)});
+        }
+        workgroupBarrier();
+
+        // calculate final value for each element in the row
+        for (var col = lindex; col < cols; col += wg) {
+          let value = exp(getValue(row, col, row_stride) - rowMaxShared) / rowSumShared;
+          setValue(row, col, row_stride, value);
+        }
+      }`,$=e.compute({name:"Softmax",shaderCache:{hint:`${f};${h}`,inputDependencies:["type"]},getRunData:()=>({outputs:[{dims:l,dataType:u.dataType}],dispatchGroup:{x:p},programUniforms:[{type:6,data:m}]}),getShaderSource:v},{inputs:[u],outputs:[a?-1:0]})[0];a&&e.compute(ce($,d),{inputs:[$]})},lu=(e,t)=>{np(e.inputs),rp(e,t)},cu=e=>N({axis:e.axis})});var mu,op,ip,sp,fu,hu=E(()=>{"use strict";V();q();K();mu=e=>Array.from(e.getBigInt64Array(),Number),op=e=>{if(!e||e.length!==2)throw new Error("Tile requires 2 inputs.");if(e[0].dataType!==1&&e[0].dataType!==10&&e[0].dataType!==6&&e[0].dataType!==12)throw new Error("Tile only support float, float16, int32, and uint32 data types");if(e[1].dataType!==7)throw new Error("Tile `repeats` input should be of int64 data type");if(e[1].dims.length!==1)throw new Error("Tile `repeats` input should be 1-D");if(mu(e[1]).length!==e[0].dims.length)throw new Error("Tile `repeats` input should have same number of elements as rank of input data tensor")},ip=(e,t)=>{let n=[];for(let r=0;r<e.length;++r)n.push(e[r]*t[r]);return n},sp=(e,t)=>{let n=e[0].dims,r=t??mu(e[1]),o=ip(n,r),i=x.size(o),s=e[0].dataType,a=S("input",s,n.length),u=C("output",s,o.length),d=l=>`
+      const inputShape = ${a.indices(...n)};
+      ${l.registerUniform("output_size","u32").declareVariables(a,u)}
+      ${l.mainStart()}
+      ${l.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+      let output_indices = ${u.offsetToIndices("global_idx")};
+      var input_indices: ${a.type.indices};
+      for (var i = 0; i < ${n.length}; i++) {
+        let input_dim_i = ${a.indicesGet("uniforms.input_shape","i")};
+        let input_dim_value = ${u.indicesGet("output_indices","i")}  % input_dim_i;
+
+        ${a.indicesSet("input_indices","i","input_dim_value")}
+      }
+      ${u.setByOffset("global_idx",a.getByIndices("input_indices"))}
+    }`;return{name:"Tile",shaderCache:{hint:`${r}`,inputDependencies:["rank"]},getRunData:()=>({outputs:[{dims:o,dataType:e[0].dataType}],dispatchGroup:{x:Math.ceil(i/64)},programUniforms:[{type:12,data:i},...P(e[0].dims,o)]}),getShaderSource:d}},fu=e=>{op(e.inputs),e.compute(sp(e.inputs),{inputs:[0]})}});var ap,up,gu,yu=E(()=>{"use strict";V();q();K();ap=(e,t,n,r,o)=>{let i=C("output_data",o,n.length,4),s=S("a_data",t[1].dataType,t[1].dims.length,4),a=S("b_data",t[2].dataType,t[2].dims.length,4),u=S("c_data",t[0].dataType,t[0].dims.length,4),d,l=(c,p,f)=>`select(${p}, ${c}, ${f})`;if(!r)d=i.setByOffset("global_idx",l(s.getByOffset("global_idx"),a.getByOffset("global_idx"),u.getByOffset("global_idx")));else{let c=(p,f,m="")=>{let h=`a_data[index_a${f}][component_a${f}]`,b=`b_data[index_b${f}][component_b${f}]`,y=`bool(c_data[index_c${f}] & (0xffu << (component_c${f} * 8)))`;return`
+            let output_indices${f} = ${i.offsetToIndices(`global_idx * 4u + ${f}u`)};
+            let offset_a${f} = ${s.broadcastedIndicesToOffset(`output_indices${f}`,i)};
+            let offset_b${f} = ${a.broadcastedIndicesToOffset(`output_indices${f}`,i)};
+            let offset_c${f} = ${u.broadcastedIndicesToOffset(`output_indices${f}`,i)};
+            let index_a${f} = offset_a${f} / 4u;
+            let index_b${f} = offset_b${f} / 4u;
+            let index_c${f} = offset_c${f} / 4u;
+            let component_a${f} = offset_a${f} % 4u;
+            let component_b${f} = offset_b${f} % 4u;
+            let component_c${f} = offset_c${f} % 4u;
+            ${p}[${f}] = ${m}(${l(h,b,y)});
+          `};o===9?d=`
+            var data = vec4<u32>(0);
+            ${c("data",0,"u32")}
+            ${c("data",1,"u32")}
+            ${c("data",2,"u32")}
+            ${c("data",3,"u32")}
+            output_data[global_idx] = dot(vec4<u32>(0x1, 0x100, 0x10000, 0x1000000), vec4<u32>(data));`:d=`
+            ${c("output_data[global_idx]",0)}
+            ${c("output_data[global_idx]",1)}
+            ${c("output_data[global_idx]",2)}
+            ${c("output_data[global_idx]",3)}
+          `}return`
+        ${e.registerUniform("vec_size","u32").declareVariables(u,s,a,i)}
+        ${e.mainStart()}
+        ${e.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.vec_size")}
+        ${d}
+      }`},up=e=>{let t=e[1].dims,n=e[2].dims,r=e[0].dims,o=e[1].dataType,i=!(x.areEqual(t,n)&&x.areEqual(n,r)),s=t,a=x.size(t);if(i){let d=Pe.calcShape(Pe.calcShape(t,n,!1),r,!1);if(!d)throw new Error("Can't perform where op on the given tensors");s=d,a=x.size(s)}let u=Math.ceil(a/4);return{name:"Where",shaderCache:{inputDependencies:["rank","rank","rank"]},getShaderSource:d=>ap(d,e,s,i,o),getRunData:()=>({outputs:[{dims:s,dataType:o}],dispatchGroup:{x:Math.ceil(a/64/4)},programUniforms:[{type:12,data:u},...P(r,t,n,s)]})}},gu=e=>{e.compute(up(e.inputs))}});var bu,_u=E(()=>{"use strict";Xo();Yt();ei();ni();Li();Yi();ts();ys();Ss();Cs();Es();Bs();Us();Ns();Ws();Fs();Zs();Ys();ta();oa();ma();ga();ba();wa();xa();or();Ta();Ga();qa();Ka();Ya();Qt();ru();ar();iu();du();pu();sr();hu();Re();en();yu();bu=new Map([["Abs",[ri]],["Acos",[oi]],["Acosh",[ii]],["Add",[Gi]],["ArgMax",[Qo,qn]],["ArgMin",[Zo,qn]],["Asin",[si]],["Asinh",[ai]],["Atan",[ui]],["Atanh",[di]],["Attention",[Yo]],["AveragePool",[Oa,za]],["BatchNormalization",[Jo]],["BiasAdd",[ti]],["BiasSplitGelu",[Ni]],["Cast",[ci,li]],["Ceil",[mi]],["Clip",[pi]],["Concat",[Ji,es]],["Conv",[er,Jn]],["ConvTranspose",[xs,$s]],["Cos",[fi]],["Cosh",[hi]],["CumSum",[Ts,Is]],["DepthToSpace",[As,ks]],["DequantizeLinear",[Wa,Ha]],["Div",[Wi]],["Einsum",[Os,Ds]],["Elu",[gi,ft]],["Equal",[Hi]],["Erf",[yi]],["Exp",[bi]],["Expand",[Rs]],["FastGelu",[Vs]],["Floor",[_i]],["FusedConv",[er,Jn]],["Gather",[Gs,Ls]],["GatherElements",[Xs,Qs]],["GatherBlockQuantized",[Ks,js]],["GatherND",[Hs,qs]],["Gelu",[wi]],["Gemm",[ea,Js]],["GlobalAveragePool",[Ma,Ba]],["GlobalMaxPool",[La,Na]],["Greater",[ji]],["GreaterOrEqual",[Qi]],["GridSample",[na,ra]],["GroupQueryAttention",[pa]],["HardSigmoid",[Ai,Ci]],["InstanceNormalization",[ha]],["LayerNormalization",[ya]],["LeakyRelu",[$i,ft]],["Less",[Zi]],["LessOrEqual",[Xi]],["Log",[Ri]],["MatMul",[_a]],["MatMulNBits",[$a,va]],["MaxPool",[Ua,Va]],["Mul",[qi]],["MultiHeadAttention",[aa,sa]],["Neg",[xi]],["Not",[vi]],["Pad",[Sa]],["Pow",[Fi]],["QuickGelu",[Ui,ft]],["Range",[Fa]],["Reciprocal",[Si]],["ReduceMin",[Wo]],["ReduceMean",[Uo]],["ReduceMax",[Go]],["ReduceSum",[qo]],["ReduceProd",[Ho]],["ReduceL1",[Vo]],["ReduceL2",[No]],["ReduceLogSum",[Ko]],["ReduceLogSumExp",[Lo]],["ReduceSumSquare",[Fo]],["Relu",[Ti]],["Resize",[tu,nu]],["RotaryEmbedding",[la]],["ScatterND",[Xa,Qa]],["Sigmoid",[Ii]],["Sin",[ki]],["Sinh",[Ei]],["Slice",[au,uu]],["SkipLayerNormalization",[ou]],["Split",[ua,da]],["Sqrt",[Pi]],["Softmax",[lu,cu]],["Sub",[Ki]],["Tan",[zi]],["Tanh",[Di]],["ThresholdedRelu",[Mi,ft]],["Tile",[fu]],["Transpose",[To,Io]],["Where",[gu]]])});var pn,wu=E(()=>{"use strict";ve();Ee();K();pn=class{constructor(t){this.backend=t;this.repo=new Map,this.attributesBound=!1}getArtifact(t){return this.repo.get(t)}setArtifact(t,n){this.repo.set(t,n)}run(t,n,r,o,i){fe(t.programInfo.name);let s=this.backend.device,a=this.backend.getComputePassEncoder();this.backend.writeTimestamp(this.backend.pendingDispatchNumber*2);let u=[];for(let l of n)u.push({binding:u.length,resource:{buffer:l.buffer}});for(let l of r)u.push({binding:u.length,resource:{buffer:l.buffer}});i&&u.push({binding:u.length,resource:i});let d=s.createBindGroup({layout:t.computePipeline.getBindGroupLayout(0),entries:u,label:t.programInfo.name});if(this.backend.sessionStatus==="capturing"){let l={kernelId:this.backend.currentKernelId,computePipeline:t.computePipeline,bindGroup:d,dispatchGroup:o};this.backend.capturedCommandList.get(this.backend.currentSessionId).push(l)}a.setPipeline(t.computePipeline),a.setBindGroup(0,d),a.dispatchWorkgroups(...o),this.backend.writeTimestamp(this.backend.pendingDispatchNumber*2+1),this.backend.pendingDispatchNumber++,(this.backend.pendingDispatchNumber>=this.backend.maxDispatchNumber||this.backend.queryType==="at-passes")&&this.backend.endComputePass(),this.backend.pendingDispatchNumber>=this.backend.maxDispatchNumber&&this.backend.flush(),me(t.programInfo.name)}dispose(){}build(t,n){fe(t.name);let r=this.backend.device,o=[];[{feature:"shader-f16",extension:"f16"},{feature:"subgroups",extension:"subgroups"}].forEach(c=>{r.features.has(c.feature)&&o.push(`enable ${c.extension};`)});let s=xo(n,this.backend.device.limits),a=t.getShaderSource(s),u=`${o.join(`
+`)}
+${s.additionalImplementations}
+${a}`,d=r.createShaderModule({code:u,label:t.name});j("verbose",()=>`[WebGPU] ${t.name} shader code: ${u}`);let l=r.createComputePipeline({compute:{module:d,entryPoint:"main"},layout:"auto",label:t.name});return me(t.name),{programInfo:t,computePipeline:l,uniformVariablesInfo:s.variablesInfo}}normalizeDispatchGroupSize(t){let n=typeof t=="number"?t:t.x,r=typeof t=="number"?1:t.y||1,o=typeof t=="number"?1:t.z||1,i=this.backend.device.limits.maxComputeWorkgroupsPerDimension;if(n<=i&&r<=i&&o<=i)return[n,r,o];let s=n*r*o,a=Math.ceil(Math.sqrt(s));if(a>i){if(a=Math.ceil(Math.cbrt(s)),a>i)throw new Error("Total dispatch size exceeds WebGPU maximum.");return[a,a,a]}else return[a,a,1]}}});var $u={};et($u,{WebGpuBackend:()=>dr});var dp,lp,ur,dr,vu=E(()=>{"use strict";ve();V();Ee();On();$o();_u();wu();dp=(e,t)=>{if(t.length!==e.length)throw new Error(`inputDependencies length ${t.length} is not equal to inputTensors length ${e.length}.`);let n=[];for(let r=0;r<e.length;++r){let o=e[r].dataType;switch(t[r]){case"none":{n.push("");break}case"type":{n.push(`${o}`);break}case"rank":{let i=e[r].dims.length;n.push(`${o};${i}`);break}case"dims":{let i=e[r].dims.join(",");n.push(`${o};${i}`);break}default:throw new Error(`unsupported input dependency: ${t[r]}`)}}return n.join("|")},lp=(e,t,n)=>{let r=e.name;return e.shaderCache?.hint&&(r+="["+e.shaderCache.hint+"]"),r+=":"+n+`:${dp(t,e.shaderCache?.inputDependencies??new Array(t.length).fill("dims"))}`,r},ur=class{constructor(t){t&&(this.architecture=t.architecture,this.vendor=t.vendor)}isArchitecture(t){return this.architecture===t}isVendor(t){return this.vendor===t}},dr=class{constructor(){this.currentSessionId=null;this.currentKernelId=null;this.commandEncoder=null;this.computePassEncoder=null;this.maxDispatchNumber=16;this.pendingDispatchNumber=0;this.pendingKernels=[];this.pendingQueries=new Map;this.sessionStatus="default";this.capturedCommandList=new Map;this.capturedPendingKernels=new Map;this.sessionExternalDataMapping=new Map}get currentKernelCustomData(){if(this.currentKernelId===null)throw new Error("currentKernelCustomData(): currentKernelId is null. (should not happen)");let t=this.kernelCustomData.get(this.currentKernelId);return t||(t={},this.kernelCustomData.set(this.currentKernelId,t)),t}async initialize(t,n){this.env=t;let r=[],o={requiredLimits:{maxComputeWorkgroupStorageSize:n.limits.maxComputeWorkgroupStorageSize,maxComputeWorkgroupsPerDimension:n.limits.maxComputeWorkgroupsPerDimension,maxStorageBufferBindingSize:n.limits.maxStorageBufferBindingSize,maxBufferSize:n.limits.maxBufferSize,maxComputeInvocationsPerWorkgroup:n.limits.maxComputeInvocationsPerWorkgroup,maxComputeWorkgroupSizeX:n.limits.maxComputeWorkgroupSizeX,maxComputeWorkgroupSizeY:n.limits.maxComputeWorkgroupSizeY,maxComputeWorkgroupSizeZ:n.limits.maxComputeWorkgroupSizeZ},requiredFeatures:r},i=s=>n.features.has(s)&&r.push(s)&&!0;i("chromium-experimental-timestamp-query-inside-passes")||i("timestamp-query"),i("shader-f16"),i("subgroups"),this.device=await n.requestDevice(o),this.adapterInfo=new ur(n.info||await n.requestAdapterInfo()),this.gpuDataManager=wo(this),this.programManager=new pn(this),this.kernels=new Map,this.kernelPersistentData=new Map,this.kernelCustomData=new Map,Nt(t.logLevel,!!t.debug),this.device.onuncapturederror=s=>{s.error instanceof GPUValidationError&&console.error(`An uncaught WebGPU validation error was raised: ${s.error.message}`)},Object.defineProperty(this.env.webgpu,"device",{value:this.device,writable:!1,enumerable:!0,configurable:!1}),Object.defineProperty(this.env.webgpu,"adapter",{value:n,writable:!1,enumerable:!0,configurable:!1}),this.setQueryType()}dispose(){typeof this.querySet<"u"&&this.querySet.destroy(),this.gpuDataManager.dispose()}getCommandEncoder(){return this.commandEncoder||(this.commandEncoder=this.device.createCommandEncoder()),this.commandEncoder}getComputePassEncoder(){if(!this.computePassEncoder){let t=this.getCommandEncoder(),n={};this.queryType==="at-passes"&&(n.timestampWrites={querySet:this.querySet,beginningOfPassWriteIndex:this.pendingDispatchNumber*2,endOfPassWriteIndex:this.pendingDispatchNumber*2+1}),this.computePassEncoder=t.beginComputePass(n)}return this.computePassEncoder}endComputePass(){this.computePassEncoder&&(this.computePassEncoder.end(),this.computePassEncoder=null)}flush(){if(!this.commandEncoder)return;fe(),this.endComputePass();let t;this.queryType!=="none"&&(this.commandEncoder.resolveQuerySet(this.querySet,0,this.pendingDispatchNumber*2,this.queryResolveBuffer,0),t=this.device.createBuffer({size:this.pendingDispatchNumber*2*8,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST}),this.pendingQueries.set(t,this.pendingKernels),this.pendingKernels=[],this.commandEncoder.copyBufferToBuffer(this.queryResolveBuffer,0,t,0,this.pendingDispatchNumber*2*8)),this.device.queue.submit([this.commandEncoder.finish()]),this.gpuDataManager.refreshPendingBuffers(),this.commandEncoder=null,this.pendingDispatchNumber=0,this.queryType!=="none"&&t.mapAsync(GPUMapMode.READ).then(()=>{let n=new BigUint64Array(t.getMappedRange()),r=this.pendingQueries.get(t);for(let o=0;o<n.length/2;o++){let i=r[o],s=i.kernelId,a=this.kernels.get(s),u=a.kernelType,d=a.kernelName,l=i.programName,c=i.inputTensorViews,p=i.outputTensorViews,f=n[o*2],m=n[o*2+1];typeof this.queryTimeBase>"u"&&(this.queryTimeBase=f);let h=Number(f-this.queryTimeBase),b=Number(m-this.queryTimeBase);if(!Number.isSafeInteger(h)||!Number.isSafeInteger(b))throw new RangeError("incorrect timestamp range");if(this.env.webgpu.profiling?.ondata)this.env.webgpu.profiling.ondata({version:1,inputsMetadata:c.map(y=>({dims:y.dims,dataType:ke(y.dataType)})),outputsMetadata:p.map(y=>({dims:y.dims,dataType:ke(y.dataType)})),kernelId:s,kernelType:u,kernelName:d,programName:l,startTime:h,endTime:b});else{let y="";c.forEach((_,w)=>{y+=`input[${w}]: [${_.dims}] | ${ke(_.dataType)}, `});let g="";p.forEach((_,w)=>{g+=`output[${w}]: [${_.dims}] | ${ke(_.dataType)}, `}),console.log(`[profiling] kernel "${s}|${u}|${d}|${l}" ${y}${g}execution time: ${b-h} ns`)}dt("GPU",`${l}::${f}::${m}`)}t.unmap(),this.pendingQueries.delete(t)}),me()}run(t,n,r,o,i,s){fe(t.name);let a=[];for(let _=0;_<n.length;++_){let w=n[_].data;if(w===0)continue;let v=this.gpuDataManager.get(w);if(!v)throw new Error(`no GPU data for input: ${w}`);a.push(v)}let{outputs:u,dispatchGroup:d,programUniforms:l}=t.getRunData(n),c=r.length===0?u.map((_,w)=>w):r;if(c.length!==u.length)throw new Error(`Output size ${c.length} must be equal to ${u.length}.`);let p=[],f=[];for(let _=0;_<u.length;++_){if(!Number.isInteger(c[_])||c[_]<-3||c[_]>=s)throw new Error(`Invalid output index: ${c[_]}`);if(c[_]===-3)continue;let w=c[_]===-1,v=c[_]===-2,$=w||v?i(u[_].dataType,u[_].dims):o(c[_],u[_].dataType,u[_].dims);if(p.push($),$.data===0)continue;let T=this.gpuDataManager.get($.data);if(!T)throw new Error(`no GPU data for output: ${$.data}`);if(w&&this.temporaryData.push(T),v){let I=this.kernelPersistentData.get(this.currentKernelId);I||(I=[],this.kernelPersistentData.set(this.currentKernelId,I)),I.push(T)}f.push(T)}if(a.length!==n.length||f.length!==p.length){if(f.length===0)return me(t.name),p;throw new Error(`Program ${t.name} has zero-sized tensor(s) in inputs or outputs. This is not supported now.`)}let m;if(l){let _=0,w=[];l.forEach(I=>{let A=typeof I.data=="number"?[I.data]:I.data;if(A.length===0)return;let z=I.type===10?2:4,M,R;I.type===10?(R=A.length>4?16:A.length>2?8:A.length*z,M=A.length>4?16:z*A.length):(R=A.length<=2?A.length*z:16,M=16),_=Math.ceil(_/R)*R,w.push(_);let W=I.type===10?8:4;_+=A.length>4?Math.ceil(A.length/W)*M:A.length*z});let v=16;_=Math.ceil(_/v)*v;let $=new ArrayBuffer(_);l.forEach((I,A)=>{let z=w[A],M=typeof I.data=="number"?[I.data]:I.data;if(I.type===6)new Int32Array($,z,M.length).set(M);else if(I.type===12)new Uint32Array($,z,M.length).set(M);else if(I.type===10)new Uint16Array($,z,M.length).set(M);else if(I.type===1)new Float32Array($,z,M.length).set(M);else throw new Error(`Unsupported uniform type: ${ke(I.type)}`)});let T=this.gpuDataManager.create(_,GPUBufferUsage.COPY_DST|GPUBufferUsage.UNIFORM);this.device.queue.writeBuffer(T.buffer,0,$,0,_),this.gpuDataManager.release(T.id),m={offset:0,size:_,buffer:T.buffer}}let h=this.programManager.normalizeDispatchGroupSize(d),b=h[1]===1&&h[2]===1,y=lp(t,n,b),g=this.programManager.getArtifact(y);if(g||(g=this.programManager.build(t,h),this.programManager.setArtifact(y,g),j("info",()=>`[artifact] key: ${y}, programName: ${t.name}`)),l&&g.uniformVariablesInfo){if(l.length!==g.uniformVariablesInfo.length)throw new Error(`Uniform variables count mismatch: expect ${g.uniformVariablesInfo.length}, got ${l.length} in program "${g.programInfo.name}".`);for(let _=0;_<l.length;_++){let w=l[_],v=w.type,$=typeof w.data=="number"?1:w.data.length,[T,I]=g.uniformVariablesInfo[_];if(v!==T||$!==I)throw new Error(`Uniform variable ${_} mismatch: expect type ${T} with size ${I}, got type ${v} with size ${$} in program "${g.programInfo.name}".`)}}if(j("info",()=>`[ProgramManager] run "${t.name}" (key=${y}) with ${h[0]}x${h[1]}x${h[2]}`),this.queryType!=="none"||this.sessionStatus==="capturing"){let _={kernelId:this.currentKernelId,programName:g.programInfo.name,inputTensorViews:n,outputTensorViews:p};this.pendingKernels.push(_),this.sessionStatus==="capturing"&&this.capturedPendingKernels.get(this.currentSessionId).push(_)}return this.programManager.run(g,a,f,h,m),me(t.name),p}upload(t,n){this.gpuDataManager.upload(t,n)}memcpy(t,n){this.gpuDataManager.memcpy(t,n)}async download(t,n){await this.gpuDataManager.download(t,n)}alloc(t){return this.gpuDataManager.create(t).id}free(t){return this.gpuDataManager.release(t)}createKernel(t,n,r,o){let i=bu.get(t);if(!i)throw new Error(`kernel not implemented: ${t}`);let s={kernelType:t,kernelName:o,kernelEntry:i[0],attributes:[i[1],r]};this.kernels.set(n,s)}releaseKernel(t){let n=this.kernelPersistentData.get(t);if(n){for(let r of n)this.gpuDataManager.release(r.id);this.kernelPersistentData.delete(t)}this.kernelCustomData.delete(t),this.kernels.delete(t)}computeKernel(t,n,r){let o=this.kernels.get(t);if(!o)throw new Error(`kernel not created: ${t}`);let i=o.kernelType,s=o.kernelName,a=o.kernelEntry,u=o.attributes;if(this.currentKernelId!==null)throw new Error(`kernel "[${i}] ${s}" is not allowed to be called recursively`);this.currentKernelId=t,u[0]&&(u[1]=u[0](u[1]),u[0]=void 0),j("info",()=>`[WebGPU] Start to run kernel "[${i}] ${s}"...`);let d=this.env.debug;this.temporaryData=[];try{return d&&this.device.pushErrorScope("validation"),a(n,u[1]),0}catch(l){return r.push(Promise.resolve(`[WebGPU] Kernel "[${i}] ${s}" failed. ${l}`)),1}finally{d&&r.push(this.device.popErrorScope().then(l=>l?`GPU validation error for kernel "[${i}] ${s}": ${l.message}`:null));for(let l of this.temporaryData)this.gpuDataManager.release(l.id);this.temporaryData=[],this.currentKernelId=null}}registerBuffer(t,n,r,o){let i=this.sessionExternalDataMapping.get(t);i||(i=new Map,this.sessionExternalDataMapping.set(t,i));let s=i.get(n),a=this.gpuDataManager.registerExternalBuffer(r,o,s);return i.set(n,[a,r]),a}unregisterBuffers(t){let n=this.sessionExternalDataMapping.get(t);n&&(n.forEach(r=>this.gpuDataManager.unregisterExternalBuffer(r[0])),this.sessionExternalDataMapping.delete(t))}getBuffer(t){let n=this.gpuDataManager.get(t);if(!n)throw new Error(`no GPU data for buffer: ${t}`);return n.buffer}createDownloader(t,n,r){return async()=>{let o=await Vn(this,t,n);return Gt(o.buffer,r)}}writeTimestamp(t){this.queryType==="inside-passes"&&this.computePassEncoder.writeTimestamp(this.querySet,t)}setQueryType(){this.queryType="none",(this.env.webgpu.profiling?.mode==="default"||(typeof this.env.trace>"u"?this.env.wasm.trace:this.env.trace))&&(this.device.features.has("chromium-experimental-timestamp-query-inside-passes")?this.queryType="inside-passes":this.device.features.has("timestamp-query")&&(this.queryType="at-passes"),this.queryType!=="none"&&typeof this.querySet>"u"&&(this.querySet=this.device.createQuerySet({type:"timestamp",count:this.maxDispatchNumber*2}),this.queryResolveBuffer=this.device.createBuffer({size:this.maxDispatchNumber*2*8,usage:GPUBufferUsage.COPY_SRC|GPUBufferUsage.QUERY_RESOLVE})))}captureBegin(){j("info","captureBegin"),this.capturedCommandList.get(this.currentSessionId)||this.capturedCommandList.set(this.currentSessionId,[]),this.capturedPendingKernels.get(this.currentSessionId)||this.capturedPendingKernels.set(this.currentSessionId,[]),this.flush(),this.sessionStatus="capturing"}captureEnd(){j("info","captureEnd"),this.flush(),this.sessionStatus="default"}replay(){j("info","replay"),this.sessionStatus="replaying";let t=this.capturedCommandList.get(this.currentSessionId),n=this.capturedPendingKernels.get(this.currentSessionId),r=t.length;this.pendingKernels=[];for(let o=0;o<r;o++){let i=this.getComputePassEncoder(),s=t[o];this.writeTimestamp(this.pendingDispatchNumber*2),i.setPipeline(s.computePipeline),i.setBindGroup(0,s.bindGroup),i.dispatchWorkgroups(...s.dispatchGroup),this.writeTimestamp(this.pendingDispatchNumber*2+1),this.pendingDispatchNumber++,this.queryType!=="none"&&this.pendingKernels.push(n[o]),(this.pendingDispatchNumber>=this.maxDispatchNumber||this.queryType==="at-passes")&&this.endComputePass(),this.pendingDispatchNumber>=this.maxDispatchNumber&&this.flush()}this.flush(),this.sessionStatus="default"}onCreateSession(){this.gpuDataManager.onCreateSession()}onReleaseSession(t){this.unregisterBuffers(t),this.capturedCommandList.has(t)&&this.capturedCommandList.delete(t),this.capturedPendingKernels.has(t)&&this.capturedPendingKernels.delete(t),this.gpuDataManager.onReleaseSession(t)}onRunStart(t){this.currentSessionId=t,this.setQueryType()}}});var xu={};et(xu,{init:()=>cp});var bt,lr,cp,Su=E(()=>{"use strict";V();Ee();q();go();bt=class e{constructor(t,n,r,o){this.module=t;this.dataType=n;this.data=r;this.dims=o}getFloat32Array(){if(this.dataType!==1)throw new Error("Invalid data type");let t=x.size(this.dims);return t===0?new Float32Array:new Float32Array(this.module.HEAP8.buffer,this.data,t)}getBigInt64Array(){if(this.dataType!==7)throw new Error("Invalid data type");let t=x.size(this.dims);return t===0?new BigInt64Array:new BigInt64Array(this.module.HEAP8.buffer,this.data,t)}getInt32Array(){if(this.dataType!==6)throw new Error("Invalid data type");let t=x.size(this.dims);return t===0?new Int32Array:new Int32Array(this.module.HEAP8.buffer,this.data,t)}getUint16Array(){if(this.dataType!==10&&this.dataType!==4)throw new Error("Invalid data type");let t=x.size(this.dims);return t===0?new Uint16Array:new Uint16Array(this.module.HEAP8.buffer,this.data,t)}reshape(t){if(x.size(t)!==x.size(this.dims))throw new Error("Invalid new shape");return new e(this.module,this.dataType,this.data,t)}},lr=class{constructor(t,n,r){this.module=t;this.backend=n;this.customDataOffset=0;this.customDataSize=0;this.adapterInfo=n.adapterInfo;let o=t.PTR_SIZE,i=r/t.PTR_SIZE,s=o===4?"i32":"i64";this.opKernelContext=Number(t.getValue(o*i++,s));let a=Number(t.getValue(o*i++,s));this.outputCount=Number(t.getValue(o*i++,s)),this.customDataOffset=Number(t.getValue(o*i++,"*")),this.customDataSize=Number(t.getValue(o*i++,s));let u=[];for(let d=0;d<a;d++){let l=Number(t.getValue(o*i++,s)),c=Number(t.getValue(o*i++,"*")),p=Number(t.getValue(o*i++,s)),f=[];for(let m=0;m<p;m++)f.push(Number(t.getValue(o*i++,s)));u.push(new bt(t,l,c,f))}this.inputs=u}get kernelCustomData(){return this.backend.currentKernelCustomData}get customDataBuffer(){return this.module.HEAPU8.subarray(this.customDataOffset,this.customDataOffset+this.customDataSize)}compute(t,n){let r=n?.inputs?.map(a=>typeof a=="number"?this.inputs[a]:a)??this.inputs,o=n?.outputs??[],i=(a,u,d)=>new bt(this.module,u,this.output(a,d),d),s=(a,u)=>{let d=We(a,u);if(!d)throw new Error(`Unsupported data type: ${a}`);let l=d>0?this.backend.gpuDataManager.create(d).id:0;return new bt(this.module,a,l,u)};return this.backend.run(t,r,o,i,s,this.outputCount)}output(t,n){let r=this.module.stackSave();try{let o=this.module.PTR_SIZE,i=o===4?"i32":"i64",s=this.module.stackAlloc((1+n.length)*o);this.module.setValue(s,n.length,i);for(let a=0;a<n.length;a++)this.module.setValue(s+o*(a+1),n[a],i);return this.module._JsepOutput(this.opKernelContext,t,s)}catch(o){throw new Error(`Failed to generate kernel's output[${t}] with dims [${n}]. If you are running with pre-allocated output, please make sure the output type/dims are correct. Error: ${o}`)}finally{this.module.stackRestore(r)}}},cp=async(e,t,n,r)=>{let o=t.jsepInit;if(!o)throw new Error("Failed to initialize JSEP. The WebAssembly module is not built with JSEP support.");if(e==="webgpu"){let i=(vu(),at($u)).WebGpuBackend,s=new i;await s.initialize(n,r),o("webgpu",[s,a=>s.alloc(Number(a)),a=>s.free(a),(a,u,d,l=!1)=>{if(l)j("verbose",()=>`[WebGPU] jsepCopyGpuToGpu: src=${Number(a)}, dst=${Number(u)}, size=${Number(d)}`),s.memcpy(Number(a),Number(u));else{j("verbose",()=>`[WebGPU] jsepCopyCpuToGpu: dataOffset=${Number(a)}, gpuDataId=${Number(u)}, size=${Number(d)}`);let c=t.HEAPU8.subarray(Number(a>>>0),Number(a>>>0)+Number(d));s.upload(Number(u),c)}},async(a,u,d)=>{j("verbose",()=>`[WebGPU] jsepCopyGpuToCpu: gpuDataId=${a}, dataOffset=${u}, size=${d}`),await s.download(Number(a),()=>t.HEAPU8.subarray(Number(u)>>>0,Number(u+d)>>>0))},(a,u,d)=>s.createKernel(a,Number(u),d,t.UTF8ToString(t._JsepGetNodeName(Number(u)))),a=>s.releaseKernel(a),(a,u,d,l)=>{j("verbose",()=>`[WebGPU] jsepRun: sessionHandle=${d}, kernel=${a}, contextDataOffset=${u}`);let c=new lr(t,s,Number(u));return s.computeKernel(Number(a),c,l)},()=>s.captureBegin(),()=>s.captureEnd(),()=>s.replay()])}else{let i=new Ft(n);o("webnn",[i,()=>i.reserveTensorId(),s=>i.releaseTensorId(s),async(s,a,u,d,l)=>i.ensureTensor(s,a,u,d,l),(s,a)=>{i.uploadTensor(s,a)},async(s,a)=>i.downloadTensor(s,a)])}}});var pp,Ct,At,Ye,mp,Tu,lt,kt,Et,Iu,Pt,zt,Ot,In=E(()=>{"use strict";no();oo();V();Le();Mt();Pn();pp=(e,t)=>{ne()._OrtInit(e,t)!==0&&Y("Can't initialize onnxruntime.")},Ct=async e=>{pp(e.wasm.numThreads,pt(e.logLevel))},At=async(e,t)=>{ne().asyncInit?.();{let n=(Su(),at(xu)).init;if(t==="webgpu"){if(typeof navigator>"u"||!navigator.gpu)throw new Error("WebGPU is not supported in current environment");let r=e.webgpu.adapter;if(r){if(typeof r.limits!="object"||typeof r.features!="object"||typeof r.requestDevice!="function")throw new Error("Invalid GPU adapter set in `env.webgpu.adapter`. It must be a GPUAdapter object.")}else{let o=e.webgpu.powerPreference;if(o!==void 0&&o!=="low-power"&&o!=="high-performance")throw new Error(`Invalid powerPreference setting: "${o}"`);let i=e.webgpu.forceFallbackAdapter;if(i!==void 0&&typeof i!="boolean")throw new Error(`Invalid forceFallbackAdapter setting: "${i}"`);if(r=await navigator.gpu.requestAdapter({powerPreference:o,forceFallbackAdapter:i}),!r)throw new Error('Failed to get GPU adapter. You may need to enable flag "--enable-unsafe-webgpu" if you are using Chrome.')}await n("webgpu",ne(),e,r)}if(t==="webnn"){if(typeof navigator>"u"||!navigator.ml)throw new Error("WebNN is not supported in current environment");await n("webnn",ne(),e)}}},Ye=new Map,mp=e=>{let t=ne(),n=t.stackSave();try{let r=t.PTR_SIZE,o=t.stackAlloc(2*r);t._OrtGetInputOutputCount(e,o,o+r)!==0&&Y("Can't get session input/output count.");let s=r===4?"i32":"i64";return[Number(t.getValue(o,s)),Number(t.getValue(o+r,s))]}finally{t.stackRestore(n)}},Tu=(e,t)=>{let n=ne(),r=n.stackSave(),o=0;try{let i=n.PTR_SIZE,s=n.stackAlloc(2*i);n._OrtGetInputOutputMetadata(e,t,s,s+i)!==0&&Y("Can't get session input/output metadata.");let u=Number(n.getValue(s,"*"));o=Number(n.getValue(s+i,"*"));let d=n.HEAP32[o/4];if(d===0)return[u,0];let l=n.HEAPU32[o/4+1],c=[];for(let p=0;p<l;p++){let f=Number(n.getValue(o+8+p*i,"*"));c.push(f!==0?n.UTF8ToString(f):Number(n.getValue(o+8+(p+l)*i,"*")))}return[u,d,c]}finally{n.stackRestore(r),o!==0&&n._OrtFree(o)}},lt=e=>{let t=ne(),n=t._malloc(e.byteLength);if(n===0)throw new Error(`Can't create a session. failed to allocate a buffer of size ${e.byteLength}.`);return t.HEAPU8.set(e,n),[n,e.byteLength]},kt=async(e,t)=>{let n,r,o=ne();Array.isArray(e)?[n,r]=e:e.buffer===o.HEAPU8.buffer?[n,r]=[e.byteOffset,e.byteLength]:[n,r]=lt(e);let i=0,s=0,a=0,u=[],d=[],l=[];try{if([s,u]=await ro(t),t?.externalData&&o.mountExternalData){let w=[];for(let v of t.externalData){let $=typeof v=="string"?v:v.path;w.push(mt(typeof v=="string"?v:v.data).then(T=>{o.mountExternalData($,T)}))}await Promise.all(w)}for(let w of t?.executionProviders??[])if((typeof w=="string"?w:w.name)==="webnn"){if(o.shouldTransferToMLTensor=!1,typeof w!="string"){let $=w,T=$?.context,I=$?.gpuDevice,A=$?.deviceType,z=$?.powerPreference;T?o.currentContext=T:I?o.currentContext=await o.webnnCreateMLContext(I):o.currentContext=await o.webnnCreateMLContext({deviceType:A,powerPreference:z})}else o.currentContext=await o.webnnCreateMLContext();break}i=await o._OrtCreateSession(n,r,s),o.webgpuOnCreateSession?.(i),i===0&&Y("Can't create a session."),o.jsepOnCreateSession?.(),o.currentContext&&(o.webnnRegisterMLContext(i,o.currentContext),o.currentContext=void 0,o.shouldTransferToMLTensor=!0);let[c,p]=mp(i),f=!!t?.enableGraphCapture,m=[],h=[],b=[],y=[],g=[];for(let w=0;w<c;w++){let[v,$,T]=Tu(i,w);v===0&&Y("Can't get an input name."),d.push(v);let I=o.UTF8ToString(v);m.push(I),b.push($===0?{name:I,isTensor:!1}:{name:I,isTensor:!0,type:ke($),shape:T})}for(let w=0;w<p;w++){let[v,$,T]=Tu(i,w+c);v===0&&Y("Can't get an output name."),l.push(v);let I=o.UTF8ToString(v);h.push(I),y.push($===0?{name:I,isTensor:!1}:{name:I,isTensor:!0,type:ke($),shape:T});{if(f&&t?.preferredOutputLocation===void 0){g.push("gpu-buffer");continue}let A=typeof t?.preferredOutputLocation=="string"?t.preferredOutputLocation:t?.preferredOutputLocation?.[I]??"cpu",z=o.webnnIsGraphOutput;if(A==="cpu"&&z&&z(i,I)){g.push("ml-tensor-cpu-output");continue}if(A!=="cpu"&&A!=="cpu-pinned"&&A!=="gpu-buffer"&&A!=="ml-tensor")throw new Error(`Not supported preferred output location: ${A}.`);if(f&&A!=="gpu-buffer")throw new Error(`Not supported preferred output location: ${A}. Only 'gpu-buffer' location is supported when enableGraphCapture is true.`);g.push(A)}}let _=null;return g.some(w=>w==="gpu-buffer"||w==="ml-tensor"||w==="ml-tensor-cpu-output")&&(a=o._OrtCreateBinding(i),a===0&&Y("Can't create IO binding."),_={handle:a,outputPreferredLocations:g,outputPreferredLocationsEncoded:g.map(w=>w==="ml-tensor-cpu-output"?"ml-tensor":w).map(w=>En(w))}),Ye.set(i,[i,d,l,_,f,!1]),[i,m,h,b,y]}catch(c){throw d.forEach(p=>o._OrtFree(p)),l.forEach(p=>o._OrtFree(p)),a!==0&&o._OrtReleaseBinding(a)!==0&&Y("Can't release IO binding."),i!==0&&o._OrtReleaseSession(i)!==0&&Y("Can't release session."),c}finally{o._free(n),s!==0&&o._OrtReleaseSessionOptions(s)!==0&&Y("Can't release session options."),u.forEach(c=>o._free(c)),o.unmountExternalData?.()}},Et=e=>{let t=ne(),n=Ye.get(e);if(!n)throw new Error(`cannot release session. invalid session id: ${e}`);let[r,o,i,s,a]=n;s&&(a&&t._OrtClearBoundOutputs(s.handle)!==0&&Y("Can't clear bound outputs."),t._OrtReleaseBinding(s.handle)!==0&&Y("Can't release IO binding.")),t.jsepOnReleaseSession?.(e),t.webnnOnReleaseSession?.(e),t.webgpuOnReleaseSession?.(e),o.forEach(u=>t._OrtFree(u)),i.forEach(u=>t._OrtFree(u)),t._OrtReleaseSession(r)!==0&&Y("Can't release session."),Ye.delete(e)},Iu=async(e,t,n,r,o,i,s=!1)=>{if(!e){t.push(0);return}let a=ne(),u=a.PTR_SIZE,d=e[0],l=e[1],c=e[3],p=c,f,m;if(d==="string"&&(c==="gpu-buffer"||c==="ml-tensor"))throw new Error("String tensor is not supported on GPU.");if(s&&c!=="gpu-buffer")throw new Error(`External buffer must be provided for input/output index ${i} when enableGraphCapture is true.`);if(c==="gpu-buffer"){let y=e[2].gpuBuffer;m=We(Ge(d),l);{let g=a.jsepRegisterBuffer;if(!g)throw new Error('Tensor location "gpu-buffer" is not supported without using WebGPU.');f=g(r,i,y,m)}}else if(c==="ml-tensor"){let y=e[2].mlTensor;m=We(Ge(d),l);let g=a.webnnRegisterMLTensor;if(!g)throw new Error('Tensor location "ml-tensor" is not supported without using WebNN.');f=g(r,y,Ge(d),l)}else{let y=e[2];if(Array.isArray(y)){m=u*y.length,f=a._malloc(m),n.push(f);for(let g=0;g<y.length;g++){if(typeof y[g]!="string")throw new TypeError(`tensor data at index ${g} is not a string`);a.setValue(f+g*u,be(y[g],n),"*")}}else{let g=a.webnnIsGraphInput,_=a.webnnIsGraphOutput;if(d!=="string"&&g&&_){let w=a.UTF8ToString(o);if(g(r,w)||_(r,w)){let v=Ge(d);m=We(v,l),p="ml-tensor";let $=a.webnnCreateTemporaryTensor,T=a.webnnUploadTensor;if(!$||!T)throw new Error('Tensor location "ml-tensor" is not supported without using WebNN.');let I=await $(r,v,l);T(I,new Uint8Array(y.buffer,y.byteOffset,y.byteLength)),f=I}else m=y.byteLength,f=a._malloc(m),n.push(f),a.HEAPU8.set(new Uint8Array(y.buffer,y.byteOffset,m),f)}else m=y.byteLength,f=a._malloc(m),n.push(f),a.HEAPU8.set(new Uint8Array(y.buffer,y.byteOffset,m),f)}}let h=a.stackSave(),b=a.stackAlloc(4*l.length);try{l.forEach((g,_)=>a.setValue(b+_*u,g,u===4?"i32":"i64"));let y=a._OrtCreateTensor(Ge(d),f,m,b,l.length,En(p));y===0&&Y(`Can't create tensor for input/output. session=${r}, index=${i}.`),t.push(y)}finally{a.stackRestore(h)}},Pt=async(e,t,n,r,o,i)=>{let s=ne(),a=s.PTR_SIZE,u=Ye.get(e);if(!u)throw new Error(`cannot run inference. invalid session id: ${e}`);let d=u[0],l=u[1],c=u[2],p=u[3],f=u[4],m=u[5],h=t.length,b=r.length,y=0,g=[],_=[],w=[],v=[],$=s.stackSave(),T=s.stackAlloc(h*a),I=s.stackAlloc(h*a),A=s.stackAlloc(b*a),z=s.stackAlloc(b*a);try{[y,g]=to(i);for(let O=0;O<h;O++)await Iu(n[O],_,v,e,l[t[O]],t[O],f);for(let O=0;O<b;O++)await Iu(o[O],w,v,e,c[r[O]],h+r[O],f);for(let O=0;O<h;O++)s.setValue(T+O*a,_[O],"*"),s.setValue(I+O*a,l[t[O]],"*");for(let O=0;O<b;O++)s.setValue(A+O*a,w[O],"*"),s.setValue(z+O*a,c[r[O]],"*");if(p&&!m){let{handle:O,outputPreferredLocations:ee,outputPreferredLocationsEncoded:G}=p;if(l.length!==h)throw new Error(`input count from feeds (${h}) is expected to be always equal to model's input count (${l.length}).`);for(let D=0;D<h;D++){let Z=t[D];await s._OrtBindInput(O,l[Z],_[D])!==0&&Y(`Can't bind input[${D}] for session=${e}.`)}for(let D=0;D<b;D++){let Z=r[D];o[D]?.[3]?s._OrtBindOutput(O,c[Z],w[D],0)!==0&&Y(`Can't bind pre-allocated output[${D}] for session=${e}.`):s._OrtBindOutput(O,c[Z],0,G[Z])!==0&&Y(`Can't bind output[${D}] to ${ee[D]} for session=${e}.`)}Ye.set(e,[d,l,c,p,f,!0])}s.jsepOnRunStart?.(d),s.webnnOnRunStart?.(d);let M;p?M=await s._OrtRunWithBinding(d,p.handle,b,A,y):M=await s._OrtRun(d,I,T,h,z,b,A,y),M!==0&&Y("failed to call OrtRun().");let R=[],W=[];for(let O=0;O<b;O++){let ee=Number(s.getValue(A+O*a,"*"));if(ee===w[O]){R.push(o[O]);continue}let G=s.stackSave(),D=s.stackAlloc(4*a),Z=!1,U,Q=0;try{s._OrtGetTensorData(ee,D,D+a,D+2*a,D+3*a)!==0&&Y(`Can't access output tensor data on index ${O}.`);let se=a===4?"i32":"i64",H=Number(s.getValue(D,se));Q=s.getValue(D+a,"*");let k=s.getValue(D+a*2,"*"),L=Number(s.getValue(D+a*3,se)),oe=[];for(let ue=0;ue<L;ue++)oe.push(Number(s.getValue(k+ue*a,se)));s._OrtFree(k)!==0&&Y("Can't free memory for tensor dims.");let we=oe.reduce((ue,ae)=>ue*ae,1);U=ke(H);let Se=p?.outputPreferredLocations[r[O]];if(U==="string"){if(Se==="gpu-buffer"||Se==="ml-tensor")throw new Error("String tensor is not supported on GPU.");let ue=[];for(let ae=0;ae<we;ae++){let Ve=s.getValue(Q+ae*a,"*"),wt=s.getValue(Q+(ae+1)*a,"*"),fr=ae===we-1?void 0:wt-Ve;ue.push(s.UTF8ToString(Ve,fr))}R.push([U,oe,ue,"cpu"])}else if(Se==="gpu-buffer"&&we>0){let ue=s.jsepGetBuffer;if(!ue)throw new Error('preferredLocation "gpu-buffer" is not supported without using WebGPU.');let ae=ue(Q),Ve=We(H,we);if(Ve===void 0||!Ut(U))throw new Error(`Unsupported data type: ${U}`);Z=!0,R.push([U,oe,{gpuBuffer:ae,download:s.jsepCreateDownloader(ae,Ve,U),dispose:()=>{s._OrtReleaseTensor(ee)!==0&&Y("Can't release tensor.")}},"gpu-buffer"])}else if(Se==="ml-tensor"&&we>0){let ue=s.webnnEnsureTensor,ae=s.webnnIsGraphInputOutputTypeSupported;if(!ue||!ae)throw new Error('preferredLocation "ml-tensor" is not supported without using WebNN.');if(We(H,we)===void 0||!Vt(U))throw new Error(`Unsupported data type: ${U}`);if(!ae(e,U,!1))throw new Error(`preferredLocation "ml-tensor" for ${U} output is not supported by current WebNN Context.`);let wt=await ue(e,Q,H,oe,!1);Z=!0,R.push([U,oe,{mlTensor:wt,download:s.webnnCreateMLTensorDownloader(Q,U),dispose:()=>{s.webnnReleaseTensorId(Q),s._OrtReleaseTensor(ee)}},"ml-tensor"])}else if(Se==="ml-tensor-cpu-output"&&we>0){let ue=s.webnnCreateMLTensorDownloader(Q,U)(),ae=R.length;Z=!0,W.push((async()=>{let Ve=[ae,await ue];return s.webnnReleaseTensorId(Q),s._OrtReleaseTensor(ee),Ve})()),R.push([U,oe,[],"cpu"])}else{let ue=tt(U),ae=new ue(we);new Uint8Array(ae.buffer,ae.byteOffset,ae.byteLength).set(s.HEAPU8.subarray(Q,Q+ae.byteLength)),R.push([U,oe,ae,"cpu"])}}finally{s.stackRestore(G),U==="string"&&Q&&s._free(Q),Z||s._OrtReleaseTensor(ee)}}p&&!f&&(s._OrtClearBoundOutputs(p.handle)!==0&&Y("Can't clear bound outputs."),Ye.set(e,[d,l,c,p,f,!1]));for(let[O,ee]of await Promise.all(W))R[O][2]=ee;return R}finally{s.webnnOnRunEnd?.(d),s.stackRestore($),_.forEach(M=>s._OrtReleaseTensor(M)),w.forEach(M=>s._OrtReleaseTensor(M)),v.forEach(M=>s._free(M)),y!==0&&s._OrtReleaseRunOptions(y),g.forEach(M=>s._free(M))}},zt=e=>{let t=ne(),n=Ye.get(e);if(!n)throw new Error("invalid session id");let r=n[0],o=t._OrtEndProfiling(r);o===0&&Y("Can't get an profile file name."),t._OrtFree(o)},Ot=e=>{let t=[];for(let n of e){let r=n[2];!Array.isArray(r)&&"buffer"in r&&t.push(r.buffer)}return t}});var Je,xe,_t,fn,hn,mn,cr,pr,it,st,hp,Cu,Au,ku,Eu,Pu,zu,Ou,mr=E(()=>{"use strict";ve();In();Le();Tt();Je=()=>!!te.wasm.proxy&&typeof document<"u",_t=!1,fn=!1,hn=!1,pr=new Map,it=(e,t)=>{let n=pr.get(e);n?n.push(t):pr.set(e,[t])},st=()=>{if(_t||!fn||hn||!xe)throw new Error("worker not ready")},hp=e=>{switch(e.data.type){case"init-wasm":_t=!1,e.data.err?(hn=!0,cr[1](e.data.err)):(fn=!0,cr[0]()),mn&&(URL.revokeObjectURL(mn),mn=void 0);break;case"init-ep":case"copy-from":case"create":case"release":case"run":case"end-profiling":{let t=pr.get(e.data.type);e.data.err?t.shift()[1](e.data.err):t.shift()[0](e.data.out);break}default:}},Cu=async()=>{if(!fn){if(_t)throw new Error("multiple calls to 'initWasm()' detected.");if(hn)throw new Error("previous call to 'initWasm()' failed.");if(_t=!0,Je())return new Promise((e,t)=>{xe?.terminate(),Yr().then(([n,r])=>{try{xe=r,xe.onerror=i=>t(i),xe.onmessage=hp,cr=[e,t];let o={type:"init-wasm",in:te};if(!o.in.wasm.wasmPaths&&n){let i=Dt();i&&(o.in.wasm.wasmPaths=i)}xe.postMessage(o),mn=n}catch(o){t(o)}},t)});try{await It(te.wasm),await Ct(te),fn=!0}catch(e){throw hn=!0,e}finally{_t=!1}}},Au=async e=>{if(Je())return st(),new Promise((t,n)=>{it("init-ep",[t,n]);let r={type:"init-ep",in:{epName:e,env:te}};xe.postMessage(r)});await At(te,e)},ku=async e=>Je()?(st(),new Promise((t,n)=>{it("copy-from",[t,n]);let r={type:"copy-from",in:{buffer:e}};xe.postMessage(r,[e.buffer])})):lt(e),Eu=async(e,t)=>{if(Je()){if(t?.preferredOutputLocation)throw new Error('session option "preferredOutputLocation" is not supported for proxy.');return st(),new Promise((n,r)=>{it("create",[n,r]);let o={type:"create",in:{model:e,options:{...t}}},i=[];e instanceof Uint8Array&&i.push(e.buffer),xe.postMessage(o,i)})}else return kt(e,t)},Pu=async e=>{if(Je())return st(),new Promise((t,n)=>{it("release",[t,n]);let r={type:"release",in:e};xe.postMessage(r)});Et(e)},zu=async(e,t,n,r,o,i)=>{if(Je()){if(n.some(s=>s[3]!=="cpu"))throw new Error("input tensor on GPU is not supported for proxy.");if(o.some(s=>s))throw new Error("pre-allocated output tensor is not supported for proxy.");return st(),new Promise((s,a)=>{it("run",[s,a]);let u=n,d={type:"run",in:{sessionId:e,inputIndices:t,inputs:u,outputIndices:r,options:i}};xe.postMessage(d,Ot(u))})}else return Pt(e,t,n,r,o,i)},Ou=async e=>{if(Je())return st(),new Promise((t,n)=>{it("end-profiling",[t,n]);let r={type:"end-profiling",in:e};xe.postMessage(r)});zt(e)}});var Du,gp,gn,Bu=E(()=>{"use strict";ve();mr();V();St();Pn();Du=(e,t)=>{switch(e.location){case"cpu":return[e.type,e.dims,e.data,"cpu"];case"gpu-buffer":return[e.type,e.dims,{gpuBuffer:e.gpuBuffer},"gpu-buffer"];case"ml-tensor":return[e.type,e.dims,{mlTensor:e.mlTensor},"ml-tensor"];default:throw new Error(`invalid data location: ${e.location} for ${t()}`)}},gp=e=>{switch(e[3]){case"cpu":return new $e(e[0],e[2],e[1]);case"gpu-buffer":{let t=e[0];if(!Ut(t))throw new Error(`not supported data type: ${t} for deserializing GPU tensor`);let{gpuBuffer:n,download:r,dispose:o}=e[2];return $e.fromGpuBuffer(n,{dataType:t,dims:e[1],download:r,dispose:o})}case"ml-tensor":{let t=e[0];if(!Vt(t))throw new Error(`not supported data type: ${t} for deserializing MLTensor tensor`);let{mlTensor:n,download:r,dispose:o}=e[2];return $e.fromMLTensor(n,{dataType:t,dims:e[1],download:r,dispose:o})}default:throw new Error(`invalid data location: ${e[3]}`)}},gn=class{async fetchModelAndCopyToWasmMemory(t){return ku(await mt(t))}async loadModel(t,n){fe();let r;typeof t=="string"?r=await this.fetchModelAndCopyToWasmMemory(t):r=t,[this.sessionId,this.inputNames,this.outputNames,this.inputMetadata,this.outputMetadata]=await Eu(r,n),me()}async dispose(){return Pu(this.sessionId)}async run(t,n,r){fe();let o=[],i=[];Object.entries(t).forEach(p=>{let f=p[0],m=p[1],h=this.inputNames.indexOf(f);if(h===-1)throw new Error(`invalid input '${f}'`);o.push(m),i.push(h)});let s=[],a=[];Object.entries(n).forEach(p=>{let f=p[0],m=p[1],h=this.outputNames.indexOf(f);if(h===-1)throw new Error(`invalid output '${f}'`);s.push(m),a.push(h)});let u=o.map((p,f)=>Du(p,()=>`input "${this.inputNames[i[f]]}"`)),d=s.map((p,f)=>p?Du(p,()=>`output "${this.outputNames[a[f]]}"`):null),l=await zu(this.sessionId,i,u,a,d,r),c={};for(let p=0;p<l.length;p++)c[this.outputNames[a[p]]]=s[p]??gp(l[p]);return me(),c}startProfiling(){}endProfiling(){Ou(this.sessionId)}}});var Ru={};et(Ru,{OnnxruntimeWebAssemblyBackend:()=>yn,initializeFlags:()=>Mu,wasmBackend:()=>yp});var Mu,yn,yp,Uu=E(()=>{"use strict";ve();mr();Bu();Mu=()=>{(typeof te.wasm.initTimeout!="number"||te.wasm.initTimeout<0)&&(te.wasm.initTimeout=0);let e=te.wasm.simd;if(typeof e!="boolean"&&e!==void 0&&e!=="fixed"&&e!=="relaxed"&&(console.warn(`Property "env.wasm.simd" is set to unknown value "${e}". Reset it to \`false\` and ignore SIMD feature checking.`),te.wasm.simd=!1),typeof te.wasm.proxy!="boolean"&&(te.wasm.proxy=!1),typeof te.wasm.trace!="boolean"&&(te.wasm.trace=!1),typeof te.wasm.numThreads!="number"||!Number.isInteger(te.wasm.numThreads)||te.wasm.numThreads<=0)if(typeof self<"u"&&!self.crossOriginIsolated)te.wasm.numThreads=1;else{let t=typeof navigator>"u"?_n("node:os").cpus().length:navigator.hardwareConcurrency;te.wasm.numThreads=Math.min(4,Math.ceil((t||1)/2))}},yn=class{async init(t){Mu(),await Cu(),await Au(t)}async createInferenceSessionHandler(t,n){let r=new gn;return await r.loadModel(t,n),r}},yp=new yn});var _p={};et(_p,{InferenceSession:()=>Ur,TRACE:()=>dt,TRACE_FUNC_BEGIN:()=>fe,TRACE_FUNC_END:()=>me,Tensor:()=>$e,default:()=>bp,env:()=>te,registerBackend:()=>Ne});ve();ve();ve();var Hr="1.22.0";var bp=Tn;{let e=(Uu(),at(Ru)).wasmBackend;Ne("webgpu",e,5),Ne("webnn",e,5),Ne("cpu",e,10),Ne("wasm",e,10)}Object.defineProperty(te.versions,"web",{value:Hr,enumerable:!0});return at(_p);})();
+/**
+ * @license
+ * Copyright 2021 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+/**
+ * @license
+ * Copyright 2020 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+/**
+ * @license
+ * Copyright 2019 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+typeof exports=="object"&&typeof module=="object"&&(module.exports=ort);
+//# sourceMappingURL=ort.min.js.map
diff --git a/demo/e2e_audio_trace_demo.py b/demo/e2e_audio_trace_demo.py
new file mode 100644
index 0000000..5fd84be
--- /dev/null
+++ b/demo/e2e_audio_trace_demo.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+"""End-to-end smoke test for the Audio Loop + Trace Wiring plan.
+
+Verifies the non-audio half of the loop:
+  1. Kernel health endpoint reachable.
+  2. Cycle-trace bus (bus_cycle_trace) emits ADR-083-shaped envelopes.
+  3. The existing Mod3 KernelBusSubscriber can consume them.
+
+Exit 0 if >=1 event of any recognized kind is validated within 60s.
+Exit 1 with a clear error otherwise.
+
+Usage:
+    python3 demo/e2e_audio_trace_demo.py
+"""
+from __future__ import annotations
+
+import asyncio
+import json
+import sys
+import time
+from collections import defaultdict
+from pathlib import Path
+
+import httpx
+
+# Allow running from repo root or from demo/
+ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(ROOT))
+
+from bus_bridge import KernelBusSubscriber  # noqa: E402
+
+HEALTH_URL = "http://localhost:6931/health"
+BUS = "bus_cycle_trace"
+KNOWN_KINDS = {"state_transition", "tool_dispatch", "assessment"}
+REQUIRED_FIELDS = ("id", "ts", "source", "cycle_id", "kind", "payload")
+MAX_WAIT_S = 60
+TARGET_EVENTS = 3
+
+
+async def check_health() -> None:
+    async with httpx.AsyncClient(timeout=5.0) as c:
+        r = await c.get(HEALTH_URL)
+    if r.status_code != 200:
+        raise SystemExit(f"health: HTTP {r.status_code}")
+    try:
+        body = r.json()
+    except Exception as e:
+        raise SystemExit(f"health: non-JSON body: {e}")
+    print(f"[ok] kernel health: status={body.get('status')!r}")
+
+
+def validate_envelope(env) -> tuple[bool, list[str], dict]:
+    """ADR-083 envelope shape check.
+
+    The kernel SSE frame is {id, type, timestamp, data: <CogBlock>}.
+    The CogBlock is {bus_id, from, seq, hash, prev, prev_hash, ts, type, v, payload}.
+    The ADR-083 CycleEvent (id/ts/source/cycle_id/kind/payload) is the CogBlock's
+    payload field — that's what consumers validate.
+    """
+    cogblock = env.raw.get("data", {}) if isinstance(env.raw, dict) else {}
+    inner = cogblock.get("payload") if isinstance(cogblock, dict) else {}
+    if not isinstance(inner, dict):
+        inner = {}
+    missing = [f for f in REQUIRED_FIELDS if f not in inner]
+    return (not missing), missing, inner
+
+
+async def collect() -> int:
+    sub = KernelBusSubscriber(bus_filter=BUS, consumer_id="e2e-audio-trace-demo")
+    captured: list = []
+    kinds = defaultdict(int)
+    cycle_ids: set[str] = set()
+    t0 = time.time()
+    unknown_kinds: set[str] = set()
+    first_ts: str | None = None
+    last_ts: str | None = None
+
+    print(f"[..] subscribing to {sub._url} bus={BUS} (up to {MAX_WAIT_S}s or {TARGET_EVENTS} events)")
+    try:
+        async def _run():
+            nonlocal first_ts, last_ts
+            async for env in sub.stream():
+                if env.kind == "connected":
+                    continue
+                ok, missing, inner = validate_envelope(env)
+                kind = inner.get("kind", env.kind)
+                if not ok:
+                    print(f"[warn] envelope missing fields {missing}: keys={sorted(inner.keys())}")
+                    continue
+                if kind not in KNOWN_KINDS:
+                    unknown_kinds.add(kind)
+                    print(f"[warn] unknown kind={kind!r} (tolerated per ADR-083)")
+                kinds[kind] += 1
+                cid = inner.get("cycle_id")
+                if isinstance(cid, str):
+                    cycle_ids.add(cid)
+                ts = inner.get("ts")
+                if isinstance(ts, str):
+                    first_ts = first_ts or ts
+                    last_ts = ts
+                captured.append(inner)
+                print(f"[evt] kind={kind} cycle_id={cid} ts={ts} payload_keys={sorted((inner.get('payload') or {}).keys())[:6]}")
+                if len(captured) >= TARGET_EVENTS:
+                    return
+
+        await asyncio.wait_for(_run(), timeout=MAX_WAIT_S)
+    except asyncio.TimeoutError:
+        print(f"[..] timeout reached after {MAX_WAIT_S}s; collected {len(captured)} event(s)")
+    finally:
+        await sub.close()
+
+    elapsed = time.time() - t0
+    print("\n=== SUMMARY ===")
+    print(f"elapsed: {elapsed:.1f}s  events: {len(captured)}  distinct cycle_ids: {len(cycle_ids)}")
+    print(f"time window: {first_ts} .. {last_ts}")
+    print(f"count per kind: {dict(kinds)}")
+    if unknown_kinds:
+        print(f"unknown kinds (tolerated): {sorted(unknown_kinds)}")
+    for k in sorted(kinds):
+        sample = next((e for e in captured if e.get("kind") == k), None)
+        if sample:
+            print(f"sample[{k}]: {json.dumps(sample, default=str)[:220]}")
+
+    if len(captured) == 0:
+        print("[FAIL] no events captured from bus_cycle_trace within window.", file=sys.stderr)
+        return 1
+    if not any(k in KNOWN_KINDS for k in kinds):
+        print("[FAIL] captured events but none matched known ADR-083 kinds.", file=sys.stderr)
+        return 1
+    print("[PASS] >=1 ADR-083-shaped event validated.")
+    return 0
+
+
+async def main() -> int:
+    await check_health()
+    return await collect()
+
+
+if __name__ == "__main__":
+    sys.exit(asyncio.run(main()))
diff --git a/demo/e2e_dashboard_harness.py b/demo/e2e_dashboard_harness.py
new file mode 100644
index 0000000..cc7dac7
--- /dev/null
+++ b/demo/e2e_dashboard_harness.py
@@ -0,0 +1,175 @@
+"""Headless end-to-end harness for the Mod³ dashboard pipeline.
+
+Simulates a dashboard client without a browser or a microphone:
+  1. Connects to ws://localhost:7860/ws/chat
+  2. Sends a text_message, collects response_text frames + audio frames
+  3. Writes captured TTS audio to /tmp/mod3-harness-tts.wav
+  4. Sends an interrupt frame mid-response (if TTS still playing) to exercise barge-in
+  5. In parallel, subscribes to kernel trace events via /v1/events/stream?bus_id=bus_cycle_trace
+  6. Emits a summary
+
+Exits 0 on success, 1 on any missing path.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import base64
+import json
+import os
+import sys
+import time
+import wave
+from pathlib import Path
+
+from websockets.client import connect as ws_connect
+
+HERE = Path(__file__).resolve().parent
+sys.path.insert(0, str(HERE.parent))
+from bus_bridge import KernelBusSubscriber  # noqa: E402
+
+WS_URL = "ws://localhost:7860/ws/chat"
+KERNEL_STREAM = "http://localhost:6931/v1/events/stream"
+TTS_OUT = Path("/tmp/mod3-harness-tts.wav")
+RESULT: dict = {"response_text": [], "audio_frames": 0, "audio_bytes": 0,
+                "trace_events": [], "interrupt_sent": False, "started": time.time()}
+
+
+async def subscribe_trace(seconds: float) -> None:
+    sub = KernelBusSubscriber(url=KERNEL_STREAM, bus_filter="*")
+    try:
+        deadline = time.time() + seconds
+        async for env in sub.stream():
+            kind = env.kind or "?"
+            RESULT["trace_events"].append({"kind": kind, "ts": env.ts})
+            if time.time() > deadline:
+                break
+    finally:
+        await sub.close()
+
+
+def _pcm_frames_to_wav(frames: list[bytes], sample_rate: int, path: Path) -> int:
+    pcm = b"".join(frames)
+    with wave.open(str(path), "wb") as w:
+        w.setnchannels(1)
+        w.setsampwidth(2)
+        w.setframerate(sample_rate)
+        w.writeframes(pcm)
+    return len(pcm)
+
+
+async def run_dashboard_session(interrupt_after_s: float, text: str) -> None:
+    audio_pcm: list[bytes] = []
+    audio_sr: int | None = None
+    interrupt_task: asyncio.Task | None = None
+
+    async with ws_connect(WS_URL, max_size=32 * 1024 * 1024) as ws:
+        # 1. config
+        await ws.send(json.dumps({"type": "config", "voice": "bm_lewis", "model": None}))
+        # 2. user turn
+        await ws.send(json.dumps({"type": "text_message", "text": text}))
+
+        async def deferred_interrupt() -> None:
+            await asyncio.sleep(interrupt_after_s)
+            RESULT["interrupt_sent"] = True
+            await ws.send(json.dumps({"type": "interrupt"}))
+            await asyncio.sleep(0.2)
+            await ws.send(json.dumps({"type": "text_message", "text": "wait hold on"}))
+
+        interrupt_task = asyncio.create_task(deferred_interrupt())
+
+        end_deadline = time.time() + 45
+        got_done = False
+        done_ts: float | None = None
+        while time.time() < end_deadline:
+            if got_done and done_ts and (time.time() - done_ts) > 4.0:
+                break
+            try:
+                msg = await asyncio.wait_for(ws.recv(), timeout=5.0)
+            except asyncio.TimeoutError:
+                continue
+            if isinstance(msg, bytes):
+                continue
+            try:
+                frame = json.loads(msg)
+            except Exception:
+                continue
+            t = frame.get("type")
+            print(f"  [frame] type={t} keys={sorted(frame.keys())}", flush=True)
+            if t == "response_text":
+                RESULT["response_text"].append(frame.get("text", ""))
+            elif t == "audio":
+                RESULT["audio_frames"] += 1
+                RESULT["audio_bytes"] += len(frame.get("data", "")) // 4 * 3
+                try:
+                    wav_b64 = frame.get("data", "")
+                    wav_bytes = base64.b64decode(wav_b64)
+                    if wav_bytes.startswith(b"RIFF"):
+                        with wave.open(__import__("io").BytesIO(wav_bytes), "rb") as r:
+                            audio_sr = r.getframerate()
+                            audio_pcm.append(r.readframes(r.getnframes()))
+                    else:
+                        sr = int(frame.get("sample_rate", 24000))
+                        audio_sr = audio_sr or sr
+                        audio_pcm.append(wav_bytes)
+                except Exception as e:
+                    print(f"  (audio decode err: {e})", flush=True)
+            elif t == "response_done" or t == "turn_complete":
+                got_done = True
+                done_ts = time.time()
+            elif t == "trace_event":
+                RESULT["trace_events"].append({
+                    "kind": (frame.get("event") or {}).get("kind", "?"),
+                    "ts": (frame.get("event") or {}).get("ts"),
+                })
+            if RESULT["audio_frames"] >= 2 and RESULT["interrupt_sent"] is False:
+                await deferred_interrupt()
+                break
+
+        if interrupt_task and not interrupt_task.done():
+            interrupt_task.cancel()
+
+    if audio_pcm and audio_sr:
+        size = _pcm_frames_to_wav(audio_pcm, audio_sr, TTS_OUT)
+        RESULT["tts_wav_path"] = str(TTS_OUT)
+        RESULT["tts_wav_bytes"] = size
+        RESULT["tts_wav_sr"] = audio_sr
+
+
+async def main() -> int:
+    text = os.environ.get("HARNESS_PROMPT",
+                          "In one short sentence, describe the planet Jupiter.")
+    skip_interrupt = os.environ.get("HARNESS_SKIP_INTERRUPT") == "1"
+    trace_task = asyncio.create_task(subscribe_trace(seconds=45.0))
+    try:
+        await asyncio.wait_for(
+            run_dashboard_session(interrupt_after_s=9999 if skip_interrupt else 2.5,
+                                  text=text),
+            timeout=90.0)
+    except asyncio.TimeoutError:
+        print("[warn] dashboard session timed out")
+    await asyncio.sleep(1.5)
+    trace_task.cancel()
+    try:
+        await trace_task
+    except asyncio.CancelledError:
+        pass
+
+    print("\n=== HARNESS SUMMARY ===")
+    print(f"prompt: {text!r}")
+    print(f"response_text frames: {len(RESULT['response_text'])}")
+    if RESULT["response_text"]:
+        joined = " ".join(RESULT["response_text"])[:400]
+        print(f"  preview: {joined!r}")
+    print(f"audio frames: {RESULT['audio_frames']}  wav path: {RESULT.get('tts_wav_path','-')}  sr={RESULT.get('tts_wav_sr','-')}")
+    print(f"interrupt_sent: {RESULT['interrupt_sent']}")
+    print(f"trace events observed: {len(RESULT['trace_events'])}  kinds: {sorted({e['kind'] for e in RESULT['trace_events']})}")
+
+    ok = (len(RESULT["response_text"]) > 0 and
+          RESULT["audio_frames"] > 0 and
+          len(RESULT["trace_events"]) > 0)
+    return 0 if ok else 1
+
+
+if __name__ == "__main__":
+    sys.exit(asyncio.run(main()))
diff --git a/http_api.py b/http_api.py
index 45a7fb8..6b7260c 100644
--- a/http_api.py
+++ b/http_api.py
@@ -67,6 +67,58 @@ def _do_warmup():
     threading.Thread(target=_do_warmup, daemon=True, name="kokoro-warmup").start()
 
 
+@app.on_event("startup")
+async def _start_bus_bridge():
+    """Launch the kernel-bus → dashboard trace-event bridge.
+
+    Non-blocking: subscriber handles reconnect with backoff, so an unreachable
+    kernel does not fail server startup. Disable with MOD3_BUS_BRIDGE_DISABLED=1.
+    """
+    from bus_bridge_runner import start_bridge
+
+    try:
+        await start_bridge(app.state)
+    except Exception as e:  # noqa: BLE001 — never fail startup on bridge wiring
+        logger.warning("bus-bridge startup failed (non-fatal): %s", e)
+
+
+@app.on_event("shutdown")
+async def _stop_bus_bridge():
+    """Gracefully stop the bus bridge on FastAPI shutdown."""
+    from bus_bridge_runner import stop_bridge
+
+    try:
+        await stop_bridge(app.state, timeout_s=2.0)
+    except Exception as e:  # noqa: BLE001
+        logger.debug("bus-bridge shutdown error (non-fatal): %s", e)
+
+
+@app.on_event("startup")
+async def _start_cogos_agent_bridge():
+    """Launch the cogos-agent response bridge (MOD3_USE_COGOS_AGENT=1).
+
+    No-op unless the env flag is set. Subscribes to `bus_dashboard_response`
+    and forwards assistant replies to the dashboard WS as `response_text`.
+    """
+    from cogos_agent_bridge import start_response_bridge
+
+    try:
+        await start_response_bridge(app.state)
+    except Exception as e:  # noqa: BLE001 — never fail startup on bridge wiring
+        logger.warning("cogos-agent startup failed (non-fatal): %s", e)
+
+
+@app.on_event("shutdown")
+async def _stop_cogos_agent_bridge():
+    """Gracefully stop the cogos-agent response bridge on FastAPI shutdown."""
+    from cogos_agent_bridge import stop_response_bridge
+
+    try:
+        await stop_response_bridge(app.state, timeout_s=2.0)
+    except Exception as e:  # noqa: BLE001
+        logger.debug("cogos-agent shutdown error (non-fatal): %s", e)
+
+
 try:
     from server import _bus as _shared_bus
 except Exception:
diff --git a/integrations/bargein-producer.py b/integrations/bargein-producer.py
index 2b9fa51..c7d0d20 100755
--- a/integrations/bargein-producer.py
+++ b/integrations/bargein-producer.py
@@ -3,12 +3,20 @@
 Barge-in signal producer — detects SuperWhisper recording and writes
 the signal file that Mod3's barge-in consumer watches.
 
+DEPRECATED as a standalone script once mod3 absorbs this functionality.
+Prefer the in-process provider at ``mod3.bargein.providers.superwhisper``
+(enabled by setting ``MOD3_BARGEIN_PROVIDERS=superwhisper``), which calls
+into mod3's barge-in consumer directly instead of going through the
+``/tmp/mod3-barge-in.json`` file IPC. This script is retained so existing
+launchd users (e.g. ``com.cogos.bargein-producer.plist``) continue to work
+until they migrate.
+
 Detection method:
   SuperWhisper creates a timestamped directory in its recordings folder
   the instant recording begins (the dir is empty). When recording finishes,
   it writes output.wav and meta.json into that directory. We poll for new
   empty directories to detect start, and for the appearance of output.wav
-  to detect end.
+  (or a matching row in SuperWhisper's SQLite DB) to detect end.
 
 Signal file: /tmp/mod3-barge-in.json
   Start:  {"event": "user_speaking_start", "timestamp": "...", "source": "superwhisper"}
@@ -130,6 +138,32 @@ def _has_output(path: Path) -> bool:
     return (path / "output.wav").exists() or (path / "meta.json").exists()
 
 
+# SuperWhisper SQLite DB — secondary signal for recording completion
+_SW_DB = os.path.expanduser("~/Library/Application Support/SuperWhisper/database/superwhisper.sqlite")
+
+
+def _is_in_db(folder_name: str) -> bool:
+    """Check if SuperWhisper has finished processing this recording.
+
+    SuperWhisper writes to its SQLite DB only AFTER transcription completes.
+    This is the structural ground truth — if the folder is in the DB, the
+    recording is definitely done, regardless of filesystem state.
+    """
+    try:
+        import sqlite3
+
+        conn = sqlite3.connect(f"file:{_SW_DB}?mode=ro", uri=True, timeout=1.0)
+        cursor = conn.execute(
+            "SELECT 1 FROM recording WHERE folderName = ? LIMIT 1",
+            (folder_name,),
+        )
+        found = cursor.fetchone() is not None
+        conn.close()
+        return found
+    except Exception:
+        return False
+
+
 _last_dir_mtime: float = 0.0
 
 
@@ -142,12 +176,25 @@ def _scan(state: State, rec_dir: Path):
     """
     global _last_dir_mtime
 
-    # Fast path: if we're tracking an active recording, just check that folder
+    # Fast path: if we're tracking an active recording, check completion signals
     if state.recording and state.active_folder:
         active_path = rec_dir / state.active_folder
+        # Primary: filesystem (output.wav or meta.json appeared)
         if _has_output(active_path):
             state.end()
-        return
+            return
+        # Secondary: SuperWhisper DB (transcription complete — structural ground truth)
+        if _is_in_db(state.active_folder):
+            log.info("DB confirms recording complete (filesystem missed it)")
+            state.end()
+            return
+        # Folder deleted/cancelled
+        if not active_path.exists():
+            log.warning("Active recording folder disappeared, clearing state")
+            state.end()
+            return
+        # Don't return here — fall through to check if directory changed,
+        # so we can detect if a newer recording superseded this one
 
     # Check if directory changed since last scan
     try:
@@ -184,12 +231,18 @@ def _scan(state: State, rec_dir: Path):
 # Staleness guard
 # ---------------------------------------------------------------------------
 
-_STALE_TIMEOUT = 120  # 2 minutes — if a recording folder stays empty this long,
-#                       assume SuperWhisper cancelled/crashed and clear state
+_STALE_TIMEOUT = 150  # 2.5 minutes — if a recording folder stays empty this long,
+#                       assume SuperWhisper cancelled/crashed/system slept and clear state.
+#                       User has long recordings (60s+), so this is generous.
 
 
 def _check_stale(state: State, rec_dir: Path):
-    """Clear recording state if the active folder has been empty too long."""
+    """Clear recording state if the active folder has been empty too long.
+
+    Before clearing, performs a final DB check to confirm the recording
+    isn't still being actively transcribed. This prevents the staleness
+    timeout from invalidating a legitimately long recording session.
+    """
     if not state.recording or not state.active_folder:
         return
     folder = rec_dir / state.active_folder
@@ -199,8 +252,20 @@ def _check_stale(state: State, rec_dir: Path):
     except (OSError, AttributeError):
         return
     if time.time() - ctime > _STALE_TIMEOUT:
-        log.warning("Stale recording detected (>%ds), clearing state", _STALE_TIMEOUT)
-        state.end()
+        # Final gateway: check DB before declaring stale
+        if _is_in_db(state.active_folder):
+            log.info("Stale timeout hit but DB confirms completion — ending normally")
+            state.end()
+        elif _has_output(folder):
+            log.info("Stale timeout hit but output files present — ending normally")
+            state.end()
+        else:
+            # Neither DB nor filesystem confirm completion — truly stale
+            log.warning(
+                "Stale recording (>%ds), no DB entry, no output files — clearing as cancelled/crashed",
+                _STALE_TIMEOUT,
+            )
+            state.end()
 
 
 # ---------------------------------------------------------------------------
@@ -261,9 +326,9 @@ def main():
         while True:
             _scan(state, rec_dir)
 
-            # Check for stale recordings every ~5 seconds
+            # Check for stale recordings every ~2 seconds
             stale_counter += 1
-            if stale_counter >= int(5.0 / POLL_INTERVAL):
+            if stale_counter >= int(2.0 / POLL_INTERVAL):
                 _check_stale(state, rec_dir)
                 stale_counter = 0
 
diff --git a/mcp_shim.py b/mcp_shim.py
index a34ca35..3f6f87b 100644
--- a/mcp_shim.py
+++ b/mcp_shim.py
@@ -47,8 +47,11 @@
 _jobs_lock = threading.Lock()
 _MAX_JOBS = 50
 
-# Barge-in signal file (same as server.py)
-_BARGEIN_SIGNAL = os.path.expanduser("~/.mod3_bargein_signal.json")
+# Barge-in signal file — must match server.py (_BARGEIN_SIGNAL there).
+# Previously this was ``~/.mod3_bargein_signal.json`` but that was never
+# written by anyone; the canonical path is the one the producer and server
+# already use: /tmp/mod3-barge-in.json.
+_BARGEIN_SIGNAL = os.environ.get("BARGEIN_SIGNAL", "/tmp/mod3-barge-in.json")
 
 
 def _http_request(method: str, path: str, body: dict | None = None, timeout: float = 30.0) -> tuple[int, dict | bytes]:
diff --git a/modules/voice.py b/modules/voice.py
index 20e02e1..ac732ad 100644
--- a/modules/voice.py
+++ b/modules/voice.py
@@ -88,7 +88,9 @@ class WhisperDecoder(Decoder):
     - Base (whisper-base-mlx): fast, used for T1 tier (~31ms)
     """
 
-    DEFAULT_MODEL = "mlx-community/whisper-large-v3-turbo"
+    # Downgraded from whisper-large-v3-turbo to base to reduce MLX Metal
+    # pressure (Gemma + Kokoro + Whisper concurrent load segfaults).
+    DEFAULT_MODEL = "mlx-community/whisper-base-mlx"
     BASE_MODEL = "mlx-community/whisper-base-mlx"
 
     def __init__(self, model: str | None = None, load_base: bool = True):
diff --git a/schemas/__init__.py b/schemas/__init__.py
new file mode 100644
index 0000000..1140380
--- /dev/null
+++ b/schemas/__init__.py
@@ -0,0 +1 @@
+"""Mod3 shared schemas."""
diff --git a/schemas/bargein.py b/schemas/bargein.py
new file mode 100644
index 0000000..0134235
--- /dev/null
+++ b/schemas/bargein.py
@@ -0,0 +1,70 @@
+"""Barge-in context schema for injecting interrupt state into the next agent turn.
+
+Sibling to `pipeline_state.InterruptInfo` — that type is the raw record captured
+at the moment TTS playback is halted (timestamp, spoken_pct, delivered_text,
+full_text, reason). `BargeinContext` is the agent-facing view: it adds the
+precomputed `unspoken` remainder, the interrupting user's transcript (when
+known), and a classified `source`, plus a `format_for_prompt()` renderer for
+system-prompt injection. A2/A3 construct one of these from an `InterruptInfo`
+(and, on the STT path, the resulting transcript) and hand it to agent_loop.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Literal
+
+from pipeline_state import InterruptInfo
+
+BargeinSource = Literal["browser_vad", "mcp_signal", "manual", "superwhisper"]
+
+
+@dataclass
+class BargeinContext:
+    """Agent-facing snapshot of a TTS interrupt, ready for prompt injection."""
+
+    spoken: str
+    unspoken: str
+    full_text: str
+    spoken_pct: float
+    user_said: str | None
+    interrupted_at: datetime
+    source: BargeinSource
+
+    @classmethod
+    def from_interrupt_info(
+        cls,
+        info: InterruptInfo,
+        source: BargeinSource,
+        user_said: str | None = None,
+    ) -> BargeinContext:
+        """Build a BargeinContext from a pipeline_state.InterruptInfo record."""
+        full_text = info.full_text or ""
+        spoken = info.delivered_text or ""
+        if full_text.startswith(spoken):
+            unspoken = full_text[len(spoken) :].strip()
+        else:
+            unspoken = full_text[len(spoken) :].strip()
+        return cls(
+            spoken=spoken,
+            unspoken=unspoken,
+            full_text=full_text,
+            spoken_pct=info.spoken_pct,
+            user_said=user_said,
+            interrupted_at=datetime.fromtimestamp(info.timestamp),
+            source=source,
+        )
+
+    def format_for_prompt(self) -> str:
+        """Render a terse system-prompt-friendly string (3-6 lines)."""
+        lines: list[str] = ["[Your previous reply was interrupted.]"]
+        if self.spoken:
+            lines.append(f'Spoken: "{self.spoken}"')
+        if self.unspoken:
+            lines.append(f'Unspoken: "{self.unspoken}"')
+        if self.user_said:
+            lines.append(f'User said: "{self.user_said}"')
+        else:
+            lines.append(f"User interrupted at {self.spoken_pct * 100:.0f}% (via {self.source}).")
+        return "\n".join(lines)
diff --git a/server.py b/server.py
index 0c0e4e5..8909f02 100644
--- a/server.py
+++ b/server.py
@@ -26,6 +26,7 @@
 import uuid
 import wave
 from collections import OrderedDict
+from datetime import datetime, timezone
 from typing import Any
 
 import anyio
@@ -369,54 +370,157 @@ async def _filter_read_stream():
 _bargein_last_mtime: float = 0.0
 
 
-def _acquire_speaking_lock(job_id: str, text: str):
-    """Write cross-process speaking lock so the barge-in watcher knows ANY Mod³ is speaking."""
+def _pid_is_alive(pid: Any) -> bool:
+    """Return True if a local process with ``pid`` is still alive."""
+    if not isinstance(pid, int) or pid <= 0:
+        return False
+    try:
+        os.kill(pid, 0)
+    except OSError:
+        return False
+    return True
+
+
+def _read_speaking_lock() -> dict | None:
+    """Read the speaking lock file. Returns None if missing or unparseable."""
+    try:
+        if not os.path.exists(_SPEAKING_LOCK):
+            return None
+        with open(_SPEAKING_LOCK) as f:
+            return json.load(f)
+    except (OSError, json.JSONDecodeError):
+        return None
+
+
+def _acquire_speaking_lock(job_id: str, text: str) -> bool:
+    """Try to claim the cross-process speaking lock for this (pid, job_id).
+
+    The lock is acquired (and overwritten) when:
+      * the file is missing,
+      * the existing holder PID is dead, or
+      * the existing holder is this same (pid, job_id) (idempotent re-acquire).
+
+    Otherwise the lock is left untouched and ``False`` is returned — a
+    different live process owns the speaker. Callers may still play audio
+    locally; they just won't be eligible for cross-process barge-in.
+    """
+    my_pid = os.getpid()
+    payload = {
+        "speaking": True,
+        "job_id": job_id,
+        "text": text,
+        "pid": my_pid,
+        "acquired_at": datetime.now(timezone.utc).isoformat(),
+    }
+
+    existing = _read_speaking_lock()
+    if existing is not None:
+        holder_pid = existing.get("pid")
+        holder_job = existing.get("job_id")
+        same_owner = holder_pid == my_pid and holder_job == job_id
+        if not same_owner and _pid_is_alive(holder_pid):
+            return False
+        # Either same owner re-acquiring, or stale lock from a dead pid —
+        # fall through and overwrite.
+
     try:
-        payload = {
-            "speaking": True,
-            "job_id": job_id,
-            "text": text,
-            "pid": os.getpid(),
-            "timestamp": time.time(),
-        }
         tmp = _SPEAKING_LOCK + ".tmp"
         with open(tmp, "w") as f:
             json.dump(payload, f)
         os.replace(tmp, _SPEAKING_LOCK)
+        return True
     except OSError:
-        pass
+        return False
 
 
-def _release_speaking_lock():
-    """Clear the cross-process speaking lock."""
+def _release_speaking_lock(job_id: str | None = None) -> bool:
+    """Release the speaking lock if this process owns it.
+
+    Returns True if the lock was removed, False if the file is missing,
+    held by a different (pid, job_id), or unreadable. When ``job_id`` is
+    provided, both pid AND job_id must match; otherwise only pid is checked.
+    """
+    existing = _read_speaking_lock()
+    if existing is None:
+        return False
+    if existing.get("pid") != os.getpid():
+        return False
+    if job_id is not None and existing.get("job_id") != job_id:
+        return False
+    try:
+        os.remove(_SPEAKING_LOCK)
+        return True
+    except OSError:
+        return False
+
+
+def _i_own_speaking_lock(job_id: str) -> bool:
+    """True if the on-disk lock matches our (pid, job_id)."""
+    existing = _read_speaking_lock()
+    if existing is None:
+        return False
+    return existing.get("pid") == os.getpid() and existing.get("job_id") == job_id
+
+
+def _force_clear_speaking_lock() -> dict | None:
+    """Forcibly remove the speaking lock regardless of owner.
+
+    Used by the cross-process barge-in path: when the file watcher decides
+    another process must stop speaking, it removes the lock file. The owner
+    notices via stop-on-pid-mismatch (its own pid is no longer present) and
+    halts its generation loop.
+
+    Returns the lock contents at the moment of removal, or ``None`` if the
+    file was missing.
+    """
+    existing = _read_speaking_lock()
     try:
         if os.path.exists(_SPEAKING_LOCK):
             os.remove(_SPEAKING_LOCK)
     except OSError:
         pass
+    return existing
 
 
 def _is_any_process_speaking() -> dict | None:
-    """Check if ANY Mod³ process is currently speaking (cross-process)."""
-    try:
-        if not os.path.exists(_SPEAKING_LOCK):
-            return None
-        with open(_SPEAKING_LOCK) as f:
-            lock = json.load(f)
-        # Stale lock check: if older than 60s, ignore it (crashed process)
-        if time.time() - lock.get("timestamp", 0) > 60:
+    """Check if a live Mod³ process is currently speaking (cross-process).
+
+    Returns the lock dict if a live holder exists; ``None`` otherwise.
+    Stale locks (holder pid is dead) are removed as a side effect.
+    """
+    existing = _read_speaking_lock()
+    if existing is None:
+        return None
+    if not _pid_is_alive(existing.get("pid")):
+        try:
             os.remove(_SPEAKING_LOCK)
-            return None
-        return lock
-    except (OSError, json.JSONDecodeError):
+        except OSError:
+            pass
         return None
+    return existing
 
 
 def _bargein_watcher():
-    """Background thread that watches for barge-in signal file changes."""
+    """Background thread that watches for barge-in signal file changes.
+
+    This path is retained for the standalone ``integrations/bargein-producer.py``
+    producer (and its launchd plist). In-process providers go through
+    ``bargein.BargeinRegistry`` instead, calling the same shared
+    ``handle_bargein_start`` consumer helper.
+
+    For ``user_speaking_end`` events, the watcher also bridges the file into
+    the registry by dispatching a synthetic ``BargeinEvent`` — that lets
+    registry-side waiters (``await_voice_input``'s ``wait_for_event``) wake
+    from file-based producers without maintaining a second wait path.
+    Feedback is broken by skipping files whose ``via`` marker shows they were
+    written by our own file-mirror subscriber.
+    """
     global _bargein_last_mtime
     import json as _json
 
+    from bargein import handle_bargein_start
+    from bargein.providers.base import BargeinEvent
+
     while True:
         try:
             import os
@@ -427,30 +531,50 @@ def _bargein_watcher():
                     _bargein_last_mtime = mtime
                     with open(_BARGEIN_SIGNAL) as f:
                         signal = _json.load(f)
-                    if signal.get("event") == "user_speaking_start":
-                        # Check local pipeline state first (same process)
-                        if pipeline_state.is_speaking:
-                            info = pipeline_state.interrupt(reason="barge_in")
-                            if info:
-                                signal["interrupted"] = {
-                                    "spoken_pct": info.spoken_pct,
-                                    "delivered_text": info.delivered_text,
-                                    "full_text": info.full_text,
-                                }
-                                with open(_BARGEIN_SIGNAL, "w") as f:
-                                    _json.dump(signal, f, indent=2)
-                            logging.info(
-                                "Barge-in: paused local playback (%.0f%% delivered)",
-                                info.spoken_pct * 100 if info else 0,
+                    event_type = signal.get("event")
+                    # Break the file_mirror → watcher → registry feedback loop:
+                    # events the registry itself just mirrored out are marked
+                    # with via=bargein_registry and should not round-trip back.
+                    from_mirror = signal.get("via") == "bargein_registry"
+                    if event_type == "user_speaking_end" and not from_mirror:
+                        # Bridge external producers (integrations/bargein-producer.py)
+                        # into the in-process registry so wait_for_event sees them.
+                        _bargein_registry._dispatch(
+                            BargeinEvent(
+                                source=signal.get("source", "superwhisper"),
+                                event_type="user_speaking_end",
+                                metadata={
+                                    "via": "file_signal",
+                                    **{k: v for k, v in signal.items() if k not in ("event", "source", "timestamp")},
+                                },
                             )
+                        )
+                    if signal.get("event") == "user_speaking_start":
+                        # Shared consumer: check is_speaking + interrupt + log
+                        info = handle_bargein_start(
+                            pipeline_state,
+                            source=signal.get("source", "file_signal"),
+                            metadata={"via": "file_signal"},
+                        )
+                        if info is not None:
+                            # Enrich the on-disk signal so cooperating consumers
+                            # can read the interrupt detail.
+                            signal["interrupted"] = {
+                                "spoken_pct": info.spoken_pct,
+                                "delivered_text": info.delivered_text,
+                                "full_text": info.full_text,
+                            }
+                            with open(_BARGEIN_SIGNAL, "w") as f:
+                                _json.dump(signal, f, indent=2)
                         else:
-                            # Check cross-process lock (another Mod³ process may be speaking)
+                            # Nothing speaking locally — check cross-process lock.
+                            # This path is only meaningful for the file-based IPC
+                            # (another mod3 process owns the speech); in-process
+                            # providers share pipeline_state so never land here.
                             lock = _is_any_process_speaking()
                             if lock:
-                                # We can't interrupt another process's pipeline_state,
-                                # but we CAN write the interrupt context from the lock data
                                 signal["interrupted"] = {
-                                    "spoken_pct": 0.0,  # Unknown from cross-process
+                                    "spoken_pct": 0.0,
                                     "delivered_text": "",
                                     "full_text": lock.get("text", ""),
                                     "cross_process": True,
@@ -458,14 +582,34 @@ def _bargein_watcher():
                                 }
                                 with open(_BARGEIN_SIGNAL, "w") as f:
                                     _json.dump(signal, f, indent=2)
-                                # Clear the speaking lock to signal the other process
-                                _release_speaking_lock()
-                                logging.info("Barge-in: cross-process interrupt (pid=%s)", lock.get("pid"))
+                                _force_clear_speaking_lock()
+                                logging.info(
+                                    "Barge-in: cross-process interrupt (pid=%s)",
+                                    lock.get("pid"),
+                                )
         except Exception as e:
             logging.debug("Barge-in watcher error: %s", e)
         time.sleep(0.1)  # 100ms poll
 
 
+# ---------------------------------------------------------------------------
+# Barge-in provider registry — in-process providers (SuperWhisper, future:
+# silero VAD, hotkey, etc.). Opt-in via MOD3_BARGEIN_PROVIDERS. Empty default
+# preserves current behavior for users who only run the legacy file producer.
+#
+# NOTE: the registry is constructed BEFORE the watcher thread starts because
+# the watcher bridges file user_speaking_end events into the registry.
+# ---------------------------------------------------------------------------
+
+from bargein import BargeinRegistry, make_file_mirror_subscriber  # noqa: E402
+
+_bargein_registry = BargeinRegistry(pipeline_state)
+# Mirror in-process provider events into the legacy signal file so
+# out-of-process consumers (mcp_shim.py, integrations watching the file)
+# keep receiving events from in-process providers like SuperWhisperProvider.
+_bargein_registry.subscribe(make_file_mirror_subscriber(_BARGEIN_SIGNAL))
+_bargein_registry.start_from_env()
+
 _bargein_thread = threading.Thread(target=_bargein_watcher, daemon=True)
 _bargein_thread.start()
 
@@ -654,7 +798,7 @@ def _run_speech_job(entry: dict) -> None:
 
     # Register with the reflex arc so inbound VAD can interrupt us
     pipeline_state.start_speaking(text, player)
-    _acquire_speaking_lock(job_id, text)
+    i_have_lock = _acquire_speaking_lock(job_id, text)
     try:
         for chunk in engine_module.generate_audio(
             text,
@@ -664,9 +808,15 @@ def _run_speech_job(entry: dict) -> None:
             speed=speed,
             emotion=emotion,
         ):
-            # Check if barge-in cleared our speaking lock (cross-process interrupt)
-            if not os.path.exists(_SPEAKING_LOCK):
-                logging.info("Speaking lock cleared by barge-in watcher — stopping generation")
+            # If we held the cross-process lock and lost it (file gone or
+            # pid no longer matches), the bargein watcher decided we should
+            # stop. Without our own lock, we don't gate on this signal —
+            # another process owns the speaker and we're playing locally.
+            if i_have_lock and not _i_own_speaking_lock(job_id):
+                logging.info(
+                    "Speaking lock no longer ours (job %s) — stopping generation",
+                    job_id,
+                )
                 player.flush()
                 break
             player.queue_audio(chunk.samples, chunk_meta=chunk.metadata if chunk.metadata else None)
@@ -686,7 +836,7 @@ def _run_speech_job(entry: dict) -> None:
     # Final position update and clear speaking state
     pipeline_state.update_position(*player.get_progress())
     pipeline_state.stop_speaking()
-    _release_speaking_lock()
+    _release_speaking_lock(job_id)
 
     result = metrics.to_dict()
     result["engine"] = engine
@@ -1161,8 +1311,14 @@ def await_voice_input(timeout_sec: float = 180.0) -> str:
     when speak() returns "held" (user is recording) or when you want to listen
     for the next voice input.
 
-    Polls the barge-in signal file for user_speaking_end, then reads the
-    transcript from SuperWhisper's recordings directory.
+    Single wait path: ``BargeinRegistry.wait_for_event("user_speaking_end", ...)``.
+    Out-of-process producers (``integrations/bargein-producer.py``) write to
+    ``/tmp/mod3-barge-in.json``; the module-level ``_bargein_watcher`` bridges
+    those writes into the registry as synthetic events, so both in-process
+    and out-of-process sources funnel through one wait.
+
+    After the wait unblocks, reads the transcript from SuperWhisper's
+    recordings directory (meta.json) or SQLite DB as a fallback.
 
     Args:
         timeout_sec: Maximum seconds to wait for recording to finish. Default 180 (3 minutes).
@@ -1172,19 +1328,8 @@ def await_voice_input(timeout_sec: float = 180.0) -> str:
     _sw_db = os.path.expanduser("~/Library/Application Support/SuperWhisper/database/superwhisper.sqlite")
     _rec_dir = os.path.expanduser("~/Documents/superwhisper/recordings")
 
-    start = time.time()
-    # If user is currently recording, wait for them to finish
-    while time.time() - start < timeout_sec:
-        try:
-            if os.path.exists(_BARGEIN_SIGNAL):
-                with open(_BARGEIN_SIGNAL) as f:
-                    signal = json.load(f)
-                if signal.get("event") == "user_speaking_end":
-                    break
-        except (OSError, json.JSONDecodeError):
-            pass
-        time.sleep(0.2)
-    else:
+    event = _bargein_registry.wait_for_event("user_speaking_end", timeout=timeout_sec)
+    if event is None:
         return json.dumps({"status": "timeout", "error": f"No recording completed within {timeout_sec}s"})
 
     # Recording finished — find the latest transcript
diff --git a/tests/test_bargein_context.py b/tests/test_bargein_context.py
new file mode 100644
index 0000000..71c7f53
--- /dev/null
+++ b/tests/test_bargein_context.py
@@ -0,0 +1,197 @@
+"""Tests for the barge-in context injection path through AgentLoop.
+
+Verifies the A2/A3 flow end-to-end:
+  pipeline_state.last_interrupt (InterruptInfo)
+    -> AgentLoop._prepare_bargein_context()
+    -> AgentLoop._pending_bargein (BargeinContext)
+    -> AgentLoop._inject_pending_bargein(system_prompt)
+    -> provider.chat(system=...)
+
+Run: python -m pytest tests/test_bargein_context.py -v
+"""
+
+from __future__ import annotations
+
+import asyncio
+import os
+import sys
+import time
+from unittest.mock import MagicMock
+
+# Ensure the project root is on sys.path for imports
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from agent_loop import AgentLoop  # noqa: E402
+from modality import CognitiveEvent, ModalityType  # noqa: E402
+from pipeline_state import InterruptInfo, PipelineState  # noqa: E402
+from providers import ProviderResponse  # noqa: E402
+from schemas.bargein import BargeinContext  # noqa: E402
+
+# ---------------------------------------------------------------------------
+# Test doubles
+# ---------------------------------------------------------------------------
+
+
+class FakeProvider:
+    """Minimal async provider: records chat() kwargs, returns empty response."""
+
+    name = "fake"
+
+    def __init__(self, response: ProviderResponse | None = None):
+        self.calls: list[dict] = []
+        self._response = response or ProviderResponse(tool_calls=[], text="")
+
+    async def chat(self, messages, tools=None, system: str = ""):
+        self.calls.append({"messages": list(messages), "tools": tools, "system": system})
+        return self._response
+
+
+def _make_loop(provider: FakeProvider | None = None) -> AgentLoop:
+    """Build an AgentLoop with a MagicMock bus and a fresh PipelineState."""
+    bus = MagicMock()
+    prov = provider or FakeProvider()
+    state = PipelineState()
+    return AgentLoop(bus=bus, provider=prov, pipeline_state=state, channel_id="test")
+
+
+def _seed_interrupt(
+    state: PipelineState,
+    *,
+    full_text: str = "Hello there how are you today friend",
+    delivered_text: str = "Hello there how",
+    spoken_pct: float = 0.45,
+    reason: str = "vad_reflex",
+    timestamp: float | None = None,
+) -> InterruptInfo:
+    """Directly seed pipeline_state._last_interrupt (matches the prod consume pattern)."""
+    info = InterruptInfo(
+        timestamp=timestamp if timestamp is not None else time.time(),
+        spoken_pct=spoken_pct,
+        delivered_text=delivered_text,
+        full_text=full_text,
+        reason=reason,
+    )
+    with state._lock:
+        state._last_interrupt = info
+    return info
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+def test_prepare_bargein_from_interrupt():
+    """_prepare_bargein_context consumes last_interrupt and builds a BargeinContext."""
+    loop = _make_loop()
+    info = _seed_interrupt(loop.pipeline_state)
+
+    loop._prepare_bargein_context(user_text="wait go back")
+
+    assert isinstance(loop._pending_bargein, BargeinContext)
+    assert loop._pending_bargein.source == "browser_vad"
+    assert loop._pending_bargein.user_said == "wait go back"
+    assert loop._pending_bargein.full_text == info.full_text
+    assert loop._pending_bargein.spoken == info.delivered_text
+    assert abs(loop._pending_bargein.spoken_pct - info.spoken_pct) < 1e-9
+    # Unspoken = full minus delivered prefix, stripped
+    assert loop._pending_bargein.unspoken == "are you today friend"
+    # Interrupt must be cleared so subsequent turns don't re-consume it
+    assert loop.pipeline_state.last_interrupt is None
+
+
+def test_prepare_bargein_none_when_no_interrupt():
+    """When pipeline_state has no interrupt, _pending_bargein is cleared to None."""
+    loop = _make_loop()
+    loop._pending_bargein = MagicMock()  # something non-None
+    loop._prepare_bargein_context(user_text="hi")
+    assert loop._pending_bargein is None
+
+
+def test_inject_pending_bargein_into_prompt():
+    """_inject_pending_bargein appends the rendered context and clears pending."""
+    loop = _make_loop()
+    loop._pending_bargein = BargeinContext(
+        spoken="Hello there",
+        unspoken="how are you today",
+        full_text="Hello there how are you today",
+        spoken_pct=0.4,
+        user_said="wait go back",
+        interrupted_at=__import__("datetime").datetime.now(),
+        source="browser_vad",
+    )
+
+    result = loop._inject_pending_bargein("BASE PROMPT")
+
+    assert "BASE PROMPT" in result
+    assert "[Your previous reply was interrupted.]" in result
+    assert 'User said: "wait go back"' in result
+    # Consumed — must not leak into subsequent turns
+    assert loop._pending_bargein is None
+
+
+def test_inject_noop_when_no_pending():
+    """_inject_pending_bargein returns the prompt unchanged when nothing is pending."""
+    loop = _make_loop()
+    loop._pending_bargein = None
+    assert loop._inject_pending_bargein("BASE PROMPT") == "BASE PROMPT"
+
+
+def test_stale_interrupt_guarded():
+    """Interrupts older than the 30s freshness window are dropped, not injected."""
+    loop = _make_loop()
+    # Timestamp far in the past (well beyond the 30s guard)
+    _seed_interrupt(loop.pipeline_state, timestamp=time.time() - 120.0)
+
+    loop._prepare_bargein_context(user_text=None)
+
+    assert loop._pending_bargein is None
+    # Stale record must also be cleared so it can't rot in state
+    assert loop.pipeline_state.last_interrupt is None
+
+
+def test_full_flow_through_process_turn(monkeypatch):
+    """End-to-end: _process builds a prompt containing the bargein render.
+
+    Monkey-patches _fetch_kernel_context to avoid any HTTP call, and uses a
+    FakeProvider to capture the `system` kwarg passed into chat().
+    """
+    # Neutralize the kernel-context HTTP fetch: return "" so the prompt is deterministic.
+    import agent_loop as agent_loop_module
+
+    monkeypatch.setattr(agent_loop_module, "_fetch_kernel_context", lambda: "")
+    # Prevent exchange-logging HTTP call
+    monkeypatch.setattr(agent_loop_module, "_log_exchange_to_bus", lambda *a, **kw: None)
+
+    provider = FakeProvider(response=ProviderResponse(tool_calls=[], text=""))
+    loop = _make_loop(provider=provider)
+
+    _seed_interrupt(
+        loop.pipeline_state,
+        full_text="The capital of France is Paris and also other cities",
+        delivered_text="The capital of France",
+        spoken_pct=0.35,
+    )
+
+    event = CognitiveEvent(
+        modality=ModalityType.VOICE,
+        content="wait actually ask about Germany",
+        source_channel="test",
+    )
+
+    asyncio.run(loop._process(event))
+
+    assert len(provider.calls) == 1, "provider.chat should have been called exactly once"
+    system_prompt = provider.calls[0]["system"]
+    # Barge-in banner is present
+    assert "[Your previous reply was interrupted.]" in system_prompt
+    # User's new utterance was threaded into the BargeinContext
+    assert 'User said: "wait actually ask about Germany"' in system_prompt
+    # Spoken prefix surfaced in the render
+    assert 'Spoken: "The capital of France"' in system_prompt
+    # Base prompt still present
+    assert "You are Cog" in system_prompt
+    # Interrupt consumed from pipeline_state
+    assert loop.pipeline_state.last_interrupt is None
+    # Pending bargein cleared after injection
+    assert loop._pending_bargein is None
diff --git a/tests/test_bargein_provider_registry.py b/tests/test_bargein_provider_registry.py
new file mode 100644
index 0000000..27e87e5
--- /dev/null
+++ b/tests/test_bargein_provider_registry.py
@@ -0,0 +1,602 @@
+"""Tests for the barge-in provider registry + shared consumer helper.
+
+Covers:
+  * BargeinProvider lifecycle (start/stop, thread dies on stop)
+  * Registry event dispatch: user_speaking_start -> pipeline_state.interrupt
+  * user_speaking_end does NOT call interrupt (end is a no-op for now)
+  * handle_bargein_start returns None when nothing is speaking
+  * Subscribers fire after the consumer helper, and their exceptions don't
+    break other subscribers
+  * start_from_env respects the env var (empty default, unknown names warned)
+
+Run: python -m pytest tests/test_bargein_provider_registry.py -v
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+import threading
+import time
+
+import pytest
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from bargein import BargeinRegistry, handle_bargein_start  # noqa: E402
+from bargein.providers.base import BargeinEvent, BargeinProvider  # noqa: E402
+from pipeline_state import PipelineState  # noqa: E402
+
+# ---------------------------------------------------------------------------
+# Test doubles
+# ---------------------------------------------------------------------------
+
+
+class FakeProvider(BargeinProvider):
+    """Minimal provider: ``_run`` blocks on stop; tests call ``trigger`` to emit."""
+
+    source = "browser_vad"  # existing literal — doesn't require the new "superwhisper"
+
+    def __init__(self, on_event):
+        super().__init__(on_event)
+        self.run_called = threading.Event()
+        self.run_returned = threading.Event()
+
+    def _run(self) -> None:
+        self.run_called.set()
+        # Wait for stop; the registry / test drives via trigger().
+        while not self._stop.is_set():
+            self._stop.wait(0.05)
+        self.run_returned.set()
+
+    def trigger(self, event_type: str, **metadata) -> None:
+        """Drive an event through the callback synchronously."""
+        self._emit(event_type, metadata)
+
+
+class _FakePlayer:
+    """Minimal player stub for PipelineState: flush() is all interrupt() calls."""
+
+    def __init__(self):
+        self.flushed = False
+
+    def flush(self) -> None:
+        self.flushed = True
+
+
+def _speaking_state(text: str = "Hello world how are you") -> PipelineState:
+    """Build a PipelineState already in the speaking state."""
+    state = PipelineState()
+    state.start_speaking(text, _FakePlayer())
+    # Pretend ~30% delivered so interrupt() has a non-zero spoken_pct
+    state.update_position(samples_played=30, total_samples=100)
+    return state
+
+
+# ---------------------------------------------------------------------------
+# handle_bargein_start (the shared consumer helper)
+# ---------------------------------------------------------------------------
+
+
+def test_handle_bargein_start_interrupts_speaking_state():
+    state = _speaking_state("The capital of France is Paris")
+    assert state.is_speaking
+
+    info = handle_bargein_start(state, source="superwhisper", metadata={"folder": "1234"})
+
+    assert info is not None
+    assert info.reason == "barge_in"
+    assert 0.0 < info.spoken_pct <= 1.0
+    assert state.is_speaking is False
+    # Interrupt is recorded for downstream BargeinContext consumption
+    assert state.last_interrupt is info
+
+
+def test_handle_bargein_start_noop_when_silent():
+    state = PipelineState()
+    assert state.is_speaking is False
+
+    info = handle_bargein_start(state, source="superwhisper")
+
+    assert info is None
+    assert state.last_interrupt is None
+
+
+# ---------------------------------------------------------------------------
+# BargeinProvider lifecycle
+# ---------------------------------------------------------------------------
+
+
+def test_provider_start_and_stop():
+    events: list[BargeinEvent] = []
+    p = FakeProvider(on_event=events.append)
+
+    p.start()
+    assert p.run_called.wait(timeout=1.0), "_run was not entered"
+    assert p.is_running
+
+    p.stop(timeout=1.0)
+    assert p.run_returned.wait(timeout=1.0), "_run did not return after stop()"
+    assert not p.is_running
+
+
+def test_provider_start_is_idempotent():
+    p = FakeProvider(on_event=lambda _e: None)
+    p.start()
+    first_thread = p._thread
+    p.start()  # should not spawn a second thread
+    assert p._thread is first_thread
+    p.stop()
+
+
+def test_provider_emits_events_through_callback():
+    events: list[BargeinEvent] = []
+    p = FakeProvider(on_event=events.append)
+    p.start()
+    try:
+        p.trigger("user_speaking_start", folder="1234")
+        p.trigger("user_speaking_end", folder="1234")
+    finally:
+        p.stop()
+
+    assert len(events) == 2
+    assert events[0].event_type == "user_speaking_start"
+    assert events[0].source == "browser_vad"
+    assert events[0].metadata == {"folder": "1234"}
+    assert events[1].event_type == "user_speaking_end"
+
+
+def test_callback_exception_does_not_kill_provider():
+    """A raising callback must not propagate out of _emit."""
+
+    def boom(_event):
+        raise RuntimeError("consumer exploded")
+
+    p = FakeProvider(on_event=boom)
+    p.start()
+    try:
+        # Should not raise
+        p.trigger("user_speaking_start")
+    finally:
+        p.stop()
+    # If we got here, _emit swallowed the exception.
+
+
+# ---------------------------------------------------------------------------
+# BargeinRegistry
+# ---------------------------------------------------------------------------
+
+
+def test_registry_routes_start_event_into_consumer_helper():
+    state = _speaking_state()
+    registry = BargeinRegistry(state)
+    p = FakeProvider(on_event=registry._dispatch)
+    registry.register(p)
+    registry.start_all()
+
+    try:
+        p.trigger("user_speaking_start", folder="1234")
+        # Dispatch is synchronous on the triggering thread
+        assert state.is_speaking is False
+        assert state.last_interrupt is not None
+        assert state.last_interrupt.reason == "barge_in"
+    finally:
+        registry.stop_all()
+
+
+def test_registry_end_event_does_not_interrupt():
+    state = _speaking_state()
+    registry = BargeinRegistry(state)
+    p = FakeProvider(on_event=registry._dispatch)
+    registry.register(p)
+    registry.start_all()
+
+    try:
+        p.trigger("user_speaking_end", folder="1234")
+        # End events must not interrupt ongoing playback
+        assert state.is_speaking is True
+        assert state.last_interrupt is None
+    finally:
+        registry.stop_all()
+
+
+def test_registry_subscribers_fire_after_consumer():
+    state = _speaking_state()
+    registry = BargeinRegistry(state)
+    seen: list[BargeinEvent] = []
+    registry.subscribe(seen.append)
+
+    p = FakeProvider(on_event=registry._dispatch)
+    registry.register(p)
+    registry.start_all()
+
+    try:
+        p.trigger("user_speaking_start")
+        p.trigger("user_speaking_end")
+    finally:
+        registry.stop_all()
+
+    # Subscriber sees both events, in order
+    assert [e.event_type for e in seen] == ["user_speaking_start", "user_speaking_end"]
+
+
+def test_registry_subscriber_exception_isolated():
+    """One subscriber raising must not prevent later subscribers from firing."""
+    state = PipelineState()  # not speaking — consumer is a no-op
+    registry = BargeinRegistry(state)
+
+    def raiser(_e):
+        raise RuntimeError("nope")
+
+    other: list[BargeinEvent] = []
+    registry.subscribe(raiser)
+    registry.subscribe(other.append)
+
+    p = FakeProvider(on_event=registry._dispatch)
+    registry.register(p)
+    registry.start_all()
+    try:
+        p.trigger("user_speaking_start")
+    finally:
+        registry.stop_all()
+
+    assert len(other) == 1
+
+
+def test_registry_stop_all_stops_every_provider():
+    state = PipelineState()
+    registry = BargeinRegistry(state)
+
+    providers = [FakeProvider(on_event=registry._dispatch) for _ in range(3)]
+    for p in providers:
+        registry.register(p)
+    registry.start_all()
+
+    # Wait until they're all actually running
+    for p in providers:
+        assert p.run_called.wait(timeout=1.0)
+
+    registry.stop_all(timeout=1.0)
+
+    for p in providers:
+        assert p.run_returned.wait(timeout=1.0)
+        assert not p.is_running
+
+
+# ---------------------------------------------------------------------------
+# start_from_env
+# ---------------------------------------------------------------------------
+
+
+def test_start_from_env_empty_by_default(monkeypatch):
+    monkeypatch.delenv("MOD3_BARGEIN_PROVIDERS", raising=False)
+    registry = BargeinRegistry(PipelineState())
+    started = registry.start_from_env()
+    assert started == []
+
+
+def test_start_from_env_ignores_unknown_names(monkeypatch, caplog):
+    monkeypatch.setenv("MOD3_BARGEIN_PROVIDERS", "definitely_not_a_provider")
+    registry = BargeinRegistry(PipelineState())
+    started = registry.start_from_env()
+    assert started == []
+
+
+def test_start_from_env_instantiates_known_provider(monkeypatch):
+    """Happy path: 'superwhisper' in env -> SuperWhisperProvider registered.
+
+    We don't wait for it to find recordings (that directory may not exist on
+    the test host) — we only verify construction + registration + shutdown.
+    """
+    monkeypatch.setenv("MOD3_BARGEIN_PROVIDERS", "superwhisper")
+    # Point it at a directory we know doesn't exist so its _run returns fast
+    monkeypatch.setenv("SW_RECORDINGS_DIR", "/tmp/mod3-bargein-test-nonexistent")
+
+    registry = BargeinRegistry(PipelineState())
+    started = registry.start_from_env()
+    try:
+        assert started == ["superwhisper"]
+        # One provider is registered
+        assert len(registry._providers) == 1
+        from bargein.providers.superwhisper import SuperWhisperProvider
+
+        assert isinstance(registry._providers[0], SuperWhisperProvider)
+    finally:
+        registry.stop_all(timeout=1.0)
+
+
+# ---------------------------------------------------------------------------
+# wait_for_event — used by await_voice_input() to block until in-process
+# providers fire (replaces the old file-only poll). Regression target for
+# Codex review #4 / Fix 1.
+# ---------------------------------------------------------------------------
+
+
+def test_wait_for_event_returns_matching_event():
+    """wait_for_event returns the event when an in-process provider emits it."""
+    registry = BargeinRegistry(PipelineState())
+    p = FakeProvider(on_event=registry._dispatch)
+    registry.register(p)
+    registry.start_all()
+
+    fired = threading.Event()
+    captured: list[BargeinEvent | None] = []
+
+    def _waiter():
+        evt = registry.wait_for_event("user_speaking_end", timeout=2.0)
+        captured.append(evt)
+        fired.set()
+
+    threading.Thread(target=_waiter, daemon=True).start()
+    # Give the waiter a tick to subscribe before we trigger
+    time.sleep(0.05)
+    p.trigger("user_speaking_end", folder="42")
+
+    assert fired.wait(timeout=2.0), "wait_for_event did not return after emit"
+    registry.stop_all()
+    assert len(captured) == 1
+    assert captured[0] is not None
+    assert captured[0].event_type == "user_speaking_end"
+    assert captured[0].metadata == {"folder": "42"}
+
+
+def test_wait_for_event_filters_by_event_type():
+    """A start event must NOT satisfy a wait for end."""
+    registry = BargeinRegistry(PipelineState())
+    p = FakeProvider(on_event=registry._dispatch)
+    registry.register(p)
+    registry.start_all()
+
+    captured: list[BargeinEvent | None] = []
+    done = threading.Event()
+
+    def _waiter():
+        captured.append(registry.wait_for_event("user_speaking_end", timeout=0.4))
+        done.set()
+
+    threading.Thread(target=_waiter, daemon=True).start()
+    time.sleep(0.05)
+    # Wrong event type — the waiter should ignore this and time out
+    p.trigger("user_speaking_start")
+
+    assert done.wait(timeout=2.0)
+    registry.stop_all()
+    assert captured == [None]
+
+
+def test_wait_for_event_filters_by_source():
+    """source=... narrows the wait to a specific provider."""
+    registry = BargeinRegistry(PipelineState())
+    p_browser = FakeProvider(on_event=registry._dispatch)
+
+    # Subclass with a different source value
+    class _SuperFake(FakeProvider):
+        source = "superwhisper"
+
+    p_sw = _SuperFake(on_event=registry._dispatch)
+    registry.register(p_browser)
+    registry.register(p_sw)
+    registry.start_all()
+
+    captured: list[BargeinEvent | None] = []
+    done = threading.Event()
+
+    def _waiter():
+        captured.append(registry.wait_for_event("user_speaking_end", source="superwhisper", timeout=2.0))
+        done.set()
+
+    threading.Thread(target=_waiter, daemon=True).start()
+    time.sleep(0.05)
+    # Browser-VAD end first — should be ignored by source filter
+    p_browser.trigger("user_speaking_end", folder="b1")
+    time.sleep(0.05)
+    # Then SuperWhisper end — should satisfy
+    p_sw.trigger("user_speaking_end", folder="sw1")
+
+    assert done.wait(timeout=2.0)
+    registry.stop_all()
+    assert captured[0] is not None
+    assert captured[0].source == "superwhisper"
+    assert captured[0].metadata == {"folder": "sw1"}
+
+
+def test_wait_for_event_times_out_when_silent():
+    """No event emitted -> wait_for_event returns None within timeout."""
+    registry = BargeinRegistry(PipelineState())
+    t0 = time.monotonic()
+    result = registry.wait_for_event("user_speaking_end", timeout=0.2)
+    elapsed = time.monotonic() - t0
+    assert result is None
+    assert 0.15 < elapsed < 1.0
+
+
+def test_wait_for_event_unsubscribes_on_completion(monkeypatch):
+    """The temporary waiter subscriber must not leak after the wait returns."""
+    registry = BargeinRegistry(PipelineState())
+    p = FakeProvider(on_event=registry._dispatch)
+    registry.register(p)
+    registry.start_all()
+
+    starting = len(registry._subscribers)
+
+    def _do_wait():
+        registry.wait_for_event("user_speaking_end", timeout=0.5)
+
+    t = threading.Thread(target=_do_wait, daemon=True)
+    t.start()
+    time.sleep(0.05)
+    # While waiting, the subscriber count should be elevated
+    assert len(registry._subscribers) == starting + 1
+    p.trigger("user_speaking_end")
+    t.join(timeout=2.0)
+    registry.stop_all()
+    # Cleaned up
+    assert len(registry._subscribers) == starting
+
+
+# ---------------------------------------------------------------------------
+# make_file_mirror_subscriber — bridges in-process events to the legacy
+# /tmp/mod3-barge-in.json signal file so out-of-process pollers (mcp_shim)
+# keep working alongside the new registry.
+# ---------------------------------------------------------------------------
+
+
+def test_file_mirror_subscriber_writes_event_to_path(tmp_path):
+    import json as _json
+
+    from bargein import make_file_mirror_subscriber
+
+    signal_path = str(tmp_path / "mod3-barge-in.json")
+    registry = BargeinRegistry(PipelineState())
+    registry.subscribe(make_file_mirror_subscriber(signal_path))
+
+    p = FakeProvider(on_event=registry._dispatch)
+    registry.register(p)
+    registry.start_all()
+    try:
+        p.trigger("user_speaking_end", folder="abc")
+    finally:
+        registry.stop_all()
+
+    with open(signal_path) as f:
+        written = _json.load(f)
+
+    assert written["event"] == "user_speaking_end"
+    assert written["source"] == "browser_vad"  # FakeProvider's source
+    assert written["folder"] == "abc"
+    assert written["via"] == "bargein_registry"
+    assert "timestamp" in written
+
+
+# ---------------------------------------------------------------------------
+# await_voice_input — end-to-end regression test for Codex review #4 / Fix 1.
+# The unit tests above cover wait_for_event + make_file_mirror_subscriber in
+# isolation; these lock down the actual mod3 tool function, which is what the
+# original regression (in-process user_speaking_end never waking the tool)
+# was about.
+# ---------------------------------------------------------------------------
+
+
+def test_await_voice_input_returns_when_registry_emits_end(monkeypatch, tmp_path):
+    """Regression: await_voice_input() must return when an in-process provider
+    dispatches user_speaking_end through the registry.
+
+    This is exactly the bug Fix 1 addressed — before the registry-aware wait,
+    await_voice_input only watched the legacy signal file and never saw
+    events from in-process providers like SuperWhisperProvider.
+    """
+    import json as _json
+
+    import server  # noqa: E402
+
+    # Isolate the file signal so we don't race with any existing /tmp state
+    signal_path = str(tmp_path / "mod3-barge-in.json")
+    monkeypatch.setattr(server, "_BARGEIN_SIGNAL", signal_path)
+    monkeypatch.setattr(server, "_bargein_last_mtime", 0.0)
+
+    result_box: list[str] = []
+    t0 = time.monotonic()
+
+    def _caller():
+        result_box.append(server.await_voice_input(timeout_sec=5.0))
+
+    caller = threading.Thread(target=_caller, daemon=True)
+    caller.start()
+
+    # Let await_voice_input subscribe before we dispatch
+    time.sleep(0.2)
+
+    server._bargein_registry._dispatch(
+        BargeinEvent(
+            source="superwhisper",
+            event_type="user_speaking_end",
+            metadata={"folder": "42"},
+        )
+    )
+
+    caller.join(timeout=3.0)
+    elapsed = time.monotonic() - t0
+
+    assert not caller.is_alive(), "await_voice_input did not return after registry dispatch"
+    # 5.0s timeout would mean we missed the event; anything under ~2s means we caught it
+    assert elapsed < 2.5, f"took {elapsed:.2f}s — likely timed out rather than catching the event"
+    assert len(result_box) == 1
+
+    result = _json.loads(result_box[0])
+    # status may be "ok" (if SuperWhisper recordings exist) or "error" (no transcript
+    # to read in this test env), but MUST NOT be "timeout" — that is the regression.
+    assert result["status"] != "timeout", f"timed out despite registry event: {result}"
+
+
+def test_await_voice_input_returns_on_legacy_file_write(monkeypatch, tmp_path):
+    """Backward-compat: out-of-process producers (e.g. integrations/bargein-producer.py)
+    write ``user_speaking_end`` to ``/tmp/mod3-barge-in.json``. await_voice_input()
+    must still wake on that path after the Fix 2 refactor.
+    """
+    import json as _json
+
+    import server  # noqa: E402
+
+    signal_path = str(tmp_path / "mod3-barge-in.json")
+    monkeypatch.setattr(server, "_BARGEIN_SIGNAL", signal_path)
+    monkeypatch.setattr(server, "_bargein_last_mtime", 0.0)
+
+    result_box: list[str] = []
+    t0 = time.monotonic()
+
+    def _caller():
+        result_box.append(server.await_voice_input(timeout_sec=5.0))
+
+    caller = threading.Thread(target=_caller, daemon=True)
+    caller.start()
+
+    # Give await_voice_input a tick to enter its wait
+    time.sleep(0.2)
+
+    # Simulate the legacy producer writing to the signal file
+    with open(signal_path, "w") as f:
+        _json.dump(
+            {
+                "event": "user_speaking_end",
+                "source": "superwhisper",
+                "timestamp": "2026-04-19T00:00:00Z",
+            },
+            f,
+        )
+
+    caller.join(timeout=3.0)
+    elapsed = time.monotonic() - t0
+
+    assert not caller.is_alive(), "await_voice_input did not return after file write"
+    assert elapsed < 2.5, f"took {elapsed:.2f}s — likely timed out rather than reading file"
+    assert len(result_box) == 1
+
+    result = _json.loads(result_box[0])
+    assert result["status"] != "timeout", f"timed out despite file write: {result}"
+
+
+def test_await_voice_input_times_out_when_no_signal(monkeypatch, tmp_path):
+    """If neither source fires, await_voice_input() must actually time out.
+
+    This is the negative control for the two regression tests above — if it
+    always returned quickly, they wouldn't be proving anything.
+    """
+    import json as _json
+
+    import server  # noqa: E402
+
+    signal_path = str(tmp_path / "mod3-barge-in.json")
+    monkeypatch.setattr(server, "_BARGEIN_SIGNAL", signal_path)
+    monkeypatch.setattr(server, "_bargein_last_mtime", 0.0)
+
+    t0 = time.monotonic()
+    raw = server.await_voice_input(timeout_sec=0.4)
+    elapsed = time.monotonic() - t0
+
+    assert 0.3 < elapsed < 2.0, f"timeout path ran for {elapsed:.2f}s"
+    result = _json.loads(raw)
+    assert result["status"] == "timeout"
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__, "-v"]))
diff --git a/tests/test_browser_channel_routing.py b/tests/test_browser_channel_routing.py
new file mode 100644
index 0000000..fd7f89c
--- /dev/null
+++ b/tests/test_browser_channel_routing.py
@@ -0,0 +1,173 @@
+"""Tests for BrowserChannel.broadcast_response_text session routing.
+
+Covers Codex review #4 / Fix 4: kernel replies that include a session_id
+should be delivered ONLY to the matching BrowserChannel; replies without a
+session_id should fall back to broadcast (preserves backward compatibility
+with older kernels that don't yet include the session field).
+
+The session_id format is the convention from cogos_agent_bridge.post_user_message:
+``mod3:<channel_id>``. The leading ``mod3:`` is stripped to match the
+``BrowserChannel.channel_id`` value (e.g. ``browser:abc12345``).
+
+We don't spin up a real WebSocket; instead we register lightweight stand-ins
+on ``BrowserChannel._active_channels`` (a class-level set) that record the
+frames they receive.
+
+Run: python -m pytest tests/test_browser_channel_routing.py -v
+"""
+
+from __future__ import annotations
+
+import asyncio
+import sys
+from pathlib import Path
+from typing import Any
+from unittest.mock import patch
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
+
+from channels import BrowserChannel  # noqa: E402
+
+
+class _FakeWS:
+    def __init__(self) -> None:
+        self.sent: list[dict] = []
+
+    async def send_json(self, frame: dict) -> None:
+        self.sent.append(frame)
+
+
+class _FakeChannel:
+    """Mimics enough of BrowserChannel for broadcast_response_text to use it.
+
+    BrowserChannel.broadcast_response_text iterates ``_active_channels`` and
+    calls ``asyncio.run_coroutine_threadsafe(ch.ws.send_json(frame), ch._loop)``.
+    We replace run_coroutine_threadsafe with a synchronous shim that just
+    runs the coroutine on the current loop, so we don't need a separate
+    background loop per fake channel.
+    """
+
+    def __init__(self, channel_id: str, loop: asyncio.AbstractEventLoop) -> None:
+        self.channel_id = channel_id
+        self.ws = _FakeWS()
+        self._loop = loop
+        self._active = True
+
+
+@pytest.fixture(autouse=True)
+def _isolate_active_channels():
+    """Snapshot and restore BrowserChannel._active_channels around each test."""
+    snapshot = set(BrowserChannel._active_channels)
+    BrowserChannel._active_channels.clear()
+    yield
+    BrowserChannel._active_channels.clear()
+    BrowserChannel._active_channels.update(snapshot)
+
+
+def _patched_run(coro, _loop):
+    """Drive the awaitable to completion on the current event loop."""
+    asyncio.get_event_loop().run_until_complete(coro)
+
+    class _Done:
+        def result(self, timeout: float = 0) -> Any:  # noqa: ARG002
+            return None
+
+    return _Done()
+
+
+def _broadcast_with_loop(text: str, session_id: str | None = None) -> None:
+    """Run broadcast_response_text with run_coroutine_threadsafe stubbed.
+
+    Uses a fresh event loop so the fake channels' ws.send_json coroutines
+    actually run.
+    """
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    try:
+        with patch("channels.asyncio.run_coroutine_threadsafe", _patched_run):
+            BrowserChannel.broadcast_response_text(text, session_id=session_id)
+    finally:
+        loop.close()
+
+
+def test_broadcast_with_no_session_id_fans_out_to_all_active_channels():
+    loop = asyncio.new_event_loop()
+    a = _FakeChannel("browser:aaa", loop)
+    b = _FakeChannel("browser:bbb", loop)
+    c = _FakeChannel("browser:ccc", loop)
+    BrowserChannel._active_channels.update({a, b, c})
+
+    _broadcast_with_loop("hello everyone")
+
+    for ch in (a, b, c):
+        assert len(ch.ws.sent) == 1
+        assert ch.ws.sent[0] == {"type": "response_text", "text": "hello everyone"}
+
+    loop.close()
+
+
+def test_broadcast_with_session_id_routes_to_only_matching_channel():
+    loop = asyncio.new_event_loop()
+    a = _FakeChannel("browser:aaa", loop)
+    b = _FakeChannel("browser:bbb", loop)
+    c = _FakeChannel("browser:ccc", loop)
+    BrowserChannel._active_channels.update({a, b, c})
+
+    _broadcast_with_loop("just for B", session_id="mod3:browser:bbb")
+
+    assert a.ws.sent == []
+    assert b.ws.sent == [{"type": "response_text", "text": "just for B"}]
+    assert c.ws.sent == []
+
+    loop.close()
+
+
+def test_broadcast_skips_inactive_channels():
+    loop = asyncio.new_event_loop()
+    a = _FakeChannel("browser:aaa", loop)
+    a._active = False
+    b = _FakeChannel("browser:bbb", loop)
+    BrowserChannel._active_channels.update({a, b})
+
+    _broadcast_with_loop("only active wins")
+
+    assert a.ws.sent == []
+    assert b.ws.sent == [{"type": "response_text", "text": "only active wins"}]
+
+    loop.close()
+
+
+def test_broadcast_session_id_without_mod3_prefix_falls_back_to_broadcast():
+    """Defensive: if a malformed session_id arrives, don't lose the message."""
+    loop = asyncio.new_event_loop()
+    a = _FakeChannel("browser:aaa", loop)
+    b = _FakeChannel("browser:bbb", loop)
+    BrowserChannel._active_channels.update({a, b})
+
+    # No "mod3:" prefix -> expected_channel stays None -> broadcast
+    _broadcast_with_loop("legacy session id", session_id="browser:aaa")
+
+    assert len(a.ws.sent) == 1
+    assert len(b.ws.sent) == 1
+    assert a.ws.sent[0]["text"] == "legacy session id"
+
+    loop.close()
+
+
+def test_broadcast_session_id_with_no_match_drops_silently():
+    """Routed session for a channel that's not connected -> no delivery."""
+    loop = asyncio.new_event_loop()
+    a = _FakeChannel("browser:aaa", loop)
+    BrowserChannel._active_channels.add(a)
+
+    _broadcast_with_loop("for a ghost", session_id="mod3:browser:zzz")
+
+    assert a.ws.sent == []
+
+    loop.close()
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__, "-v"]))
diff --git a/tests/test_bus_bridge.py b/tests/test_bus_bridge.py
new file mode 100644
index 0000000..f27a8a0
--- /dev/null
+++ b/tests/test_bus_bridge.py
@@ -0,0 +1,65 @@
+"""Tests for KernelBusSubscriber endpoint resolution (Codex review #4 / Fix 3).
+
+The subscriber used to hard-code ``http://localhost:6931/v1/events/stream`` as
+its default. In any non-default kernel topology that meant sends and receives
+targeted different hosts. The fix: resolve the URL from ``COGOS_ENDPOINT`` (with
+the same default as the rest of the cogos client code) at construction time.
+
+Run: python -m pytest tests/test_bus_bridge.py -v
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
+
+import bus_bridge  # noqa: E402
+from bus_bridge import KernelBusSubscriber, default_stream_url  # noqa: E402
+
+
+def test_default_stream_url_uses_default_when_env_unset(monkeypatch):
+    monkeypatch.delenv("COGOS_ENDPOINT", raising=False)
+    assert default_stream_url() == "http://localhost:6931/v1/events/stream"
+
+
+def test_default_stream_url_honors_cogos_endpoint(monkeypatch):
+    monkeypatch.setenv("COGOS_ENDPOINT", "http://kernel.internal:7000")
+    assert default_stream_url() == "http://kernel.internal:7000/v1/events/stream"
+
+
+def test_default_stream_url_strips_trailing_slash(monkeypatch):
+    monkeypatch.setenv("COGOS_ENDPOINT", "http://kernel.internal:7000/")
+    assert default_stream_url() == "http://kernel.internal:7000/v1/events/stream"
+
+
+def test_subscriber_default_url_uses_env(monkeypatch):
+    """A KernelBusSubscriber constructed with no url must honor COGOS_ENDPOINT."""
+    monkeypatch.setenv("COGOS_ENDPOINT", "http://kernel.internal:7777")
+    sub = KernelBusSubscriber()
+    assert sub._url == "http://kernel.internal:7777/v1/events/stream"
+
+
+def test_subscriber_explicit_url_overrides_env(monkeypatch):
+    """An explicit url= argument must beat the env var (back-compat)."""
+    monkeypatch.setenv("COGOS_ENDPOINT", "http://kernel.internal:7777")
+    sub = KernelBusSubscriber(url="http://override.example/v1/events/stream")
+    assert sub._url == "http://override.example/v1/events/stream"
+
+
+def test_kernel_bus_stream_url_module_attr_resolves_default(monkeypatch):
+    """The module-level back-compat attribute reflects the env at import.
+
+    We can't reload the module mid-test, but we can verify it agrees with
+    default_stream_url() called from the same env state.
+    """
+    # Just confirm the back-compat attr exists and is a string of expected shape
+    assert isinstance(bus_bridge.KERNEL_BUS_STREAM_URL, str)
+    assert bus_bridge.KERNEL_BUS_STREAM_URL.endswith("/v1/events/stream")
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__, "-v"]))
diff --git a/tests/test_bus_bridge_runner.py b/tests/test_bus_bridge_runner.py
new file mode 100644
index 0000000..4f2581d
--- /dev/null
+++ b/tests/test_bus_bridge_runner.py
@@ -0,0 +1,74 @@
+"""Tests for bus_bridge_runner.run_bridge — filter + fan-out behavior."""
+
+from __future__ import annotations
+
+import asyncio
+import sys
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+
+# Make the mod3 package root importable (tests live in tests/ subfolder).
+sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
+
+from bus_bridge import BusEnvelope  # noqa: E402
+from bus_bridge_runner import ADR083_KINDS, run_bridge  # noqa: E402
+
+
+class _FakeSubscriber:
+    """Minimal stand-in for KernelBusSubscriber: yields canned envelopes then stops."""
+
+    def __init__(self, envelopes: list[BusEnvelope]) -> None:
+        self._envelopes = envelopes
+
+    async def stream(self):
+        for env in self._envelopes:
+            yield env
+
+
+def _env(kind: str, payload: dict | None = None, event_id: str = "e1") -> BusEnvelope:
+    return BusEnvelope(
+        raw={"type": "bus.event", "data": payload or {"kind": kind}},
+        kind=kind,
+        payload=payload or {"kind": kind, "cycle_id": "c1"},
+        ts="2026-04-17T00:00:00Z",
+        event_id=event_id,
+    )
+
+
+def test_run_bridge_forwards_only_filtered_kinds():
+    envelopes = [
+        _env("state_transition", {"kind": "state_transition", "cycle_id": "c1"}, "e1"),
+        _env("unknown_future_kind", {"kind": "unknown_future_kind"}, "e2"),
+        _env("tool_dispatch", {"kind": "tool_dispatch", "cycle_id": "c1"}, "e3"),
+        _env("connected", {}, "e4"),
+    ]
+    sub = _FakeSubscriber(envelopes)
+
+    with patch("bus_bridge_runner.BrowserChannel.broadcast_trace_event") as mock_bcast:
+        asyncio.run(run_bridge(sub, filter_kinds=set(ADR083_KINDS)))
+
+    # Only the two ADR-083 envelopes should have been forwarded.
+    assert mock_bcast.call_count == 2
+    kinds_forwarded = [c.args[0]["kind"] for c in mock_bcast.call_args_list]
+    assert kinds_forwarded == ["state_transition", "tool_dispatch"]
+
+
+def test_run_bridge_no_filter_forwards_all_nonconnected():
+    envelopes = [
+        _env("state_transition", {"kind": "state_transition"}, "e1"),
+        _env("assessment", {"kind": "assessment"}, "e2"),
+        _env("weird.ns.kind", {"kind": "weird.ns.kind"}, "e3"),
+        _env("connected", {}, "e4"),  # bootstrap frame — always skipped
+    ]
+    sub = _FakeSubscriber(envelopes)
+
+    with patch("bus_bridge_runner.BrowserChannel.broadcast_trace_event") as mock_bcast:
+        asyncio.run(run_bridge(sub, filter_kinds=None))
+
+    assert mock_bcast.call_count == 3
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/tests/test_cogos_agent_bridge.py b/tests/test_cogos_agent_bridge.py
new file mode 100644
index 0000000..2d5de43
--- /dev/null
+++ b/tests/test_cogos_agent_bridge.py
@@ -0,0 +1,205 @@
+"""Tests for cogos_agent_bridge — POST body shape + response fan-out."""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import sys
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
+
+from bus_bridge import BusEnvelope  # noqa: E402
+from cogos_agent_bridge import (  # noqa: E402
+    _extract_response_text,
+    _extract_session_id,
+    post_user_message,
+    run_response_bridge,
+)
+
+
+class _FakeSubscriber:
+    def __init__(self, envelopes: list[BusEnvelope]) -> None:
+        self._envelopes = envelopes
+
+    async def stream(self):
+        for env in self._envelopes:
+            yield env
+
+
+def _env(payload: dict, event_id: str = "r1") -> BusEnvelope:
+    return BusEnvelope(
+        raw={"type": "bus.event", "data": payload},
+        kind="bus.event",
+        payload=payload,
+        ts="2026-04-17T00:00:00Z",
+        event_id=event_id,
+    )
+
+
+def test_extract_response_text_handles_content_wrapped_json():
+    # Kernel wraps the sent `message` string inside {"content": "<str>"}.
+    inner = {"type": "agent_response", "text": "hi there", "ts": "2026-04-17T00:00:00Z"}
+    payload = {"content": json.dumps(inner)}
+    assert _extract_response_text(payload) == "hi there"
+
+
+def test_extract_response_text_handles_plain_content_string():
+    assert _extract_response_text({"content": "hello"}) == "hello"
+
+
+def test_extract_response_text_skips_unparseable():
+    assert _extract_response_text({"foo": "bar"}) is None
+    assert _extract_response_text({}) is None
+
+
+def test_post_user_message_body_shape():
+    captured: dict = {}
+
+    class _FakeResp:
+        status_code = 200
+        text = ""
+
+    class _FakeClient:
+        def __init__(self, *a, **kw):
+            pass
+
+        async def __aenter__(self):
+            return self
+
+        async def __aexit__(self, *a):
+            return False
+
+        async def post(self, url, json=None):  # noqa: A002 — matches httpx signature
+            captured["url"] = url
+            captured["body"] = json
+            return _FakeResp()
+
+    with patch("cogos_agent_bridge.httpx.AsyncClient", _FakeClient):
+        ok = asyncio.run(post_user_message("hello agent", session_id="mod3:browser:abc"))
+
+    assert ok is True
+    body = captured["body"]
+    assert body["bus_id"] == "bus_dashboard_chat"
+    assert body["type"] == "user_message"
+    assert body["from"] == "mod3-dashboard"
+    # `message` is a JSON-encoded event dict — parse and check shape.
+    event = json.loads(body["message"])
+    assert event["type"] == "user_message"
+    assert event["text"] == "hello agent"
+    assert event["session_id"] == "mod3:browser:abc"
+    assert "ts" in event
+
+
+def test_run_response_bridge_fans_out_to_broadcast():
+    # Envelope in the shape the kernel emits: payload = {"content": "<json>"}
+    inner = json.dumps({"type": "agent_response", "text": "reply one"})
+    envelopes = [
+        _env({"content": inner}, "r1"),
+        _env({"content": "free-form string reply"}, "r2"),
+        _env({"foo": "bar"}, "r3"),  # no text — should be skipped
+    ]
+    sub = _FakeSubscriber(envelopes)
+
+    with patch("cogos_agent_bridge.BrowserChannel.broadcast_response_text") as mock_bcast:
+        asyncio.run(run_response_bridge(sub))
+
+    texts = [c.args[0] for c in mock_bcast.call_args_list]
+    assert texts == ["reply one", "free-form string reply"]
+
+
+# ---------------------------------------------------------------------------
+# Session-id extraction & forwarding (Codex review #4 / Fix 4)
+#
+# The kernel-side change includes session_id in reply payloads so mod3 can
+# route to the originating BrowserChannel. When session_id is missing
+# (older kernel, non-session-scoped event), broadcast_response_text falls
+# back to broadcasting — preserving backward compat.
+# ---------------------------------------------------------------------------
+
+
+def test_extract_session_id_from_top_level():
+    assert _extract_session_id({"session_id": "mod3:browser:abc"}) == "mod3:browser:abc"
+
+
+def test_extract_session_id_from_content_wrapped_json():
+    inner = {"type": "agent_response", "text": "hi", "session_id": "mod3:browser:xyz"}
+    payload = {"content": json.dumps(inner)}
+    assert _extract_session_id(payload) == "mod3:browser:xyz"
+
+
+def test_extract_session_id_returns_none_when_absent():
+    assert _extract_session_id({"text": "hi"}) is None
+    assert _extract_session_id({"content": json.dumps({"text": "hi"})}) is None
+    assert _extract_session_id({"content": "free-form string"}) is None
+    assert _extract_session_id({}) is None
+
+
+def test_run_response_bridge_forwards_session_id_when_present():
+    """When the kernel reply includes session_id, it must reach broadcast."""
+    inner = json.dumps({"type": "agent_response", "text": "scoped reply", "session_id": "mod3:browser:abc"})
+    envelopes = [_env({"content": inner}, "r1")]
+    sub = _FakeSubscriber(envelopes)
+
+    with patch("cogos_agent_bridge.BrowserChannel.broadcast_response_text") as mock_bcast:
+        asyncio.run(run_response_bridge(sub))
+
+    assert mock_bcast.call_count == 1
+    call = mock_bcast.call_args_list[0]
+    assert call.args[0] == "scoped reply"
+    # session_id passed as keyword
+    assert call.kwargs.get("session_id") == "mod3:browser:abc"
+
+
+def test_run_response_bridge_falls_back_to_broadcast_when_no_session_id():
+    """Old-kernel reply (no session_id) -> broadcast_response_text(session_id=None)."""
+    inner = json.dumps({"type": "agent_response", "text": "broadcast reply"})
+    envelopes = [_env({"content": inner}, "r1")]
+    sub = _FakeSubscriber(envelopes)
+
+    with patch("cogos_agent_bridge.BrowserChannel.broadcast_response_text") as mock_bcast:
+        asyncio.run(run_response_bridge(sub))
+
+    assert mock_bcast.call_count == 1
+    call = mock_bcast.call_args_list[0]
+    assert call.args[0] == "broadcast reply"
+    assert call.kwargs.get("session_id") is None
+
+
+def test_post_user_message_uses_runtime_endpoint(monkeypatch):
+    """post_user_message must POST to the URL derived from COGOS_ENDPOINT
+    at call time — not a stale module-import-time value."""
+    monkeypatch.setenv("COGOS_ENDPOINT", "http://kernel.test:9000")
+
+    captured: dict = {}
+
+    class _FakeResp:
+        status_code = 200
+        text = ""
+
+    class _FakeClient:
+        def __init__(self, *a, **kw):
+            pass
+
+        async def __aenter__(self):
+            return self
+
+        async def __aexit__(self, *a):
+            return False
+
+        async def post(self, url, json=None):  # noqa: A002
+            captured["url"] = url
+            return _FakeResp()
+
+    with patch("cogos_agent_bridge.httpx.AsyncClient", _FakeClient):
+        ok = asyncio.run(post_user_message("hi", session_id="mod3:browser:abc"))
+
+    assert ok is True
+    assert captured["url"] == "http://kernel.test:9000/v1/bus/send"
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/tests/test_speaking_lock.py b/tests/test_speaking_lock.py
new file mode 100644
index 0000000..69f6f9f
--- /dev/null
+++ b/tests/test_speaking_lock.py
@@ -0,0 +1,228 @@
+"""Tests for the owner-aware speaking lock (Codex review #4 / Fix 2).
+
+The cross-process ``/tmp/mod3-speaking.json`` lock used to be ownership-blind:
+``_release_speaking_lock`` removed the file unconditionally, so two overlapping
+mod3 processes could falsely interrupt each other when one finished its speech.
+
+The new contract:
+  * Acquire writes only if the file is missing, the holder PID is dead, or
+    (pid, job_id) match the current process (idempotent re-acquire).
+  * Release only removes the file when (pid, job_id) match.
+  * ``_i_own_speaking_lock`` returns True only when our (pid, job_id) is the
+    current on-disk holder; the speech loop uses this for stop-on-mismatch.
+  * ``_force_clear_speaking_lock`` is the only path that clears regardless of
+    owner — used by the bargein watcher's cross-process interrupt path.
+
+Run: python -m pytest tests/test_speaking_lock.py -v
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
+
+import server  # noqa: E402
+
+
+@pytest.fixture(autouse=True)
+def _isolated_lock_path(tmp_path, monkeypatch):
+    """Each test gets its own lock-file path so they don't collide on /tmp."""
+    lock_path = str(tmp_path / "mod3-speaking.json")
+    monkeypatch.setattr(server, "_SPEAKING_LOCK", lock_path)
+    yield lock_path
+    # Best-effort cleanup
+    try:
+        os.remove(lock_path)
+    except OSError:
+        pass
+
+
+def _write_raw_lock(path: str, payload: dict) -> None:
+    with open(path, "w") as f:
+        json.dump(payload, f)
+
+
+def test_acquire_writes_lock_when_missing(_isolated_lock_path):
+    ok = server._acquire_speaking_lock("job-1", "hello")
+    assert ok is True
+
+    with open(_isolated_lock_path) as f:
+        lock = json.load(f)
+    assert lock["pid"] == os.getpid()
+    assert lock["job_id"] == "job-1"
+    assert lock["text"] == "hello"
+    assert "acquired_at" in lock
+
+
+def test_acquire_is_idempotent_for_same_owner(_isolated_lock_path):
+    assert server._acquire_speaking_lock("job-1", "hello") is True
+    # Same (pid, job_id) re-acquiring — must succeed and refresh the file
+    assert server._acquire_speaking_lock("job-1", "hello again") is True
+
+    with open(_isolated_lock_path) as f:
+        lock = json.load(f)
+    assert lock["text"] == "hello again"
+
+
+def test_acquire_blocked_by_live_other_process(_isolated_lock_path):
+    """Different live PID owns the lock -> we can't acquire."""
+    other_pid = os.getppid()  # parent pid is reliably alive during this test
+    assert other_pid != os.getpid()
+    _write_raw_lock(
+        _isolated_lock_path,
+        {"pid": other_pid, "job_id": "their-job", "text": "..."},
+    )
+
+    ok = server._acquire_speaking_lock("our-job", "hi")
+    assert ok is False
+
+    # The other process's lock must be untouched
+    with open(_isolated_lock_path) as f:
+        lock = json.load(f)
+    assert lock["pid"] == other_pid
+    assert lock["job_id"] == "their-job"
+
+
+def test_acquire_reclaims_lock_when_holder_pid_is_dead(_isolated_lock_path):
+    """A lock left by a crashed process must be reclaimable."""
+    # PID 1 on macOS is launchd — not us. We mock _pid_is_alive to make it
+    # appear dead so the acquire path takes the "stale" branch.
+    _write_raw_lock(
+        _isolated_lock_path,
+        {"pid": 99999, "job_id": "ghost-job", "text": "stale"},
+    )
+
+    with patch.object(server, "_pid_is_alive", return_value=False):
+        ok = server._acquire_speaking_lock("our-job", "hi")
+    assert ok is True
+
+    with open(_isolated_lock_path) as f:
+        lock = json.load(f)
+    assert lock["pid"] == os.getpid()
+    assert lock["job_id"] == "our-job"
+
+
+def test_release_only_clears_own_lock(_isolated_lock_path):
+    """Release with mismatched job_id must NOT remove the file."""
+    other_pid = os.getppid()
+    _write_raw_lock(
+        _isolated_lock_path,
+        {"pid": other_pid, "job_id": "their-job", "text": "..."},
+    )
+
+    # Same pid, different job_id -> still must not remove (we don't own job_id)
+    _write_raw_lock(
+        _isolated_lock_path,
+        {"pid": os.getpid(), "job_id": "their-job", "text": "..."},
+    )
+    ok = server._release_speaking_lock("our-job")
+    assert ok is False
+    assert os.path.exists(_isolated_lock_path)
+
+    # Wrong PID -> still must not remove
+    _write_raw_lock(
+        _isolated_lock_path,
+        {"pid": other_pid, "job_id": "any", "text": "..."},
+    )
+    ok = server._release_speaking_lock("any")
+    assert ok is False
+    assert os.path.exists(_isolated_lock_path)
+
+
+def test_release_removes_own_lock(_isolated_lock_path):
+    assert server._acquire_speaking_lock("job-1", "x") is True
+    assert os.path.exists(_isolated_lock_path)
+
+    ok = server._release_speaking_lock("job-1")
+    assert ok is True
+    assert not os.path.exists(_isolated_lock_path)
+
+
+def test_i_own_speaking_lock_matches_pid_and_job(_isolated_lock_path):
+    assert server._i_own_speaking_lock("job-1") is False  # missing
+
+    server._acquire_speaking_lock("job-1", "x")
+    assert server._i_own_speaking_lock("job-1") is True
+    assert server._i_own_speaking_lock("other-job") is False
+
+    # Simulate another process taking over the lock
+    _write_raw_lock(
+        _isolated_lock_path,
+        {"pid": os.getppid(), "job_id": "job-1", "text": "x"},
+    )
+    assert server._i_own_speaking_lock("job-1") is False
+
+
+def test_two_processes_cannot_release_each_others_locks(_isolated_lock_path):
+    """The original Codex bug: process A's release deletes process B's lock.
+
+    Simulated with mismatched-pid lock content + a release call that should
+    no-op because (pid, job_id) doesn't match this process.
+    """
+    # Process B (different pid) currently owns the lock
+    other_pid = os.getppid()
+    _write_raw_lock(
+        _isolated_lock_path,
+        {"pid": other_pid, "job_id": "B-job", "text": "playing..."},
+    )
+
+    # Process A finishes its own job and calls release.
+    # The OLD release would delete the file (false interrupt for B);
+    # the NEW release must observe pid mismatch and no-op.
+    ok = server._release_speaking_lock("A-job")
+    assert ok is False
+    assert os.path.exists(_isolated_lock_path)
+
+    # B's lock content is unchanged
+    with open(_isolated_lock_path) as f:
+        lock = json.load(f)
+    assert lock["pid"] == other_pid
+    assert lock["job_id"] == "B-job"
+
+
+def test_force_clear_removes_any_lock(_isolated_lock_path):
+    """The bargein watcher's cross-process kill path."""
+    other_pid = os.getppid()
+    _write_raw_lock(
+        _isolated_lock_path,
+        {"pid": other_pid, "job_id": "B-job", "text": "long playback"},
+    )
+
+    cleared = server._force_clear_speaking_lock()
+    assert cleared is not None
+    assert cleared["pid"] == other_pid
+    assert cleared["job_id"] == "B-job"
+    assert not os.path.exists(_isolated_lock_path)
+
+
+def test_is_any_process_speaking_drops_dead_holder_lock(_isolated_lock_path):
+    """Stale lock (dead holder pid) must be treated as 'no one speaking'."""
+    _write_raw_lock(
+        _isolated_lock_path,
+        {"pid": 99999, "job_id": "ghost", "text": "..."},
+    )
+
+    with patch.object(server, "_pid_is_alive", return_value=False):
+        result = server._is_any_process_speaking()
+
+    assert result is None
+    # Side effect: stale file is removed
+    assert not os.path.exists(_isolated_lock_path)
+
+
+def test_pid_is_alive_handles_self_and_invalid():
+    assert server._pid_is_alive(os.getpid()) is True
+    assert server._pid_is_alive(0) is False
+    assert server._pid_is_alive(-1) is False
+    assert server._pid_is_alive("not-an-int") is False  # type: ignore[arg-type]
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__, "-v"]))