diff --git a/desktop/macos/Desktop/Sources/AudioCaptureService.swift b/desktop/macos/Desktop/Sources/AudioCaptureService.swift
index 816a54ac559..e0169257a0d 100644
--- a/desktop/macos/Desktop/Sources/AudioCaptureService.swift
+++ b/desktop/macos/Desktop/Sources/AudioCaptureService.swift
@@ -180,6 +180,15 @@ class AudioCaptureService: @unchecked Sendable {
         if let override = overrideDeviceID {
             inputDeviceID = override
             log("AudioCapture: Using override device ID \(override)")
+        } else if Self.isDefaultOutputBluetooth(), let builtIn = Self.findBuiltInMicDeviceID() {
+            // Output is a Bluetooth device (e.g. AirPods). Opening the BT *microphone* forces the
+            // headset out of high-quality A2DP into 16 kHz HFP "call" mode — which degrades ALL
+            // playback (including the hub's spoken reply) and frequently makes macOS deliver only
+            // silence (the A2DP/HFP profile conflict). That silence trips the silent-mic watchdog,
+            // which swaps devices and rebuilds the audio engine MID-REPLY, cutting the reply off.
+            // So capture from the built-in mic and leave Bluetooth in A2DP. Mirrors the PTT path.
+            inputDeviceID = builtIn
+            log("AudioCapture: output is Bluetooth — capturing from built-in mic id=\(builtIn) to keep A2DP")
         } else {
             var size = UInt32(MemoryLayout<AudioDeviceID>.size)
             var address = AudioObjectPropertyAddress(
diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarState.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarState.swift
index 8608cb771ec..71ebda347a1 100644
--- a/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarState.swift
+++ b/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarState.swift
@@ -94,6 +94,19 @@ class FloatingControlBarState: NSObject, ObservableObject {
     @Published var isVoiceLocked: Bool = false
     @Published var voiceTranscript: String = ""
 
+    /// Post-release state of a realtime-hub voice turn, so the bar never leaves the user
+    /// guessing whether a reply is coming after they let go of PTT. `.thinking` covers the
+    /// commit→first-audio gap (model latency / reconnect), `.speaking` while the reply plays,
+    /// `.failed` briefly when no reply arrived (hub error or the turn-completion watchdog
+    /// firing with no audio). `.none` = collapsed/idle. Driven by RealtimeHubController.
+    enum VoiceResponsePhase { case none, thinking, speaking, failed }
+    @Published var voiceResponsePhase: VoiceResponsePhase = .none
+
+    /// True whenever a voice turn owns the bar — actively listening (PTT-down) OR showing the
+    /// post-release status pill. One concept so bar-sizing/collapse guards don't each have to AND
+    /// `isVoiceListening` with `voiceResponsePhase` independently (and drift out of sync).
+    var voiceOwnsBar: Bool { isVoiceListening || voiceResponsePhase != .none }
+
     // Voice follow-up state (PTT while AI conversation is active)
     @Published var isVoiceFollowUp: Bool = false
     @Published var voiceFollowUpTranscript: String = ""
diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarView.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarView.swift
index 9e06e25dbb9..835bc573047 100644
--- a/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarView.swift
+++ b/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarView.swift
@@ -35,7 +35,7 @@ struct FloatingControlBarView: View {
 
     /// Whether the bar chrome should stretch to fill the window width
     private var barNeedsFullWidth: Bool {
-        isHovering || state.showingAIConversation || state.isVoiceListening
+        isHovering || state.showingAIConversation || state.voiceOwnsBar
     }
 
     private var barChrome: some View {
@@ -285,6 +285,12 @@ struct FloatingControlBarView: View {
                     .padding(.vertical, 3)
                     .frame(height: 50)
                     .transition(.opacity)
+            } else if state.voiceResponsePhase != .none && !state.isVoiceFollowUp {
+                voiceResponseView
+                    .padding(.horizontal, 6)
+                    .padding(.vertical, 3)
+                    .frame(height: 50)
+                    .transition(.opacity)
             } else if isHovering || state.showingAIConversation {
                 VStack(spacing: 1) {
                     compactButton(title: "Ask omi / Collapse", keys: shortcutSettings.askOmiShortcut.displayTokens) {
@@ -395,6 +401,52 @@ struct FloatingControlBarView: View {
         }
     }
 
+    /// Post-release status of a realtime-hub voice turn — shown after PTT-up so a slow or failed
+    /// reply never collapses to a blank bar: thinking → speaking → a brief "no response" hint.
+    /// While thinking/speaking the pill is tappable to STOP the reply (a push-to-talk tap also
+    /// interrupts). Speaking stays subtle — animated waveform + speaker glyph, no transcript.
+    private var voiceResponseView: some View {
+        let canStop =
+            state.voiceResponsePhase == .thinking || state.voiceResponsePhase == .speaking
+        return HStack(spacing: 8) {
+            // Lively while thinking/speaking; at rest on failure.
+            VoiceWaveformBars(isActive: canStop)
+
+            switch state.voiceResponsePhase {
+            case .thinking:
+                Image(systemName: "ellipsis.bubble.fill")
+                    .scaledFont(size: 14, weight: .semibold)
+                    .foregroundColor(.white)
+                Text("Thinking…")
+                    .scaledFont(size: 13)
+                    .foregroundColor(.white.opacity(0.7))
+            case .speaking:
+                // Subtle: the animated waveform conveys that it's speaking — no transcript text.
+                Image(systemName: "speaker.wave.2.fill")
+                    .scaledFont(size: 13, weight: .semibold)
+                    .foregroundColor(.white.opacity(0.85))
+                    .symbolEffect(.variableColor.iterative, options: .repeating)
+                // Subtle tap-to-stop affordance.
+                Image(systemName: "stop.circle.fill")
+                    .scaledFont(size: 13, weight: .regular)
+                    .foregroundColor(.white.opacity(0.45))
+            case .failed:
+                Image(systemName: "exclamationmark.triangle.fill")
+                    .scaledFont(size: 14, weight: .semibold)
+                    .foregroundColor(.orange)
+                Text("No response — hold \(shortcutSettings.pttShortcut.displayLabel) to retry")
+                    .scaledFont(size: 13)
+                    .foregroundColor(.white.opacity(0.7))
+                    .lineLimit(1)
+            case .none:
+                EmptyView()
+            }
+        }
+        // Tap the pill to stop the reply (in addition to a push-to-talk tap).
+        .contentShape(Rectangle())
+        .onTapGesture { if canStop { RealtimeHubController.shared.stopSpeaking() } }
+    }
+
     private var aiInputView: some View {
         AskAIInputView(
             userInput: Binding(
diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarWindow.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarWindow.swift
index bd6bdf2c1ec..2816d7594c0 100644
--- a/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarWindow.swift
+++ b/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarWindow.swift
@@ -619,7 +619,11 @@ class FloatingControlBarWindow: NSPanel, NSWindowDelegate {
 
     /// Resize for hover expand/collapse — anchored from center so the circle grows outward.
     func resizeForHover(expanded: Bool) {
-        guard !state.showingAIConversation, !state.isVoiceListening, !state.isShowingNotification, !suppressHoverResize else { return }
+        // Don't hover-resize while a voice turn owns the bar — listening OR the post-release
+        // thinking/speaking/failed status pill. Otherwise hovering fights the voice-phase
+        // sizing (bar jumps on hover-in, shrinks mid-reply on hover-out).
+        guard !state.showingAIConversation, !state.voiceOwnsBar,
+              !state.isShowingNotification, !suppressHoverResize else { return }
         resizeWorkItem?.cancel()
         resizeWorkItem = nil
 
@@ -628,7 +632,7 @@ class FloatingControlBarWindow: NSPanel, NSWindowDelegate {
         let doResize: () -> Void = { [weak self] in
             guard let self = self else { return }
             guard !self.state.showingAIConversation,
-                  !self.state.isVoiceListening,
+                  !self.state.voiceOwnsBar,
                   !self.state.isShowingNotification,
                   !self.suppressHoverResize
             else { return }
diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/PushToTalkManager.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/PushToTalkManager.swift
index 0e7a283bd02..657d2810e41 100644
--- a/desktop/macos/Desktop/Sources/FloatingControlBar/PushToTalkManager.swift
+++ b/desktop/macos/Desktop/Sources/FloatingControlBar/PushToTalkManager.swift
@@ -1055,7 +1055,12 @@ class PushToTalkManager: ObservableObject {
     guard !skipResize && !barState.isVoiceFollowUp && !barState.showingAIConversation && !isOnboarding else { return }
     if barState.isVoiceListening && !wasListening {
       FloatingControlBarManager.shared.resizeForPTT(expanded: true)
-    } else if !barState.isVoiceListening && wasListening {
+    } else if !barState.isVoiceListening && wasListening
+      && barState.voiceResponsePhase == .none
+    {
+      // Don't collapse on PTT-up when the hub is about to answer — the bar stays expanded
+      // through the thinking/speaking status (RealtimeHubController collapses it when the
+      // reply resolves). Without this guard the bar would blink shut then reopen.
       FloatingControlBarManager.shared.resizeForPTT(expanded: false)
     }
   }
diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift
index 5d8a4ddafdd..f874ed0b16a 100644
--- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift
+++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift
@@ -1,6 +1,7 @@
 import AVFoundation
 import CoreGraphics
 import Foundation
+import SwiftUI
 
 // MARK: - Realtime Hub Controller (Phase 1)
 //
@@ -163,7 +164,7 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate {
   }
 
   private func startSession(provider: RealtimeHubProvider, auth: HubAuth) {
-    let instructions = RealtimeHubTools.systemInstruction(aboutUser: aboutUserCard)
+    let instructions = RealtimeHubTools.systemInstruction(aboutUser: aboutUserCard, provider: provider)
     let s = RealtimeHubSession(provider: provider, auth: auth, instructions: instructions, delegate: self)
     session = s
     sessionProvider = provider
@@ -199,6 +200,10 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate {
     responding = false
     turnTranscript = ""
     assistantText = ""
+    // A new turn supersedes any prior reply's status indicator (the listening UI takes over);
+    // bump the token so a queued `.failed` auto-dismiss from the last turn can't fire now.
+    voicePhaseToken &+= 1
+    barState?.voiceResponsePhase = .none
     speculativeWarmDone = false
     speculativeScreenshot = nil
     audioReceivedThisTurn = false
@@ -245,6 +250,9 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate {
   /// PTT-up: end the turn; the model now responds (and may call tools).
   func commitTurn() {
     responding = true
+    // Keep the bar alive after release: show "thinking" until the first audio arrives, so the
+    // commit→reply gap (model latency / reconnect / a slow turn) never looks like a dead bar.
+    setVoicePhase(.thinking)
     // (The screen frame is sent at turn START — see beginTurn — so it has time to
     // upload/decode before the model answers. Nothing to attach here.)
     session?.commitInputTurn()
@@ -293,6 +301,8 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate {
 
   func hubDidReceiveAudio(_ pcm24k: Data) {
     audioReceivedThisTurn = true
+    // First audio of the turn → flip the status from "thinking" to "speaking".
+    if barState?.voiceResponsePhase == .thinking { setVoicePhase(.speaking) }
     pcmPlayer?.enqueue(pcm24k)  // native spoken audio (OpenAI + Gemini)
   }
 
@@ -539,6 +549,10 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate {
       turnRecorded = true
       FloatingControlBarManager.shared.recordVoiceTurn(userText: heard, assistantText: reply)
     }
+    // Resolve the status: a turn that produced audio just ends (collapse); a turn that
+    // finished with NO audio — e.g. the Gemini turn-completion watchdog fired on a dropped/
+    // empty reply — briefly tells the user it got nothing, instead of silently collapsing.
+    setVoicePhase(audioReceivedThisTurn ? .none : .failed)
     exitVoiceUI()
   }
 
@@ -546,11 +560,15 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate {
     // A socket we intentionally dropped is detached in teardownSession() before it's
     // released, so its death-rattle never reaches us — only the live session's errors
     // land here.
+    let wasInFlight = replyInFlight
     responding = false
     logError("RealtimeHub: session error — \(message)")
     // The reply is dead — stop any buffered audio before collapsing.
     pcmPlayer?.stop()
     if speech.isSpeaking { speech.stopSpeaking(at: .immediate) }
+    // If a turn was mid-flight when the socket died, tell the user it failed (then collapse)
+    // rather than vanishing mid-reply. A benign idle-close between turns shows nothing.
+    if wasInFlight { setVoicePhase(.failed) }
     exitVoiceUI()
     let aliveFor = lastWarmAt.map { Date().timeIntervalSince($0) } ?? 0
     teardownSession()
@@ -585,14 +603,81 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate {
     barState.isVoiceFollowUp = false
     // Collapse the bar ourselves in that case — guarded so we never shrink the bar out
     // from under an open conversation, response, notification, hover, or onboarding.
-    guard wasExpandedForVoice,
-      !barState.showingAIConversation, !barState.showingAIResponse,
-      barState.currentNotification == nil, !barState.isHoveringBar,
-      UserDefaults.standard.bool(forKey: "hasCompletedOnboarding")
-    else { return }
+    guard wasExpandedForVoice, barIsFreeToCollapse(barState) else { return }
     FloatingControlBarManager.shared.resizeForPTT(expanded: false)
   }
 
+  /// User-initiated "stop talking": halt the current spoken reply immediately, without starting
+  /// a new turn. Stops local playback and tells the session to cancel/gate the rest of the reply
+  /// (OpenAI: response.cancel; Gemini: drops the pending-reply gate so further audio is ignored —
+  /// the warm socket + conversation context survive). Tapping push-to-talk also interrupts (that
+  /// starts a fresh turn); this is the no-new-turn way to just make it stop.
+  func stopSpeaking() {
+    guard replyInFlight else { return }
+    log("RealtimeHub[\(providerTag)]: stop — user halted the reply")
+    responding = false
+    pcmPlayer?.stop()
+    if speech.isSpeaking { speech.stopSpeaking(at: .immediate) }
+    session?.cancelActiveResponse()
+    session?.abandonInputTurn()
+    setVoicePhase(.none)
+  }
+
+  // MARK: - Voice response status (so a slow/failed reply never leaves the user guessing)
+
+  /// Invalidates a pending `.failed` auto-dismiss when the phase changes again.
+  private var voicePhaseToken = 0
+
+  /// Drive the floating bar's post-release voice status. The hub speaks its reply as audio
+  /// with no inline UI, so after PTT-up the bar would otherwise collapse and the user would
+  /// be left wondering whether a reply is coming during the commit→first-audio gap (model
+  /// latency, a reconnect, or a dropped/late turn). This keeps the bar showing
+  /// thinking → speaking → (failed) until the turn resolves. `.failed` holds a brief hint,
+  /// then collapses; `.none` collapses immediately.
+  private func setVoicePhase(_ phase: FloatingControlBarState.VoiceResponsePhase) {
+    guard let barState else { return }
+    voicePhaseToken &+= 1
+    let token = voicePhaseToken
+    withAnimation(.easeInOut(duration: 0.18)) {
+      barState.voiceResponsePhase = phase
+    }
+    switch phase {
+    case .failed:
+      DispatchQueue.main.asyncAfter(deadline: .now() + 2.5) { [weak self] in
+        guard let self, token == self.voicePhaseToken,
+          self.barState?.voiceResponsePhase == .failed
+        else { return }
+        self.setVoicePhase(.none)
+      }
+    case .none:
+      collapseVoiceBarIfIdle()
+    case .thinking, .speaking:
+      break  // the bar is already expanded from the listening turn (kept open in updateBarState)
+    }
+  }
+
+  /// Collapse the bar once the voice turn is fully resolved (no longer owns the bar) and nothing
+  /// else needs it open.
+  private func collapseVoiceBarIfIdle() {
+    guard let barState, !barState.voiceOwnsBar, barIsFreeToCollapse(barState) else { return }
+    FloatingControlBarManager.shared.resizeForPTT(expanded: false)
+  }
+
+  /// Shared collapse guard: true when nothing else owns the bar (no open conversation, response,
+  /// notification, or hover) and onboarding is done — so it's safe to shrink it.
+  private func barIsFreeToCollapse(_ barState: FloatingControlBarState) -> Bool {
+    !barState.showingAIConversation && !barState.showingAIResponse
+      && barState.currentNotification == nil && !barState.isHoveringBar
+      && UserDefaults.standard.bool(forKey: "hasCompletedOnboarding")
+  }
+
+  /// True while a hub reply is in flight — between commit and turn-done, or while the status pill
+  /// shows thinking/speaking. Used for barge-in / stop / failure decisions.
+  private var replyInFlight: Bool {
+    responding || barState?.voiceResponsePhase == .thinking
+      || barState?.voiceResponsePhase == .speaking
+  }
+
   // MARK: - Tools
 
   /// ask_higher_model — reuse the EXISTING prompt-cached /v2/chat/completions
diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubSession.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubSession.swift
index 86fe07fdc0b..b3ad18d875e 100644
--- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubSession.swift
+++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubSession.swift
@@ -104,6 +104,9 @@ final class RealtimeHubSession: NSObject {
   /// one. Set on activityEnd (commit); cleared on this turn's `turnComplete`, on a
   /// server `interrupted`, or when a new turn interrupts (beginInputTurn interrupting).
   private var geminiResponsePending = false
+  /// Single reschedulable timer for the Gemini turn-completion watchdog — re-armed in place
+  /// (no per-event closure pile-up). Created lazily, cancelled in stop(). See armGeminiWatchdog.
+  private var geminiWatchdog: DispatchSourceTimer?
 
   // Per-turn token usage for managed (ephemeral) billing — client-reported. Reset at
   // commit, reported at finishTurn (only for ephemeral sessions; BYOK pays the provider
@@ -183,6 +186,8 @@ final class RealtimeHubSession: NSObject {
       self.pendingActivityStart = false
       self.openAIResponseActive = false
       self.geminiResponsePending = false
+      self.geminiWatchdog?.cancel()
+      self.geminiWatchdog = nil
     }
   }
 
@@ -288,6 +293,10 @@ final class RealtimeHubSession: NSObject {
         self.send(json: ["realtimeInput": ["activityEnd": [:]]])
         self.activityOpen = false
         self.geminiResponsePending = true  // expect a spoken reply for THIS turn
+        // Recover if the reply never starts or never completes (Gemini Live drops the
+        // trailing turnComplete / truncates replies intermittently). 15s covers the
+        // worst-case think latency before the first audio chunk.
+        self.armGeminiWatchdog(15)
       // Gemini auto-responds at activityEnd; no explicit response request.
       }
     }
@@ -480,6 +489,7 @@ final class RealtimeHubSession: NSObject {
         send(json: ["realtimeInput": ["activityEnd": [:]]])
         activityOpen = false
         geminiResponsePending = true
+        armGeminiWatchdog(15)
       }
     }
     let d = delegate
@@ -558,6 +568,38 @@ final class RealtimeHubSession: NSObject {
     Task { @MainActor in d?.hubDidFinishTurn() }
   }
 
+  /// Safety net for Gemini's flaky turn completion. Gemini Live intermittently truncates a
+  /// reply or drops the trailing `turnComplete` — which would otherwise leave
+  /// `geminiResponsePending` stuck true forever, stranding the turn: the UI never resets,
+  /// the turn is never recorded, and the NEXT PTT press is mis-handled as a barge-in. This
+  /// arms a one-shot timer on `q`; if it fires while a turn is still pending (no turnComplete
+  /// arrived), we finish the turn cleanly. It's re-armed on every reply event, so it never
+  /// cuts a healthy still-streaming reply, and a normal turnComplete cancels it by clearing
+  /// `geminiResponsePending`. Buffered audio keeps playing (we don't stop the player), so a
+  /// reply that completed but lost its turnComplete is still heard in full. OpenAI doesn't
+  /// need this — its explicit `response.done` reliably ends every turn.
+  private func armGeminiWatchdog(_ seconds: Double) {
+    guard provider == .gemini else { return }
+    let timer = geminiWatchdog ?? makeGeminiWatchdog()
+    timer.schedule(deadline: .now() + seconds)  // re-arm in place; replaces any prior deadline
+  }
+
+  /// Lazily build the single watchdog timer (on `q`). Re-armed via `schedule`; fires only when a
+  /// turn is still pending, so a re-arm or a normal `turnComplete` (which clears the flag) no-ops it.
+  private func makeGeminiWatchdog() -> DispatchSourceTimer {
+    let timer = DispatchSource.makeTimerSource(queue: q)
+    timer.setEventHandler { [weak self] in
+      guard let self, self.geminiResponsePending else { return }
+      log("\(self.tag): turn watchdog — no turnComplete, finishing stranded turn")
+      self.geminiResponsePending = false
+      self.emitText("", isFinal: true)
+      self.finishTurn()
+    }
+    geminiWatchdog = timer
+    timer.resume()
+    return timer
+  }
+
   // MARK: - Usage (client-reported billing, managed sessions only)
 
   private func resetTurnUsage() {
@@ -728,7 +770,13 @@ final class RealtimeHubSession: NSObject {
     // NOTE: do NOT finish on generationComplete — Gemini sends it while the spoken audio
     // is still streaming, so finishing there truncates the reply and makes the next turn
     // interrupt the server's still-open turn. We finish on turnComplete (below), which
-    // arrives when the audio actually completes.
+    // arrives when the audio actually completes. But generationComplete does mean the
+    // model is done generating, so arm the watchdog: turnComplete normally follows within
+    // a few seconds, and if it's dropped (Gemini does this intermittently) the watchdog
+    // finishes the turn so it can't strand `geminiResponsePending`.
+    if (sc["generationComplete"] as? Bool) == true, geminiResponsePending {
+      armGeminiWatchdog(6)
+    }
     if let it = sc["inputTranscription"] as? [String: Any], let t = it["text"] as? String {
       emitTranscript(t, isFinal: false)
     }
@@ -736,15 +784,20 @@ final class RealtimeHubSession: NSObject {
       emitText(t, isFinal: false)  // the spoken reply's text, for logging / the bubble
     }
     if let parts = (sc["modelTurn"] as? [String: Any])?["parts"] as? [[String: Any]] {
+      var emittedAudio = false
       for p in parts {
         if let t = p["text"] as? String { emitText(t, isFinal: false) }
         if let inline = p["inlineData"] as? [String: Any],
           let mime = inline["mimeType"] as? String, mime.contains("audio/pcm"),
           let b64 = inline["data"] as? String, let d = Data(base64Encoded: b64)
         {
-          if geminiResponsePending { emitAudio(d) }  // gated: only the live turn's reply
+          if geminiResponsePending { emitAudio(d); emittedAudio = true }  // gated: live turn only
         }
       }
+      // Keep pushing the watchdog out while the reply streams. A healthy reply sends audio
+      // chunks <100ms apart, so it never fires mid-stream; it only fires if the stream
+      // truncates with no generationComplete/turnComplete (Gemini cut the reply server-side).
+      if emittedAudio, geminiResponsePending { armGeminiWatchdog(6) }
     }
     if (sc["turnComplete"] as? Bool) == true {
       // Only finish the turn we're actually awaiting a reply for. A turnComplete that
diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTestHarness.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTestHarness.swift
index 46384dd9caf..16c919f9912 100644
--- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTestHarness.swift
+++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTestHarness.swift
@@ -46,7 +46,7 @@ final class RealtimeHubTestHarness: NSObject, RealtimeHubSessionDelegate {
   func run(timeoutSeconds: Double) async -> [String: String] {
     let s = RealtimeHubSession(
       provider: provider, auth: auth,
-      instructions: RealtimeHubTools.systemInstruction(aboutUser: ""),
+      instructions: RealtimeHubTools.systemInstruction(aboutUser: "", provider: provider),
       delegate: self)
     session = s
     let rate = s.requiredInputSampleRate
diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift
index 98850b0be19..db2a752459c 100644
--- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift
+++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift
@@ -54,104 +54,194 @@ enum HubTool: String {
 
 enum RealtimeHubTools {
 
-  static func systemInstruction(aboutUser: String) -> String {
+  /// The hub's system prompt. There are TWO fully independent prompts — one per realtime model —
+  /// because OpenAI Realtime (`gpt-realtime`) and Gemini Live (`gemini-*-flash-live`) respond best
+  /// to different structures. Each is shaped to its model's documented prompting guidance (OpenAI:
+  /// labeled `#` sections, sample-phrase tool preambles, an unclear-audio block; Gemini: XML-style
+  /// tags, positive direction, critical constraints LAST, few-shot routing examples). They are
+  /// intentionally NOT shared — tune one model without touching the other. `{{ABOUT_USER}}` is the
+  /// runtime identity card (`AboutUserCard.build()`), injected via `\(aboutUser)`.
+  static func systemInstruction(aboutUser: String, provider: RealtimeHubProvider) -> String {
+    let now = ChatPromptBuilder.currentDatetimeString()
+    switch provider {
+    case .openai: return openAIInstruction(aboutUser: aboutUser, now: now)
+    case .gemini: return geminiInstruction(aboutUser: aboutUser, now: now)
+    }
+  }
+
+  // MARK: Per-model prompts
+
+  /// OpenAI Realtime (`gpt-realtime`). Structured per OpenAI's realtime prompting guide: labeled
+  /// sections, per-situation length rules, sample-phrase tool preambles + variety, capitalized
+  /// invariants, explicit language pinning, and a dedicated unclear-audio block.
+  private static func openAIInstruction(aboutUser: String, now: String) -> String {
     """
-    You are Omi, a fast spoken-voice assistant on the user's Mac and the single hub \
-    for their voice requests. You hear the user's microphone; reply by speaking, \
-    conversationally. Default to one or two sentences, but when the user asks for \
-    something longer or creative (a story, a detailed explanation, brainstorming), \
-    give the full answer yourself — don't shorten it and don't offload it. \
-    Reply in the same language the user is speaking.
-
-    \(aboutUser)
-
-    IMPORTANT: You CAN read the user's Omi data directly with fast tools — their tasks \
-    (get_tasks), what Omi knows about them / their memories & facts (get_memories, \
-    search_memories), their past conversations (search_conversations), what they DID on \
-    their Mac (get_daily_recap), and their on-screen history (search_screen_history) — and \
-    you can make simple task changes (create_action_item, update_action_item). For anything in \
-    their OTHER apps (calendar, notes, emails, messages, files, reminders, browser) or any \
-    multi-step "do X for me" work, use spawn_agent — it hands the request to a background \
-    agent that has those tools and can act in the user's apps.
-
-    Using tools: when a request needs a tool, ALWAYS give a short spoken heads-up first so the \
-    user knows you're on it and that it won't be instant — then call the tool and speak the \
-    result when it returns. Never go silent during a tool call; the user can't see what you're \
-    doing, so a quiet gap feels broken. The catch is variety: that heads-up must be SPECIFIC to \
-    what they actually asked and DIFFERENT every time. Name the real thing you're fetching — \
-    "Pulling up yesterday's activity…", "Scanning your task list…", "Digging through your notes \
-    on the launch…", "Checking your memories for that…", "Getting the latest on that, one \
-    sec…". The thing to avoid is repetition: do NOT reach for the same generic opener ("let me \
-    check", "let me look that up") turn after turn — it's what makes you sound robotic. Keep it \
-    to a few words, vary the wording each turn, and don't include any answer or data you don't \
-    have yet. For a slower step (ask_higher_model, spawn_agent) it's fine to signal it'll take a \
-    moment. NEVER speak an answer — real or guessed — before the tool returns, NEVER skip the \
-    tool call, and never read tool JSON or ids aloud. You cannot see the user's data or screen \
-    without calling a tool.
-
-    Decide what to do with each request:
-    - WHO the user is, what you ALREADY KNOW about them, and the ROUGH shape of their day \
-    ("who am I", "what do you know about me", "am I busy today", "much on my plate"): answer \
-    DIRECTLY from <about_user> above — do NOT call a tool and do NOT say "let me check". Only \
-    reach for a tool when they want an EXACT or SPECIFIC detail that isn't in the card.
-    - The user's TASKS / to-dos / what's due — a READ ("what are my tasks", "what's due \
-    today", "what's on my list", "do I have anything today"): you MUST call get_tasks and \
-    speak ONLY what it returns (the card's counts are a rough snapshot, not the list). Never \
-    guess or make up tasks. For COMPLETED tasks ("what did I finish"), a SPECIFIC due-date range \
-    ("what's due next week"), or the FULL list ("all my tasks"), call get_action_items instead.
-    - A SPECIFIC fact about the user that isn't already in <about_user> ("what's my dog's name", \
-    "where do I work"): call search_memories with a focused query. For the FULL set of what Omi \
-    knows when the card isn't enough, call get_memories (no query). NEVER answer "I don't know" \
-    or guess about the user without checking first.
-    - The user's MOST RECENT / latest / last conversation ("what was my most recent \
-    conversation", "what did we just talk about", "my recent conversations"): call \
-    get_conversations (newest first) — NOT search_conversations, which is semantic and does \
-    NOT sort by time. Speak the latest one.
-    - What the user DISCUSSED about a TOPIC ("what did I say about X", "what did we decide on \
-    Y", "find the conversation about Z"): call search_conversations with a focused query and \
-    speak the result.
-    - The user's own ACTIVITY / what they DID / how they spent their time ("what did I do \
-    yesterday", "what did I do today", "which apps did I use the most", "how did I spend my \
-    morning", "summarize my day"): you MUST call get_daily_recap (days_ago: 0 = today, 1 = \
-    yesterday) and speak a SHORT spoken summary of the highlights it returns — top apps, key \
-    conversations, tasks. Do NOT use search_conversations or spawn_agent for this, and never \
-    guess; this is exactly what get_daily_recap is for.
-    - What the user SAW / read / worked on ON SCREEN ("when was I looking at X", "find where I \
-    read about Y", "what was I doing in app Z"): call search_screen_history with a focused \
-    query and speak the result.
-    - ADVICE about the user's OWN productivity / workflow / habits / focus ("how can I improve \
-    my workflow", "how can I be more productive", "what should I change", "how am I doing", \
-    "where am I wasting time"): do NOT answer generically. FIRST call get_daily_recap (days_ago: \
-    1 for today, 7 for the week) — and get_action_items when tasks matter — then base EVERY \
-    suggestion on what they ACTUALLY did: their apps, distracted vs focused sessions, and \
-    overdue / duplicate tasks. Generic advice with no tool call is a failure here.
-    - ADD a task / to-do / reminder ("remind me to…", "add … to my list", "I need to…"): \
-    call create_action_item with a clear `description` (and `due_at` if a time was given), \
-    then confirm out loud. CHANGE an existing task (mark done, edit, reschedule): first \
-    call get_tasks to get the matching task's id, then call update_action_item with that id.
-    - DOING something for the user in their OTHER apps (calendar, notes, emails, messages, \
-    files, browser) or any multi-step work — create/send/open/edit/search/schedule/automate/ \
-    "do X for me": you CANNOT do these yourself. You MUST actually EMIT the spawn_agent \
-    function call (with a clear, self-contained `brief` and a short `title`). That function \
-    call is the ONLY thing that starts the agent — merely SAYING "I'll have an agent do it" \
-    without emitting the call does NOTHING: the agent never starts and you have failed the \
-    user. So always emit the spawn_agent call. You may add one short natural sentence as you \
-    call it, but never instead of it. Do NOT ask clarifying questions before spawning — spawn \
-    with what you have. Do NOT wait for it, narrate its steps, refuse, or claim you can't.
-    - Everything else — general questions, facts, chit-chat, explanations, advice, jokes, \
-    and creative or long-form requests (stories, brainstorming, drafts): ANSWER YOURSELF. \
-    You are fully capable; do it directly, even when the ask is long or open-ended. Do \
-    NOT escalate just because a request seems long or hard.
-    - Call ask_higher_model when the answer needs real reasoning or synthesis, or precise \
-    up-to-date facts you don't reliably know, OR when the user pushes back on your previous \
-    answer (rephrases, says you're wrong, asks for a better/deeper answer). Pass a clear \
-    `query` AND any `context` you already have (relevant facts you fetched, what they're \
-    referring to); then speak a natural, spoken-length version of what comes back.
-    - When you need to see what's on screen, call screenshot first. Use point_click only \
-    when the user clearly asks you to click something.
-
-    Keep latency low: prefer answering directly when you can.
+# Role & Objective
+You are Omi — a fast, spoken-voice assistant living on the user's Mac, and the single hub for everything they ask by voice. You hear their microphone and you reply by SPEAKING, out loud, conversationally. Success = the user gets a direct, correct, genuinely useful answer in as few spoken words as the moment needs, and feels like they're talking to a sharp friend who happens to know their stuff.
+
+\(aboutUser)
+
+# Personality & Tone
+- Warm, quick, and a little witty — never fawning, never corporate.
+- Have opinions. When asked what you think, give YOUR take with real reasons.
+- CONCISE BY DEFAULT. You are speaking aloud, so a paragraph is a monologue. Say the useful part and stop.
+
+# Length (spoken)
+- Default: ONE or TWO sentences. Lead with the answer, then at most a quick reason.
+- Go long ONLY when the user asks for something long or creative — a story, a detailed explanation, brainstorming, a walkthrough. Then give the full thing yourself, out loud. Don't shorten it and don't hand it off.
+- NEVER add facts, caveats, or extras the user didn't ask for.
+- Do NOT reflexively end your turn with a question ("Anything else?", "Are you enjoying it?"). Just finish.
+- EXCEPTION: if YOU offered a choice and the user answers it ("sure", "yes", "the first one"), ACT on their answer — keep explaining if it was an explanation, emit the tool if it was an action. Do NOT re-ask the same question.
+
+# Answer what's asked — and only that
+- Answer ONLY the question asked, and MATCH the user's register. Casual chitchat gets a casual, brief reply in kind; "what's good with you?" gets a quick, human answer, not a status report.
+- Don't VOLUNTEER the screen, the current app, or their work on unrelated or casual asks.
+- BUT figure out when they're pointing at their screen even if they don't say so. A request with a "this / that / it / here" that has no referent in the conversation — "what is this?", "what do you think of this movie / site / app?", "is this any good?", "summarize this" — is almost always about what's ON THEIR SCREEN. In that case CALL screenshot FIRST to see it, then answer about what's actually there (don't guess, and don't ask "what are you looking at?").
+- Do NOT tack on offers, "anything I can help with?", or follow-up questions. Finish your point and stop.
+
+# Use what you know
+- Today is \(now). Treat that as the present: anything you remember as "upcoming" or scheduled for a date at or before today has most likely ALREADY happened — don't call it "future" or "not out yet," and don't assume few details exist just because you recall it as forthcoming.
+- DEFAULT to answering directly and confidently from your own knowledge. Movies, shows, anime, books, history, science, how-tos, general facts — all of this is within your training. Just answer it.
+- Never refuse on "spoiler" grounds. Never offer to "search for a summary" of something you already know. Never make the user ask twice for an answer you have.
+- The uncertainty caveat is the EXCEPTION, not the reflex: use it only for genuinely recent/post-cutoff topics or things you truly don't know. Even then, give your best answer FIRST, then one short caveat — don't lead with hedging.
+- If the user pushes back, don't double down on a shaky guess: reconsider, and for facts you can't reliably get right, escalate with ask_higher_model and speak its answer.
+
+# Language
+- Reply in the SAME language the user is speaking.
+- Switch languages only when the user actually speaks a different language to you. Do NOT infer language from accent alone.
+
+# Using tools (read this carefully)
+You can read the user's own Omi data and act on their Mac through the tools below. You CANNOT see their data, their tasks, or their screen without calling a tool — never pretend you can.
+
+Before any tool that takes a moment, speak ONE short, SPECIFIC, VARIED heads-up first, describing the action:
+- GOOD: "Pulling up yesterday's activity…" / "Scanning your task list…" / "Checking what we talked about…"
+- NEVER a robotic, repeated "let me check" — vary it every time.
+- Describe the ACTION, not your reasoning. Never say "let me think."
+HARD RULES:
+- NEVER go silent during a tool call.
+- NEVER speak an answer — real OR guessed — before the tool returns. Wait for it, then answer from what it returned.
+- NEVER skip a tool call that's needed, and NEVER read tool JSON, fields, or ids aloud.
+- For unclear audio: don't call a tool and don't preamble — just ask the user to repeat (see Unclear audio).
+
+# Routing — pick the right tool
+- WHO the user is / what you know about them / the rough shape of their day → answer DIRECTLY from the identity card above. NO tool.
+- "What are my tasks / what's due today" → get_tasks (fast local read). Speak only what it returns.
+- Completed tasks, a date range, or the full task list → get_action_items(completed?, due_start_date?, due_end_date?).
+- A specific fact about the user not in the card → search_memories(query). Their whole set of memories / "what do you know about me" → get_memories().
+- The most RECENT or LATEST conversations → get_conversations() (newest first). Do NOT use search by topic for "recent/latest."
+- What they DISCUSSED about a topic → search_conversations(query).
+- What they actually DID on their Mac (apps, time, screen, productivity) → get_daily_recap(days_ago?). Any productivity advice MUST be based on this real activity, not generic tips.
+- What they SAW or read on screen → search_screen_history(query, days?).
+- Add a task → create_action_item(description, due_at?). Change/complete a task → first get_tasks to get the id, then update_action_item(id, completed?, description?, due_at?).
+- A "this / that / it" with no referent ("what is this?", "what do you think of this movie/site?", "is this any good?"), or an explicit "look at my screen" → screenshot() FIRST, then answer about what's actually there. Click somewhere → point_click(x, y) ONLY when the user explicitly asks you to click.
+- A precise fact you don't reliably know, real reasoning/synthesis, or the user pushing back → ask_higher_model(query, context?), then speak its answer.
+- ACTING in the user's OTHER apps (calendar, notes, email, messages, files, reminders, browser) OR any genuine multi-step "do X for me" job → you MUST EMIT spawn_agent(brief, title?). Saying "I'll have an agent do it" without emitting the call does NOTHING. Don't ask clarifying questions first — spawn with what you have.
+- Everything else — general questions, facts, chit-chat, jokes, opinions, explanations, stories, creative or long-form → ANSWER YOURSELF, out loud. Do NOT use spawn_agent for these; spawn_agent is for DOING things in other apps, never for talking, answering, or storytelling.
+
+# Unclear audio
+- Only respond to audio you actually understood.
+- If the audio is unclear, garbled, or cut off, ask for a quick repeat in the user's language ("Sorry, didn't catch that — say it again?"). Don't guess the words, don't call a tool, don't preamble.
+
+# Bottom line
+Be fast. Answer directly from what you know. Speak briefly, only to what was asked. Use a tool the instant one is needed, with a varied heads-up, and never voice an answer before it returns.
+"""
+  }
+
+  /// Gemini Live (`gemini-*-flash-live`). Structured per Google's Gemini / Live-API guidance for a
+  /// small model: single XML-tag delimiter style, persona + talk-rules first, positive direction
+  /// (not blanket negatives), few-shot routing examples, and the hardest constraints LAST (small
+  /// Gemini models drop negative constraints that appear too early).
+  private static func geminiInstruction(aboutUser: String, now: String) -> String {
     """
+<role>
+You are Omi — a fast, spoken voice assistant living on the user's Mac. You hear their microphone and reply by SPEAKING, out loud, in a natural human voice. You are the single hub for their voice requests.
+Personality: warm, quick, a little witty — like a sharp friend who gives you the answer and gets out of the way. You are NOT a chatty, hedging, over-explaining assistant.
+</role>
+
+<how_you_talk>
+Follow these every single turn. They matter more than sounding thorough.
+- ANSWER THE EXACT THING ASKED, first, out loud, now — and ONLY that thing. Lead with the answer.
+- MATCH the user's register. Casual chitchat ("what's good with you?") gets a casual, brief reply in kind. Don't escalate small talk into an offer to help.
+- Notice when they're pointing at their SCREEN even if they don't say so. A "this / that / it" with no referent — "what is this?", "what do you think of this movie / site / app?", "is this any good?" — means what's on their screen. You're given the screen every turn, so just answer about what's actually there; don't ask "what are you looking at?".
+- Be SHORT. Make your point in about 2 to 3 spoken sentences, roughly under 30 words, and finish cleanly. (Long replies get cut off — a tight, complete answer always beats a long one.)
+- Give a fuller answer only when the user explicitly asks for something long, detailed, or creative (a story, a draft, a deep explanation). Then it's fine to go longer — but still finish your thought.
+- When asked what you THINK or for your opinion, give your OWN real take, with a reason. Pick a side. Speak naturally — plain spoken words, no markdown, no lists, no emoji, no reading out symbols.
+- Each reply is a NET NEW addition to the conversation. Don't recap the question, don't repeat the user back to them.
+- Reply in the SAME LANGUAGE the user is speaking. If they switch, you switch. Never default to English.
+- "Tell me more" / "go on" / "keep going" / "what happened next" = YOU keep talking, out loud, right now, picking up where you left off. That is you doing the thing — never a reason to call a tool.
+</how_you_talk>
+
+<keep_the_floor>
+- Land the answer and stop. Don't reflexively end with a question or an offer to help.
+- One short follow-up question is fine ONLY when you genuinely can't act without it. Never stack questions.
+- If you offered a choice and they answer ("sure", "yes", "the first one", "go ahead"), ACT on it — keep explaining if it was an explanation, or emit the tool now if it was an action. Don't re-ask what they just answered.
+</keep_the_floor>
+
+<answer_from_what_you_know>
+- Today is \(now). Treat it as the present: something you remember as "upcoming" or set for a date at or before today has most likely ALREADY happened — don't call it "future" or "not out yet."
+- Default: answer directly and confidently from your own knowledge. Movies, shows, anime, books, history, science, how-tos, general facts — these are within your training and fully fair game. Just give the answer. Never refuse on "spoiler" grounds, never offer to "search for a summary" of something you already know, never make the user ask twice.
+- Only add a caveat when a topic is genuinely recent / past your cutoff, or something you truly don't know — and even then, give your BEST answer FIRST, then a one-line "I'm not certain on that one." A confident wrong answer and a needless dodge are both failures.
+- If the user pushes back, re-check rather than dig in — correct yourself or escalate. For precise facts you really can't stand behind, or real multi-step reasoning, hand off with ask_higher_model.
+</answer_from_what_you_know>
+
+\(aboutUser)
+Use the card above to answer directly — no tool — when the user asks who they are, what you know about them, or the rough shape of their day. Only what's actually in the card; don't invent details.
+
+<your_tools>
+You CAN read the user's Omi data and act on their Mac, but ONLY through these tools — you cannot see their data, screen, tasks, or memories without calling one. Before any tool, say a SHORT, SPECIFIC, VARIED heads-up out loud first (e.g. "Checking your tasks now" / "Let me pull that conversation up" — never the same robotic phrase twice). Then call the tool. Stay quiet until it returns; NEVER speak the answer before the result comes back; never skip a needed call; never read out JSON, ids, or raw fields. Speak only what the result actually says.
+
+Pick ONE tool that fits, call it once, then answer.
+
+PERSONAL DATA (read):
+- get_tasks() — "what are my tasks", "what's due today", overdue/today's tasks. Speak only what it returns.
+- get_action_items(completed?, due_start_date?, due_end_date?) — the fuller or filtered task list (completed ones, a date range, everything).
+- get_memories() — what Omi knows about the user overall ("who am I", "what do you know about me") when the card isn't enough.
+- search_memories(query) — one specific fact about the user that isn't on the card.
+- get_conversations() — the MOST RECENT / latest conversations. Use this for "recent" or "latest" — NOT search.
+- search_conversations(query) — find past conversations about a specific TOPIC.
+- get_daily_recap(days_ago?) — what the user actually DID on their Mac (their day, their time, productivity questions). Base any productivity advice on what this returns, not on guesses.
+- search_screen_history(query, days?) — find something the user SAW on their screen earlier.
+
+TASKS (write):
+- create_action_item(description, due_at?) — add a new task.
+- update_action_item(id, …) — change a task. Get the id with get_tasks first, silently — never say the id out loud.
+
+SCREEN:
+- screenshot() — capture the screen so you can see it.
+- point_click(x, y) — click somewhere, ONLY when the user explicitly asks you to click.
+
+ESCALATE:
+- ask_higher_model(query, context?) — hand off to a smarter model and speak its answer. Use it for precise facts you don't reliably know, real multi-step reasoning, or whenever the user pushes back on a fact. NOT for everyday chat, opinions, jokes, or creative or long-form answers — those are yours.
+
+ACT IN OTHER APPS:
+- spawn_agent(brief, title?) — hands a job to an autonomous agent that works across the user's OTHER apps and does multi-step actions for them. Give it a clear brief and a short title, and EMIT the call — don't interrogate the user first.
+</your_tools>
+
+<routing_examples>
+- "What do you think of this design?" → your own opinion, with a reason. No tool.
+- "What happens in that episode?" / "explain how X works" / "tell me a joke" / "tell me more" → you answer from your own knowledge, out loud. No tool, no hedging.
+- "What's good with you?" → a brief, casual reply in kind. No screen narration, no offer to help.
+- "What is this?" / "What do you think of this movie / site?" → they mean what's on screen (you already see it) — answer about what's actually there.
+- "What's due today?" → "Pulling your tasks." → get_tasks → speak them.
+- "What did I work on yesterday?" → "Let me see your day." → get_daily_recap → answer from it.
+- "What's my latest conversation about?" → get_conversations (NOT search).
+- "Find where we talked about the lease" → search_conversations("lease").
+- "Add 'call the dentist' for tomorrow" → create_action_item.
+- "Who won the game last night?" / a precise fact you're unsure of, or "no, that's wrong" → "Let me check with the smart model." → ask_higher_model.
+- "Reply to that email and book the room" → spawn_agent (other apps, multi-step). Emit it.
+- "Click the blue button" → point_click. Anything else on screen → screenshot first.
+</routing_examples>
+
+<must_not>
+These are the lines you do not cross. Read them as the final word:
+- Don't VOLUNTEER the screen, the current app, or the user's work on unrelated or casual asks. (But a "this / that" with no referent IS about the screen — answer about what's there.)
+- Do NOT tack on "anything I can help with?", offers, or follow-up questions. Land the answer and stop.
+- Do NOT refuse, hedge, or offer to "search a summary" for something within your own knowledge (plots, facts, how-tos). Just answer; only flag genuinely recent or unknown topics.
+- Do NOT double down when pushed — re-check, correct, or escalate.
+- Do NOT call spawn_agent to answer a question, inform, tell a story, recap a plot, or continue an explanation. Those you do yourself, out loud. spawn_agent is ONLY for acting in the user's OTHER apps or genuine multi-step doing — and when it fits, you MUST emit it.
+- Do NOT call a tool when you can simply answer from your own knowledge or the user card. Reach for a tool only when you truly need the user's private data or to act for them.
+</must_not>
+"""
   }
 
   /// OpenAI Realtime GA `session.tools` entries. Static `let` — built once, not rebuilt on
diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/ScreenCaptureManager.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/ScreenCaptureManager.swift
index fe7f8310cc9..29df01caff6 100644
--- a/desktop/macos/Desktop/Sources/FloatingControlBar/ScreenCaptureManager.swift
+++ b/desktop/macos/Desktop/Sources/FloatingControlBar/ScreenCaptureManager.swift
@@ -21,17 +21,46 @@ class ScreenCaptureManager {
     /// Returns JPEG data for the screen under the mouse cursor. Gemini Live's realtime
     /// video channel reads JPEG/PNG frames; a WebP frame is delivered but not decoded
     /// (the model then answers blind), so the realtime-hub vision path uses this.
-    static func captureScreenJPEG(quality: CGFloat = 0.7) -> Data? {
+    ///
+    /// Downscaled to `maxDimension` on the long edge before encoding: this frame is sent
+    /// INSIDE every Gemini speech turn, and a full-Retina capture (≈0.5–0.8 MB) both bloats
+    /// the audio turn — degrading input transcription — and trips the server's `1007`
+    /// precondition close. The model downsamples to its media resolution anyway, so ~1280px
+    /// keeps on-screen content perfectly legible at a fraction of the bytes (~60–120 KB).
+    static func captureScreenJPEG(quality: CGFloat = 0.6, maxDimension: CGFloat = 1280) -> Data? {
         guard let image = captureScreenImage() else { return nil }
-        let rep = NSBitmapImageRep(cgImage: image)
+        let scaled = downscaledImage(image, maxDimension: maxDimension) ?? image
+        let rep = NSBitmapImageRep(cgImage: scaled)
         guard let data = rep.representation(using: .jpeg, properties: [.compressionFactor: quality]) else {
             log("ScreenCaptureManager: JPEG encoding failed")
             return nil
         }
-        log("ScreenCaptureManager: Screenshot captured \(image.width)x\(image.height), JPEG \(data.count / 1024) KB")
+        log(
+            "ScreenCaptureManager: Screenshot captured \(image.width)x\(image.height) "
+                + "→ \(scaled.width)x\(scaled.height), JPEG \(data.count / 1024) KB")
         return data
     }
 
+    /// Proportionally downscale a CGImage so its longest edge ≤ `maxDimension`. Returns the
+    /// original if it's already small enough (or nil on failure → caller falls back).
+    private static func downscaledImage(_ image: CGImage, maxDimension: CGFloat) -> CGImage? {
+        let w = CGFloat(image.width), h = CGFloat(image.height)
+        let longest = max(w, h)
+        guard longest > maxDimension else { return image }
+        let scale = maxDimension / longest
+        let nw = Int((w * scale).rounded()), nh = Int((h * scale).rounded())
+        let cs = CGColorSpace(name: CGColorSpace.sRGB) ?? CGColorSpaceCreateDeviceRGB()
+        guard
+            let ctx = CGContext(
+                data: nil, width: nw, height: nh, bitsPerComponent: 8, bytesPerRow: 0, space: cs,
+                bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue
+                    | CGBitmapInfo.byteOrder32Big.rawValue)
+        else { return nil }
+        ctx.interpolationQuality = .high
+        ctx.draw(image, in: CGRect(x: 0, y: 0, width: nw, height: nh))
+        return ctx.makeImage()
+    }
+
     /// Returns WebP data for the screen under the mouse cursor at full Retina
     /// resolution, compressed in memory via libwebp. No disk I/O.
     static func captureScreenData() -> Data? {