Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions desktop/macos/Desktop/Sources/AudioCaptureService.swift
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,15 @@ class AudioCaptureService: @unchecked Sendable {
if let override = overrideDeviceID {
inputDeviceID = override
log("AudioCapture: Using override device ID \(override)")
} else if Self.isDefaultOutputBluetooth(), let builtIn = Self.findBuiltInMicDeviceID() {
// Output is a Bluetooth device (e.g. AirPods). Opening the BT *microphone* forces the
// headset out of high-quality A2DP into 16 kHz HFP "call" mode — which degrades ALL
// playback (including the hub's spoken reply) and frequently makes macOS deliver only
// silence (the A2DP/HFP profile conflict). That silence trips the silent-mic watchdog,
Comment on lines +183 to +187
// which swaps devices and rebuilds the audio engine MID-REPLY, cutting the reply off.
// So capture from the built-in mic and leave Bluetooth in A2DP. Mirrors the PTT path.
inputDeviceID = builtIn

@cubic-dev-ai cubic-dev-ai Bot Jun 21, 2026

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: Auto-selecting built-in mic for Bluetooth output is not pinned, so later reconfiguration can revert capture back to default input and reintroduce the Bluetooth HFP/A2DP conflict.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At desktop/macos/Desktop/Sources/AudioCaptureService.swift, line 190:

<comment>Auto-selecting built-in mic for Bluetooth output is not pinned, so later reconfiguration can revert capture back to default input and reintroduce the Bluetooth HFP/A2DP conflict.</comment>

<file context>
@@ -180,6 +180,15 @@ class AudioCaptureService: @unchecked Sendable {
+            // silence (the A2DP/HFP profile conflict). That silence trips the silent-mic watchdog,
+            // which swaps devices and rebuilds the audio engine MID-REPLY, cutting the reply off.
+            // So capture from the built-in mic and leave Bluetooth in A2DP. Mirrors the PTT path.
+            inputDeviceID = builtIn
+            log("AudioCapture: output is Bluetooth — capturing from built-in mic id=\(builtIn) to keep A2DP")
         } else {
</file context>
Fix with cubic

log("AudioCapture: output is Bluetooth — capturing from built-in mic id=\(builtIn) to keep A2DP")
} else {
var size = UInt32(MemoryLayout<AudioDeviceID>.size)
var address = AudioObjectPropertyAddress(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,19 @@ class FloatingControlBarState: NSObject, ObservableObject {
@Published var isVoiceLocked: Bool = false
@Published var voiceTranscript: String = ""

/// Post-release state of a realtime-hub voice turn, so the bar never leaves the user
/// guessing whether a reply is coming after they let go of PTT. `.thinking` covers the
/// commit→first-audio gap (model latency / reconnect), `.speaking` while the reply plays,
/// `.failed` briefly when no reply arrived (hub error or the turn-completion watchdog
/// firing with no audio). `.none` = collapsed/idle. Driven by RealtimeHubController.
enum VoiceResponsePhase { case none, thinking, speaking, failed }
@Published var voiceResponsePhase: VoiceResponsePhase = .none

/// True whenever a voice turn owns the bar — actively listening (PTT-down) OR showing the
/// post-release status pill. One concept so bar-sizing/collapse guards don't each have to AND
/// `isVoiceListening` with `voiceResponsePhase` independently (and drift out of sync).
var voiceOwnsBar: Bool { isVoiceListening || voiceResponsePhase != .none }

// Voice follow-up state (PTT while AI conversation is active)
@Published var isVoiceFollowUp: Bool = false
@Published var voiceFollowUpTranscript: String = ""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ struct FloatingControlBarView: View {

/// Whether the bar chrome should stretch to fill the window width
private var barNeedsFullWidth: Bool {
isHovering || state.showingAIConversation || state.isVoiceListening
isHovering || state.showingAIConversation || state.voiceOwnsBar
}

private var barChrome: some View {
Expand Down Expand Up @@ -285,6 +285,12 @@ struct FloatingControlBarView: View {
.padding(.vertical, 3)
.frame(height: 50)
.transition(.opacity)
} else if state.voiceResponsePhase != .none && !state.isVoiceFollowUp {
voiceResponseView
.padding(.horizontal, 6)
.padding(.vertical, 3)
.frame(height: 50)
.transition(.opacity)
} else if isHovering || state.showingAIConversation {
VStack(spacing: 1) {
compactButton(title: "Ask omi / Collapse", keys: shortcutSettings.askOmiShortcut.displayTokens) {
Expand Down Expand Up @@ -395,6 +401,52 @@ struct FloatingControlBarView: View {
}
}

/// Post-release status of a realtime-hub voice turn — shown after PTT-up so a slow or failed
/// reply never collapses to a blank bar: thinking → speaking → a brief "no response" hint.
/// While thinking/speaking the pill is tappable to STOP the reply (a push-to-talk tap also
/// interrupts). Speaking stays subtle — animated waveform + speaker glyph, no transcript.
private var voiceResponseView: some View {
let canStop =
state.voiceResponsePhase == .thinking || state.voiceResponsePhase == .speaking
return HStack(spacing: 8) {
// Lively while thinking/speaking; at rest on failure.
VoiceWaveformBars(isActive: canStop)

switch state.voiceResponsePhase {
case .thinking:
Image(systemName: "ellipsis.bubble.fill")
.scaledFont(size: 14, weight: .semibold)
.foregroundColor(.white)
Text("Thinking…")
.scaledFont(size: 13)
.foregroundColor(.white.opacity(0.7))
case .speaking:
// Subtle: the animated waveform conveys that it's speaking — no transcript text.
Image(systemName: "speaker.wave.2.fill")
.scaledFont(size: 13, weight: .semibold)
.foregroundColor(.white.opacity(0.85))
.symbolEffect(.variableColor.iterative, options: .repeating)
// Subtle tap-to-stop affordance.
Image(systemName: "stop.circle.fill")
.scaledFont(size: 13, weight: .regular)
.foregroundColor(.white.opacity(0.45))
case .failed:
Image(systemName: "exclamationmark.triangle.fill")
.scaledFont(size: 14, weight: .semibold)
.foregroundColor(.orange)
Text("No response — hold \(shortcutSettings.pttShortcut.displayLabel) to retry")
.scaledFont(size: 13)
.foregroundColor(.white.opacity(0.7))
.lineLimit(1)
case .none:
EmptyView()
}
}
// Tap the pill to stop the reply (in addition to a push-to-talk tap).
.contentShape(Rectangle())
.onTapGesture { if canStop { RealtimeHubController.shared.stopSpeaking() } }
}

private var aiInputView: some View {
AskAIInputView(
userInput: Binding(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -619,7 +619,11 @@ class FloatingControlBarWindow: NSPanel, NSWindowDelegate {

/// Resize for hover expand/collapse — anchored from center so the circle grows outward.
func resizeForHover(expanded: Bool) {
guard !state.showingAIConversation, !state.isVoiceListening, !state.isShowingNotification, !suppressHoverResize else { return }
// Don't hover-resize while a voice turn owns the bar — listening OR the post-release
// thinking/speaking/failed status pill. Otherwise hovering fights the voice-phase
// sizing (bar jumps on hover-in, shrinks mid-reply on hover-out).
guard !state.showingAIConversation, !state.voiceOwnsBar,
!state.isShowingNotification, !suppressHoverResize else { return }
resizeWorkItem?.cancel()
resizeWorkItem = nil

Expand All @@ -628,7 +632,7 @@ class FloatingControlBarWindow: NSPanel, NSWindowDelegate {
let doResize: () -> Void = { [weak self] in
guard let self = self else { return }
guard !self.state.showingAIConversation,
!self.state.isVoiceListening,
!self.state.voiceOwnsBar,
!self.state.isShowingNotification,
!self.suppressHoverResize
else { return }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1055,7 +1055,12 @@ class PushToTalkManager: ObservableObject {
guard !skipResize && !barState.isVoiceFollowUp && !barState.showingAIConversation && !isOnboarding else { return }
if barState.isVoiceListening && !wasListening {
FloatingControlBarManager.shared.resizeForPTT(expanded: true)
} else if !barState.isVoiceListening && wasListening {
} else if !barState.isVoiceListening && wasListening
&& barState.voiceResponsePhase == .none
{
// Don't collapse on PTT-up when the hub is about to answer — the bar stays expanded
// through the thinking/speaking status (RealtimeHubController collapses it when the
// reply resolves). Without this guard the bar would blink shut then reopen.
FloatingControlBarManager.shared.resizeForPTT(expanded: false)
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import AVFoundation
import CoreGraphics
import Foundation
import SwiftUI

// MARK: - Realtime Hub Controller (Phase 1)
//
Expand Down Expand Up @@ -163,7 +164,7 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate {
}

private func startSession(provider: RealtimeHubProvider, auth: HubAuth) {
let instructions = RealtimeHubTools.systemInstruction(aboutUser: aboutUserCard)
let instructions = RealtimeHubTools.systemInstruction(aboutUser: aboutUserCard, provider: provider)
let s = RealtimeHubSession(provider: provider, auth: auth, instructions: instructions, delegate: self)
session = s
sessionProvider = provider
Expand Down Expand Up @@ -199,6 +200,10 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate {
responding = false
turnTranscript = ""
assistantText = ""
// A new turn supersedes any prior reply's status indicator (the listening UI takes over);
// bump the token so a queued `.failed` auto-dismiss from the last turn can't fire now.
voicePhaseToken &+= 1
barState?.voiceResponsePhase = .none
speculativeWarmDone = false
speculativeScreenshot = nil
audioReceivedThisTurn = false
Expand Down Expand Up @@ -245,6 +250,9 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate {
/// PTT-up: end the turn; the model now responds (and may call tools).
func commitTurn() {
responding = true
// Keep the bar alive after release: show "thinking" until the first audio arrives, so the
// commit→reply gap (model latency / reconnect / a slow turn) never looks like a dead bar.
setVoicePhase(.thinking)

@cubic-dev-ai cubic-dev-ai Bot Jun 21, 2026

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: commitTurn shows "thinking" even when no live session exists. After a mid-turn disconnect this can leave the voice status stuck indefinitely.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift, line 255:

<comment>`commitTurn` shows "thinking" even when no live session exists. After a mid-turn disconnect this can leave the voice status stuck indefinitely.</comment>

<file context>
@@ -245,6 +250,9 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate {
     responding = true
+    // Keep the bar alive after release: show "thinking" until the first audio arrives, so the
+    // commit→reply gap (model latency / reconnect / a slow turn) never looks like a dead bar.
+    setVoicePhase(.thinking)
     // (The screen frame is sent at turn START — see beginTurn — so it has time to
     // upload/decode before the model answers. Nothing to attach here.)
</file context>
Suggested change
setVoicePhase(.thinking)
if session != nil { setVoicePhase(.thinking) }
Fix with cubic

// (The screen frame is sent at turn START — see beginTurn — so it has time to
// upload/decode before the model answers. Nothing to attach here.)
session?.commitInputTurn()
Expand Down Expand Up @@ -293,6 +301,8 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate {

func hubDidReceiveAudio(_ pcm24k: Data) {
audioReceivedThisTurn = true
// First audio of the turn → flip the status from "thinking" to "speaking".
if barState?.voiceResponsePhase == .thinking { setVoicePhase(.speaking) }
pcmPlayer?.enqueue(pcm24k) // native spoken audio (OpenAI + Gemini)
}

Expand Down Expand Up @@ -539,18 +549,26 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate {
turnRecorded = true
FloatingControlBarManager.shared.recordVoiceTurn(userText: heard, assistantText: reply)
}
// Resolve the status: a turn that produced audio just ends (collapse); a turn that
// finished with NO audio — e.g. the Gemini turn-completion watchdog fired on a dropped/
// empty reply — briefly tells the user it got nothing, instead of silently collapsing.
setVoicePhase(audioReceivedThisTurn ? .none : .failed)

@cubic-dev-ai cubic-dev-ai Bot Jun 21, 2026

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: Failure state is keyed only on native audio, so valid text-fallback replies are mislabeled as "no response." This produces incorrect UX during fallback turns.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift, line 555:

<comment>Failure state is keyed only on native audio, so valid text-fallback replies are mislabeled as "no response." This produces incorrect UX during fallback turns.</comment>

<file context>
@@ -539,18 +549,26 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate {
+    // Resolve the status: a turn that produced audio just ends (collapse); a turn that
+    // finished with NO audio — e.g. the Gemini turn-completion watchdog fired on a dropped/
+    // empty reply — briefly tells the user it got nothing, instead of silently collapsing.
+    setVoicePhase(audioReceivedThisTurn ? .none : .failed)
     exitVoiceUI()
   }
</file context>
Suggested change
setVoicePhase(audioReceivedThisTurn ? .none : .failed)
setVoicePhase((audioReceivedThisTurn || !reply.isEmpty) ? .none : .failed)
Fix with cubic

exitVoiceUI()
}

func hubDidError(_ message: String) {
// A socket we intentionally dropped is detached in teardownSession() before it's
// released, so its death-rattle never reaches us — only the live session's errors
// land here.
let wasInFlight = replyInFlight
responding = false
logError("RealtimeHub: session error — \(message)")
// The reply is dead — stop any buffered audio before collapsing.
pcmPlayer?.stop()
if speech.isSpeaking { speech.stopSpeaking(at: .immediate) }
// If a turn was mid-flight when the socket died, tell the user it failed (then collapse)
// rather than vanishing mid-reply. A benign idle-close between turns shows nothing.
if wasInFlight { setVoicePhase(.failed) }
exitVoiceUI()
let aliveFor = lastWarmAt.map { Date().timeIntervalSince($0) } ?? 0
teardownSession()
Expand Down Expand Up @@ -585,14 +603,81 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate {
barState.isVoiceFollowUp = false
// Collapse the bar ourselves in that case — guarded so we never shrink the bar out
// from under an open conversation, response, notification, hover, or onboarding.
guard wasExpandedForVoice,
!barState.showingAIConversation, !barState.showingAIResponse,
barState.currentNotification == nil, !barState.isHoveringBar,
UserDefaults.standard.bool(forKey: "hasCompletedOnboarding")
else { return }
guard wasExpandedForVoice, barIsFreeToCollapse(barState) else { return }
FloatingControlBarManager.shared.resizeForPTT(expanded: false)
}

/// User-initiated "stop talking": halt the current spoken reply immediately, without starting
/// a new turn. Stops local playback and tells the session to cancel/gate the rest of the reply
/// (OpenAI: response.cancel; Gemini: drops the pending-reply gate so further audio is ignored —
/// the warm socket + conversation context survive). Tapping push-to-talk also interrupts (that
/// starts a fresh turn); this is the no-new-turn way to just make it stop.
func stopSpeaking() {
guard replyInFlight else { return }
log("RealtimeHub[\(providerTag)]: stop — user halted the reply")
responding = false
pcmPlayer?.stop()
if speech.isSpeaking { speech.stopSpeaking(at: .immediate) }
session?.cancelActiveResponse()
session?.abandonInputTurn()
setVoicePhase(.none)
}

// MARK: - Voice response status (so a slow/failed reply never leaves the user guessing)

/// Invalidates a pending `.failed` auto-dismiss when the phase changes again.
private var voicePhaseToken = 0

/// Drive the floating bar's post-release voice status. The hub speaks its reply as audio
/// with no inline UI, so after PTT-up the bar would otherwise collapse and the user would
/// be left wondering whether a reply is coming during the commit→first-audio gap (model
/// latency, a reconnect, or a dropped/late turn). This keeps the bar showing
/// thinking → speaking → (failed) until the turn resolves. `.failed` holds a brief hint,
/// then collapses; `.none` collapses immediately.
private func setVoicePhase(_ phase: FloatingControlBarState.VoiceResponsePhase) {
guard let barState else { return }
voicePhaseToken &+= 1
let token = voicePhaseToken
withAnimation(.easeInOut(duration: 0.18)) {
barState.voiceResponsePhase = phase
}
switch phase {
case .failed:
DispatchQueue.main.asyncAfter(deadline: .now() + 2.5) { [weak self] in
guard let self, token == self.voicePhaseToken,
self.barState?.voiceResponsePhase == .failed
else { return }
self.setVoicePhase(.none)
}
case .none:
collapseVoiceBarIfIdle()
case .thinking, .speaking:
break // the bar is already expanded from the listening turn (kept open in updateBarState)
}
}

/// Collapse the bar once the voice turn is fully resolved (no longer owns the bar) and nothing
/// else needs it open.
private func collapseVoiceBarIfIdle() {
guard let barState, !barState.voiceOwnsBar, barIsFreeToCollapse(barState) else { return }
FloatingControlBarManager.shared.resizeForPTT(expanded: false)
}

/// Shared collapse guard: true when nothing else owns the bar (no open conversation, response,
/// notification, or hover) and onboarding is done — so it's safe to shrink it.
private func barIsFreeToCollapse(_ barState: FloatingControlBarState) -> Bool {
!barState.showingAIConversation && !barState.showingAIResponse
&& barState.currentNotification == nil && !barState.isHoveringBar
&& UserDefaults.standard.bool(forKey: "hasCompletedOnboarding")
}

/// True while a hub reply is in flight — between commit and turn-done, or while the status pill
/// shows thinking/speaking. Used for barge-in / stop / failure decisions.
private var replyInFlight: Bool {
responding || barState?.voiceResponsePhase == .thinking
|| barState?.voiceResponsePhase == .speaking
}

// MARK: - Tools

/// ask_higher_model — reuse the EXISTING prompt-cached /v2/chat/completions
Expand Down
Loading
Loading