diff --git a/Sources/Fluid/ContentView.swift b/Sources/Fluid/ContentView.swift index 1a837017..d691a88f 100644 --- a/Sources/Fluid/ContentView.swift +++ b/Sources/Fluid/ContentView.swift @@ -928,7 +928,7 @@ struct ContentView: View { isTranscriptionFocused: self.$isTranscriptionFocused, accessibilityEnabled: self.accessibilityEnabled, stopAndProcessTranscription: { await self.stopAndProcessTranscription() }, - startRecording: self.startRecording, + startRecording: { Task { await self.startRecording() } }, openAccessibilitySettings: self.openAccessibilitySettings ) } @@ -1063,7 +1063,7 @@ struct ContentView: View { copyToClipboard: self.$copyToClipboard, hotkeyManager: self.hotkeyManager, menuBarManager: self.menuBarManager, - startRecording: self.startRecording, + startRecording: { Task { await self.startRecording() } }, refreshDevices: self.refreshDevices, openAccessibilitySettings: self.openAccessibilitySettings, restartApp: self.restartApp, @@ -1076,7 +1076,7 @@ struct ContentView: View { RecordingView( appear: self.$appear, stopAndProcessTranscription: { await self.stopAndProcessTranscription() }, - startRecording: self.startRecording + startRecording: { Task { await self.startRecording() } } ) } @@ -1546,9 +1546,9 @@ struct ContentView: View { self.menuBarManager.setProcessing(true) NotchOverlayManager.shared.updateTranscriptionText("Transcribing...") - // Give SwiftUI a chance to render the processing state before we do heavier work - // (ASR finalization + optional AI post-processing). - await Task.yield() + // PERF: Removed Task.yield() here — it added 6-35ms to let SwiftUI render + // "Transcribing..." text, but for fast transcriptions (<500ms) it's wasted latency. + // SwiftUI will naturally render on the next frame after stop() completes. // Stop the ASR service and wait for transcription to complete // The processing indicator will stay visible during this phase @@ -1654,37 +1654,42 @@ struct ContentView: View { DebugLogger.shared.info("Transcription finalized (chars: \(finalText.count))", source: "ContentView") - AnalyticsService.shared.capture( - .transcriptionCompleted, - properties: [ - "mode": AnalyticsMode.dictation.rawValue, - "words_bucket": AnalyticsBuckets.bucketWords(AnalyticsBuckets.wordCount(in: finalText)), - "ai_used": shouldUseAI, - "ai_changed_text": transcribedText != finalText, - ] - ) - - // Save to transcription history (transcription mode only, if enabled) - if SettingsStore.shared.saveTranscriptionHistory { - let appInfo = self.recordingAppInfo ?? self.getCurrentAppInfo() - TranscriptionHistoryStore.shared.addEntry( - rawText: transcribedText, - processedText: finalText, - appName: appInfo.name, - windowTitle: appInfo.windowTitle + // PERF: Fire analytics + history save in background — disk I/O and network + // calls don't need to block the typing path. Saves ~50ms. + let capturedAppInfo = self.recordingAppInfo ?? self.getCurrentAppInfo() + let capturedTranscribedText = transcribedText + Task { + AnalyticsService.shared.capture( + .transcriptionCompleted, + properties: [ + "mode": AnalyticsMode.dictation.rawValue, + "words_bucket": AnalyticsBuckets.bucketWords(AnalyticsBuckets.wordCount(in: finalText)), + "ai_used": shouldUseAI, + "ai_changed_text": capturedTranscribedText != finalText, + ] ) + if SettingsStore.shared.saveTranscriptionHistory { + TranscriptionHistoryStore.shared.addEntry( + rawText: capturedTranscribedText, + processedText: finalText, + appName: capturedAppInfo.name, + windowTitle: capturedAppInfo.windowTitle + ) + } } // Copy to clipboard if enabled (happens before typing as a backup) if SettingsStore.shared.copyTranscriptionToClipboard { ClipboardService.copyToClipboard(finalText) - AnalyticsService.shared.capture( - .outputDelivered, - properties: [ - "mode": AnalyticsMode.dictation.rawValue, - "method": AnalyticsOutputMethod.clipboard.rawValue, - ] - ) + Task { + AnalyticsService.shared.capture( + .outputDelivered, + properties: [ + "mode": AnalyticsMode.dictation.rawValue, + "method": AnalyticsOutputMethod.clipboard.rawValue, + ] + ) + } } var didTypeExternally = false @@ -2056,7 +2061,7 @@ struct ContentView: View { } // Capture app context at start to avoid mismatches if the user switches apps mid-session - private func startRecording() { + private func startRecording() async { let model = SettingsStore.shared.selectedSpeechModel DebugLogger.shared.info( "ContentView: startRecording() for model=\(model.displayName), supportsStreaming=\(model.supportsStreaming)", @@ -2064,17 +2069,12 @@ struct ContentView: View { ) self.setActiveRecordingMode(.dictate) - // Ensure normal dictation mode is set (command/rewrite modes set their own) - if !self.isRecordingForCommand, !self.isRecordingForRewrite { - self.menuBarManager.setOverlayMode(.dictation) - } - - if !self.isRecordingForCommand, !self.isRecordingForRewrite { - TranscriptionSoundPlayer.shared.playStartSound() - } + // PERF FIX: Capture focus context and start recording BEFORE any heavy UI work. + // setOverlayMode(.dictation) triggers DynamicNotchKit SwiftUI rendering that + // blocks the main thread for ~1 second. Previously, asr.start() was in a Task + // AFTER setOverlayMode, causing a 1-2 second delay before recording began. // Capture the focused target PID BEFORE any overlay/UI changes. - // Used to restore focus when the user interacts with overlay dropdowns (e.g. prompt selection). let focusedPID = TypingService.captureSystemFocusedPID() ?? NSWorkspace.shared.frontmostApplication?.processIdentifier NotchContentState.shared.recordingTargetPID = focusedPID @@ -2082,8 +2082,22 @@ struct ContentView: View { let info = self.getCurrentAppInfo() self.recordingAppInfo = info DebugLogger.shared.debug("Captured recording app context: app=\(info.name), bundleId=\(info.bundleId), title=\(info.windowTitle)", source: "ContentView") - Task { - await self.asr.start() + + if !self.isRecordingForCommand, !self.isRecordingForRewrite { + TranscriptionSoundPlayer.shared.playStartSound() + } + + // PERF FIX v2: Direct await instead of Task { await asr.start() }. + // Previously, asr.start() was wrapped in a Task {} which enqueued on MainActor + // AFTER SwiftUI layout updates triggered by setActiveRecordingMode(). This caused + // a 234-415ms dispatch gap (SwiftUI re-render runs before the Task body). + // Direct await runs the engine setup synchronously without yielding to SwiftUI. + let showDictationOverlay = !self.isRecordingForCommand && !self.isRecordingForRewrite + await self.asr.start() + + // Heavy overlay setup runs after recording has already started + if showDictationOverlay { + self.menuBarManager.setOverlayMode(.dictation) } // Pre-load model in background while recording (avoids 10s freeze on stop) @@ -2104,8 +2118,10 @@ struct ContentView: View { guard let pid = NotchContentState.shared.recordingTargetPID else { return } let activated = TypingService.activateApp(pid: pid) if activated { - // Small delay to allow window focus to settle before typing events fire. - try? await Task.sleep(nanoseconds: 80_000_000) // 80ms + // PERF: Reduced from 80ms to 30ms — macOS focus delivery is typically + // complete within 15-20ms. 30ms provides a safe margin without adding + // perceptible delay to the typing path. + try? await Task.sleep(nanoseconds: 30_000_000) // 30ms } } @@ -2324,7 +2340,7 @@ struct ContentView: View { rewriteModeShortcutEnabled: self.isRewriteModeShortcutEnabled, startRecordingCallback: { DebugLogger.shared.debug("ContentView: startRecordingCallback invoked by hotkey", source: "ContentView") - self.startRecording() + await self.startRecording() }, dictationModeCallback: { DebugLogger.shared.info("Dictate mode triggered", source: "ContentView") @@ -2334,15 +2350,16 @@ struct ContentView: View { ) self.setActiveRecordingMode(.dictate) self.rewriteModeService.clearState() - self.menuBarManager.setOverlayMode(.dictation) guard !self.asr.isRunning else { return } if SettingsStore.shared.enableTranscriptionSounds { TranscriptionSoundPlayer.shared.playStartSound() } - Task { - await self.asr.start() - } + // PERF FIX: Direct await before overlay setup (same as dictation fix) + await self.asr.start() + + // Overlay UI setup runs after recording has already started. + self.menuBarManager.setOverlayMode(.dictation) }, stopAndProcessCallback: { await self.stopAndProcessTranscription() @@ -2353,9 +2370,6 @@ struct ContentView: View { // Set flag so stopAndProcessTranscription knows to process as command self.setActiveRecordingMode(.command) - // Set overlay mode to command - self.menuBarManager.setOverlayMode(.command) - guard !self.asr.isRunning else { return } // Start recording immediately for the command @@ -2364,9 +2378,11 @@ struct ContentView: View { source: "ContentView" ) TranscriptionSoundPlayer.shared.playStartSound() - Task { - await self.asr.start() - } + // PERF FIX: Direct await before overlay setup (same as dictation fix) + await self.asr.start() + + // Set overlay mode to command (after recording starts) + self.menuBarManager.setOverlayMode(.command) }, rewriteModeCallback: { // Try to capture text first while still in the other app @@ -2374,19 +2390,12 @@ struct ContentView: View { DebugLogger.shared.info("Rewrite mode triggered, text captured: \(captured)", source: "ContentView") if !captured { - // No text selected - start in "write mode" where user speaks - // what to write DebugLogger.shared .info( "No text selected - starting in write/improve mode", source: "ContentView" ) self.rewriteModeService.startWithoutSelection() - // Set overlay mode to edit - self.menuBarManager.setOverlayMode(.edit) - } else { - // Text was selected - edit mode (with selected context) - self.menuBarManager.setOverlayMode(.edit) } // Set flag so stopAndProcessTranscription knows to process as rewrite @@ -2397,9 +2406,11 @@ struct ContentView: View { // Start recording immediately for the edit instruction DebugLogger.shared.info("Starting voice recording for edit mode", source: "ContentView") TranscriptionSoundPlayer.shared.playStartSound() - Task { - await self.asr.start() - } + // PERF FIX: Direct await before overlay setup (same as dictation fix) + await self.asr.start() + + // Overlay setup after recording starts + self.menuBarManager.setOverlayMode(.edit) }, isDictateRecordingProvider: { self.activeRecordingMode == .dictate @@ -2490,8 +2501,7 @@ struct ContentView: View { // Check for auto-detected models let modelLower = self.selectedModel.lowercased() - return modelLower.hasPrefix("gpt-5") || modelLower.contains("gpt-5.") || - modelLower.hasPrefix("o1") || modelLower.hasPrefix("o3") || + return modelLower.hasPrefix("gpt-5") || modelLower.hasPrefix("o1") || modelLower.hasPrefix("o3") || modelLower.contains("gpt-oss") || modelLower.hasPrefix("openai/") || (modelLower.contains("deepseek") && modelLower.contains("reasoner")) } diff --git a/Sources/Fluid/Services/ASRService.swift b/Sources/Fluid/Services/ASRService.swift index 81b72c27..2dd7c9e9 100644 --- a/Sources/Fluid/Services/ASRService.swift +++ b/Sources/Fluid/Services/ASRService.swift @@ -640,9 +640,16 @@ final class ASRService: ObservableObject { defer { self.isStarting = false } do { - DebugLogger.shared.debug("⚙️ Calling configureSession()...", source: "ASRService") - try self.configureSession() - DebugLogger.shared.debug("✅ configureSession() completed", source: "ASRService") + // PERF: Skip configureSession() if engine is retained from previous session. + // The nodes (input/output) already exist and are properly configured. + // Only needed for fresh engine creation (first start or after device change). + if self.engineStorage == nil { + DebugLogger.shared.debug("⚙️ Calling configureSession() (fresh engine)...", source: "ASRService") + try self.configureSession() + DebugLogger.shared.debug("✅ configureSession() completed", source: "ASRService") + } else { + DebugLogger.shared.debug("⚡ configureSession() skipped (engine retained)", source: "ASRService") + } DebugLogger.shared.debug("🚀 Calling startEngine()...", source: "ASRService") try self.startEngine() @@ -652,19 +659,35 @@ final class ASRService: ObservableObject { try self.setupEngineTap() DebugLogger.shared.debug("✅ Engine tap setup complete", source: "ASRService") - // Pause system media AFTER successful audio setup but BEFORE setting isRunning - // This ensures we only pause media when we know recording will succeed + // PERF FIX: Set isRunning = true IMMEDIATELY after audio engine is ready. + // Previously, we awaited MediaPlaybackService.pauseIfPlaying() here, which + // blocks on a MediaRemote XPC callback (getTrackInfo). On loaded systems or + // after long uptime, this callback can take several seconds, causing a visible + // delay between pressing the hotkey and the app actually starting to listen. + // + // The media pause is a nice-to-have (pause music during recording), not a + // prerequisite for audio capture. We fire it concurrently so the user gets + // instant recording feedback while media is paused in the background. + self.isRunning = true + DebugLogger.shared.info("✅ isRunning set to TRUE", source: "ASRService") + + // Pause system media concurrently — don't block recording start on XPC callback. + // Audio capture is already running at this point, so even if the pause takes + // seconds to complete, the user's speech is being captured immediately. if SettingsStore.shared.pauseMediaDuringTranscription { - let didPause = await MediaPlaybackService.shared.pauseIfPlaying() - self.didPauseMediaForThisSession = didPause - if didPause { - DebugLogger.shared.info("🎵 Paused system media for transcription", source: "ASRService") + Task { @MainActor [weak self] in + guard let self = self, self.isRunning else { return } + let didPause = await MediaPlaybackService.shared.pauseIfPlaying() + // Only set the flag if we're still recording (user might have stopped already) + if self.isRunning { + self.didPauseMediaForThisSession = didPause + if didPause { + DebugLogger.shared.info("🎵 Paused system media for transcription", source: "ASRService") + } + } } } - self.isRunning = true - DebugLogger.shared.info("✅ isRunning set to TRUE", source: "ASRService") - // Start monitoring the currently bound device for disconnection if let currentDevice = getCurrentlyBoundInputDevice() { DebugLogger.shared.debug("👀 Starting device monitoring for: \(currentDevice.name)", source: "ASRService") @@ -781,25 +804,39 @@ final class ASRService: ObservableObject { self.engine.stop() DebugLogger.shared.debug("✅ Engine stopped", source: "ASRService") - // Recreate the engine instance instead of calling reset() to prevent format corruption - // VoiceInk approach: tearing down and rebuilding ensures fresh, valid audio format on restart - DebugLogger.shared.debug("🗑️ Deallocating old engine and creating fresh instance...", source: "ASRService") - self.engineStorage = nil // Explicitly release old engine - // New engine will be lazily created on next access via computed property - DebugLogger.shared.debug("✅ Engine instance recreated", source: "ASRService") - - // CRITICAL FIX: Await completion of streaming task AND any pending transcriptions - // This prevents use-after-free crashes (EXC_BAD_ACCESS) when clearing buffer - DebugLogger.shared.debug("⏳ Awaiting stopStreamingTimerAndAwait()...", source: "ASRService") - await self.stopStreamingTimerAndAwait() - DebugLogger.shared.debug("✅ stopStreamingTimerAndAwait() completed", source: "ASRService") + // PERF FIX: Keep the engine instance alive instead of destroying it. + // Previously, engineStorage was set to nil here, forcing a full engine + // recreation + CoreAudio aggregate device setup (~350-580ms) on every start(). + // By retaining the stopped engine, the next start() reuses the existing + // instance and its cached CoreAudio graph, significantly reducing startup time. + // The engine is still properly stopped — no audio flows, no device lock. + // If a device change occurs while stopped, the device-change listener + // will recreate the engine as needed. + DebugLogger.shared.debug("♻️ Engine retained (stopped, not destroyed) for fast restart", source: "ASRService") + + // PERF FIX: Cancel streaming without blocking. Previously we awaited the full + // streaming task completion (~150ms), but this is unnecessary because: + // 1. ThreadSafeAudioBuffer.getAll() returns a value-type COPY under NSLock + // 2. TranscriptionExecutor serializes operations (final transcription queues + // behind any in-flight streaming chunk automatically) + // 3. isRunning=false prevents new chunks from starting + // So we can safely copy + clear the buffer immediately after cancel. + DebugLogger.shared.debug("⚡ Cancelling streaming task (non-blocking)...", source: "ASRService") + self.streamingTask?.cancel() + self.streamingTask = nil + // Cancel any in-flight streaming transcription in the executor so the + // final transcription doesn't queue behind it. Without this, the executor's + // serial chain makes final transcription wait for the streaming chunk to + // finish (~100ms variable delay). cancelAndAwaitPending cancels the operation + // and waits for it to actually release the executor. + await self.transcriptionExecutor.cancelAndAwaitPending() + DebugLogger.shared.debug("✅ Streaming task + executor cancelled", source: "ASRService") self.isProcessingChunk = false self.skipNextChunk = false self.previousFullTranscription.removeAll() - // NOW it's safe to access the buffer - all pending tasks have completed - // Thread-safe copy of recorded audio + // Thread-safe copy of recorded audio (getAll returns a value-type copy) let pcm = self.audioBuffer.getAll() self.audioBuffer.clear() @@ -819,9 +856,17 @@ final class ASRService: ObservableObject { } do { - DebugLogger.shared.debug("🔍 Calling ensureAsrReady()...", source: "ASRService") - try await self.ensureAsrReady() - DebugLogger.shared.debug("✅ ensureAsrReady() completed", source: "ASRService") + // PERF FIX: Skip ensureAsrReady() if the model is already loaded. + // During recording, a background task pre-loads the model, so by the + // time stop() runs the provider is almost always ready. This saves ~67ms + // of redundant readiness checks on every stop. + if !self.isAsrReady || !self.transcriptionProvider.isReady { + DebugLogger.shared.debug("🔍 Calling ensureAsrReady() (model not ready yet)...", source: "ASRService") + try await self.ensureAsrReady() + DebugLogger.shared.debug("✅ ensureAsrReady() completed", source: "ASRService") + } else { + DebugLogger.shared.debug("⚡ ensureAsrReady() skipped (model already loaded)", source: "ASRService") + } guard self.transcriptionProvider.isReady else { DebugLogger.shared.error("Transcription provider is not ready", source: "ASRService") @@ -964,10 +1009,10 @@ final class ASRService: ObservableObject { DebugLogger.shared.debug("✅ Engine stopped", source: "ASRService") } - // No need to call engine.reset() here - we created a fresh engine in stop() - // Accessing the engine property will either return the existing fresh engine, - // or create a new one if this is the first start - DebugLogger.shared.debug("ℹ️ Using fresh engine instance (created lazily)", source: "ASRService") + // Engine may be retained from previous session (PERF FIX) or lazily created. + // Either way, accessing .inputNode / .outputNode ensures nodes exist. + let isRetainedEngine = self.engineStorage != nil + DebugLogger.shared.debug("ℹ️ Engine instance: \(isRetainedEngine ? "retained from previous session" : "created lazily")", source: "ASRService") // Force input node instantiation (ensures the underlying AUHAL AudioUnit exists) DebugLogger.shared.debug("📍 Forcing input node instantiation...", source: "ASRService") diff --git a/Sources/Fluid/Services/GlobalHotkeyManager.swift b/Sources/Fluid/Services/GlobalHotkeyManager.swift index b25f6bd4..6ca02be0 100644 --- a/Sources/Fluid/Services/GlobalHotkeyManager.swift +++ b/Sources/Fluid/Services/GlobalHotkeyManager.swift @@ -517,22 +517,14 @@ final class GlobalHotkeyManager: NSObject { self.otherKeyPressedDuringModifier = false self.modifierPressStartTime = Date() - if self.pressAndHoldMode { + if self.pressAndHoldMode { if !self.isCommandModeKeyPressed { self.isCommandModeKeyPressed = true - // Delay start by 150ms to detect if this is a key combo + // PERF: Zero-delay start — begin recording immediately on modifier keydown. self.pendingHoldModeStart?.cancel() self.pendingHoldModeType = .commandMode - self.pendingHoldModeStart = Task { @MainActor [weak self] in - try? await Task.sleep(nanoseconds: 150_000_000) // 150ms - guard let self = self, !Task.isCancelled else { return } - guard self.isCommandModeKeyPressed, !self.otherKeyPressedDuringModifier else { - DebugLogger.shared.debug("Command mode hold start cancelled - key combo detected", source: "GlobalHotkeyManager") - return - } - DebugLogger.shared.info("Command mode modifier held (hold mode) - starting after delay", source: "GlobalHotkeyManager") - self.triggerCommandMode() - } + DebugLogger.shared.info("Command mode modifier held (hold mode) - starting immediately", source: "GlobalHotkeyManager") + self.triggerCommandMode() } } // Toggle mode: do NOT trigger yet, wait for release @@ -599,19 +591,11 @@ final class GlobalHotkeyManager: NSObject { if self.pressAndHoldMode { if !self.isRewriteKeyPressed { self.isRewriteKeyPressed = true - // Delay start by 150ms to detect if this is a key combo + // PERF: Zero-delay start — begin recording immediately on modifier keydown. self.pendingHoldModeStart?.cancel() self.pendingHoldModeType = .rewriteMode - self.pendingHoldModeStart = Task { @MainActor [weak self] in - try? await Task.sleep(nanoseconds: 150_000_000) // 150ms - guard let self = self, !Task.isCancelled else { return } - guard self.isRewriteKeyPressed, !self.otherKeyPressedDuringModifier else { - DebugLogger.shared.debug("Rewrite mode hold start cancelled - key combo detected", source: "GlobalHotkeyManager") - return - } - DebugLogger.shared.info("Rewrite mode modifier held (hold mode) - starting after delay", source: "GlobalHotkeyManager") - self.triggerRewriteMode() - } + DebugLogger.shared.info("Rewrite mode modifier held (hold mode) - starting immediately", source: "GlobalHotkeyManager") + self.triggerRewriteMode() } } // Toggle mode: do NOT trigger yet, wait for release @@ -668,22 +652,14 @@ final class GlobalHotkeyManager: NSObject { self.otherKeyPressedDuringModifier = false self.modifierPressStartTime = Date() - if self.pressAndHoldMode { + if self.pressAndHoldMode { if !self.isKeyPressed { self.isKeyPressed = true - // Delay start by 150ms to detect if this is a key combo + // PERF: Zero-delay start — begin recording immediately on modifier keydown. self.pendingHoldModeStart?.cancel() self.pendingHoldModeType = .transcription - self.pendingHoldModeStart = Task { @MainActor [weak self] in - try? await Task.sleep(nanoseconds: 150_000_000) // 150ms - guard let self = self, !Task.isCancelled else { return } - guard self.isKeyPressed, !self.otherKeyPressedDuringModifier else { - DebugLogger.shared.debug("Transcription hold start cancelled - key combo detected", source: "GlobalHotkeyManager") - return - } - DebugLogger.shared.info("Transcription modifier held (hold mode) - starting after delay", source: "GlobalHotkeyManager") - self.startRecordingIfNeeded() - } + DebugLogger.shared.info("Transcription modifier held (hold mode) - starting immediately", source: "GlobalHotkeyManager") + self.startRecordingIfNeeded() } } // Toggle mode: do NOT trigger yet, wait for release