johnbean393 · iamyabz · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026
diff --git a/KeyType/Logic/Completion/CompletionController.swift b/KeyType/Logic/Completion/CompletionController.swift
@@ -303,6 +303,29 @@ final class CompletionController {
     nonisolated static let fastDebounceNanoseconds: UInt64 = 35_000_000
     nonisolated static let moderateDebounceNanoseconds: UInt64 = 50_000_000
     nonisolated static let conservativeDebounceNanoseconds: UInt64 = 90_000_000
+
+    /// Collapses runs of two or more ASCII spaces to one, leaving every other character
+    /// (including the leading single space that the next-word separator in ADR-050 carries)
+    /// untouched. The model occasionally emits `"hello  world"`-style internal double spaces
+    /// in a completion candidate; this gate runs only on the bytes we're about to insert
+    /// — display anchors and ghost text are unchanged — so the user-visible "bigger space
+    /// than needed" after Tab / Shift+Tab cannot survive insertion. See ADR-079.
+    nonisolated static func collapseInternalDoubleSpaces(_ text: String) -> String {
+        guard text.contains("  ") else { return text }
+        var out = ""
+        out.reserveCapacity(text.count)
+        var lastWasSpace = false
+        for ch in text {
+            if ch == " " {
+                if !lastWasSpace { out.append(ch) }
+                lastWasSpace = true
+            } else {
+                out.append(ch)
+                lastWasSpace = false
+            }
+        }
+        return out
+    }
     private static let sideContextFreezeInterval: TimeInterval = 2.0
     private static let screenCaptureBundleIdentifiers: Set<String> = [
         "com.apple.screenshot.launcher"
@@ -598,7 +621,15 @@ final class CompletionController {
             try? await Task.sleep(nanoseconds: debounceNanoseconds)
             guard !Task.isCancelled, let self else { return }
             latencyTrace.eventDebounceElapsed()
-            self.generationTask = Task { [weak self] in
+            // ADR-078: generation deadline. Telemetry on the live app showed `generationMillis`
+            // tail outliers up to ~6 s (cold KV cache + long prompts + maximally divergent
+            // edits land all at once). Predictions that take longer than ~1.2 s to land are
+            // already stale by the time they would render — the user has typed several more
+            // characters since — so they should be abandoned rather than shown belatedly. We
+            // capture the generation Task and a sibling task cancels it after the budget; the
+            // existing `catch is CancellationError` arm already drops the result silently.
+            let generationDeadlineNanoseconds: UInt64 = 1_200_000_000
+            let generationTask = Task { [weak self] in
                 guard let self else { return }
                 do {
                     latencyTrace.eventGenerationBegin()
@@ -611,13 +642,19 @@ final class CompletionController {
                     self.telemetry.recordLatency(milliseconds: elapsedMs)
                     self.present(candidates, request: request, style: style, latencyTrace: latencyTrace)
                 } catch is CancellationError {
-                    // Superseded by a newer keystroke — leave the current ghost as-is.
+                    // Superseded by a newer keystroke, or hit the generation deadline above.
+                    // Either way, leave the current ghost as-is and drop this attempt silently.
                     self.finishLatencyTrace(latencyTrace, outcome: "cancelled")
                 } catch {
                     self.log.error("Generation failed: \(error, privacy: .public)")
                     self.finishLatencyTrace(latencyTrace, outcome: "generation-error")
                 }
             }
+            self.generationTask = generationTask
+            Task {
+                try? await Task.sleep(nanoseconds: generationDeadlineNanoseconds)
+                generationTask.cancel()
+            }
         }
     }
 
@@ -633,14 +670,31 @@ final class CompletionController {
             .map { "\"\(PredictionLog.escape($0.text))\"" }
             .joined(separator: " | ")
 
-        guard let best = candidates.first else {
+        guard let topRanked = candidates.first else {
             telemetry.recordSuppressed(reason: "noCandidate")
             predictionLog.append("PREDICT ctx=\"\(ctx)\" → SUPPRESS(noCandidate)")
             clearCompletion()
             finishLatencyTrace(latencyTrace, outcome: "suppressed-no-candidate")
             return
         }
-        if let reason = filter.suppressionReason(for: best, request: request) {
+        // ADR-077: rank-fallback. The constrained decoder returns up to `maxCandidates`
+        // hypotheses ordered by cumulative log-probability; previously we accepted only
+        // `candidates.first` and any filter rejection suppressed the entire prediction —
+        // even when ranks 2..N passed every gate. Telemetry showed 41 of 77 suppressions
+        // were `insertionUnsafe` (the most common cause: the top candidate was pure
+        // punctuation or whitespace and the prose runner-up never got a turn). Now we
+        // walk the ranked list and pick the first candidate that survives the filter.
+        // Suppression is reported only if every candidate fails — and we report the top
+        // candidate's reason, preserving the existing telemetry shape.
+        let best: CompletionCandidate
+        if filter.suppressionReason(for: topRanked, request: request) == nil {
+            best = topRanked
+        } else if let runnerUp = candidates.dropFirst().first(where: {
+            filter.suppressionReason(for: $0, request: request) == nil
+        }) {
+            best = runnerUp
+        } else {
+            let reason = filter.suppressionReason(for: topRanked, request: request)!
             telemetry.recordSuppressed(reason: String(describing: reason))
             log.debug("Suppressed: \(String(describing: reason), privacy: .public)")
             predictionLog.append("PREDICT ctx=\"\(ctx)\" [\(ranked)] → SUPPRESS(\(reason))")
@@ -1280,7 +1334,15 @@ final class CompletionController {
     /// overlay are left intact so the induced snapshot re-renders the shrinking remainder instead.
     private func insert(text: String, context: TextFieldContext, keepingAnchor: Bool = false) {
         guard !text.isEmpty else { return }
-        let plan = inserter.planInsertion(candidate: CompletionCandidate(text: text), context: context)
+        // ADR-079: collapse internal multi-space runs (`"hello  world"` → `"hello world"`).
+        // The model occasionally emits double spaces inside a candidate, which produced the
+        // user-visible "bigger space than needed" after Tab / Shift+Tab insertion. A *single*
+        // leading space is the separator that ADR-050 says leads the next word, so we keep
+        // it; runs of two or more ASCII spaces anywhere in the string collapse to one. Other
+        // whitespace forms (tabs, NBSP) are not touched — apps that require NBSP get the
+        // existing per-policy substitution in `InsertionPlanner.plan(...)`.
+        let normalized = Self.collapseInternalDoubleSpaces(text)
+        let plan = inserter.planInsertion(candidate: CompletionCandidate(text: normalized), context: context)
         if !keepingAnchor {
             // Drop the dedupe key so the post-insertion snapshot always regenerates a fresh suggestion.
             lastContextKey = nil

diff --git a/KeyType/Logic/Context/ScreenContextController.swift b/KeyType/Logic/Context/ScreenContextController.swift
@@ -29,7 +29,11 @@ final class ScreenContextController {
 
     /// How often to re-OCR the focused window while it stays focused, so the context tracks slow
     /// on-screen changes (a scrolled doc, an updated panel) without a focus change to trigger it.
-    private let refreshInterval: TimeInterval = 4.0
+    /// 12 s instead of the previous 4 s — see ADR-076: even with `.fast` Vision the OCR pass is
+    /// not free, and the user-perceptible "screen changes" we want to react to (paragraph scroll,
+    /// updated panel) are on the order of seconds, not sub-second. The big trigger remains focus
+    /// change, which is still instant via `handle(snapshot:)`.
+    private let refreshInterval: TimeInterval = 12.0
 
     private(set) var isRunning = false
     private var listenerToken: UUID?

diff --git a/...es/ConstrainedGeneration/Sources/ConstrainedGeneration/Engine/DecodingConfiguration.swift b/...es/ConstrainedGeneration/Sources/ConstrainedGeneration/Engine/DecodingConfiguration.swift
@@ -53,7 +53,13 @@ public struct DecodingConfiguration: Equatable {
         topK: Int = 64,
         topP: Float = 0.95,
         temperature: Float = 0.8,
-        branchWidth: Int = 4,
+        // ADR-077: default 3 (was 4). Per ADR-012's `testBranchWidthSweep` (warm means
+        // 239 / 164 / 107 / 75 / … ms at widths 8/6/4/3/…) dropping a single branch trims
+        // ~25% off generation latency and a one-rank-narrower beam barely moves the
+        // top-1 quality on the catalog set. With rank-fallback in `CompletionController`
+        // we now consume the extra ranks the beam still emits, so giving up the 4th branch
+        // costs less than the latency it cost to keep producing it.
+        branchWidth: Int = 3,
         relativeCutoff: Float = 6,
         minBranchProbability: Float = 0.02,
         maxCandidates: Int = 5,

diff --git a/Packages/MacContextCapture/Sources/MacContextCapture/Accessibility/FocusedFieldReader.swift b/Packages/MacContextCapture/Sources/MacContextCapture/Accessibility/FocusedFieldReader.swift
@@ -35,10 +35,52 @@ public struct FocusedFieldSnapshot: Equatable {
     }
 }
 
+/// Memoizes the result of `FocusedFieldReader.textElement(for:preferDescendantTextElement:)`
+/// for a single focused-root identity (ADR-075). Profiling on a fanless M4 showed the BFS in
+/// `textElement(for:)` accounted for ~10% of main-thread time during typing — every
+/// `kAXValueChangedNotification` fired on every keystroke walked the AX tree again to find the
+/// same text descendant we had already resolved a moment earlier. With this cache the walk runs
+/// once per focus change instead of once per keystroke; on value/selection changes we go
+/// straight to `AXCaretHelper.stringValue` on the already-resolved element.
+///
+/// Reference type so the value-typed `FocusedFieldReader` can mutate it through a `let`
+/// property — the reader is held across refreshes by `AccessibilityContextTracker` so the
+/// cached element lives for the lifetime of the focus session.
+private final class FocusedFieldResolutionCache {
+    /// Outcome of a resolution attempt against a given root identity. `negative` is preserved
+    /// separately from "miss" so a focused root with no text descendant doesn't re-walk the
+    /// tree on every value tick (e.g. media keys firing AX notifications from a non-text
+    /// focused control).
+    enum Lookup {
+        case miss
+        case hit(AXUIElement)
+        case negative
+    }
+
+    private var rootIdentity: String?
+    private var resolvedTextElement: AXUIElement?
+
+    /// `kAXFocusedUIElementChanged` / `kAXFocusedWindowChanged` produce a different root
+    /// identity, so this never serves a stale element across focus boundaries — yet
+    /// `kAXValueChanged` / `kAXSelectedTextChanged` keep the same root, which is the case that
+    /// gets the win.
+    func lookup(rootIdentity identity: String) -> Lookup {
+        guard self.rootIdentity == identity else { return .miss }
+        if let cached = resolvedTextElement { return .hit(cached) }
+        return .negative
+    }
+
+    func store(rootIdentity identity: String, textElement: AXUIElement?) {
+        self.rootIdentity = identity
+        self.resolvedTextElement = textElement
+    }
+}
+
 @MainActor
 public struct FocusedFieldReader {
     private let resolver: AXCaretGeometryResolver
     private nonisolated let webAppClassifier: AppBundleWebAppClassifier
+    private nonisolated let resolutionCache = FocusedFieldResolutionCache()
 
     public nonisolated init(
         resolver: AXCaretGeometryResolver = AXCaretGeometryResolver(),
@@ -51,15 +93,32 @@ public struct FocusedFieldReader {
     /// Read the focused AX element into a snapshot. Returns nil if the element has no AX
     /// value (likely not a text-bearing field).
     public func snapshot(of element: AXUIElement) -> FocusedFieldSnapshot? {
-        let initialBundleIdentifier = AppTargetResolver.bundleIdentifier(for: element)
-        let isKnownWebBackedApp = webAppClassifier.isWebBacked(
-            bundleIdentifier: initialBundleIdentifier
-        )
-        guard let textElement = Self.textElement(
-            for: element,
-            preferDescendantTextElement: isKnownWebBackedApp
-        ) else {
+        // Fast path (ADR-075): if the focused root's identity matches the one we resolved a
+        // text descendant for previously, skip the AX tree walk in `textElement(for:)` and
+        // reuse the cached element. The expensive `AXCaretHelper.childElements` BFS is the
+        // dominant per-keystroke main-thread cost; same-root revisits during continuous typing
+        // were our 10% main-thread hotspot before this.
+        let rootIdentity = AXCaretHelper.elementIdentity(for: element)
+        let textElement: AXUIElement
+        switch resolutionCache.lookup(rootIdentity: rootIdentity) {
+        case .hit(let cached):
+            textElement = cached
+        case .negative:
             return nil
+        case .miss:
+            let initialBundleIdentifier = AppTargetResolver.bundleIdentifier(for: element)
+            let isKnownWebBackedApp = webAppClassifier.isWebBacked(
+                bundleIdentifier: initialBundleIdentifier
+            )
+            guard let resolved = Self.textElement(
+                for: element,
+                preferDescendantTextElement: isKnownWebBackedApp
+            ) else {
+                resolutionCache.store(rootIdentity: rootIdentity, textElement: nil)
+                return nil
+            }
+            resolutionCache.store(rootIdentity: rootIdentity, textElement: resolved)
+            textElement = resolved
         }
         let target = AppTargetResolver.resolveAppTarget(for: textElement)
 

diff --git a/Packages/MacContextCapture/Sources/MacContextCapture/Screen/ScreenTextOCR.swift b/Packages/MacContextCapture/Sources/MacContextCapture/Screen/ScreenTextOCR.swift
@@ -13,18 +13,23 @@ import Vision
 
 public enum ScreenTextOCR {
     /// Recognise text in `image`, returning the recognised lines in natural reading order
-    /// (top-to-bottom, then left-to-right). Uses `.accurate` recognition with language correction
-    /// **on**: this capture runs out of band (on focus/window change plus a slow timer, never on the
-    /// per-keystroke path — see `WindowOCRCaptureEngine`/`ScreenContextController`), so there is no
-    /// keystroke-latency budget to protect and accuracy is what matters. Garbled recognitions are the
-    /// single largest source of polluted screen context (the model parrots "Ilne wilh real 5ulfix"
-    /// gibberish), and `.accurate` + language correction cuts them at the source rather than relying
-    /// solely on the post-hoc corruption filters below. (See ADR-049/052.)
+    /// (top-to-bottom, then left-to-right). Uses `.fast` recognition with language correction
+    /// **off** (ADR-076). Earlier revisions used `.accurate` + `usesLanguageCorrection = true`
+    /// because the per-line corruption filter (`droppingCorruptedLines` below) had a non-trivial
+    /// false-negative rate when fed `.fast`-tier mojibake (ADR-049/052). In practice — once the
+    /// digit-substitution guard (ADR-050) and the symbol-density guard are in place — the
+    /// surviving `.fast` lines are good enough for `[Screen context]` and the CPU win on a
+    /// fanless M4 is measured in 5–10× per refresh. The OCR is still off the keystroke path,
+    /// but the same CPU competes with everything else the user is doing, and a 4-second timer
+    /// firing `.accurate` Vision passes was the dominant remaining draw after ADR-074/075.
+    /// `.fast` routes through the Neural Engine where available, which is exactly what
+    /// Apple's own Live Text / system text recognition uses for ambient capture.
     ///
     /// `minimumConfidence` is the first guard against *corrupted* OCR reaching the prompt: Vision
     /// reports a per-candidate confidence, and a low value is the signal of a mangled recognition.
     /// Feeding nothing is better than feeding garbage, so low-confidence lines are dropped here.
-    public static func recognizeLines(in image: CGImage, minimumConfidence: Float = 0.4) async throws -> [String] {
+    /// The threshold is bumped slightly to compensate for `.fast`'s noisier candidates.
+    public static func recognizeLines(in image: CGImage, minimumConfidence: Float = 0.45) async throws -> [String] {
         try await withCheckedThrowingContinuation { continuation in
             let request = VNRecognizeTextRequest { request, error in
                 if let error {
@@ -47,8 +52,12 @@ public enum ScreenTextOCR {
                 }
                 continuation.resume(returning: lines)
             }
-            request.recognitionLevel = .accurate
-            request.usesLanguageCorrection = true
+            // ADR-076: trade some recognition accuracy for ~5–10× lower per-refresh CPU. The
+            // downstream corruption filters (`droppingCorruptedLines`, `isPlausibleText`,
+            // `containsDigitSubstitutedWord`) reject the additional mangled lines that `.fast`
+            // produces, so the model still only sees plausible prose.
+            request.recognitionLevel = .fast
+            request.usesLanguageCorrection = false
 
             let handler = VNImageRequestHandler(cgImage: image, options: [:])
             do {

diff --git a/Packages/MacContextCapture/Sources/MacContextCapture/Screen/WindowOCRCaptureEngine.swift b/Packages/MacContextCapture/Sources/MacContextCapture/Screen/WindowOCRCaptureEngine.swift
@@ -85,7 +85,12 @@ public struct ScreenCaptureKitWindowTextCapturer: ScreenWindowTextCapturing {
     /// Longest side (in pixels) of the captured image before OCR. Caps Retina blow-up for speed.
     private let maxCaptureDimension: CGFloat
 
-    public init(maxCaptureDimension: CGFloat = 1600) {
+    public init(maxCaptureDimension: CGFloat = 1200) {
+        // ADR-076: 1200 is the longest-side cap before OCR. The previous default of 1600 was
+        // a holdover from `.accurate` Vision (which benefits from more pixels); `.fast` doesn't
+        // gain proportionally from extra resolution, so we shrink the captured image to cut both
+        // the screenshot encode cost and Vision's per-pixel work without hurting recognition of
+        // ordinary screen text. Apps that need a denser cap can pass it explicitly.
         self.maxCaptureDimension = maxCaptureDimension
     }
 

diff --git a/Packages/ModelRuntime/Sources/LlamaModelRuntime/LlamaModelRuntime.swift b/Packages/ModelRuntime/Sources/LlamaModelRuntime/LlamaModelRuntime.swift
@@ -93,7 +93,14 @@ public actor LlamaModelRuntime: LocalModelRuntime {
         maxSequences: Int = 4,
         // Incremental beam decoding (ADR-046): keep branch KV resident across levels and decode only
         // the new token. On by default; the reseed path (ADR-043) remains as a per-call fallback.
-        enableIncrementalBeam: Bool = true
+        enableIncrementalBeam: Bool = true,
+        // Number of transformer layers to offload to the GPU via llama.cpp's Metal backend
+        // (ADR-074). Default 999 means "all layers"; llama.cpp clamps to the model's real layer
+        // count. Pass 0 to force CPU-only (e.g. tooling, deterministic tests). Without this,
+        // `llama_model_default_params()` defaults `n_gpu_layers` to 0 and every token is decoded
+        // on the CPU even though the Metal backend is linked — pinning the CPU and triggering
+        // thermal throttling on fanless Apple Silicon.
+        nGpuLayers: Int = 999
     ) throws {
         guard ModelContainer.modelExists(at: modelURL) else {
             throw LlamaRuntimeError.modelFileMissing(path: modelURL.path)
@@ -103,6 +110,12 @@ public actor LlamaModelRuntime: LocalModelRuntime {
         var modelParams = llama_model_default_params()
         modelParams.use_mmap = true
         modelParams.use_mlock = false
+        // Offload all transformer layers to the Metal GPU (ADR-074). Without this, llama.cpp
+        // defaults `n_gpu_layers` to 0 and decodes every token on the CPU even though the Metal
+        // backend is linked — the high CPU usage / thermal throttling we saw was from inference
+        // running entirely on CPU cores. 999 means "all layers"; llama.cpp clamps to the model's
+        // real depth.
+        modelParams.n_gpu_layers = Int32(nGpuLayers)
 
         guard let loadedModel = llama_model_load_from_file(modelURL.path, modelParams) else {
             throw LlamaRuntimeError.modelLoadFailed