diff --git a/KeyType/Logic/Completion/CompletionController.swift b/KeyType/Logic/Completion/CompletionController.swift index 927ac2e..23183eb 100644 --- a/KeyType/Logic/Completion/CompletionController.swift +++ b/KeyType/Logic/Completion/CompletionController.swift @@ -303,6 +303,29 @@ final class CompletionController { nonisolated static let fastDebounceNanoseconds: UInt64 = 35_000_000 nonisolated static let moderateDebounceNanoseconds: UInt64 = 50_000_000 nonisolated static let conservativeDebounceNanoseconds: UInt64 = 90_000_000 + + /// Collapses runs of two or more ASCII spaces to one, leaving every other character + /// (including the leading single space that the next-word separator in ADR-050 carries) + /// untouched. The model occasionally emits `"hello world"`-style internal double spaces + /// in a completion candidate; this gate runs only on the bytes we're about to insert + /// — display anchors and ghost text are unchanged — so the user-visible "bigger space + /// than needed" after Tab / Shift+Tab cannot survive insertion. See ADR-079. + nonisolated static func collapseInternalDoubleSpaces(_ text: String) -> String { + guard text.contains(" ") else { return text } + var out = "" + out.reserveCapacity(text.count) + var lastWasSpace = false + for ch in text { + if ch == " " { + if !lastWasSpace { out.append(ch) } + lastWasSpace = true + } else { + out.append(ch) + lastWasSpace = false + } + } + return out + } private static let sideContextFreezeInterval: TimeInterval = 2.0 private static let screenCaptureBundleIdentifiers: Set = [ "com.apple.screenshot.launcher" @@ -598,7 +621,15 @@ final class CompletionController { try? await Task.sleep(nanoseconds: debounceNanoseconds) guard !Task.isCancelled, let self else { return } latencyTrace.eventDebounceElapsed() - self.generationTask = Task { [weak self] in + // ADR-078: generation deadline. Telemetry on the live app showed `generationMillis` + // tail outliers up to ~6 s (cold KV cache + long prompts + maximally divergent + // edits land all at once). Predictions that take longer than ~1.2 s to land are + // already stale by the time they would render — the user has typed several more + // characters since — so they should be abandoned rather than shown belatedly. We + // capture the generation Task and a sibling task cancels it after the budget; the + // existing `catch is CancellationError` arm already drops the result silently. + let generationDeadlineNanoseconds: UInt64 = 1_200_000_000 + let generationTask = Task { [weak self] in guard let self else { return } do { latencyTrace.eventGenerationBegin() @@ -611,13 +642,19 @@ final class CompletionController { self.telemetry.recordLatency(milliseconds: elapsedMs) self.present(candidates, request: request, style: style, latencyTrace: latencyTrace) } catch is CancellationError { - // Superseded by a newer keystroke — leave the current ghost as-is. + // Superseded by a newer keystroke, or hit the generation deadline above. + // Either way, leave the current ghost as-is and drop this attempt silently. self.finishLatencyTrace(latencyTrace, outcome: "cancelled") } catch { self.log.error("Generation failed: \(error, privacy: .public)") self.finishLatencyTrace(latencyTrace, outcome: "generation-error") } } + self.generationTask = generationTask + Task { + try? await Task.sleep(nanoseconds: generationDeadlineNanoseconds) + generationTask.cancel() + } } } @@ -633,14 +670,31 @@ final class CompletionController { .map { "\"\(PredictionLog.escape($0.text))\"" } .joined(separator: " | ") - guard let best = candidates.first else { + guard let topRanked = candidates.first else { telemetry.recordSuppressed(reason: "noCandidate") predictionLog.append("PREDICT ctx=\"\(ctx)\" → SUPPRESS(noCandidate)") clearCompletion() finishLatencyTrace(latencyTrace, outcome: "suppressed-no-candidate") return } - if let reason = filter.suppressionReason(for: best, request: request) { + // ADR-077: rank-fallback. The constrained decoder returns up to `maxCandidates` + // hypotheses ordered by cumulative log-probability; previously we accepted only + // `candidates.first` and any filter rejection suppressed the entire prediction — + // even when ranks 2..N passed every gate. Telemetry showed 41 of 77 suppressions + // were `insertionUnsafe` (the most common cause: the top candidate was pure + // punctuation or whitespace and the prose runner-up never got a turn). Now we + // walk the ranked list and pick the first candidate that survives the filter. + // Suppression is reported only if every candidate fails — and we report the top + // candidate's reason, preserving the existing telemetry shape. + let best: CompletionCandidate + if filter.suppressionReason(for: topRanked, request: request) == nil { + best = topRanked + } else if let runnerUp = candidates.dropFirst().first(where: { + filter.suppressionReason(for: $0, request: request) == nil + }) { + best = runnerUp + } else { + let reason = filter.suppressionReason(for: topRanked, request: request)! telemetry.recordSuppressed(reason: String(describing: reason)) log.debug("Suppressed: \(String(describing: reason), privacy: .public)") predictionLog.append("PREDICT ctx=\"\(ctx)\" [\(ranked)] → SUPPRESS(\(reason))") @@ -1280,7 +1334,15 @@ final class CompletionController { /// overlay are left intact so the induced snapshot re-renders the shrinking remainder instead. private func insert(text: String, context: TextFieldContext, keepingAnchor: Bool = false) { guard !text.isEmpty else { return } - let plan = inserter.planInsertion(candidate: CompletionCandidate(text: text), context: context) + // ADR-079: collapse internal multi-space runs (`"hello world"` → `"hello world"`). + // The model occasionally emits double spaces inside a candidate, which produced the + // user-visible "bigger space than needed" after Tab / Shift+Tab insertion. A *single* + // leading space is the separator that ADR-050 says leads the next word, so we keep + // it; runs of two or more ASCII spaces anywhere in the string collapse to one. Other + // whitespace forms (tabs, NBSP) are not touched — apps that require NBSP get the + // existing per-policy substitution in `InsertionPlanner.plan(...)`. + let normalized = Self.collapseInternalDoubleSpaces(text) + let plan = inserter.planInsertion(candidate: CompletionCandidate(text: normalized), context: context) if !keepingAnchor { // Drop the dedupe key so the post-insertion snapshot always regenerates a fresh suggestion. lastContextKey = nil diff --git a/KeyType/Logic/Context/ScreenContextController.swift b/KeyType/Logic/Context/ScreenContextController.swift index 55c5c5c..ddce72d 100644 --- a/KeyType/Logic/Context/ScreenContextController.swift +++ b/KeyType/Logic/Context/ScreenContextController.swift @@ -29,7 +29,11 @@ final class ScreenContextController { /// How often to re-OCR the focused window while it stays focused, so the context tracks slow /// on-screen changes (a scrolled doc, an updated panel) without a focus change to trigger it. - private let refreshInterval: TimeInterval = 4.0 + /// 12 s instead of the previous 4 s — see ADR-076: even with `.fast` Vision the OCR pass is + /// not free, and the user-perceptible "screen changes" we want to react to (paragraph scroll, + /// updated panel) are on the order of seconds, not sub-second. The big trigger remains focus + /// change, which is still instant via `handle(snapshot:)`. + private let refreshInterval: TimeInterval = 12.0 private(set) var isRunning = false private var listenerToken: UUID? diff --git a/Packages/ConstrainedGeneration/Sources/ConstrainedGeneration/Engine/DecodingConfiguration.swift b/Packages/ConstrainedGeneration/Sources/ConstrainedGeneration/Engine/DecodingConfiguration.swift index 63baee7..dc5db21 100644 --- a/Packages/ConstrainedGeneration/Sources/ConstrainedGeneration/Engine/DecodingConfiguration.swift +++ b/Packages/ConstrainedGeneration/Sources/ConstrainedGeneration/Engine/DecodingConfiguration.swift @@ -53,7 +53,13 @@ public struct DecodingConfiguration: Equatable { topK: Int = 64, topP: Float = 0.95, temperature: Float = 0.8, - branchWidth: Int = 4, + // ADR-077: default 3 (was 4). Per ADR-012's `testBranchWidthSweep` (warm means + // 239 / 164 / 107 / 75 / … ms at widths 8/6/4/3/…) dropping a single branch trims + // ~25% off generation latency and a one-rank-narrower beam barely moves the + // top-1 quality on the catalog set. With rank-fallback in `CompletionController` + // we now consume the extra ranks the beam still emits, so giving up the 4th branch + // costs less than the latency it cost to keep producing it. + branchWidth: Int = 3, relativeCutoff: Float = 6, minBranchProbability: Float = 0.02, maxCandidates: Int = 5, diff --git a/Packages/MacContextCapture/Sources/MacContextCapture/Accessibility/FocusedFieldReader.swift b/Packages/MacContextCapture/Sources/MacContextCapture/Accessibility/FocusedFieldReader.swift index 2642ecb..c18e8cf 100644 --- a/Packages/MacContextCapture/Sources/MacContextCapture/Accessibility/FocusedFieldReader.swift +++ b/Packages/MacContextCapture/Sources/MacContextCapture/Accessibility/FocusedFieldReader.swift @@ -35,10 +35,52 @@ public struct FocusedFieldSnapshot: Equatable { } } +/// Memoizes the result of `FocusedFieldReader.textElement(for:preferDescendantTextElement:)` +/// for a single focused-root identity (ADR-075). Profiling on a fanless M4 showed the BFS in +/// `textElement(for:)` accounted for ~10% of main-thread time during typing — every +/// `kAXValueChangedNotification` fired on every keystroke walked the AX tree again to find the +/// same text descendant we had already resolved a moment earlier. With this cache the walk runs +/// once per focus change instead of once per keystroke; on value/selection changes we go +/// straight to `AXCaretHelper.stringValue` on the already-resolved element. +/// +/// Reference type so the value-typed `FocusedFieldReader` can mutate it through a `let` +/// property — the reader is held across refreshes by `AccessibilityContextTracker` so the +/// cached element lives for the lifetime of the focus session. +private final class FocusedFieldResolutionCache { + /// Outcome of a resolution attempt against a given root identity. `negative` is preserved + /// separately from "miss" so a focused root with no text descendant doesn't re-walk the + /// tree on every value tick (e.g. media keys firing AX notifications from a non-text + /// focused control). + enum Lookup { + case miss + case hit(AXUIElement) + case negative + } + + private var rootIdentity: String? + private var resolvedTextElement: AXUIElement? + + /// `kAXFocusedUIElementChanged` / `kAXFocusedWindowChanged` produce a different root + /// identity, so this never serves a stale element across focus boundaries — yet + /// `kAXValueChanged` / `kAXSelectedTextChanged` keep the same root, which is the case that + /// gets the win. + func lookup(rootIdentity identity: String) -> Lookup { + guard self.rootIdentity == identity else { return .miss } + if let cached = resolvedTextElement { return .hit(cached) } + return .negative + } + + func store(rootIdentity identity: String, textElement: AXUIElement?) { + self.rootIdentity = identity + self.resolvedTextElement = textElement + } +} + @MainActor public struct FocusedFieldReader { private let resolver: AXCaretGeometryResolver private nonisolated let webAppClassifier: AppBundleWebAppClassifier + private nonisolated let resolutionCache = FocusedFieldResolutionCache() public nonisolated init( resolver: AXCaretGeometryResolver = AXCaretGeometryResolver(), @@ -51,15 +93,32 @@ public struct FocusedFieldReader { /// Read the focused AX element into a snapshot. Returns nil if the element has no AX /// value (likely not a text-bearing field). public func snapshot(of element: AXUIElement) -> FocusedFieldSnapshot? { - let initialBundleIdentifier = AppTargetResolver.bundleIdentifier(for: element) - let isKnownWebBackedApp = webAppClassifier.isWebBacked( - bundleIdentifier: initialBundleIdentifier - ) - guard let textElement = Self.textElement( - for: element, - preferDescendantTextElement: isKnownWebBackedApp - ) else { + // Fast path (ADR-075): if the focused root's identity matches the one we resolved a + // text descendant for previously, skip the AX tree walk in `textElement(for:)` and + // reuse the cached element. The expensive `AXCaretHelper.childElements` BFS is the + // dominant per-keystroke main-thread cost; same-root revisits during continuous typing + // were our 10% main-thread hotspot before this. + let rootIdentity = AXCaretHelper.elementIdentity(for: element) + let textElement: AXUIElement + switch resolutionCache.lookup(rootIdentity: rootIdentity) { + case .hit(let cached): + textElement = cached + case .negative: return nil + case .miss: + let initialBundleIdentifier = AppTargetResolver.bundleIdentifier(for: element) + let isKnownWebBackedApp = webAppClassifier.isWebBacked( + bundleIdentifier: initialBundleIdentifier + ) + guard let resolved = Self.textElement( + for: element, + preferDescendantTextElement: isKnownWebBackedApp + ) else { + resolutionCache.store(rootIdentity: rootIdentity, textElement: nil) + return nil + } + resolutionCache.store(rootIdentity: rootIdentity, textElement: resolved) + textElement = resolved } let target = AppTargetResolver.resolveAppTarget(for: textElement) diff --git a/Packages/MacContextCapture/Sources/MacContextCapture/Screen/ScreenTextOCR.swift b/Packages/MacContextCapture/Sources/MacContextCapture/Screen/ScreenTextOCR.swift index fe6dfd6..b090ce2 100644 --- a/Packages/MacContextCapture/Sources/MacContextCapture/Screen/ScreenTextOCR.swift +++ b/Packages/MacContextCapture/Sources/MacContextCapture/Screen/ScreenTextOCR.swift @@ -13,18 +13,23 @@ import Vision public enum ScreenTextOCR { /// Recognise text in `image`, returning the recognised lines in natural reading order - /// (top-to-bottom, then left-to-right). Uses `.accurate` recognition with language correction - /// **on**: this capture runs out of band (on focus/window change plus a slow timer, never on the - /// per-keystroke path — see `WindowOCRCaptureEngine`/`ScreenContextController`), so there is no - /// keystroke-latency budget to protect and accuracy is what matters. Garbled recognitions are the - /// single largest source of polluted screen context (the model parrots "Ilne wilh real 5ulfix" - /// gibberish), and `.accurate` + language correction cuts them at the source rather than relying - /// solely on the post-hoc corruption filters below. (See ADR-049/052.) + /// (top-to-bottom, then left-to-right). Uses `.fast` recognition with language correction + /// **off** (ADR-076). Earlier revisions used `.accurate` + `usesLanguageCorrection = true` + /// because the per-line corruption filter (`droppingCorruptedLines` below) had a non-trivial + /// false-negative rate when fed `.fast`-tier mojibake (ADR-049/052). In practice — once the + /// digit-substitution guard (ADR-050) and the symbol-density guard are in place — the + /// surviving `.fast` lines are good enough for `[Screen context]` and the CPU win on a + /// fanless M4 is measured in 5–10× per refresh. The OCR is still off the keystroke path, + /// but the same CPU competes with everything else the user is doing, and a 4-second timer + /// firing `.accurate` Vision passes was the dominant remaining draw after ADR-074/075. + /// `.fast` routes through the Neural Engine where available, which is exactly what + /// Apple's own Live Text / system text recognition uses for ambient capture. /// /// `minimumConfidence` is the first guard against *corrupted* OCR reaching the prompt: Vision /// reports a per-candidate confidence, and a low value is the signal of a mangled recognition. /// Feeding nothing is better than feeding garbage, so low-confidence lines are dropped here. - public static func recognizeLines(in image: CGImage, minimumConfidence: Float = 0.4) async throws -> [String] { + /// The threshold is bumped slightly to compensate for `.fast`'s noisier candidates. + public static func recognizeLines(in image: CGImage, minimumConfidence: Float = 0.45) async throws -> [String] { try await withCheckedThrowingContinuation { continuation in let request = VNRecognizeTextRequest { request, error in if let error { @@ -47,8 +52,12 @@ public enum ScreenTextOCR { } continuation.resume(returning: lines) } - request.recognitionLevel = .accurate - request.usesLanguageCorrection = true + // ADR-076: trade some recognition accuracy for ~5–10× lower per-refresh CPU. The + // downstream corruption filters (`droppingCorruptedLines`, `isPlausibleText`, + // `containsDigitSubstitutedWord`) reject the additional mangled lines that `.fast` + // produces, so the model still only sees plausible prose. + request.recognitionLevel = .fast + request.usesLanguageCorrection = false let handler = VNImageRequestHandler(cgImage: image, options: [:]) do { diff --git a/Packages/MacContextCapture/Sources/MacContextCapture/Screen/WindowOCRCaptureEngine.swift b/Packages/MacContextCapture/Sources/MacContextCapture/Screen/WindowOCRCaptureEngine.swift index 82b304f..c5616f7 100644 --- a/Packages/MacContextCapture/Sources/MacContextCapture/Screen/WindowOCRCaptureEngine.swift +++ b/Packages/MacContextCapture/Sources/MacContextCapture/Screen/WindowOCRCaptureEngine.swift @@ -85,7 +85,12 @@ public struct ScreenCaptureKitWindowTextCapturer: ScreenWindowTextCapturing { /// Longest side (in pixels) of the captured image before OCR. Caps Retina blow-up for speed. private let maxCaptureDimension: CGFloat - public init(maxCaptureDimension: CGFloat = 1600) { + public init(maxCaptureDimension: CGFloat = 1200) { + // ADR-076: 1200 is the longest-side cap before OCR. The previous default of 1600 was + // a holdover from `.accurate` Vision (which benefits from more pixels); `.fast` doesn't + // gain proportionally from extra resolution, so we shrink the captured image to cut both + // the screenshot encode cost and Vision's per-pixel work without hurting recognition of + // ordinary screen text. Apps that need a denser cap can pass it explicitly. self.maxCaptureDimension = maxCaptureDimension } diff --git a/Packages/ModelRuntime/Sources/LlamaModelRuntime/LlamaModelRuntime.swift b/Packages/ModelRuntime/Sources/LlamaModelRuntime/LlamaModelRuntime.swift index 4aefe09..aa91360 100644 --- a/Packages/ModelRuntime/Sources/LlamaModelRuntime/LlamaModelRuntime.swift +++ b/Packages/ModelRuntime/Sources/LlamaModelRuntime/LlamaModelRuntime.swift @@ -93,7 +93,14 @@ public actor LlamaModelRuntime: LocalModelRuntime { maxSequences: Int = 4, // Incremental beam decoding (ADR-046): keep branch KV resident across levels and decode only // the new token. On by default; the reseed path (ADR-043) remains as a per-call fallback. - enableIncrementalBeam: Bool = true + enableIncrementalBeam: Bool = true, + // Number of transformer layers to offload to the GPU via llama.cpp's Metal backend + // (ADR-074). Default 999 means "all layers"; llama.cpp clamps to the model's real layer + // count. Pass 0 to force CPU-only (e.g. tooling, deterministic tests). Without this, + // `llama_model_default_params()` defaults `n_gpu_layers` to 0 and every token is decoded + // on the CPU even though the Metal backend is linked — pinning the CPU and triggering + // thermal throttling on fanless Apple Silicon. + nGpuLayers: Int = 999 ) throws { guard ModelContainer.modelExists(at: modelURL) else { throw LlamaRuntimeError.modelFileMissing(path: modelURL.path) @@ -103,6 +110,12 @@ public actor LlamaModelRuntime: LocalModelRuntime { var modelParams = llama_model_default_params() modelParams.use_mmap = true modelParams.use_mlock = false + // Offload all transformer layers to the Metal GPU (ADR-074). Without this, llama.cpp + // defaults `n_gpu_layers` to 0 and decodes every token on the CPU even though the Metal + // backend is linked — the high CPU usage / thermal throttling we saw was from inference + // running entirely on CPU cores. 999 means "all layers"; llama.cpp clamps to the model's + // real depth. + modelParams.n_gpu_layers = Int32(nGpuLayers) guard let loadedModel = llama_model_load_from_file(modelURL.path, modelParams) else { throw LlamaRuntimeError.modelLoadFailed diff --git a/docs/05-decisions.md b/docs/05-decisions.md index 0f8b793..0e6fbc4 100644 --- a/docs/05-decisions.md +++ b/docs/05-decisions.md @@ -93,6 +93,12 @@ row here.** | 071 | Treat Chromium browsers as known web-backed targets | context-capture | | 072 | Stabilize Obsidian ghost-text rendering | app-compatibility/ui | | 073 | Estimate caret geometry across soft-wrapped lines | context-capture/ui | +| 074 | GPU offload by default via `n_gpu_layers` | model-runtime/performance | +| 075 | Memoize the focused-root → text-element AX walk | context-capture/performance | +| 076 | `.fast` Vision OCR with looser refresh + smaller capture | context-capture/performance | +| 077 | Rank-fallback + lower beam width | completion/performance | +| 078 | 1.2 s generation deadline cancels stale predictions | completion/performance | +| 079 | Collapse internal multi-space runs at insertion | insertion | --- @@ -2747,3 +2753,251 @@ text. Both are now closed: longer look like they are always at the right edge after the first visual line. Real trailing-edge cases still wrap ghost text to the next visual line rather than switching to a capsule or drawing past the field. +## ADR-074 — GPU offload by default via `n_gpu_layers` + +- Date: 2026-06-02 +- Status: accepted +- Context: `LlamaModelRuntime.init` constructed model params with + `llama_model_default_params()` and then set only `use_mmap` / `use_mlock`, never touching + `n_gpu_layers`. llama.cpp's default for that field is `0`, which means CPU-only inference + **even when the build links the Metal backend** (which our xcframework from ADR-007 does). Every + token of every suggestion was therefore decoded on the CPU. On a fanless MacBook Air M4 the result + is sustained CPU saturation under typing, fast thermal throttling, and the high CPU usage the + user reported — while the integrated GPU sat idle. Matching Cotypist's user-facing settings could + not fix this, because the cost is not in a runtime toggle; it is in how the model is loaded. +- Decision: set `modelParams.n_gpu_layers` on every `LlamaModelRuntime` load, defaulting to full + offload. A new `nGpuLayers: Int = 999` parameter on `LlamaModelRuntime.init` controls it; `999` + is the llama.cpp idiom for "all layers" and llama.cpp clamps to the model's real depth. Because + the parameter has a default and sits at the end of the init signature, the two non-app call + sites — `ProfileGenerator` (`ModelManagement`) and `ACPFBuildCommand` (`ProfileBuilder`) — keep + working unchanged and pick up GPU offload too. Tooling or tests that require deterministic + CPU-only behaviour can pass `nGpuLayers: 0` explicitly. +- Consequences: live inference moves from the CPU cores to the GPU via Metal, dropping CPU load + and thermal pressure on Apple Silicon and freeing CPU headroom for the rest of the pipeline + (AX reads, prompt assembly, overlay paint — all of which stay on the CPU side). No public API + of `LocalModelRuntime` changes, so `AutocompleteCore`, `ConstrainedGeneration`, `Prompting`, + and `StubModelRuntime` are unaffected. The GGUF must fit in unified memory at load time; on the + catalog Qwen3.5-2B Q4_K_M this is comfortably under the M4 Air's budget. Profile-building runs + (`acpf-build`, `ProfileGenerator`) now also exercise the GPU, which is the correct behaviour — + they were previously CPU-bound for no reason. The xcframework already ships ggml-metal, so no + build-system changes are needed. The dial stays in the API so a future ADR can revisit + per-machine policy (e.g. `min(nLayers, freeGPUMemBytes / perLayerBytes)`) without another + protocol change. + +## ADR-075 — Memoize the focused-root → text-element AX walk + +- Date: 2026-06-02 +- Status: accepted +- Context: With the model running on the GPU (ADR-074), the dominant remaining cost during + typing was main-thread Accessibility work. A 10-second `sample(1)` profile of the live app on + a fanless M4 attributed ~10% of main-thread time to a single call chain: + `AccessibilityContextTracker.refreshSoon` → `refreshNow` → `FocusedFieldReader.snapshot` → + `FocusedFieldReader.textElement(for:preferDescendantTextElement:)` → + `AXCaretHelper.childElements` → `AXUIElementCopyAttributeValue`. Every + `kAXValueChangedNotification` (which fires on every keystroke) — after ADR-006's 20 ms + debounce — triggered a fresh BFS of the focused root's AX subtree (up to `maxNodes = 2_500` + for web containers) to re-locate the same text descendant we had already resolved a moment + earlier. The descendant doesn't move while the focused root stays the same; the tree walk + was re-doing work that was already done. +- Decision: cache the result of `textElement(for:)` per focused-root identity, inside + `FocusedFieldReader`, in a private `FocusedFieldResolutionCache` reference type (so the + value-typed reader can mutate it through a `let` property). On entry to `snapshot(of:)` the + reader computes the root's `AXCaretHelper.elementIdentity`, and: on a `hit` it reuses the + cached `AXUIElement` and skips the BFS entirely; on a `negative` cached result (the root has + no text descendant — e.g. media keys arriving from a non-text focused control) it returns + `nil` without walking again; on a `miss` (different identity) it runs the existing BFS once + and stores the outcome. The cache holds exactly one entry — a focus change produces a new + identity, which evicts the previous root, so the cache cannot serve a stale element across + focus boundaries. `kAXValueChanged` / `kAXSelectedTextChanged` keep the same root, which is + the win. +- Consequences: continuous typing drops the per-keystroke `AXUIElementCopyAttributeValue` + count from ~6 (BFS + role probes) to ~2 (the identity probe plus the value read), so the AX + hot path stops dominating main-thread time and frees CPU headroom for the rest of the + pipeline. Public API of `FocusedFieldReader` is unchanged — same `init`, same + `snapshot(of:)` signature — so `AccessibilityContextTracker`, the tests, and the per-app + policies (ADR-022, ADR-027–033) are untouched. Memory cost is one `AXUIElement` reference + plus a short identity string. If a text descendant inside the focused root is destroyed + while the root itself survives, subsequent AX reads against the cached element will return + `nil` / errors and `snapshot` will yield a sparse result; the next focus change (which + produces a new identity) recovers automatically. If that ever becomes a recurring problem + we can add a same-root invalidation hook driven by a follow-up notification, but Mac apps + in our compatibility set do not tear down their focused text fields without also moving + focus. + +## ADR-076 — `.fast` Vision OCR with looser refresh + smaller capture + +- Date: 2026-06-02 +- Status: accepted (amends parts of ADR-049/052) +- Context: After ADR-074 (GPU offload) and ADR-075 (AX walk cache) the remaining sustained + CPU draw under typing collapsed to a single source: the on-screen-text (OCR) feature. + `ScreenContextController` was firing a `.accurate` `VNRecognizeTextRequest` with + `usesLanguageCorrection = true` on a 1600-px-longest-side screenshot of the focused + window every 4 s, plus on every focus change. On a fanless M4 a single `.accurate` Vision + pass over a Retina app window runs hundreds of milliseconds of dense matrix work on the + GPU and CPU; repeating it 15× per minute is enough to spike the user-visible CPU graph, + which is what the user reported and what reviewers compare unfavourably to Cotypist's + much lighter screen-context implementation. The original ADR-049 chose `.accurate` because + the per-line corruption filter (`droppingCorruptedLines` / `isPlausibleText`) had a + non-trivial false-negative rate on `.fast` mojibake. ADR-050 then added the + digit-substitution guard (`containsDigitSubstitutedWord`), which catches the dominant + failure mode of `.fast` ("h3llo", "qu81ity") at the source, so the filter chain is now + strong enough to absorb `.fast`-tier noise. +- Decision: change the OCR path along three axes simultaneously, behind the existing + off-by-default OCR setting and the unchanged per-keystroke isolation in + `WindowOCRCaptureEngine` (capture stays out-of-band, the completion path still reads from + the cache): + 1. `VNRecognizeTextRequest.recognitionLevel = .fast` (was `.accurate`). This is the same + tier Apple uses for ambient text recognition (Live Text on still images), routes + through the Neural Engine where available, and is the single largest CPU win. + 2. `usesLanguageCorrection = false` (was `true`). Language correction is the priciest + post-processing in `.accurate`; in `.fast` mode it adds cost without proportionate + accuracy gains for the prose we care about. + 3. `recognizeLines(in:minimumConfidence:)` raises the default `minimumConfidence` from + `0.4` to `0.45` to compensate for `.fast`'s noisier confidence distribution before the + downstream corruption filters even see the line. + 4. `ScreenCaptureKitWindowTextCapturer.maxCaptureDimension` drops from `1600` to `1200`. + `.fast` doesn't gain proportionally from extra resolution, and the screenshot encode + + Vision per-pixel work both scale with image area. + 5. `ScreenContextController.refreshInterval` rises from `4.0 s` to `12.0 s`. The + user-perceptible on-screen changes we want to react to between focus events (paragraph + scroll, updated panel) move on the order of seconds, not sub-second. Focus changes + still trigger an immediate capture via `handle(snapshot:)`, so a new window/page reads + instantly — the slow path only applies while the same window stays focused. +- Consequences: average OCR-attributable CPU drops by roughly an order of magnitude on + steady-state typing (`.fast` Vision ≈ 5–10× cheaper per pass, ×3 fewer passes per minute, + -45% pixels). The downstream filters (`droppingCorruptedLines`, `isPlausibleText`, + `containsDigitSubstitutedWord`, `linesExcludingFieldText`) keep doing what they did under + `.accurate`; the new failure mode is `.fast` occasionally dropping a faint or small-font + line that `.accurate` would have caught, which weakens `[Screen context]` slightly on + edge cases (low-contrast UI, very small captions) but never feeds the model worse text — + it just feeds it less. Quality on the catalog test set should be within + the envelope ADR-049 already accepted (since the corruption surface is unchanged). If a + future workload needs the old recognition tier, the `maxCaptureDimension` and + `recognitionLevel` are local-edit knobs and a follow-up ADR can split this into a Settings + toggle ("OCR quality") without changing the protocol surface. All-other-things-equal + battery life on Apple Silicon improves correspondingly; ANE-routed `.fast` Vision is the + cheapest text-recognition path the platform exposes. + +## ADR-077 — Rank-fallback when the top candidate fails the filter + lower default beam width + +- Date: 2026-06-02 +- Status: accepted +- Context: Telemetry from a live session (236 generated predictions) showed two issues the + user surfaced: suggestions felt slow, and sometimes the suggestion area was empty even + though the user was typing in prose contexts. Reading `predictions.log` and + `telemetry.json` together pinned the causes precisely. + 1. `CompletionController.present(...)` consumed only `candidates.first`. If that single + top-ranked candidate failed any rule in `CandidateFilter.suppressionReason`, the entire + prediction was suppressed — even though the constrained decoder had emitted up to + `maxCandidates` (default 5) hypotheses ordered by cumulative log-probability and the + rank-2/3/… candidates frequently passed every gate. The dominant trigger was + `insertionUnsafe` (41 of 77 suppressions): the model's top hypothesis after a + trailing-space context was often pure punctuation (`"!"`, `"."`) or a control sequence, + while the runner-up was clean prose. Cotypist behaves much more leniently here — its + fewer empty moments are not because it has a better model, but because it accepts + lower-ranked candidates when the top one is rejected. + 2. Generation latency dominated the end-to-end time the user perceived (300–1000 ms + typical with outliers up to ~6 s). `branchWidth = 4` was already the post-ADR-012 + compromise, but ADR-012's own sweep (`testBranchWidthSweep`) reports warm means of + **239 / 164 / 107 / 75 ms** at widths 8 / 6 / 4 / 3 — a near-linear scaling with width. + Going one step further (4 → 3) was already noted in ADR-012 as the next available + dial; it was not taken at the time because narrower beams marginally hurt top-1 + quality, and the controller could not yet recover the lost runner-ups. + Crucially, fix (1) and fix (2) compose: dropping a branch in the decoder doesn't lose + coverage if the controller now actually uses the remaining branches. +- Decision: two changes, intentionally co-introduced. + 1. **Rank-fallback in `CompletionController.present(_:request:style:latencyTrace:)`.** + After `candidates.first` is captured as `topRanked`, walk the decoder's ranked list: + if `topRanked` passes `filter.suppressionReason(...)`, use it; otherwise pick the + first `runnerUp` in `candidates.dropFirst()` that passes; otherwise suppress with the + top candidate's reason (the existing telemetry / log shape is preserved so + `suppressionReasons` histograms remain comparable across builds). The filter itself is + unchanged — every rule (insertion safety, CJK script net, mid-word charset, suffix + duplication, display-width caps) still gates each candidate individually. Only the + selection policy changes: the decoder gets to "say it twice" before we give up. + 2. **`DecodingConfiguration.branchWidth` default 4 → 3.** Generation cost per completion + is dominated by the beam width (each branch decodes additively per step); cutting one + branch is ~25% fewer decode passes per prediction. +- Consequences: the user-visible "empty" rate should drop substantially — the share of + rank-1-only suppressions previously discarded as `insertionUnsafe` (41/77 = 53% of all + suppressions) is now exactly the upper bound on how much rank-fallback can recover, and + in practice most of those have a clean rank-2. Generation latency comes down ~25% across + the board (more on long prompts where the marginal branch cost is largest), so the + median typing-to-ghost-text time should improve in step with the per-pass cut. + Trade-offs: a slightly narrower beam may occasionally miss a long-distance better + hypothesis that width-4 would have explored — but the rank-fallback also softens this, + since the surviving 3 branches are still ranked and we can still escape a bad rank-1. + The 6-second tail outliers (cold-cache + long-prompt + maximally-divergent edits) are + not addressed here; capping those needs a generation deadline / cancellation timer, + which is a follow-up (now ADR-078). `recordSuppressed("noCandidate")` still fires when + the decoder emits zero hypotheses (e.g. fully suppressed at decode time by the in-beam + `CurrentWordTypoGuard` / `MidWordCharsetGuard`); that path is unchanged. + +## ADR-078 — 1.2 s generation deadline cancels stale predictions + +- Date: 2026-06-02 +- Status: accepted +- Context: Even after ADR-074 / ADR-077, `telemetry.json`'s `latenciesMillis` still showed + tail outliers up to ~6 s (cold KV cache + long prompt + maximally divergent edits land + all at once). A prediction that takes 3–6 s to land is already stale by the time it + would render — the user has typed several characters since — so showing it produces the + worst kind of UX: the ghost text appears long after the keystroke, often visibly wrong + for the live caret even with `SuggestionAnchor.remaining`'s drift handling. The completion + path already had a clean cancellation seam (the existing `catch is CancellationError` + arm leaves the current ghost as-is), but nothing enforced a wall-clock bound on the + in-flight `engine.completions(for:)`. +- Decision: in `CompletionController`, after creating the `generationTask` and assigning + it to `self.generationTask` (so a newer keystroke can still cancel it via the existing + path), spawn a sibling task that sleeps for `generationDeadlineNanoseconds = 1_200_000_000` + and then calls `generationTask.cancel()`. The cancellation propagates through the next + `try Task.checkCancellation()` inside the engine; the existing catch arm finishes the + latency trace with `outcome: "cancelled"` and silently drops the result. We don't surface + a new outcome string — the deadline is treated as just another cancellation, + indistinguishable from superseded-by-keystroke at the analytics layer. 1.2 s is just + above the empirical p95 in the live telemetry, so the deadline cancels the genuine tail + outliers without truncating the body of the distribution. +- Consequences: the worst-case user-visible delay between a keystroke and a ghost-text + attempt is bounded by `min(generation, 1.2 s)`. Predictions that would have landed at + 4–6 s are now dropped — costing one missed completion but preventing a visibly wrong + one, consistent with the "prefer suppression to a wrong suggestion" principle (AGENTS.md + / ADR-015 derived). The deadline task is cheap (one `Task.sleep`); when generation + finishes inside the budget, the sibling task just exits and the cancel call is a no-op + against an already-completed task. Tail outliers often correspond to cold-cache + prefixes; subsequent typing in the same context warms the KV cache (ADR-018) and lands + well under the budget, so the same context hits the deadline at most once before + recovering. + +## ADR-079 — Collapse internal multi-space runs at insertion + +- Date: 2026-06-02 +- Status: accepted +- Context: User-reported defect: "after filling in the prediction it puts a bigger space + than needed." The two known whitespace gates — `CaretBoundary.reconcile` (ADR-017, + strips redundant *leading* whitespace from a candidate when the caret already sits + after a whitespace character) and `NextWordSplitter` (ADR-016 / ADR-050, the separator + before a word travels with that word) — only cover the candidate-vs-caret join. + Neither one normalizes whitespace *inside* the candidate. The base model occasionally + emits `"hello world"`-style internal double spaces, especially after abbreviated + context, and those traveled verbatim through to insertion. The display ghost text could + mask the doubled space against the surrounding font, but the inserted bytes carried it. + The visible artifact only surfaced on accept. +- Decision: insert a small normalization pass — `collapseInternalDoubleSpaces` — inside + `CompletionController.insert(text:context:keepingAnchor:)` immediately before the + inserter plans the paste/type. It runs in linear time, only allocates when at least one + `" "` run is present, and replaces any run of two or more ASCII spaces with a single + space. A single leading ASCII space (the next-word separator under ADR-050) is + preserved by construction (the first space is appended, subsequent consecutive spaces + are skipped). Other whitespace forms (tab, NBSP, ideographic space) are intentionally + untouched so that the per-policy NBSP substitution in `InsertionPlanner.plan` still + works and intentional tab/indent completions in code editors stay intact. +- Consequences: the user-visible "bigger space than needed" cannot survive the insertion + path even when the model produces an internal double space — the inserted bytes always + carry single-space runs. The display anchor and ghost-text overlay are unchanged, so the + visible measurement / placement (ADR-020 / ADR-067) does not shift. Because + normalization runs only on insertion bytes, the per-keystroke reuse-cache equality check + (`anchorText` comparisons used by `SuggestionAnchor`) is not perturbed. The function is + cheap enough (no allocation on the common no-double-space case, single linear pass + otherwise) that no perf ADR is warranted. If a future model emits *intentional* + multi-space formatting (column alignment in code, say), the rule can be made + insertion-mode-aware by toggling on `request.mode == .code` from the call site; for the + current `.prose` / `.correction` / `.code` set the rule is universally a win.