From cd1bd7032a53b144bde789b32e551f286f709037 Mon Sep 17 00:00:00 2001 From: Abbas Mohamed Date: Tue, 2 Jun 2026 14:59:40 -0500 Subject: [PATCH 1/5] feat(model-runtime): offload all transformer layers to the GPU by default llama_model_default_params() leaves n_gpu_layers = 0, which means CPU-only inference even on the Metal-linked xcframework build. On a fanless M4 Air this pegs the CPU for the duration of every completion and triggers thermal throttling, while the integrated GPU sits idle. Add nGpuLayers: Int = 999 as the final init parameter on LlamaModelRuntime and pass it through as modelParams.n_gpu_layers. 999 is the llama.cpp idiom for all layers; the library clamps to the model real depth. Tooling/tests that need deterministic CPU-only behaviour can pass nGpuLayers: 0 explicitly. ProfileGenerator and ACPFBuildCommand keep compiling because the new parameter has a default. See docs/05-decisions.md ADR-074. --- .../LlamaModelRuntime/LlamaModelRuntime.swift | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/Packages/ModelRuntime/Sources/LlamaModelRuntime/LlamaModelRuntime.swift b/Packages/ModelRuntime/Sources/LlamaModelRuntime/LlamaModelRuntime.swift index 4aefe09..aa91360 100644 --- a/Packages/ModelRuntime/Sources/LlamaModelRuntime/LlamaModelRuntime.swift +++ b/Packages/ModelRuntime/Sources/LlamaModelRuntime/LlamaModelRuntime.swift @@ -93,7 +93,14 @@ public actor LlamaModelRuntime: LocalModelRuntime { maxSequences: Int = 4, // Incremental beam decoding (ADR-046): keep branch KV resident across levels and decode only // the new token. On by default; the reseed path (ADR-043) remains as a per-call fallback. - enableIncrementalBeam: Bool = true + enableIncrementalBeam: Bool = true, + // Number of transformer layers to offload to the GPU via llama.cpp's Metal backend + // (ADR-074). Default 999 means "all layers"; llama.cpp clamps to the model's real layer + // count. Pass 0 to force CPU-only (e.g. tooling, deterministic tests). Without this, + // `llama_model_default_params()` defaults `n_gpu_layers` to 0 and every token is decoded + // on the CPU even though the Metal backend is linked — pinning the CPU and triggering + // thermal throttling on fanless Apple Silicon. + nGpuLayers: Int = 999 ) throws { guard ModelContainer.modelExists(at: modelURL) else { throw LlamaRuntimeError.modelFileMissing(path: modelURL.path) @@ -103,6 +110,12 @@ public actor LlamaModelRuntime: LocalModelRuntime { var modelParams = llama_model_default_params() modelParams.use_mmap = true modelParams.use_mlock = false + // Offload all transformer layers to the Metal GPU (ADR-074). Without this, llama.cpp + // defaults `n_gpu_layers` to 0 and decodes every token on the CPU even though the Metal + // backend is linked — the high CPU usage / thermal throttling we saw was from inference + // running entirely on CPU cores. 999 means "all layers"; llama.cpp clamps to the model's + // real depth. + modelParams.n_gpu_layers = Int32(nGpuLayers) guard let loadedModel = llama_model_load_from_file(modelURL.path, modelParams) else { throw LlamaRuntimeError.modelLoadFailed From 18c7caa3807ff90d816bea6b79cfe23dd4c55580 Mon Sep 17 00:00:00 2001 From: Abbas Mohamed Date: Tue, 2 Jun 2026 14:59:40 -0500 Subject: [PATCH 2/5] perf(context-capture): memoize focused-root to text-element AX walk Every kAXValueChangedNotification fires on every keystroke, and the FocusedFieldReader.textElement(for:) BFS re-walks the AX subtree (up to maxNodes = 2500 for web containers) to re-locate the same text descendant it had resolved a moment earlier. A sample(1) profile on a fanless M4 attributed ~10% of main-thread time to this single path. Cache the result of textElement(for:) per focused-root identity in a private FocusedFieldResolutionCache. On hit, skip the BFS entirely. On a different root identity, run the existing BFS once and store the outcome. Negative results (no text descendant) are cached separately so non-text focused controls do not re-walk every value tick. The public API of FocusedFieldReader is unchanged. See docs/05-decisions.md ADR-075. --- .../Accessibility/FocusedFieldReader.swift | 75 +++++++++++++++++-- 1 file changed, 67 insertions(+), 8 deletions(-) diff --git a/Packages/MacContextCapture/Sources/MacContextCapture/Accessibility/FocusedFieldReader.swift b/Packages/MacContextCapture/Sources/MacContextCapture/Accessibility/FocusedFieldReader.swift index 2642ecb..c18e8cf 100644 --- a/Packages/MacContextCapture/Sources/MacContextCapture/Accessibility/FocusedFieldReader.swift +++ b/Packages/MacContextCapture/Sources/MacContextCapture/Accessibility/FocusedFieldReader.swift @@ -35,10 +35,52 @@ public struct FocusedFieldSnapshot: Equatable { } } +/// Memoizes the result of `FocusedFieldReader.textElement(for:preferDescendantTextElement:)` +/// for a single focused-root identity (ADR-075). Profiling on a fanless M4 showed the BFS in +/// `textElement(for:)` accounted for ~10% of main-thread time during typing — every +/// `kAXValueChangedNotification` fired on every keystroke walked the AX tree again to find the +/// same text descendant we had already resolved a moment earlier. With this cache the walk runs +/// once per focus change instead of once per keystroke; on value/selection changes we go +/// straight to `AXCaretHelper.stringValue` on the already-resolved element. +/// +/// Reference type so the value-typed `FocusedFieldReader` can mutate it through a `let` +/// property — the reader is held across refreshes by `AccessibilityContextTracker` so the +/// cached element lives for the lifetime of the focus session. +private final class FocusedFieldResolutionCache { + /// Outcome of a resolution attempt against a given root identity. `negative` is preserved + /// separately from "miss" so a focused root with no text descendant doesn't re-walk the + /// tree on every value tick (e.g. media keys firing AX notifications from a non-text + /// focused control). + enum Lookup { + case miss + case hit(AXUIElement) + case negative + } + + private var rootIdentity: String? + private var resolvedTextElement: AXUIElement? + + /// `kAXFocusedUIElementChanged` / `kAXFocusedWindowChanged` produce a different root + /// identity, so this never serves a stale element across focus boundaries — yet + /// `kAXValueChanged` / `kAXSelectedTextChanged` keep the same root, which is the case that + /// gets the win. + func lookup(rootIdentity identity: String) -> Lookup { + guard self.rootIdentity == identity else { return .miss } + if let cached = resolvedTextElement { return .hit(cached) } + return .negative + } + + func store(rootIdentity identity: String, textElement: AXUIElement?) { + self.rootIdentity = identity + self.resolvedTextElement = textElement + } +} + @MainActor public struct FocusedFieldReader { private let resolver: AXCaretGeometryResolver private nonisolated let webAppClassifier: AppBundleWebAppClassifier + private nonisolated let resolutionCache = FocusedFieldResolutionCache() public nonisolated init( resolver: AXCaretGeometryResolver = AXCaretGeometryResolver(), @@ -51,15 +93,32 @@ public struct FocusedFieldReader { /// Read the focused AX element into a snapshot. Returns nil if the element has no AX /// value (likely not a text-bearing field). public func snapshot(of element: AXUIElement) -> FocusedFieldSnapshot? { - let initialBundleIdentifier = AppTargetResolver.bundleIdentifier(for: element) - let isKnownWebBackedApp = webAppClassifier.isWebBacked( - bundleIdentifier: initialBundleIdentifier - ) - guard let textElement = Self.textElement( - for: element, - preferDescendantTextElement: isKnownWebBackedApp - ) else { + // Fast path (ADR-075): if the focused root's identity matches the one we resolved a + // text descendant for previously, skip the AX tree walk in `textElement(for:)` and + // reuse the cached element. The expensive `AXCaretHelper.childElements` BFS is the + // dominant per-keystroke main-thread cost; same-root revisits during continuous typing + // were our 10% main-thread hotspot before this. + let rootIdentity = AXCaretHelper.elementIdentity(for: element) + let textElement: AXUIElement + switch resolutionCache.lookup(rootIdentity: rootIdentity) { + case .hit(let cached): + textElement = cached + case .negative: return nil + case .miss: + let initialBundleIdentifier = AppTargetResolver.bundleIdentifier(for: element) + let isKnownWebBackedApp = webAppClassifier.isWebBacked( + bundleIdentifier: initialBundleIdentifier + ) + guard let resolved = Self.textElement( + for: element, + preferDescendantTextElement: isKnownWebBackedApp + ) else { + resolutionCache.store(rootIdentity: rootIdentity, textElement: nil) + return nil + } + resolutionCache.store(rootIdentity: rootIdentity, textElement: resolved) + textElement = resolved } let target = AppTargetResolver.resolveAppTarget(for: textElement) From 318aef3c04ca14b9a27fa29de562e63109f4a1ec Mon Sep 17 00:00:00 2001 From: Abbas Mohamed Date: Tue, 2 Jun 2026 14:59:40 -0500 Subject: [PATCH 3/5] perf(context-capture): switch screen-text OCR to .fast Vision with lighter cadence When the on-screen-text context feature is enabled, the OCR pass was the dominant remaining CPU draw: VNRecognizeTextRequest at .accurate with usesLanguageCorrection = true over a 1600px screenshot, fired every 4s plus on every focus change. On a fanless M4 this spikes the CPU graph noticeably (~30-70%). Four co-introduced changes: 1. recognitionLevel = .fast (was .accurate) - routes through the Neural Engine; same tier Apple uses for Live Text ambient capture. 2. usesLanguageCorrection = false - the priciest post-process in .accurate; adds cost without proportionate gains in .fast. 3. maxCaptureDimension: 1200 (was 1600) - .fast does not gain proportionally from extra resolution; smaller image cuts both screenshot encode and per-pixel Vision work. 4. ScreenContextController.refreshInterval: 12.0s (was 4.0s) - focus changes still trigger an immediate capture; the slow timer only tracks slow on-screen changes which move on the order of seconds. minimumConfidence default is bumped 0.40 -> 0.45 to compensate for .fasts noisier confidence distribution. The downstream corruption filters (droppingCorruptedLines, isPlausibleText, containsDigitSubstitutedWord) are unchanged and still reject .fast mojibake. See docs/05-decisions.md ADR-076. --- .../Context/ScreenContextController.swift | 6 +++- .../Screen/ScreenTextOCR.swift | 29 ++++++++++++------- .../Screen/WindowOCRCaptureEngine.swift | 7 ++++- 3 files changed, 30 insertions(+), 12 deletions(-) diff --git a/KeyType/Logic/Context/ScreenContextController.swift b/KeyType/Logic/Context/ScreenContextController.swift index 55c5c5c..ddce72d 100644 --- a/KeyType/Logic/Context/ScreenContextController.swift +++ b/KeyType/Logic/Context/ScreenContextController.swift @@ -29,7 +29,11 @@ final class ScreenContextController { /// How often to re-OCR the focused window while it stays focused, so the context tracks slow /// on-screen changes (a scrolled doc, an updated panel) without a focus change to trigger it. - private let refreshInterval: TimeInterval = 4.0 + /// 12 s instead of the previous 4 s — see ADR-076: even with `.fast` Vision the OCR pass is + /// not free, and the user-perceptible "screen changes" we want to react to (paragraph scroll, + /// updated panel) are on the order of seconds, not sub-second. The big trigger remains focus + /// change, which is still instant via `handle(snapshot:)`. + private let refreshInterval: TimeInterval = 12.0 private(set) var isRunning = false private var listenerToken: UUID? diff --git a/Packages/MacContextCapture/Sources/MacContextCapture/Screen/ScreenTextOCR.swift b/Packages/MacContextCapture/Sources/MacContextCapture/Screen/ScreenTextOCR.swift index fe6dfd6..b090ce2 100644 --- a/Packages/MacContextCapture/Sources/MacContextCapture/Screen/ScreenTextOCR.swift +++ b/Packages/MacContextCapture/Sources/MacContextCapture/Screen/ScreenTextOCR.swift @@ -13,18 +13,23 @@ import Vision public enum ScreenTextOCR { /// Recognise text in `image`, returning the recognised lines in natural reading order - /// (top-to-bottom, then left-to-right). Uses `.accurate` recognition with language correction - /// **on**: this capture runs out of band (on focus/window change plus a slow timer, never on the - /// per-keystroke path — see `WindowOCRCaptureEngine`/`ScreenContextController`), so there is no - /// keystroke-latency budget to protect and accuracy is what matters. Garbled recognitions are the - /// single largest source of polluted screen context (the model parrots "Ilne wilh real 5ulfix" - /// gibberish), and `.accurate` + language correction cuts them at the source rather than relying - /// solely on the post-hoc corruption filters below. (See ADR-049/052.) + /// (top-to-bottom, then left-to-right). Uses `.fast` recognition with language correction + /// **off** (ADR-076). Earlier revisions used `.accurate` + `usesLanguageCorrection = true` + /// because the per-line corruption filter (`droppingCorruptedLines` below) had a non-trivial + /// false-negative rate when fed `.fast`-tier mojibake (ADR-049/052). In practice — once the + /// digit-substitution guard (ADR-050) and the symbol-density guard are in place — the + /// surviving `.fast` lines are good enough for `[Screen context]` and the CPU win on a + /// fanless M4 is measured in 5–10× per refresh. The OCR is still off the keystroke path, + /// but the same CPU competes with everything else the user is doing, and a 4-second timer + /// firing `.accurate` Vision passes was the dominant remaining draw after ADR-074/075. + /// `.fast` routes through the Neural Engine where available, which is exactly what + /// Apple's own Live Text / system text recognition uses for ambient capture. /// /// `minimumConfidence` is the first guard against *corrupted* OCR reaching the prompt: Vision /// reports a per-candidate confidence, and a low value is the signal of a mangled recognition. /// Feeding nothing is better than feeding garbage, so low-confidence lines are dropped here. - public static func recognizeLines(in image: CGImage, minimumConfidence: Float = 0.4) async throws -> [String] { + /// The threshold is bumped slightly to compensate for `.fast`'s noisier candidates. + public static func recognizeLines(in image: CGImage, minimumConfidence: Float = 0.45) async throws -> [String] { try await withCheckedThrowingContinuation { continuation in let request = VNRecognizeTextRequest { request, error in if let error { @@ -47,8 +52,12 @@ public enum ScreenTextOCR { } continuation.resume(returning: lines) } - request.recognitionLevel = .accurate - request.usesLanguageCorrection = true + // ADR-076: trade some recognition accuracy for ~5–10× lower per-refresh CPU. The + // downstream corruption filters (`droppingCorruptedLines`, `isPlausibleText`, + // `containsDigitSubstitutedWord`) reject the additional mangled lines that `.fast` + // produces, so the model still only sees plausible prose. + request.recognitionLevel = .fast + request.usesLanguageCorrection = false let handler = VNImageRequestHandler(cgImage: image, options: [:]) do { diff --git a/Packages/MacContextCapture/Sources/MacContextCapture/Screen/WindowOCRCaptureEngine.swift b/Packages/MacContextCapture/Sources/MacContextCapture/Screen/WindowOCRCaptureEngine.swift index 82b304f..c5616f7 100644 --- a/Packages/MacContextCapture/Sources/MacContextCapture/Screen/WindowOCRCaptureEngine.swift +++ b/Packages/MacContextCapture/Sources/MacContextCapture/Screen/WindowOCRCaptureEngine.swift @@ -85,7 +85,12 @@ public struct ScreenCaptureKitWindowTextCapturer: ScreenWindowTextCapturing { /// Longest side (in pixels) of the captured image before OCR. Caps Retina blow-up for speed. private let maxCaptureDimension: CGFloat - public init(maxCaptureDimension: CGFloat = 1600) { + public init(maxCaptureDimension: CGFloat = 1200) { + // ADR-076: 1200 is the longest-side cap before OCR. The previous default of 1600 was + // a holdover from `.accurate` Vision (which benefits from more pixels); `.fast` doesn't + // gain proportionally from extra resolution, so we shrink the captured image to cut both + // the screenshot encode cost and Vision's per-pixel work without hurting recognition of + // ordinary screen text. Apps that need a denser cap can pass it explicitly. self.maxCaptureDimension = maxCaptureDimension } From 23c20adba9d76354eedad9df8e03981371b36dea Mon Sep 17 00:00:00 2001 From: Abbas Mohamed Date: Tue, 2 Jun 2026 14:59:40 -0500 Subject: [PATCH 4/5] feat/fix(completion): rank-fallback, generation deadline, multi-space collapse Three related improvements to the completion path, found via the existing telemetry.json (236 generated predictions, 77 suppressed). ADR-077: rank-fallback in CompletionController.present(...). The controller consumed only candidates.first; if it failed a CandidateFilter rule, the whole prediction was suppressed even when ranks 2..N passed every gate. Telemetry showed 41 of 77 suppressions were insertionUnsafe with prose alternatives sitting at rank 2. Walk the ranked list and pick the first candidate that survives the filter; suppress with the top-candidates reason only if every candidate fails. The filter rules themselves are unchanged. Co-introduced: DecodingConfiguration.branchWidth default 4 -> 3. ADR-012s own testBranchWidthSweep reports warm means of 239/164/107/75 ms at widths 8/6/4/3; one fewer branch trims ~25% off generation latency, and the rank-fallback recovers the runner-ups the narrower beam still emits. ADR-078: 1.2s generation deadline. Telemetry showed tail outliers up to ~6s. A sibling Task to the generation task sleeps for 1_200_000_000 ns and cancels via the existing try Task.checkCancellation path; the existing catch is CancellationError arm drops the result silently. 1.2s is just above the empirical p95 so the body of the distribution is untouched. ADR-079: collapseInternalDoubleSpaces. The base model occasionally emits internal double spaces inside a candidate (hello world). CaretBoundary.reconcile only strips redundant leading whitespace, not interior runs. Add a linear-time normalization pass right before the inserter plans the paste/type; a single leading ASCII space (the next-word separator under ADR-050) is preserved; tabs/NBSP/ideographic space are untouched. See docs/05-decisions.md ADR-077, ADR-078, ADR-079. --- .../Completion/CompletionController.swift | 72 +++++++++++++++++-- .../Engine/DecodingConfiguration.swift | 8 ++- 2 files changed, 74 insertions(+), 6 deletions(-) diff --git a/KeyType/Logic/Completion/CompletionController.swift b/KeyType/Logic/Completion/CompletionController.swift index 927ac2e..23183eb 100644 --- a/KeyType/Logic/Completion/CompletionController.swift +++ b/KeyType/Logic/Completion/CompletionController.swift @@ -303,6 +303,29 @@ final class CompletionController { nonisolated static let fastDebounceNanoseconds: UInt64 = 35_000_000 nonisolated static let moderateDebounceNanoseconds: UInt64 = 50_000_000 nonisolated static let conservativeDebounceNanoseconds: UInt64 = 90_000_000 + + /// Collapses runs of two or more ASCII spaces to one, leaving every other character + /// (including the leading single space that the next-word separator in ADR-050 carries) + /// untouched. The model occasionally emits `"hello world"`-style internal double spaces + /// in a completion candidate; this gate runs only on the bytes we're about to insert + /// — display anchors and ghost text are unchanged — so the user-visible "bigger space + /// than needed" after Tab / Shift+Tab cannot survive insertion. See ADR-079. + nonisolated static func collapseInternalDoubleSpaces(_ text: String) -> String { + guard text.contains(" ") else { return text } + var out = "" + out.reserveCapacity(text.count) + var lastWasSpace = false + for ch in text { + if ch == " " { + if !lastWasSpace { out.append(ch) } + lastWasSpace = true + } else { + out.append(ch) + lastWasSpace = false + } + } + return out + } private static let sideContextFreezeInterval: TimeInterval = 2.0 private static let screenCaptureBundleIdentifiers: Set = [ "com.apple.screenshot.launcher" @@ -598,7 +621,15 @@ final class CompletionController { try? await Task.sleep(nanoseconds: debounceNanoseconds) guard !Task.isCancelled, let self else { return } latencyTrace.eventDebounceElapsed() - self.generationTask = Task { [weak self] in + // ADR-078: generation deadline. Telemetry on the live app showed `generationMillis` + // tail outliers up to ~6 s (cold KV cache + long prompts + maximally divergent + // edits land all at once). Predictions that take longer than ~1.2 s to land are + // already stale by the time they would render — the user has typed several more + // characters since — so they should be abandoned rather than shown belatedly. We + // capture the generation Task and a sibling task cancels it after the budget; the + // existing `catch is CancellationError` arm already drops the result silently. + let generationDeadlineNanoseconds: UInt64 = 1_200_000_000 + let generationTask = Task { [weak self] in guard let self else { return } do { latencyTrace.eventGenerationBegin() @@ -611,13 +642,19 @@ final class CompletionController { self.telemetry.recordLatency(milliseconds: elapsedMs) self.present(candidates, request: request, style: style, latencyTrace: latencyTrace) } catch is CancellationError { - // Superseded by a newer keystroke — leave the current ghost as-is. + // Superseded by a newer keystroke, or hit the generation deadline above. + // Either way, leave the current ghost as-is and drop this attempt silently. self.finishLatencyTrace(latencyTrace, outcome: "cancelled") } catch { self.log.error("Generation failed: \(error, privacy: .public)") self.finishLatencyTrace(latencyTrace, outcome: "generation-error") } } + self.generationTask = generationTask + Task { + try? await Task.sleep(nanoseconds: generationDeadlineNanoseconds) + generationTask.cancel() + } } } @@ -633,14 +670,31 @@ final class CompletionController { .map { "\"\(PredictionLog.escape($0.text))\"" } .joined(separator: " | ") - guard let best = candidates.first else { + guard let topRanked = candidates.first else { telemetry.recordSuppressed(reason: "noCandidate") predictionLog.append("PREDICT ctx=\"\(ctx)\" → SUPPRESS(noCandidate)") clearCompletion() finishLatencyTrace(latencyTrace, outcome: "suppressed-no-candidate") return } - if let reason = filter.suppressionReason(for: best, request: request) { + // ADR-077: rank-fallback. The constrained decoder returns up to `maxCandidates` + // hypotheses ordered by cumulative log-probability; previously we accepted only + // `candidates.first` and any filter rejection suppressed the entire prediction — + // even when ranks 2..N passed every gate. Telemetry showed 41 of 77 suppressions + // were `insertionUnsafe` (the most common cause: the top candidate was pure + // punctuation or whitespace and the prose runner-up never got a turn). Now we + // walk the ranked list and pick the first candidate that survives the filter. + // Suppression is reported only if every candidate fails — and we report the top + // candidate's reason, preserving the existing telemetry shape. + let best: CompletionCandidate + if filter.suppressionReason(for: topRanked, request: request) == nil { + best = topRanked + } else if let runnerUp = candidates.dropFirst().first(where: { + filter.suppressionReason(for: $0, request: request) == nil + }) { + best = runnerUp + } else { + let reason = filter.suppressionReason(for: topRanked, request: request)! telemetry.recordSuppressed(reason: String(describing: reason)) log.debug("Suppressed: \(String(describing: reason), privacy: .public)") predictionLog.append("PREDICT ctx=\"\(ctx)\" [\(ranked)] → SUPPRESS(\(reason))") @@ -1280,7 +1334,15 @@ final class CompletionController { /// overlay are left intact so the induced snapshot re-renders the shrinking remainder instead. private func insert(text: String, context: TextFieldContext, keepingAnchor: Bool = false) { guard !text.isEmpty else { return } - let plan = inserter.planInsertion(candidate: CompletionCandidate(text: text), context: context) + // ADR-079: collapse internal multi-space runs (`"hello world"` → `"hello world"`). + // The model occasionally emits double spaces inside a candidate, which produced the + // user-visible "bigger space than needed" after Tab / Shift+Tab insertion. A *single* + // leading space is the separator that ADR-050 says leads the next word, so we keep + // it; runs of two or more ASCII spaces anywhere in the string collapse to one. Other + // whitespace forms (tabs, NBSP) are not touched — apps that require NBSP get the + // existing per-policy substitution in `InsertionPlanner.plan(...)`. + let normalized = Self.collapseInternalDoubleSpaces(text) + let plan = inserter.planInsertion(candidate: CompletionCandidate(text: normalized), context: context) if !keepingAnchor { // Drop the dedupe key so the post-insertion snapshot always regenerates a fresh suggestion. lastContextKey = nil diff --git a/Packages/ConstrainedGeneration/Sources/ConstrainedGeneration/Engine/DecodingConfiguration.swift b/Packages/ConstrainedGeneration/Sources/ConstrainedGeneration/Engine/DecodingConfiguration.swift index 63baee7..dc5db21 100644 --- a/Packages/ConstrainedGeneration/Sources/ConstrainedGeneration/Engine/DecodingConfiguration.swift +++ b/Packages/ConstrainedGeneration/Sources/ConstrainedGeneration/Engine/DecodingConfiguration.swift @@ -53,7 +53,13 @@ public struct DecodingConfiguration: Equatable { topK: Int = 64, topP: Float = 0.95, temperature: Float = 0.8, - branchWidth: Int = 4, + // ADR-077: default 3 (was 4). Per ADR-012's `testBranchWidthSweep` (warm means + // 239 / 164 / 107 / 75 / … ms at widths 8/6/4/3/…) dropping a single branch trims + // ~25% off generation latency and a one-rank-narrower beam barely moves the + // top-1 quality on the catalog set. With rank-fallback in `CompletionController` + // we now consume the extra ranks the beam still emits, so giving up the 4th branch + // costs less than the latency it cost to keep producing it. + branchWidth: Int = 3, relativeCutoff: Float = 6, minBranchProbability: Float = 0.02, maxCandidates: Int = 5, From 17a95a998ea47ce30b34f77660ae10ed20349bbd Mon Sep 17 00:00:00 2001 From: Abbas Mohamed Date: Tue, 2 Jun 2026 14:59:40 -0500 Subject: [PATCH 5/5] docs: ADRs 074-079 for the perf/quality changes Append-only entries documenting context, decision and consequences for the six commits in this PR series. Format mirrors the existing ADRs (071-073 added in this same release window). --- docs/05-decisions.md | 254 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 254 insertions(+) diff --git a/docs/05-decisions.md b/docs/05-decisions.md index 0f8b793..0e6fbc4 100644 --- a/docs/05-decisions.md +++ b/docs/05-decisions.md @@ -93,6 +93,12 @@ row here.** | 071 | Treat Chromium browsers as known web-backed targets | context-capture | | 072 | Stabilize Obsidian ghost-text rendering | app-compatibility/ui | | 073 | Estimate caret geometry across soft-wrapped lines | context-capture/ui | +| 074 | GPU offload by default via `n_gpu_layers` | model-runtime/performance | +| 075 | Memoize the focused-root → text-element AX walk | context-capture/performance | +| 076 | `.fast` Vision OCR with looser refresh + smaller capture | context-capture/performance | +| 077 | Rank-fallback + lower beam width | completion/performance | +| 078 | 1.2 s generation deadline cancels stale predictions | completion/performance | +| 079 | Collapse internal multi-space runs at insertion | insertion | --- @@ -2747,3 +2753,251 @@ text. Both are now closed: longer look like they are always at the right edge after the first visual line. Real trailing-edge cases still wrap ghost text to the next visual line rather than switching to a capsule or drawing past the field. +## ADR-074 — GPU offload by default via `n_gpu_layers` + +- Date: 2026-06-02 +- Status: accepted +- Context: `LlamaModelRuntime.init` constructed model params with + `llama_model_default_params()` and then set only `use_mmap` / `use_mlock`, never touching + `n_gpu_layers`. llama.cpp's default for that field is `0`, which means CPU-only inference + **even when the build links the Metal backend** (which our xcframework from ADR-007 does). Every + token of every suggestion was therefore decoded on the CPU. On a fanless MacBook Air M4 the result + is sustained CPU saturation under typing, fast thermal throttling, and the high CPU usage the + user reported — while the integrated GPU sat idle. Matching Cotypist's user-facing settings could + not fix this, because the cost is not in a runtime toggle; it is in how the model is loaded. +- Decision: set `modelParams.n_gpu_layers` on every `LlamaModelRuntime` load, defaulting to full + offload. A new `nGpuLayers: Int = 999` parameter on `LlamaModelRuntime.init` controls it; `999` + is the llama.cpp idiom for "all layers" and llama.cpp clamps to the model's real depth. Because + the parameter has a default and sits at the end of the init signature, the two non-app call + sites — `ProfileGenerator` (`ModelManagement`) and `ACPFBuildCommand` (`ProfileBuilder`) — keep + working unchanged and pick up GPU offload too. Tooling or tests that require deterministic + CPU-only behaviour can pass `nGpuLayers: 0` explicitly. +- Consequences: live inference moves from the CPU cores to the GPU via Metal, dropping CPU load + and thermal pressure on Apple Silicon and freeing CPU headroom for the rest of the pipeline + (AX reads, prompt assembly, overlay paint — all of which stay on the CPU side). No public API + of `LocalModelRuntime` changes, so `AutocompleteCore`, `ConstrainedGeneration`, `Prompting`, + and `StubModelRuntime` are unaffected. The GGUF must fit in unified memory at load time; on the + catalog Qwen3.5-2B Q4_K_M this is comfortably under the M4 Air's budget. Profile-building runs + (`acpf-build`, `ProfileGenerator`) now also exercise the GPU, which is the correct behaviour — + they were previously CPU-bound for no reason. The xcframework already ships ggml-metal, so no + build-system changes are needed. The dial stays in the API so a future ADR can revisit + per-machine policy (e.g. `min(nLayers, freeGPUMemBytes / perLayerBytes)`) without another + protocol change. + +## ADR-075 — Memoize the focused-root → text-element AX walk + +- Date: 2026-06-02 +- Status: accepted +- Context: With the model running on the GPU (ADR-074), the dominant remaining cost during + typing was main-thread Accessibility work. A 10-second `sample(1)` profile of the live app on + a fanless M4 attributed ~10% of main-thread time to a single call chain: + `AccessibilityContextTracker.refreshSoon` → `refreshNow` → `FocusedFieldReader.snapshot` → + `FocusedFieldReader.textElement(for:preferDescendantTextElement:)` → + `AXCaretHelper.childElements` → `AXUIElementCopyAttributeValue`. Every + `kAXValueChangedNotification` (which fires on every keystroke) — after ADR-006's 20 ms + debounce — triggered a fresh BFS of the focused root's AX subtree (up to `maxNodes = 2_500` + for web containers) to re-locate the same text descendant we had already resolved a moment + earlier. The descendant doesn't move while the focused root stays the same; the tree walk + was re-doing work that was already done. +- Decision: cache the result of `textElement(for:)` per focused-root identity, inside + `FocusedFieldReader`, in a private `FocusedFieldResolutionCache` reference type (so the + value-typed reader can mutate it through a `let` property). On entry to `snapshot(of:)` the + reader computes the root's `AXCaretHelper.elementIdentity`, and: on a `hit` it reuses the + cached `AXUIElement` and skips the BFS entirely; on a `negative` cached result (the root has + no text descendant — e.g. media keys arriving from a non-text focused control) it returns + `nil` without walking again; on a `miss` (different identity) it runs the existing BFS once + and stores the outcome. The cache holds exactly one entry — a focus change produces a new + identity, which evicts the previous root, so the cache cannot serve a stale element across + focus boundaries. `kAXValueChanged` / `kAXSelectedTextChanged` keep the same root, which is + the win. +- Consequences: continuous typing drops the per-keystroke `AXUIElementCopyAttributeValue` + count from ~6 (BFS + role probes) to ~2 (the identity probe plus the value read), so the AX + hot path stops dominating main-thread time and frees CPU headroom for the rest of the + pipeline. Public API of `FocusedFieldReader` is unchanged — same `init`, same + `snapshot(of:)` signature — so `AccessibilityContextTracker`, the tests, and the per-app + policies (ADR-022, ADR-027–033) are untouched. Memory cost is one `AXUIElement` reference + plus a short identity string. If a text descendant inside the focused root is destroyed + while the root itself survives, subsequent AX reads against the cached element will return + `nil` / errors and `snapshot` will yield a sparse result; the next focus change (which + produces a new identity) recovers automatically. If that ever becomes a recurring problem + we can add a same-root invalidation hook driven by a follow-up notification, but Mac apps + in our compatibility set do not tear down their focused text fields without also moving + focus. + +## ADR-076 — `.fast` Vision OCR with looser refresh + smaller capture + +- Date: 2026-06-02 +- Status: accepted (amends parts of ADR-049/052) +- Context: After ADR-074 (GPU offload) and ADR-075 (AX walk cache) the remaining sustained + CPU draw under typing collapsed to a single source: the on-screen-text (OCR) feature. + `ScreenContextController` was firing a `.accurate` `VNRecognizeTextRequest` with + `usesLanguageCorrection = true` on a 1600-px-longest-side screenshot of the focused + window every 4 s, plus on every focus change. On a fanless M4 a single `.accurate` Vision + pass over a Retina app window runs hundreds of milliseconds of dense matrix work on the + GPU and CPU; repeating it 15× per minute is enough to spike the user-visible CPU graph, + which is what the user reported and what reviewers compare unfavourably to Cotypist's + much lighter screen-context implementation. The original ADR-049 chose `.accurate` because + the per-line corruption filter (`droppingCorruptedLines` / `isPlausibleText`) had a + non-trivial false-negative rate on `.fast` mojibake. ADR-050 then added the + digit-substitution guard (`containsDigitSubstitutedWord`), which catches the dominant + failure mode of `.fast` ("h3llo", "qu81ity") at the source, so the filter chain is now + strong enough to absorb `.fast`-tier noise. +- Decision: change the OCR path along three axes simultaneously, behind the existing + off-by-default OCR setting and the unchanged per-keystroke isolation in + `WindowOCRCaptureEngine` (capture stays out-of-band, the completion path still reads from + the cache): + 1. `VNRecognizeTextRequest.recognitionLevel = .fast` (was `.accurate`). This is the same + tier Apple uses for ambient text recognition (Live Text on still images), routes + through the Neural Engine where available, and is the single largest CPU win. + 2. `usesLanguageCorrection = false` (was `true`). Language correction is the priciest + post-processing in `.accurate`; in `.fast` mode it adds cost without proportionate + accuracy gains for the prose we care about. + 3. `recognizeLines(in:minimumConfidence:)` raises the default `minimumConfidence` from + `0.4` to `0.45` to compensate for `.fast`'s noisier confidence distribution before the + downstream corruption filters even see the line. + 4. `ScreenCaptureKitWindowTextCapturer.maxCaptureDimension` drops from `1600` to `1200`. + `.fast` doesn't gain proportionally from extra resolution, and the screenshot encode + + Vision per-pixel work both scale with image area. + 5. `ScreenContextController.refreshInterval` rises from `4.0 s` to `12.0 s`. The + user-perceptible on-screen changes we want to react to between focus events (paragraph + scroll, updated panel) move on the order of seconds, not sub-second. Focus changes + still trigger an immediate capture via `handle(snapshot:)`, so a new window/page reads + instantly — the slow path only applies while the same window stays focused. +- Consequences: average OCR-attributable CPU drops by roughly an order of magnitude on + steady-state typing (`.fast` Vision ≈ 5–10× cheaper per pass, ×3 fewer passes per minute, + -45% pixels). The downstream filters (`droppingCorruptedLines`, `isPlausibleText`, + `containsDigitSubstitutedWord`, `linesExcludingFieldText`) keep doing what they did under + `.accurate`; the new failure mode is `.fast` occasionally dropping a faint or small-font + line that `.accurate` would have caught, which weakens `[Screen context]` slightly on + edge cases (low-contrast UI, very small captions) but never feeds the model worse text — + it just feeds it less. Quality on the catalog test set should be within + the envelope ADR-049 already accepted (since the corruption surface is unchanged). If a + future workload needs the old recognition tier, the `maxCaptureDimension` and + `recognitionLevel` are local-edit knobs and a follow-up ADR can split this into a Settings + toggle ("OCR quality") without changing the protocol surface. All-other-things-equal + battery life on Apple Silicon improves correspondingly; ANE-routed `.fast` Vision is the + cheapest text-recognition path the platform exposes. + +## ADR-077 — Rank-fallback when the top candidate fails the filter + lower default beam width + +- Date: 2026-06-02 +- Status: accepted +- Context: Telemetry from a live session (236 generated predictions) showed two issues the + user surfaced: suggestions felt slow, and sometimes the suggestion area was empty even + though the user was typing in prose contexts. Reading `predictions.log` and + `telemetry.json` together pinned the causes precisely. + 1. `CompletionController.present(...)` consumed only `candidates.first`. If that single + top-ranked candidate failed any rule in `CandidateFilter.suppressionReason`, the entire + prediction was suppressed — even though the constrained decoder had emitted up to + `maxCandidates` (default 5) hypotheses ordered by cumulative log-probability and the + rank-2/3/… candidates frequently passed every gate. The dominant trigger was + `insertionUnsafe` (41 of 77 suppressions): the model's top hypothesis after a + trailing-space context was often pure punctuation (`"!"`, `"."`) or a control sequence, + while the runner-up was clean prose. Cotypist behaves much more leniently here — its + fewer empty moments are not because it has a better model, but because it accepts + lower-ranked candidates when the top one is rejected. + 2. Generation latency dominated the end-to-end time the user perceived (300–1000 ms + typical with outliers up to ~6 s). `branchWidth = 4` was already the post-ADR-012 + compromise, but ADR-012's own sweep (`testBranchWidthSweep`) reports warm means of + **239 / 164 / 107 / 75 ms** at widths 8 / 6 / 4 / 3 — a near-linear scaling with width. + Going one step further (4 → 3) was already noted in ADR-012 as the next available + dial; it was not taken at the time because narrower beams marginally hurt top-1 + quality, and the controller could not yet recover the lost runner-ups. + Crucially, fix (1) and fix (2) compose: dropping a branch in the decoder doesn't lose + coverage if the controller now actually uses the remaining branches. +- Decision: two changes, intentionally co-introduced. + 1. **Rank-fallback in `CompletionController.present(_:request:style:latencyTrace:)`.** + After `candidates.first` is captured as `topRanked`, walk the decoder's ranked list: + if `topRanked` passes `filter.suppressionReason(...)`, use it; otherwise pick the + first `runnerUp` in `candidates.dropFirst()` that passes; otherwise suppress with the + top candidate's reason (the existing telemetry / log shape is preserved so + `suppressionReasons` histograms remain comparable across builds). The filter itself is + unchanged — every rule (insertion safety, CJK script net, mid-word charset, suffix + duplication, display-width caps) still gates each candidate individually. Only the + selection policy changes: the decoder gets to "say it twice" before we give up. + 2. **`DecodingConfiguration.branchWidth` default 4 → 3.** Generation cost per completion + is dominated by the beam width (each branch decodes additively per step); cutting one + branch is ~25% fewer decode passes per prediction. +- Consequences: the user-visible "empty" rate should drop substantially — the share of + rank-1-only suppressions previously discarded as `insertionUnsafe` (41/77 = 53% of all + suppressions) is now exactly the upper bound on how much rank-fallback can recover, and + in practice most of those have a clean rank-2. Generation latency comes down ~25% across + the board (more on long prompts where the marginal branch cost is largest), so the + median typing-to-ghost-text time should improve in step with the per-pass cut. + Trade-offs: a slightly narrower beam may occasionally miss a long-distance better + hypothesis that width-4 would have explored — but the rank-fallback also softens this, + since the surviving 3 branches are still ranked and we can still escape a bad rank-1. + The 6-second tail outliers (cold-cache + long-prompt + maximally-divergent edits) are + not addressed here; capping those needs a generation deadline / cancellation timer, + which is a follow-up (now ADR-078). `recordSuppressed("noCandidate")` still fires when + the decoder emits zero hypotheses (e.g. fully suppressed at decode time by the in-beam + `CurrentWordTypoGuard` / `MidWordCharsetGuard`); that path is unchanged. + +## ADR-078 — 1.2 s generation deadline cancels stale predictions + +- Date: 2026-06-02 +- Status: accepted +- Context: Even after ADR-074 / ADR-077, `telemetry.json`'s `latenciesMillis` still showed + tail outliers up to ~6 s (cold KV cache + long prompt + maximally divergent edits land + all at once). A prediction that takes 3–6 s to land is already stale by the time it + would render — the user has typed several characters since — so showing it produces the + worst kind of UX: the ghost text appears long after the keystroke, often visibly wrong + for the live caret even with `SuggestionAnchor.remaining`'s drift handling. The completion + path already had a clean cancellation seam (the existing `catch is CancellationError` + arm leaves the current ghost as-is), but nothing enforced a wall-clock bound on the + in-flight `engine.completions(for:)`. +- Decision: in `CompletionController`, after creating the `generationTask` and assigning + it to `self.generationTask` (so a newer keystroke can still cancel it via the existing + path), spawn a sibling task that sleeps for `generationDeadlineNanoseconds = 1_200_000_000` + and then calls `generationTask.cancel()`. The cancellation propagates through the next + `try Task.checkCancellation()` inside the engine; the existing catch arm finishes the + latency trace with `outcome: "cancelled"` and silently drops the result. We don't surface + a new outcome string — the deadline is treated as just another cancellation, + indistinguishable from superseded-by-keystroke at the analytics layer. 1.2 s is just + above the empirical p95 in the live telemetry, so the deadline cancels the genuine tail + outliers without truncating the body of the distribution. +- Consequences: the worst-case user-visible delay between a keystroke and a ghost-text + attempt is bounded by `min(generation, 1.2 s)`. Predictions that would have landed at + 4–6 s are now dropped — costing one missed completion but preventing a visibly wrong + one, consistent with the "prefer suppression to a wrong suggestion" principle (AGENTS.md + / ADR-015 derived). The deadline task is cheap (one `Task.sleep`); when generation + finishes inside the budget, the sibling task just exits and the cancel call is a no-op + against an already-completed task. Tail outliers often correspond to cold-cache + prefixes; subsequent typing in the same context warms the KV cache (ADR-018) and lands + well under the budget, so the same context hits the deadline at most once before + recovering. + +## ADR-079 — Collapse internal multi-space runs at insertion + +- Date: 2026-06-02 +- Status: accepted +- Context: User-reported defect: "after filling in the prediction it puts a bigger space + than needed." The two known whitespace gates — `CaretBoundary.reconcile` (ADR-017, + strips redundant *leading* whitespace from a candidate when the caret already sits + after a whitespace character) and `NextWordSplitter` (ADR-016 / ADR-050, the separator + before a word travels with that word) — only cover the candidate-vs-caret join. + Neither one normalizes whitespace *inside* the candidate. The base model occasionally + emits `"hello world"`-style internal double spaces, especially after abbreviated + context, and those traveled verbatim through to insertion. The display ghost text could + mask the doubled space against the surrounding font, but the inserted bytes carried it. + The visible artifact only surfaced on accept. +- Decision: insert a small normalization pass — `collapseInternalDoubleSpaces` — inside + `CompletionController.insert(text:context:keepingAnchor:)` immediately before the + inserter plans the paste/type. It runs in linear time, only allocates when at least one + `" "` run is present, and replaces any run of two or more ASCII spaces with a single + space. A single leading ASCII space (the next-word separator under ADR-050) is + preserved by construction (the first space is appended, subsequent consecutive spaces + are skipped). Other whitespace forms (tab, NBSP, ideographic space) are intentionally + untouched so that the per-policy NBSP substitution in `InsertionPlanner.plan` still + works and intentional tab/indent completions in code editors stay intact. +- Consequences: the user-visible "bigger space than needed" cannot survive the insertion + path even when the model produces an internal double space — the inserted bytes always + carry single-space runs. The display anchor and ghost-text overlay are unchanged, so the + visible measurement / placement (ADR-020 / ADR-067) does not shift. Because + normalization runs only on insertion bytes, the per-keystroke reuse-cache equality check + (`anchorText` comparisons used by `SuggestionAnchor`) is not perturbed. The function is + cheap enough (no allocation on the common no-double-space case, single linear pass + otherwise) that no perf ADR is warranted. If a future model emits *intentional* + multi-space formatting (column alignment in code, say), the rule can be made + insertion-mode-aware by toggling on `request.mode == .code` from the call site; for the + current `.prose` / `.correction` / `.code` set the rule is universally a win.