Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 67 additions & 5 deletions KeyType/Logic/Completion/CompletionController.swift
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,29 @@ final class CompletionController {
nonisolated static let fastDebounceNanoseconds: UInt64 = 35_000_000
nonisolated static let moderateDebounceNanoseconds: UInt64 = 50_000_000
nonisolated static let conservativeDebounceNanoseconds: UInt64 = 90_000_000

/// Collapses runs of two or more ASCII spaces to one, leaving every other character
/// (including the leading single space that the next-word separator in ADR-050 carries)
/// untouched. The model occasionally emits `"hello world"`-style internal double spaces
/// in a completion candidate; this gate runs only on the bytes we're about to insert
/// — display anchors and ghost text are unchanged — so the user-visible "bigger space
/// than needed" after Tab / Shift+Tab cannot survive insertion. See ADR-079.
nonisolated static func collapseInternalDoubleSpaces(_ text: String) -> String {
guard text.contains(" ") else { return text }
var out = ""
out.reserveCapacity(text.count)
var lastWasSpace = false
for ch in text {
if ch == " " {
if !lastWasSpace { out.append(ch) }
lastWasSpace = true
} else {
out.append(ch)
lastWasSpace = false
}
}
return out
}
private static let sideContextFreezeInterval: TimeInterval = 2.0
private static let screenCaptureBundleIdentifiers: Set<String> = [
"com.apple.screenshot.launcher"
Expand Down Expand Up @@ -598,7 +621,15 @@ final class CompletionController {
try? await Task.sleep(nanoseconds: debounceNanoseconds)
guard !Task.isCancelled, let self else { return }
latencyTrace.eventDebounceElapsed()
self.generationTask = Task { [weak self] in
// ADR-078: generation deadline. Telemetry on the live app showed `generationMillis`
// tail outliers up to ~6 s (cold KV cache + long prompts + maximally divergent
// edits land all at once). Predictions that take longer than ~1.2 s to land are
// already stale by the time they would render — the user has typed several more
// characters since — so they should be abandoned rather than shown belatedly. We
// capture the generation Task and a sibling task cancels it after the budget; the
// existing `catch is CancellationError` arm already drops the result silently.
let generationDeadlineNanoseconds: UInt64 = 1_200_000_000
let generationTask = Task { [weak self] in
guard let self else { return }
do {
latencyTrace.eventGenerationBegin()
Expand All @@ -611,13 +642,19 @@ final class CompletionController {
self.telemetry.recordLatency(milliseconds: elapsedMs)
self.present(candidates, request: request, style: style, latencyTrace: latencyTrace)
} catch is CancellationError {
// Superseded by a newer keystroke — leave the current ghost as-is.
// Superseded by a newer keystroke, or hit the generation deadline above.
// Either way, leave the current ghost as-is and drop this attempt silently.
self.finishLatencyTrace(latencyTrace, outcome: "cancelled")
} catch {
self.log.error("Generation failed: \(error, privacy: .public)")
self.finishLatencyTrace(latencyTrace, outcome: "generation-error")
}
}
self.generationTask = generationTask
Task {
try? await Task.sleep(nanoseconds: generationDeadlineNanoseconds)
generationTask.cancel()
}
}
}

Expand All @@ -633,14 +670,31 @@ final class CompletionController {
.map { "\"\(PredictionLog.escape($0.text))\"" }
.joined(separator: " | ")

guard let best = candidates.first else {
guard let topRanked = candidates.first else {
telemetry.recordSuppressed(reason: "noCandidate")
predictionLog.append("PREDICT ctx=\"\(ctx)\" → SUPPRESS(noCandidate)")
clearCompletion()
finishLatencyTrace(latencyTrace, outcome: "suppressed-no-candidate")
return
}
if let reason = filter.suppressionReason(for: best, request: request) {
// ADR-077: rank-fallback. The constrained decoder returns up to `maxCandidates`
// hypotheses ordered by cumulative log-probability; previously we accepted only
// `candidates.first` and any filter rejection suppressed the entire prediction —
// even when ranks 2..N passed every gate. Telemetry showed 41 of 77 suppressions
// were `insertionUnsafe` (the most common cause: the top candidate was pure
// punctuation or whitespace and the prose runner-up never got a turn). Now we
// walk the ranked list and pick the first candidate that survives the filter.
// Suppression is reported only if every candidate fails — and we report the top
// candidate's reason, preserving the existing telemetry shape.
let best: CompletionCandidate
if filter.suppressionReason(for: topRanked, request: request) == nil {
best = topRanked
} else if let runnerUp = candidates.dropFirst().first(where: {
filter.suppressionReason(for: $0, request: request) == nil
}) {
best = runnerUp
} else {
let reason = filter.suppressionReason(for: topRanked, request: request)!
telemetry.recordSuppressed(reason: String(describing: reason))
log.debug("Suppressed: \(String(describing: reason), privacy: .public)")
predictionLog.append("PREDICT ctx=\"\(ctx)\" [\(ranked)] → SUPPRESS(\(reason))")
Expand Down Expand Up @@ -1280,7 +1334,15 @@ final class CompletionController {
/// overlay are left intact so the induced snapshot re-renders the shrinking remainder instead.
private func insert(text: String, context: TextFieldContext, keepingAnchor: Bool = false) {
guard !text.isEmpty else { return }
let plan = inserter.planInsertion(candidate: CompletionCandidate(text: text), context: context)
// ADR-079: collapse internal multi-space runs (`"hello world"` → `"hello world"`).
// The model occasionally emits double spaces inside a candidate, which produced the
// user-visible "bigger space than needed" after Tab / Shift+Tab insertion. A *single*
// leading space is the separator that ADR-050 says leads the next word, so we keep
// it; runs of two or more ASCII spaces anywhere in the string collapse to one. Other
// whitespace forms (tabs, NBSP) are not touched — apps that require NBSP get the
// existing per-policy substitution in `InsertionPlanner.plan(...)`.
let normalized = Self.collapseInternalDoubleSpaces(text)
let plan = inserter.planInsertion(candidate: CompletionCandidate(text: normalized), context: context)
if !keepingAnchor {
// Drop the dedupe key so the post-insertion snapshot always regenerates a fresh suggestion.
lastContextKey = nil
Expand Down
6 changes: 5 additions & 1 deletion KeyType/Logic/Context/ScreenContextController.swift
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,11 @@ final class ScreenContextController {

/// How often to re-OCR the focused window while it stays focused, so the context tracks slow
/// on-screen changes (a scrolled doc, an updated panel) without a focus change to trigger it.
private let refreshInterval: TimeInterval = 4.0
/// 12 s instead of the previous 4 s — see ADR-076: even with `.fast` Vision the OCR pass is
/// not free, and the user-perceptible "screen changes" we want to react to (paragraph scroll,
/// updated panel) are on the order of seconds, not sub-second. The big trigger remains focus
/// change, which is still instant via `handle(snapshot:)`.
private let refreshInterval: TimeInterval = 12.0

private(set) var isRunning = false
private var listenerToken: UUID?
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,13 @@ public struct DecodingConfiguration: Equatable {
topK: Int = 64,
topP: Float = 0.95,
temperature: Float = 0.8,
branchWidth: Int = 4,
// ADR-077: default 3 (was 4). Per ADR-012's `testBranchWidthSweep` (warm means
// 239 / 164 / 107 / 75 / … ms at widths 8/6/4/3/…) dropping a single branch trims
// ~25% off generation latency and a one-rank-narrower beam barely moves the
// top-1 quality on the catalog set. With rank-fallback in `CompletionController`
// we now consume the extra ranks the beam still emits, so giving up the 4th branch
// costs less than the latency it cost to keep producing it.
branchWidth: Int = 3,
relativeCutoff: Float = 6,
minBranchProbability: Float = 0.02,
maxCandidates: Int = 5,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,52 @@ public struct FocusedFieldSnapshot: Equatable {
}
}

/// Memoizes the result of `FocusedFieldReader.textElement(for:preferDescendantTextElement:)`
/// for a single focused-root identity (ADR-075). Profiling on a fanless M4 showed the BFS in
/// `textElement(for:)` accounted for ~10% of main-thread time during typing — every
/// `kAXValueChangedNotification` fired on every keystroke walked the AX tree again to find the
/// same text descendant we had already resolved a moment earlier. With this cache the walk runs
/// once per focus change instead of once per keystroke; on value/selection changes we go
/// straight to `AXCaretHelper.stringValue` on the already-resolved element.
///
/// Reference type so the value-typed `FocusedFieldReader` can mutate it through a `let`
/// property — the reader is held across refreshes by `AccessibilityContextTracker` so the
/// cached element lives for the lifetime of the focus session.
private final class FocusedFieldResolutionCache {
/// Outcome of a resolution attempt against a given root identity. `negative` is preserved
/// separately from "miss" so a focused root with no text descendant doesn't re-walk the
/// tree on every value tick (e.g. media keys firing AX notifications from a non-text
/// focused control).
enum Lookup {
case miss
case hit(AXUIElement)
case negative
}

private var rootIdentity: String?
private var resolvedTextElement: AXUIElement?

/// `kAXFocusedUIElementChanged` / `kAXFocusedWindowChanged` produce a different root
/// identity, so this never serves a stale element across focus boundaries — yet
/// `kAXValueChanged` / `kAXSelectedTextChanged` keep the same root, which is the case that
/// gets the win.
func lookup(rootIdentity identity: String) -> Lookup {
guard self.rootIdentity == identity else { return .miss }
if let cached = resolvedTextElement { return .hit(cached) }
return .negative
}

func store(rootIdentity identity: String, textElement: AXUIElement?) {
self.rootIdentity = identity
self.resolvedTextElement = textElement
}
}

@MainActor
public struct FocusedFieldReader {
private let resolver: AXCaretGeometryResolver
private nonisolated let webAppClassifier: AppBundleWebAppClassifier
private nonisolated let resolutionCache = FocusedFieldResolutionCache()

public nonisolated init(
resolver: AXCaretGeometryResolver = AXCaretGeometryResolver(),
Expand All @@ -51,15 +93,32 @@ public struct FocusedFieldReader {
/// Read the focused AX element into a snapshot. Returns nil if the element has no AX
/// value (likely not a text-bearing field).
public func snapshot(of element: AXUIElement) -> FocusedFieldSnapshot? {
let initialBundleIdentifier = AppTargetResolver.bundleIdentifier(for: element)
let isKnownWebBackedApp = webAppClassifier.isWebBacked(
bundleIdentifier: initialBundleIdentifier
)
guard let textElement = Self.textElement(
for: element,
preferDescendantTextElement: isKnownWebBackedApp
) else {
// Fast path (ADR-075): if the focused root's identity matches the one we resolved a
// text descendant for previously, skip the AX tree walk in `textElement(for:)` and
// reuse the cached element. The expensive `AXCaretHelper.childElements` BFS is the
// dominant per-keystroke main-thread cost; same-root revisits during continuous typing
// were our 10% main-thread hotspot before this.
let rootIdentity = AXCaretHelper.elementIdentity(for: element)
let textElement: AXUIElement
switch resolutionCache.lookup(rootIdentity: rootIdentity) {
case .hit(let cached):
textElement = cached
case .negative:
return nil
case .miss:
let initialBundleIdentifier = AppTargetResolver.bundleIdentifier(for: element)
let isKnownWebBackedApp = webAppClassifier.isWebBacked(
bundleIdentifier: initialBundleIdentifier
)
guard let resolved = Self.textElement(
for: element,
preferDescendantTextElement: isKnownWebBackedApp
) else {
resolutionCache.store(rootIdentity: rootIdentity, textElement: nil)
return nil
}
resolutionCache.store(rootIdentity: rootIdentity, textElement: resolved)
textElement = resolved
}
let target = AppTargetResolver.resolveAppTarget(for: textElement)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,23 @@ import Vision

public enum ScreenTextOCR {
/// Recognise text in `image`, returning the recognised lines in natural reading order
/// (top-to-bottom, then left-to-right). Uses `.accurate` recognition with language correction
/// **on**: this capture runs out of band (on focus/window change plus a slow timer, never on the
/// per-keystroke path — see `WindowOCRCaptureEngine`/`ScreenContextController`), so there is no
/// keystroke-latency budget to protect and accuracy is what matters. Garbled recognitions are the
/// single largest source of polluted screen context (the model parrots "Ilne wilh real 5ulfix"
/// gibberish), and `.accurate` + language correction cuts them at the source rather than relying
/// solely on the post-hoc corruption filters below. (See ADR-049/052.)
/// (top-to-bottom, then left-to-right). Uses `.fast` recognition with language correction
/// **off** (ADR-076). Earlier revisions used `.accurate` + `usesLanguageCorrection = true`
/// because the per-line corruption filter (`droppingCorruptedLines` below) had a non-trivial
/// false-negative rate when fed `.fast`-tier mojibake (ADR-049/052). In practice — once the
/// digit-substitution guard (ADR-050) and the symbol-density guard are in place — the
/// surviving `.fast` lines are good enough for `[Screen context]` and the CPU win on a
/// fanless M4 is measured in 5–10× per refresh. The OCR is still off the keystroke path,
/// but the same CPU competes with everything else the user is doing, and a 4-second timer
/// firing `.accurate` Vision passes was the dominant remaining draw after ADR-074/075.
/// `.fast` routes through the Neural Engine where available, which is exactly what
/// Apple's own Live Text / system text recognition uses for ambient capture.
///
/// `minimumConfidence` is the first guard against *corrupted* OCR reaching the prompt: Vision
/// reports a per-candidate confidence, and a low value is the signal of a mangled recognition.
/// Feeding nothing is better than feeding garbage, so low-confidence lines are dropped here.
public static func recognizeLines(in image: CGImage, minimumConfidence: Float = 0.4) async throws -> [String] {
/// The threshold is bumped slightly to compensate for `.fast`'s noisier candidates.
public static func recognizeLines(in image: CGImage, minimumConfidence: Float = 0.45) async throws -> [String] {
try await withCheckedThrowingContinuation { continuation in
let request = VNRecognizeTextRequest { request, error in
if let error {
Expand All @@ -47,8 +52,12 @@ public enum ScreenTextOCR {
}
continuation.resume(returning: lines)
}
request.recognitionLevel = .accurate
request.usesLanguageCorrection = true
// ADR-076: trade some recognition accuracy for ~5–10× lower per-refresh CPU. The
// downstream corruption filters (`droppingCorruptedLines`, `isPlausibleText`,
// `containsDigitSubstitutedWord`) reject the additional mangled lines that `.fast`
// produces, so the model still only sees plausible prose.
request.recognitionLevel = .fast
request.usesLanguageCorrection = false

let handler = VNImageRequestHandler(cgImage: image, options: [:])
do {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,12 @@ public struct ScreenCaptureKitWindowTextCapturer: ScreenWindowTextCapturing {
/// Longest side (in pixels) of the captured image before OCR. Caps Retina blow-up for speed.
private let maxCaptureDimension: CGFloat

public init(maxCaptureDimension: CGFloat = 1600) {
public init(maxCaptureDimension: CGFloat = 1200) {
// ADR-076: 1200 is the longest-side cap before OCR. The previous default of 1600 was
// a holdover from `.accurate` Vision (which benefits from more pixels); `.fast` doesn't
// gain proportionally from extra resolution, so we shrink the captured image to cut both
// the screenshot encode cost and Vision's per-pixel work without hurting recognition of
// ordinary screen text. Apps that need a denser cap can pass it explicitly.
self.maxCaptureDimension = maxCaptureDimension
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,14 @@ public actor LlamaModelRuntime: LocalModelRuntime {
maxSequences: Int = 4,
// Incremental beam decoding (ADR-046): keep branch KV resident across levels and decode only
// the new token. On by default; the reseed path (ADR-043) remains as a per-call fallback.
enableIncrementalBeam: Bool = true
enableIncrementalBeam: Bool = true,
// Number of transformer layers to offload to the GPU via llama.cpp's Metal backend
// (ADR-074). Default 999 means "all layers"; llama.cpp clamps to the model's real layer
// count. Pass 0 to force CPU-only (e.g. tooling, deterministic tests). Without this,
// `llama_model_default_params()` defaults `n_gpu_layers` to 0 and every token is decoded
// on the CPU even though the Metal backend is linked — pinning the CPU and triggering
// thermal throttling on fanless Apple Silicon.
nGpuLayers: Int = 999
) throws {
guard ModelContainer.modelExists(at: modelURL) else {
throw LlamaRuntimeError.modelFileMissing(path: modelURL.path)
Expand All @@ -103,6 +110,12 @@ public actor LlamaModelRuntime: LocalModelRuntime {
var modelParams = llama_model_default_params()
modelParams.use_mmap = true
modelParams.use_mlock = false
// Offload all transformer layers to the Metal GPU (ADR-074). Without this, llama.cpp
// defaults `n_gpu_layers` to 0 and decodes every token on the CPU even though the Metal
// backend is linked — the high CPU usage / thermal throttling we saw was from inference
// running entirely on CPU cores. 999 means "all layers"; llama.cpp clamps to the model's
// real depth.
modelParams.n_gpu_layers = Int32(nGpuLayers)

guard let loadedModel = llama_model_load_from_file(modelURL.path, modelParams) else {
throw LlamaRuntimeError.modelLoadFailed
Expand Down
Loading