diff --git a/Cotabby/Models/LlamaRuntimeModels.swift b/Cotabby/Models/LlamaRuntimeModels.swift index 846529a..8a24d4a 100644 --- a/Cotabby/Models/LlamaRuntimeModels.swift +++ b/Cotabby/Models/LlamaRuntimeModels.swift @@ -193,9 +193,17 @@ struct LlamaGenerationOptions: Equatable, Sendable { /// Defaults to -infinity, which disables suppression entirely. var confidenceFloor: Double = -.infinity - /// Beam width for the constrained decoder. 1 is the single-path greedy decode; values > 1 run a + /// Routes generation through the deterministic constrained decoder (logit read + admissibility + /// mask + argmax + manual token commit) instead of the engine's built-in stochastic sampler. + /// Default off so the shipping sampleNext path is unaffected until the constrained decoder is + /// validated on device. Changing it does not affect KV reuse, so it is intentionally excluded + /// from `SamplingFingerprint`. + var useConstrainedDecoder: Bool = false + + /// Beam width for the constrained decoder. 1 keeps the single-path greedy decode; values > 1 run a /// multi-branch beam search that explores several short continuations and keeps the highest-scoring - /// one. Does not affect KV reuse, so it is excluded from `SamplingFingerprint`. + /// one. Only consulted when `useConstrainedDecoder` is true. Like `useConstrainedDecoder`, it does + /// not affect KV reuse, so it is excluded from `SamplingFingerprint`. var beamWidth: Int = 1 /// When set (and the model is FIM-capable), the runtime builds a fill-in-middle prompt from the diff --git a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift index a62bdc1..8aa4102 100644 --- a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift +++ b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift @@ -212,9 +212,12 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { autocompleteSamplingFingerprint = fingerprint } - // The KV-trim defer above runs after whichever decoder returns. Greedy and beam share the - // prepared sequence and the same confidence-suppression contract; they differ only in how far - // they explore before committing each token. + // The KV-trim defer above runs after whichever decoder returns. Both decoders share the + // prepared sequence and the same confidence-suppression contract; they differ only in how + // they pick each token (engine sampler vs. deterministic constrained selection). + guard options.useConstrainedDecoder else { + return runEngineSampledDecode(sequenceID: sequenceID, options: options) + } return options.beamWidth > 1 ? try runConstrainedBeamDecode( sequenceID: sequenceID, @@ -267,6 +270,57 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { return SentenceBoundaryClassifier.endsSentence(decoded) } + /// The shipping decoder: delegates token selection to the engine's built-in sampler + /// (`sampleNext`), which applies temperature / top-k / top-p / min-p and commits each token. + private func runEngineSampledDecode(sequenceID: Int32, options: LlamaGenerationOptions) -> String { + var generatedText = "" + var tokensGenerated = 0 + var sumLogprob = 0.0 + var stopReason = "budget_exhausted" + + for _ in 0 ..< options.maxPredictionTokens { + // Cooperative cancellation: when the wrapping Task is cancelled (caller hit a new + // keystroke, focus changed, Compose started), bail before the next sampleNext call so + // we release `autocompleteLock` instead of running the full prediction budget and + // making the next autocomplete wait behind us. + if Task.isCancelled { + stopReason = "cancelled" + break + } + + let result = engine.sampleNext(sequenceID) + + if result.was_cancelled { + stopReason = "engine_cancelled" + break + } + if result.is_eos { + stopReason = "eos" + break + } + + let piece = Self.extractPiece(result) + generatedText += piece + tokensGenerated += 1 + sumLogprob += Double(result.logprob) + } + + CotabbyLogger.runtime.debug( + "Decode end", + metadata: [ + "kind": .string("generate"), + "tokens_generated": .stringConvertible(tokensGenerated), + "chars_generated": .stringConvertible(generatedText.count), + "stop_reason": .string(stopReason) + ] + ) + + if Self.shouldSuppress(sumLogprob: sumLogprob, tokensGenerated: tokensGenerated, options: options) { + return "" + } + return generatedText + } + /// The constrained decoder: reads the raw next-token logits, masks structural / excluded tokens /// via the token profile, deterministically selects the highest-logit admissible token, and /// commits it manually with `acceptToken`. This trades the sampler's randomness for reproducible, @@ -850,6 +904,15 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { return [] } + private static func extractPiece(_ result: SampleResult) -> String { + guard let piece = result.piece, result.piece_length > 0 else { return "" } + let buffer = UnsafeBufferPointer( + start: UnsafeRawPointer(piece).assumingMemoryBound(to: UInt8.self), + count: Int(result.piece_length) + ) + return String(bytes: buffer, encoding: .utf8) ?? "" + } + private static func samplingConfig(from options: LlamaGenerationOptions) -> SamplingConfig { SamplingConfig( max_prediction_tokens: Int32(options.maxPredictionTokens), diff --git a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift index df2d8e1..be4d991 100644 --- a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift +++ b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift @@ -107,8 +107,8 @@ final class LlamaRuntimeManager: ObservableObject { do { // `Task.detached` does not inherit the caller's cancellation, so an outer cancel // would otherwise leave `core.generate` running to its full prediction budget while - // holding `autocompleteLock`. The handler forwards the cancel signal, and the decode - // loop inside `core.generate` polls `Task.isCancelled` between token steps. + // holding `autocompleteLock`. The handler forwards the cancel signal, and the loop + // inside `core.generate` polls `Task.isCancelled` between sampleNext calls. let task = Task.detached { try core.generate( prompt: prompt, diff --git a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift index d70ce7f..fdbb961 100644 --- a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift +++ b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift @@ -13,19 +13,39 @@ final class LlamaSuggestionEngine { private let runtimeManager: LlamaRuntimeGenerating private var promptCacheHintTracker = LlamaPromptCacheHintTracker() - /// UserDefaults key (no UI) for the constrained decoder's beam width. 1 is greedy (the shipping - /// default); a value > 1 runs a multi-branch beam search. Kept as a tuning knob, not a feature - /// gate: beam-by-default still needs the batched-beam decode work to avoid per-branch latency. + /// UserDefaults key (no UI) that routes llama generation through the deterministic constrained + /// decoder instead of the engine's stochastic sampler. Default-off: decode quality can only be + /// judged with a real model in a real field, so this stays a hidden developer/dogfood toggle + /// until it is validated on device and promoted to the default. + private static let constrainedDecoderDefaultsKey = "cotabbyConstrainedDecoderEnabled" + private static var isConstrainedDecoderEnabled: Bool { + UserDefaults.standard.bool(forKey: constrainedDecoderDefaultsKey) + } + + /// UserDefaults key (no UI) for the constrained decoder's beam width. Default 1 keeps the existing + /// single-path greedy decode; a value > 1 runs a multi-branch beam search. Paired with the + /// constrained-decoder flag as a hidden developer/dogfood knob until validated on device. private static let constrainedBeamWidthDefaultsKey = "cotabbyConstrainedBeamWidth" private static var constrainedBeamWidth: Int { let stored = UserDefaults.standard.integer(forKey: constrainedBeamWidthDefaultsKey) return stored > 0 ? stored : 1 } - /// The fill-in-middle request for a generation, or nil to use the forward base prompt. Built when - /// the caret is genuinely mid-line (real text follows it on the same line); the runtime still falls - /// back to the base prompt when the model lacks the FIM marker tokens. + /// UserDefaults key (no UI) for fill-in-middle prompting. Default off: FIM only helps models that + /// ship the FIM marker tokens, and it changes the prompt structure, so it stays a developer toggle + /// until validated on device. + private static let fillInMiddleDefaultsKey = "cotabbyFillInMiddleEnabled" + private static var isFillInMiddleEnabled: Bool { + UserDefaults.standard.bool(forKey: fillInMiddleDefaultsKey) + } + + /// The fill-in-middle request for a generation, or nil to use the forward base prompt. Built only + /// when the flag is on and the caret is genuinely mid-line (real text follows it on the same line); + /// the runtime still falls back to the base prompt when the model lacks FIM markers. private static func fillInMiddleRequest(for request: SuggestionRequest) -> FillInMiddleRequest? { + guard isFillInMiddleEnabled else { + return nil + } // A caret at end of line wants a forward continuation, not infilling — even if `trailingText` // is non-empty because a line break and later paragraphs follow it. Gating on end-of-line // (rather than `trailingText.isEmpty`) keeps FIM to the case it is actually meant for. @@ -76,6 +96,7 @@ final class LlamaSuggestionEngine { precedingText: request.context.precedingText, trailingText: request.context.trailingText ), + useConstrainedDecoder: Self.isConstrainedDecoderEnabled, beamWidth: Self.constrainedBeamWidth, fillInMiddle: Self.fillInMiddleRequest(for: request) )