diff --git a/Cotabby/Models/LlamaRuntimeModels.swift b/Cotabby/Models/LlamaRuntimeModels.swift index 8a24d4a..846529a 100644 --- a/Cotabby/Models/LlamaRuntimeModels.swift +++ b/Cotabby/Models/LlamaRuntimeModels.swift @@ -193,17 +193,9 @@ struct LlamaGenerationOptions: Equatable, Sendable { /// Defaults to -infinity, which disables suppression entirely. var confidenceFloor: Double = -.infinity - /// Routes generation through the deterministic constrained decoder (logit read + admissibility - /// mask + argmax + manual token commit) instead of the engine's built-in stochastic sampler. - /// Default off so the shipping sampleNext path is unaffected until the constrained decoder is - /// validated on device. Changing it does not affect KV reuse, so it is intentionally excluded - /// from `SamplingFingerprint`. - var useConstrainedDecoder: Bool = false - - /// Beam width for the constrained decoder. 1 keeps the single-path greedy decode; values > 1 run a + /// Beam width for the constrained decoder. 1 is the single-path greedy decode; values > 1 run a /// multi-branch beam search that explores several short continuations and keeps the highest-scoring - /// one. Only consulted when `useConstrainedDecoder` is true. Like `useConstrainedDecoder`, it does - /// not affect KV reuse, so it is excluded from `SamplingFingerprint`. + /// one. Does not affect KV reuse, so it is excluded from `SamplingFingerprint`. var beamWidth: Int = 1 /// When set (and the model is FIM-capable), the runtime builds a fill-in-middle prompt from the diff --git a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift index 831389c..92f02dd 100644 --- a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift +++ b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift @@ -212,12 +212,9 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { autocompleteSamplingFingerprint = fingerprint } - // The KV-trim defer above runs after whichever decoder returns. Both decoders share the - // prepared sequence and the same confidence-suppression contract; they differ only in how - // they pick each token (engine sampler vs. deterministic constrained selection). - guard options.useConstrainedDecoder else { - return runEngineSampledDecode(sequenceID: sequenceID, options: options) - } + // The KV-trim defer above runs after whichever decoder returns. Greedy and beam share the + // prepared sequence and the same confidence-suppression contract; they differ only in how far + // they explore before committing each token. return options.beamWidth > 1 ? try runConstrainedBeamDecode( sequenceID: sequenceID, @@ -270,57 +267,6 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { return SentenceBoundaryClassifier.endsSentence(decoded) } - /// The shipping decoder: delegates token selection to the engine's built-in sampler - /// (`sampleNext`), which applies temperature / top-k / top-p / min-p and commits each token. - private func runEngineSampledDecode(sequenceID: Int32, options: LlamaGenerationOptions) -> String { - var generatedText = "" - var tokensGenerated = 0 - var sumLogprob = 0.0 - var stopReason = "budget_exhausted" - - for _ in 0 ..< options.maxPredictionTokens { - // Cooperative cancellation: when the wrapping Task is cancelled (caller hit a new - // keystroke, focus changed, Compose started), bail before the next sampleNext call so - // we release `autocompleteLock` instead of running the full prediction budget and - // making the next autocomplete wait behind us. - if Task.isCancelled { - stopReason = "cancelled" - break - } - - let result = engine.sampleNext(sequenceID) - - if result.was_cancelled { - stopReason = "engine_cancelled" - break - } - if result.is_eos { - stopReason = "eos" - break - } - - let piece = Self.extractPiece(result) - generatedText += piece - tokensGenerated += 1 - sumLogprob += Double(result.logprob) - } - - CotabbyLogger.runtime.debug( - "Decode end", - metadata: [ - "kind": .string("generate"), - "tokens_generated": .stringConvertible(tokensGenerated), - "chars_generated": .stringConvertible(generatedText.count), - "stop_reason": .string(stopReason) - ] - ) - - if Self.shouldSuppress(sumLogprob: sumLogprob, tokensGenerated: tokensGenerated, options: options) { - return "" - } - return generatedText - } - /// The constrained decoder: reads the raw next-token logits, masks structural / excluded tokens /// via the token profile, deterministically selects the highest-logit admissible token, and /// commits it manually with `acceptToken`. This trades the sampler's randomness for reproducible, @@ -899,15 +845,6 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { return [] } - private static func extractPiece(_ result: SampleResult) -> String { - guard let piece = result.piece, result.piece_length > 0 else { return "" } - let buffer = UnsafeBufferPointer( - start: UnsafeRawPointer(piece).assumingMemoryBound(to: UInt8.self), - count: Int(result.piece_length) - ) - return String(bytes: buffer, encoding: .utf8) ?? "" - } - private static func samplingConfig(from options: LlamaGenerationOptions) -> SamplingConfig { SamplingConfig( max_prediction_tokens: Int32(options.maxPredictionTokens), diff --git a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift index be4d991..df2d8e1 100644 --- a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift +++ b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift @@ -107,8 +107,8 @@ final class LlamaRuntimeManager: ObservableObject { do { // `Task.detached` does not inherit the caller's cancellation, so an outer cancel // would otherwise leave `core.generate` running to its full prediction budget while - // holding `autocompleteLock`. The handler forwards the cancel signal, and the loop - // inside `core.generate` polls `Task.isCancelled` between sampleNext calls. + // holding `autocompleteLock`. The handler forwards the cancel signal, and the decode + // loop inside `core.generate` polls `Task.isCancelled` between token steps. let task = Task.detached { try core.generate( prompt: prompt, diff --git a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift index fdbb961..d70ce7f 100644 --- a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift +++ b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift @@ -13,39 +13,19 @@ final class LlamaSuggestionEngine { private let runtimeManager: LlamaRuntimeGenerating private var promptCacheHintTracker = LlamaPromptCacheHintTracker() - /// UserDefaults key (no UI) that routes llama generation through the deterministic constrained - /// decoder instead of the engine's stochastic sampler. Default-off: decode quality can only be - /// judged with a real model in a real field, so this stays a hidden developer/dogfood toggle - /// until it is validated on device and promoted to the default. - private static let constrainedDecoderDefaultsKey = "cotabbyConstrainedDecoderEnabled" - private static var isConstrainedDecoderEnabled: Bool { - UserDefaults.standard.bool(forKey: constrainedDecoderDefaultsKey) - } - - /// UserDefaults key (no UI) for the constrained decoder's beam width. Default 1 keeps the existing - /// single-path greedy decode; a value > 1 runs a multi-branch beam search. Paired with the - /// constrained-decoder flag as a hidden developer/dogfood knob until validated on device. + /// UserDefaults key (no UI) for the constrained decoder's beam width. 1 is greedy (the shipping + /// default); a value > 1 runs a multi-branch beam search. Kept as a tuning knob, not a feature + /// gate: beam-by-default still needs the batched-beam decode work to avoid per-branch latency. private static let constrainedBeamWidthDefaultsKey = "cotabbyConstrainedBeamWidth" private static var constrainedBeamWidth: Int { let stored = UserDefaults.standard.integer(forKey: constrainedBeamWidthDefaultsKey) return stored > 0 ? stored : 1 } - /// UserDefaults key (no UI) for fill-in-middle prompting. Default off: FIM only helps models that - /// ship the FIM marker tokens, and it changes the prompt structure, so it stays a developer toggle - /// until validated on device. - private static let fillInMiddleDefaultsKey = "cotabbyFillInMiddleEnabled" - private static var isFillInMiddleEnabled: Bool { - UserDefaults.standard.bool(forKey: fillInMiddleDefaultsKey) - } - - /// The fill-in-middle request for a generation, or nil to use the forward base prompt. Built only - /// when the flag is on and the caret is genuinely mid-line (real text follows it on the same line); - /// the runtime still falls back to the base prompt when the model lacks FIM markers. + /// The fill-in-middle request for a generation, or nil to use the forward base prompt. Built when + /// the caret is genuinely mid-line (real text follows it on the same line); the runtime still falls + /// back to the base prompt when the model lacks the FIM marker tokens. private static func fillInMiddleRequest(for request: SuggestionRequest) -> FillInMiddleRequest? { - guard isFillInMiddleEnabled else { - return nil - } // A caret at end of line wants a forward continuation, not infilling — even if `trailingText` // is non-empty because a line break and later paragraphs follow it. Gating on end-of-line // (rather than `trailingText.isEmpty`) keeps FIM to the case it is actually meant for. @@ -96,7 +76,6 @@ final class LlamaSuggestionEngine { precedingText: request.context.precedingText, trailingText: request.context.trailingText ), - useConstrainedDecoder: Self.isConstrainedDecoderEnabled, beamWidth: Self.constrainedBeamWidth, fillInMiddle: Self.fillInMiddleRequest(for: request) )