FuJacob · FuJacob · Jun 2, 2026 · Jun 2, 2026 · greptile-apps · Jun 2, 2026
diff --git a/Cotabby/Models/LlamaRuntimeModels.swift b/Cotabby/Models/LlamaRuntimeModels.swift
@@ -193,9 +193,17 @@ struct LlamaGenerationOptions: Equatable, Sendable {
     /// Defaults to -infinity, which disables suppression entirely.
     var confidenceFloor: Double = -.infinity
 
-    /// Beam width for the constrained decoder. 1 is the single-path greedy decode; values > 1 run a
+    /// Routes generation through the deterministic constrained decoder (logit read + admissibility
+    /// mask + argmax + manual token commit) instead of the engine's built-in stochastic sampler.
+    /// Default off so the shipping sampleNext path is unaffected until the constrained decoder is
+    /// validated on device. Changing it does not affect KV reuse, so it is intentionally excluded
+    /// from `SamplingFingerprint`.
+    var useConstrainedDecoder: Bool = false
+
+    /// Beam width for the constrained decoder. 1 keeps the single-path greedy decode; values > 1 run a
     /// multi-branch beam search that explores several short continuations and keeps the highest-scoring
-    /// one. Does not affect KV reuse, so it is excluded from `SamplingFingerprint`.
+    /// one. Only consulted when `useConstrainedDecoder` is true. Like `useConstrainedDecoder`, it does
+    /// not affect KV reuse, so it is excluded from `SamplingFingerprint`.
     var beamWidth: Int = 1
 
     /// When set (and the model is FIM-capable), the runtime builds a fill-in-middle prompt from the

diff --git a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
@@ -212,9 +212,12 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
             autocompleteSamplingFingerprint = fingerprint
         }
 
-        // The KV-trim defer above runs after whichever decoder returns. Greedy and beam share the
-        // prepared sequence and the same confidence-suppression contract; they differ only in how far
-        // they explore before committing each token.
+        // The KV-trim defer above runs after whichever decoder returns. Both decoders share the
+        // prepared sequence and the same confidence-suppression contract; they differ only in how
+        // they pick each token (engine sampler vs. deterministic constrained selection).
+        guard options.useConstrainedDecoder else {
+            return runEngineSampledDecode(sequenceID: sequenceID, options: options)
+        }
         return options.beamWidth > 1
             ? try runConstrainedBeamDecode(
                 sequenceID: sequenceID,
@@ -267,6 +270,57 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
         return SentenceBoundaryClassifier.endsSentence(decoded)
     }
 
+    /// The shipping decoder: delegates token selection to the engine's built-in sampler
+    /// (`sampleNext`), which applies temperature / top-k / top-p / min-p and commits each token.
+    private func runEngineSampledDecode(sequenceID: Int32, options: LlamaGenerationOptions) -> String {
+        var generatedText = ""
+        var tokensGenerated = 0
+        var sumLogprob = 0.0
+        var stopReason = "budget_exhausted"
+
+        for _ in 0 ..< options.maxPredictionTokens {
+            // Cooperative cancellation: when the wrapping Task is cancelled (caller hit a new
+            // keystroke, focus changed, Compose started), bail before the next sampleNext call so
+            // we release `autocompleteLock` instead of running the full prediction budget and
+            // making the next autocomplete wait behind us.
+            if Task.isCancelled {
+                stopReason = "cancelled"
+                break
+            }
+
+            let result = engine.sampleNext(sequenceID)
+
+            if result.was_cancelled {
+                stopReason = "engine_cancelled"
+                break
+            }
+            if result.is_eos {
+                stopReason = "eos"
+                break
+            }
+
+            let piece = Self.extractPiece(result)
+            generatedText += piece
+            tokensGenerated += 1
+            sumLogprob += Double(result.logprob)
+        }
+
+        CotabbyLogger.runtime.debug(
+            "Decode end",
+            metadata: [
+                "kind": .string("generate"),
+                "tokens_generated": .stringConvertible(tokensGenerated),
+                "chars_generated": .stringConvertible(generatedText.count),
+                "stop_reason": .string(stopReason)
+            ]
+        )
+
+        if Self.shouldSuppress(sumLogprob: sumLogprob, tokensGenerated: tokensGenerated, options: options) {
+            return ""
+        }
+        return generatedText
+    }
+
     /// The constrained decoder: reads the raw next-token logits, masks structural / excluded tokens
     /// via the token profile, deterministically selects the highest-logit admissible token, and
     /// commits it manually with `acceptToken`. This trades the sampler's randomness for reproducible,
@@ -850,6 +904,15 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
         return []
     }
 
+    private static func extractPiece(_ result: SampleResult) -> String {
+        guard let piece = result.piece, result.piece_length > 0 else { return "" }
+        let buffer = UnsafeBufferPointer(
+            start: UnsafeRawPointer(piece).assumingMemoryBound(to: UInt8.self),
+            count: Int(result.piece_length)
+        )
+        return String(bytes: buffer, encoding: .utf8) ?? ""
+    }
+
     private static func samplingConfig(from options: LlamaGenerationOptions) -> SamplingConfig {
         SamplingConfig(
             max_prediction_tokens: Int32(options.maxPredictionTokens),

diff --git a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift
@@ -107,8 +107,8 @@ final class LlamaRuntimeManager: ObservableObject {
         do {
             // `Task.detached` does not inherit the caller's cancellation, so an outer cancel
             // would otherwise leave `core.generate` running to its full prediction budget while
-            // holding `autocompleteLock`. The handler forwards the cancel signal, and the decode
-            // loop inside `core.generate` polls `Task.isCancelled` between token steps.
+            // holding `autocompleteLock`. The handler forwards the cancel signal, and the loop
+            // inside `core.generate` polls `Task.isCancelled` between sampleNext calls.
             let task = Task.detached {
                 try core.generate(
                     prompt: prompt,

diff --git a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift
@@ -13,19 +13,39 @@ final class LlamaSuggestionEngine {
     private let runtimeManager: LlamaRuntimeGenerating
     private var promptCacheHintTracker = LlamaPromptCacheHintTracker()
 
-    /// UserDefaults key (no UI) for the constrained decoder's beam width. 1 is greedy (the shipping
-    /// default); a value > 1 runs a multi-branch beam search. Kept as a tuning knob, not a feature
-    /// gate: beam-by-default still needs the batched-beam decode work to avoid per-branch latency.
+    /// UserDefaults key (no UI) that routes llama generation through the deterministic constrained
+    /// decoder instead of the engine's stochastic sampler. Default-off: decode quality can only be
+    /// judged with a real model in a real field, so this stays a hidden developer/dogfood toggle
+    /// until it is validated on device and promoted to the default.
+    private static let constrainedDecoderDefaultsKey = "cotabbyConstrainedDecoderEnabled"
+    private static var isConstrainedDecoderEnabled: Bool {
+        UserDefaults.standard.bool(forKey: constrainedDecoderDefaultsKey)
+    }
+
+    /// UserDefaults key (no UI) for the constrained decoder's beam width. Default 1 keeps the existing
+    /// single-path greedy decode; a value > 1 runs a multi-branch beam search. Paired with the
+    /// constrained-decoder flag as a hidden developer/dogfood knob until validated on device.
     private static let constrainedBeamWidthDefaultsKey = "cotabbyConstrainedBeamWidth"
     private static var constrainedBeamWidth: Int {
         let stored = UserDefaults.standard.integer(forKey: constrainedBeamWidthDefaultsKey)
         return stored > 0 ? stored : 1
     }
 
-    /// The fill-in-middle request for a generation, or nil to use the forward base prompt. Built when
-    /// the caret is genuinely mid-line (real text follows it on the same line); the runtime still falls
-    /// back to the base prompt when the model lacks the FIM marker tokens.
+    /// UserDefaults key (no UI) for fill-in-middle prompting. Default off: FIM only helps models that
+    /// ship the FIM marker tokens, and it changes the prompt structure, so it stays a developer toggle
+    /// until validated on device.
+    private static let fillInMiddleDefaultsKey = "cotabbyFillInMiddleEnabled"
+    private static var isFillInMiddleEnabled: Bool {
+        UserDefaults.standard.bool(forKey: fillInMiddleDefaultsKey)
+    }
+
+    /// The fill-in-middle request for a generation, or nil to use the forward base prompt. Built only
+    /// when the flag is on and the caret is genuinely mid-line (real text follows it on the same line);
+    /// the runtime still falls back to the base prompt when the model lacks FIM markers.
     private static func fillInMiddleRequest(for request: SuggestionRequest) -> FillInMiddleRequest? {
+        guard isFillInMiddleEnabled else {
+            return nil
+        }
         // A caret at end of line wants a forward continuation, not infilling — even if `trailingText`
         // is non-empty because a line break and later paragraphs follow it. Gating on end-of-line
         // (rather than `trailingText.isEmpty`) keeps FIM to the case it is actually meant for.
@@ -76,6 +96,7 @@ final class LlamaSuggestionEngine {
                         precedingText: request.context.precedingText,
                         trailingText: request.context.trailingText
                     ),
+                    useConstrainedDecoder: Self.isConstrainedDecoderEnabled,
                     beamWidth: Self.constrainedBeamWidth,
                     fillInMiddle: Self.fillInMiddleRequest(for: request)
                 )