From 30968938331b7eadac63f014952bba292b8a80fc Mon Sep 17 00:00:00 2001
From: Jacob Fu <141651335+FuJacob@users.noreply.github.com>
Date: Mon, 1 Jun 2026 21:37:39 -0700
Subject: [PATCH] Make the constrained decoder the only llama decode path

Validated on device, so it ships to everyone with no flag. Removes the
cotabbyConstrainedDecoderEnabled and cotabbyFillInMiddleEnabled feature gates
(both now always on), the useConstrainedDecoder option, and the now-unreachable
runEngineSampledDecode plus its extractPiece helper. Generation routes straight to
the greedy or beam constrained decoder.

Beam width stays as a tuning knob (cotabbyConstrainedBeamWidth, greedy by default),
not a feature gate: it keeps the beam path reachable and is the basis for the
batched-beam work, which is what beam-by-default needs to avoid per-branch latency.

Fill-in-middle is now unconditional, still gated by its real preconditions (a
genuine mid-line caret and a model that ships the FIM markers).
---
 Cotabby/Models/LlamaRuntimeModels.swift       | 12 +---
 .../Services/Runtime/LlamaRuntimeCore.swift   | 69 +------------------
 .../Runtime/LlamaRuntimeManager.swift         |  4 +-
 .../Runtime/LlamaSuggestionEngine.swift       | 33 ++-------
 4 files changed, 13 insertions(+), 105 deletions(-)

diff --git a/Cotabby/Models/LlamaRuntimeModels.swift b/Cotabby/Models/LlamaRuntimeModels.swift
index 8a24d4a..846529a 100644
--- a/Cotabby/Models/LlamaRuntimeModels.swift
+++ b/Cotabby/Models/LlamaRuntimeModels.swift
@@ -193,17 +193,9 @@ struct LlamaGenerationOptions: Equatable, Sendable {
     /// Defaults to -infinity, which disables suppression entirely.
     var confidenceFloor: Double = -.infinity
 
-    /// Routes generation through the deterministic constrained decoder (logit read + admissibility
-    /// mask + argmax + manual token commit) instead of the engine's built-in stochastic sampler.
-    /// Default off so the shipping sampleNext path is unaffected until the constrained decoder is
-    /// validated on device. Changing it does not affect KV reuse, so it is intentionally excluded
-    /// from `SamplingFingerprint`.
-    var useConstrainedDecoder: Bool = false
-
-    /// Beam width for the constrained decoder. 1 keeps the single-path greedy decode; values > 1 run a
+    /// Beam width for the constrained decoder. 1 is the single-path greedy decode; values > 1 run a
     /// multi-branch beam search that explores several short continuations and keeps the highest-scoring
-    /// one. Only consulted when `useConstrainedDecoder` is true. Like `useConstrainedDecoder`, it does
-    /// not affect KV reuse, so it is excluded from `SamplingFingerprint`.
+    /// one. Does not affect KV reuse, so it is excluded from `SamplingFingerprint`.
     var beamWidth: Int = 1
 
     /// When set (and the model is FIM-capable), the runtime builds a fill-in-middle prompt from the
diff --git a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
index 831389c..92f02dd 100644
--- a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
+++ b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
@@ -212,12 +212,9 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
             autocompleteSamplingFingerprint = fingerprint
         }
 
-        // The KV-trim defer above runs after whichever decoder returns. Both decoders share the
-        // prepared sequence and the same confidence-suppression contract; they differ only in how
-        // they pick each token (engine sampler vs. deterministic constrained selection).
-        guard options.useConstrainedDecoder else {
-            return runEngineSampledDecode(sequenceID: sequenceID, options: options)
-        }
+        // The KV-trim defer above runs after whichever decoder returns. Greedy and beam share the
+        // prepared sequence and the same confidence-suppression contract; they differ only in how far
+        // they explore before committing each token.
         return options.beamWidth > 1
             ? try runConstrainedBeamDecode(
                 sequenceID: sequenceID,
@@ -270,57 +267,6 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
         return SentenceBoundaryClassifier.endsSentence(decoded)
     }
 
-    /// The shipping decoder: delegates token selection to the engine's built-in sampler
-    /// (`sampleNext`), which applies temperature / top-k / top-p / min-p and commits each token.
-    private func runEngineSampledDecode(sequenceID: Int32, options: LlamaGenerationOptions) -> String {
-        var generatedText = ""
-        var tokensGenerated = 0
-        var sumLogprob = 0.0
-        var stopReason = "budget_exhausted"
-
-        for _ in 0 ..< options.maxPredictionTokens {
-            // Cooperative cancellation: when the wrapping Task is cancelled (caller hit a new
-            // keystroke, focus changed, Compose started), bail before the next sampleNext call so
-            // we release `autocompleteLock` instead of running the full prediction budget and
-            // making the next autocomplete wait behind us.
-            if Task.isCancelled {
-                stopReason = "cancelled"
-                break
-            }
-
-            let result = engine.sampleNext(sequenceID)
-
-            if result.was_cancelled {
-                stopReason = "engine_cancelled"
-                break
-            }
-            if result.is_eos {
-                stopReason = "eos"
-                break
-            }
-
-            let piece = Self.extractPiece(result)
-            generatedText += piece
-            tokensGenerated += 1
-            sumLogprob += Double(result.logprob)
-        }
-
-        CotabbyLogger.runtime.debug(
-            "Decode end",
-            metadata: [
-                "kind": .string("generate"),
-                "tokens_generated": .stringConvertible(tokensGenerated),
-                "chars_generated": .stringConvertible(generatedText.count),
-                "stop_reason": .string(stopReason)
-            ]
-        )
-
-        if Self.shouldSuppress(sumLogprob: sumLogprob, tokensGenerated: tokensGenerated, options: options) {
-            return ""
-        }
-        return generatedText
-    }
-
     /// The constrained decoder: reads the raw next-token logits, masks structural / excluded tokens
     /// via the token profile, deterministically selects the highest-logit admissible token, and
     /// commits it manually with `acceptToken`. This trades the sampler's randomness for reproducible,
@@ -899,15 +845,6 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
         return []
     }
 
-    private static func extractPiece(_ result: SampleResult) -> String {
-        guard let piece = result.piece, result.piece_length > 0 else { return "" }
-        let buffer = UnsafeBufferPointer(
-            start: UnsafeRawPointer(piece).assumingMemoryBound(to: UInt8.self),
-            count: Int(result.piece_length)
-        )
-        return String(bytes: buffer, encoding: .utf8) ?? ""
-    }
-
     private static func samplingConfig(from options: LlamaGenerationOptions) -> SamplingConfig {
         SamplingConfig(
             max_prediction_tokens: Int32(options.maxPredictionTokens),
diff --git a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift
index be4d991..df2d8e1 100644
--- a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift
+++ b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift
@@ -107,8 +107,8 @@ final class LlamaRuntimeManager: ObservableObject {
         do {
             // `Task.detached` does not inherit the caller's cancellation, so an outer cancel
             // would otherwise leave `core.generate` running to its full prediction budget while
-            // holding `autocompleteLock`. The handler forwards the cancel signal, and the loop
-            // inside `core.generate` polls `Task.isCancelled` between sampleNext calls.
+            // holding `autocompleteLock`. The handler forwards the cancel signal, and the decode
+            // loop inside `core.generate` polls `Task.isCancelled` between token steps.
             let task = Task.detached {
                 try core.generate(
                     prompt: prompt,
diff --git a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift
index fdbb961..d70ce7f 100644
--- a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift
+++ b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift
@@ -13,39 +13,19 @@ final class LlamaSuggestionEngine {
     private let runtimeManager: LlamaRuntimeGenerating
     private var promptCacheHintTracker = LlamaPromptCacheHintTracker()
 
-    /// UserDefaults key (no UI) that routes llama generation through the deterministic constrained
-    /// decoder instead of the engine's stochastic sampler. Default-off: decode quality can only be
-    /// judged with a real model in a real field, so this stays a hidden developer/dogfood toggle
-    /// until it is validated on device and promoted to the default.
-    private static let constrainedDecoderDefaultsKey = "cotabbyConstrainedDecoderEnabled"
-    private static var isConstrainedDecoderEnabled: Bool {
-        UserDefaults.standard.bool(forKey: constrainedDecoderDefaultsKey)
-    }
-
-    /// UserDefaults key (no UI) for the constrained decoder's beam width. Default 1 keeps the existing
-    /// single-path greedy decode; a value > 1 runs a multi-branch beam search. Paired with the
-    /// constrained-decoder flag as a hidden developer/dogfood knob until validated on device.
+    /// UserDefaults key (no UI) for the constrained decoder's beam width. 1 is greedy (the shipping
+    /// default); a value > 1 runs a multi-branch beam search. Kept as a tuning knob, not a feature
+    /// gate: beam-by-default still needs the batched-beam decode work to avoid per-branch latency.
     private static let constrainedBeamWidthDefaultsKey = "cotabbyConstrainedBeamWidth"
     private static var constrainedBeamWidth: Int {
         let stored = UserDefaults.standard.integer(forKey: constrainedBeamWidthDefaultsKey)
         return stored > 0 ? stored : 1
     }
 
-    /// UserDefaults key (no UI) for fill-in-middle prompting. Default off: FIM only helps models that
-    /// ship the FIM marker tokens, and it changes the prompt structure, so it stays a developer toggle
-    /// until validated on device.
-    private static let fillInMiddleDefaultsKey = "cotabbyFillInMiddleEnabled"
-    private static var isFillInMiddleEnabled: Bool {
-        UserDefaults.standard.bool(forKey: fillInMiddleDefaultsKey)
-    }
-
-    /// The fill-in-middle request for a generation, or nil to use the forward base prompt. Built only
-    /// when the flag is on and the caret is genuinely mid-line (real text follows it on the same line);
-    /// the runtime still falls back to the base prompt when the model lacks FIM markers.
+    /// The fill-in-middle request for a generation, or nil to use the forward base prompt. Built when
+    /// the caret is genuinely mid-line (real text follows it on the same line); the runtime still falls
+    /// back to the base prompt when the model lacks the FIM marker tokens.
     private static func fillInMiddleRequest(for request: SuggestionRequest) -> FillInMiddleRequest? {
-        guard isFillInMiddleEnabled else {
-            return nil
-        }
         // A caret at end of line wants a forward continuation, not infilling — even if `trailingText`
         // is non-empty because a line break and later paragraphs follow it. Gating on end-of-line
         // (rather than `trailingText.isEmpty`) keeps FIM to the case it is actually meant for.
@@ -96,7 +76,6 @@ final class LlamaSuggestionEngine {
                         precedingText: request.context.precedingText,
                         trailingText: request.context.trailingText
                     ),
-                    useConstrainedDecoder: Self.isConstrainedDecoderEnabled,
                     beamWidth: Self.constrainedBeamWidth,
                     fillInMiddle: Self.fillInMiddleRequest(for: request)
                 )