diff --git a/Cotabby/Models/LlamaRuntimeModels.swift b/Cotabby/Models/LlamaRuntimeModels.swift
index 8a24d4a..846529a 100644
--- a/Cotabby/Models/LlamaRuntimeModels.swift
+++ b/Cotabby/Models/LlamaRuntimeModels.swift
@@ -193,17 +193,9 @@ struct LlamaGenerationOptions: Equatable, Sendable {
     /// Defaults to -infinity, which disables suppression entirely.
     var confidenceFloor: Double = -.infinity
 
-    /// Routes generation through the deterministic constrained decoder (logit read + admissibility
-    /// mask + argmax + manual token commit) instead of the engine's built-in stochastic sampler.
-    /// Default off so the shipping sampleNext path is unaffected until the constrained decoder is
-    /// validated on device. Changing it does not affect KV reuse, so it is intentionally excluded
-    /// from `SamplingFingerprint`.
-    var useConstrainedDecoder: Bool = false
-
-    /// Beam width for the constrained decoder. 1 keeps the single-path greedy decode; values > 1 run a
+    /// Beam width for the constrained decoder. 1 is the single-path greedy decode; values > 1 run a
     /// multi-branch beam search that explores several short continuations and keeps the highest-scoring
-    /// one. Only consulted when `useConstrainedDecoder` is true. Like `useConstrainedDecoder`, it does
-    /// not affect KV reuse, so it is excluded from `SamplingFingerprint`.
+    /// one. Does not affect KV reuse, so it is excluded from `SamplingFingerprint`.
     var beamWidth: Int = 1
 
     /// When set (and the model is FIM-capable), the runtime builds a fill-in-middle prompt from the
diff --git a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
index 831389c..92f02dd 100644
--- a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
+++ b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
@@ -212,12 +212,9 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
             autocompleteSamplingFingerprint = fingerprint
         }
 
-        // The KV-trim defer above runs after whichever decoder returns. Both decoders share the
-        // prepared sequence and the same confidence-suppression contract; they differ only in how
-        // they pick each token (engine sampler vs. deterministic constrained selection).
-        guard options.useConstrainedDecoder else {
-            return runEngineSampledDecode(sequenceID: sequenceID, options: options)
-        }
+        // The KV-trim defer above runs after whichever decoder returns. Greedy and beam share the
+        // prepared sequence and the same confidence-suppression contract; they differ only in how far
+        // they explore before committing each token.
         return options.beamWidth > 1
             ? try runConstrainedBeamDecode(
                 sequenceID: sequenceID,
@@ -270,57 +267,6 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
         return SentenceBoundaryClassifier.endsSentence(decoded)
     }
 
-    /// The shipping decoder: delegates token selection to the engine's built-in sampler
-    /// (`sampleNext`), which applies temperature / top-k / top-p / min-p and commits each token.
-    private func runEngineSampledDecode(sequenceID: Int32, options: LlamaGenerationOptions) -> String {
-        var generatedText = ""
-        var tokensGenerated = 0
-        var sumLogprob = 0.0
-        var stopReason = "budget_exhausted"
-
-        for _ in 0 ..< options.maxPredictionTokens {
-            // Cooperative cancellation: when the wrapping Task is cancelled (caller hit a new
-            // keystroke, focus changed, Compose started), bail before the next sampleNext call so
-            // we release `autocompleteLock` instead of running the full prediction budget and
-            // making the next autocomplete wait behind us.
-            if Task.isCancelled {
-                stopReason = "cancelled"
-                break
-            }
-
-            let result = engine.sampleNext(sequenceID)
-
-            if result.was_cancelled {
-                stopReason = "engine_cancelled"
-                break
-            }
-            if result.is_eos {
-                stopReason = "eos"
-                break
-            }
-
-            let piece = Self.extractPiece(result)
-            generatedText += piece
-            tokensGenerated += 1
-            sumLogprob += Double(result.logprob)
-        }
-
-        CotabbyLogger.runtime.debug(
-            "Decode end",
-            metadata: [
-                "kind": .string("generate"),
-                "tokens_generated": .stringConvertible(tokensGenerated),
-                "chars_generated": .stringConvertible(generatedText.count),
-                "stop_reason": .string(stopReason)
-            ]
-        )
-
-        if Self.shouldSuppress(sumLogprob: sumLogprob, tokensGenerated: tokensGenerated, options: options) {
-            return ""
-        }
-        return generatedText
-    }
-
     /// The constrained decoder: reads the raw next-token logits, masks structural / excluded tokens
     /// via the token profile, deterministically selects the highest-logit admissible token, and
     /// commits it manually with `acceptToken`. This trades the sampler's randomness for reproducible,
@@ -899,15 +845,6 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
         return []
     }
 
-    private static func extractPiece(_ result: SampleResult) -> String {
-        guard let piece = result.piece, result.piece_length > 0 else { return "" }
-        let buffer = UnsafeBufferPointer(
-            start: UnsafeRawPointer(piece).assumingMemoryBound(to: UInt8.self),
-            count: Int(result.piece_length)
-        )
-        return String(bytes: buffer, encoding: .utf8) ?? ""
-    }
-
     private static func samplingConfig(from options: LlamaGenerationOptions) -> SamplingConfig {
         SamplingConfig(
             max_prediction_tokens: Int32(options.maxPredictionTokens),
diff --git a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift
index be4d991..df2d8e1 100644
--- a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift
+++ b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift
@@ -107,8 +107,8 @@ final class LlamaRuntimeManager: ObservableObject {
         do {
             // `Task.detached` does not inherit the caller's cancellation, so an outer cancel
             // would otherwise leave `core.generate` running to its full prediction budget while
-            // holding `autocompleteLock`. The handler forwards the cancel signal, and the loop
-            // inside `core.generate` polls `Task.isCancelled` between sampleNext calls.
+            // holding `autocompleteLock`. The handler forwards the cancel signal, and the decode
+            // loop inside `core.generate` polls `Task.isCancelled` between token steps.
             let task = Task.detached {
                 try core.generate(
                     prompt: prompt,
diff --git a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift
index fdbb961..d70ce7f 100644
--- a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift
+++ b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift
@@ -13,39 +13,19 @@ final class LlamaSuggestionEngine {
     private let runtimeManager: LlamaRuntimeGenerating
     private var promptCacheHintTracker = LlamaPromptCacheHintTracker()
 
-    /// UserDefaults key (no UI) that routes llama generation through the deterministic constrained
-    /// decoder instead of the engine's stochastic sampler. Default-off: decode quality can only be
-    /// judged with a real model in a real field, so this stays a hidden developer/dogfood toggle
-    /// until it is validated on device and promoted to the default.
-    private static let constrainedDecoderDefaultsKey = "cotabbyConstrainedDecoderEnabled"
-    private static var isConstrainedDecoderEnabled: Bool {
-        UserDefaults.standard.bool(forKey: constrainedDecoderDefaultsKey)
-    }
-
-    /// UserDefaults key (no UI) for the constrained decoder's beam width. Default 1 keeps the existing
-    /// single-path greedy decode; a value > 1 runs a multi-branch beam search. Paired with the
-    /// constrained-decoder flag as a hidden developer/dogfood knob until validated on device.
+    /// UserDefaults key (no UI) for the constrained decoder's beam width. 1 is greedy (the shipping
+    /// default); a value > 1 runs a multi-branch beam search. Kept as a tuning knob, not a feature
+    /// gate: beam-by-default still needs the batched-beam decode work to avoid per-branch latency.
     private static let constrainedBeamWidthDefaultsKey = "cotabbyConstrainedBeamWidth"
     private static var constrainedBeamWidth: Int {
         let stored = UserDefaults.standard.integer(forKey: constrainedBeamWidthDefaultsKey)
         return stored > 0 ? stored : 1
     }
 
-    /// UserDefaults key (no UI) for fill-in-middle prompting. Default off: FIM only helps models that
-    /// ship the FIM marker tokens, and it changes the prompt structure, so it stays a developer toggle
-    /// until validated on device.
-    private static let fillInMiddleDefaultsKey = "cotabbyFillInMiddleEnabled"
-    private static var isFillInMiddleEnabled: Bool {
-        UserDefaults.standard.bool(forKey: fillInMiddleDefaultsKey)
-    }
-
-    /// The fill-in-middle request for a generation, or nil to use the forward base prompt. Built only
-    /// when the flag is on and the caret is genuinely mid-line (real text follows it on the same line);
-    /// the runtime still falls back to the base prompt when the model lacks FIM markers.
+    /// The fill-in-middle request for a generation, or nil to use the forward base prompt. Built when
+    /// the caret is genuinely mid-line (real text follows it on the same line); the runtime still falls
+    /// back to the base prompt when the model lacks the FIM marker tokens.
     private static func fillInMiddleRequest(for request: SuggestionRequest) -> FillInMiddleRequest? {
-        guard isFillInMiddleEnabled else {
-            return nil
-        }
         // A caret at end of line wants a forward continuation, not infilling — even if `trailingText`
         // is non-empty because a line break and later paragraphs follow it. Gating on end-of-line
         // (rather than `trailingText.isEmpty`) keeps FIM to the case it is actually meant for.
@@ -96,7 +76,6 @@ final class LlamaSuggestionEngine {
                         precedingText: request.context.precedingText,
                         trailingText: request.context.trailingText
                     ),
-                    useConstrainedDecoder: Self.isConstrainedDecoderEnabled,
                     beamWidth: Self.constrainedBeamWidth,
                     fillInMiddle: Self.fillInMiddleRequest(for: request)
                 )