Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions Cotabby/Models/LlamaRuntimeModels.swift
Original file line number Diff line number Diff line change
Expand Up @@ -193,9 +193,17 @@ struct LlamaGenerationOptions: Equatable, Sendable {
/// Defaults to -infinity, which disables suppression entirely.
var confidenceFloor: Double = -.infinity

/// Beam width for the constrained decoder. 1 is the single-path greedy decode; values > 1 run a
/// Routes generation through the deterministic constrained decoder (logit read + admissibility
/// mask + argmax + manual token commit) instead of the engine's built-in stochastic sampler.
/// Default off so the shipping sampleNext path is unaffected until the constrained decoder is
/// validated on device. Changing it does not affect KV reuse, so it is intentionally excluded
/// from `SamplingFingerprint`.
var useConstrainedDecoder: Bool = false

/// Beam width for the constrained decoder. 1 keeps the single-path greedy decode; values > 1 run a
/// multi-branch beam search that explores several short continuations and keeps the highest-scoring
/// one. Does not affect KV reuse, so it is excluded from `SamplingFingerprint`.
/// one. Only consulted when `useConstrainedDecoder` is true. Like `useConstrainedDecoder`, it does
/// not affect KV reuse, so it is excluded from `SamplingFingerprint`.
var beamWidth: Int = 1

/// When set (and the model is FIM-capable), the runtime builds a fill-in-middle prompt from the
Expand Down
69 changes: 66 additions & 3 deletions Cotabby/Services/Runtime/LlamaRuntimeCore.swift
Original file line number Diff line number Diff line change
Expand Up @@ -212,9 +212,12 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
autocompleteSamplingFingerprint = fingerprint
}

// The KV-trim defer above runs after whichever decoder returns. Greedy and beam share the
// prepared sequence and the same confidence-suppression contract; they differ only in how far
// they explore before committing each token.
// The KV-trim defer above runs after whichever decoder returns. Both decoders share the
// prepared sequence and the same confidence-suppression contract; they differ only in how
// they pick each token (engine sampler vs. deterministic constrained selection).
guard options.useConstrainedDecoder else {
return runEngineSampledDecode(sequenceID: sequenceID, options: options)
}
return options.beamWidth > 1
? try runConstrainedBeamDecode(
sequenceID: sequenceID,
Expand Down Expand Up @@ -267,6 +270,57 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
return SentenceBoundaryClassifier.endsSentence(decoded)
}

/// The shipping decoder: delegates token selection to the engine's built-in sampler
/// (`sampleNext`), which applies temperature / top-k / top-p / min-p and commits each token.
private func runEngineSampledDecode(sequenceID: Int32, options: LlamaGenerationOptions) -> String {
var generatedText = ""
var tokensGenerated = 0
var sumLogprob = 0.0
var stopReason = "budget_exhausted"

for _ in 0 ..< options.maxPredictionTokens {
// Cooperative cancellation: when the wrapping Task is cancelled (caller hit a new
// keystroke, focus changed, Compose started), bail before the next sampleNext call so
// we release `autocompleteLock` instead of running the full prediction budget and
// making the next autocomplete wait behind us.
if Task.isCancelled {
stopReason = "cancelled"
break
}

let result = engine.sampleNext(sequenceID)

if result.was_cancelled {
stopReason = "engine_cancelled"
break
}
if result.is_eos {
stopReason = "eos"
break
}

let piece = Self.extractPiece(result)
generatedText += piece
tokensGenerated += 1
sumLogprob += Double(result.logprob)
}

CotabbyLogger.runtime.debug(
"Decode end",
metadata: [
"kind": .string("generate"),
"tokens_generated": .stringConvertible(tokensGenerated),
"chars_generated": .stringConvertible(generatedText.count),
"stop_reason": .string(stopReason)
]
)

if Self.shouldSuppress(sumLogprob: sumLogprob, tokensGenerated: tokensGenerated, options: options) {
return ""
}
return generatedText
}

/// The constrained decoder: reads the raw next-token logits, masks structural / excluded tokens
/// via the token profile, deterministically selects the highest-logit admissible token, and
/// commits it manually with `acceptToken`. This trades the sampler's randomness for reproducible,
Expand Down Expand Up @@ -850,6 +904,15 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
return []
}

private static func extractPiece(_ result: SampleResult) -> String {
guard let piece = result.piece, result.piece_length > 0 else { return "" }
let buffer = UnsafeBufferPointer(
start: UnsafeRawPointer(piece).assumingMemoryBound(to: UInt8.self),
count: Int(result.piece_length)
)
return String(bytes: buffer, encoding: .utf8) ?? ""
}
Comment on lines +907 to +914
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Silent drop of partial-UTF-8 token pieces

String(bytes: buffer, encoding: .utf8) ?? "" returns "" whenever a token's bytes are not a complete, valid UTF-8 sequence. The llama tokenizer can split a multi-byte scalar (CJK character, emoji) across adjacent tokens, so the first token carries only the leading byte(s) and the second carries the continuation byte(s) — neither fragment decodes independently as valid UTF-8. The consequence is that both tokens silently contribute nothing to generatedText, effectively erasing the character. runConstrainedDecode already documents this exact pitfall and works around it by accumulating raw bytes and decoding once at the end with String(decoding:as:). The native sampler path doesn't accumulate bytes, but the per-token fallback to "" means the same class of corruption applies.

Fix in Codex Fix in Claude Code


private static func samplingConfig(from options: LlamaGenerationOptions) -> SamplingConfig {
SamplingConfig(
max_prediction_tokens: Int32(options.maxPredictionTokens),
Expand Down
4 changes: 2 additions & 2 deletions Cotabby/Services/Runtime/LlamaRuntimeManager.swift
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,8 @@ final class LlamaRuntimeManager: ObservableObject {
do {
// `Task.detached` does not inherit the caller's cancellation, so an outer cancel
// would otherwise leave `core.generate` running to its full prediction budget while
// holding `autocompleteLock`. The handler forwards the cancel signal, and the decode
// loop inside `core.generate` polls `Task.isCancelled` between token steps.
// holding `autocompleteLock`. The handler forwards the cancel signal, and the loop
// inside `core.generate` polls `Task.isCancelled` between sampleNext calls.
let task = Task.detached {
try core.generate(
prompt: prompt,
Expand Down
33 changes: 27 additions & 6 deletions Cotabby/Services/Runtime/LlamaSuggestionEngine.swift
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,39 @@ final class LlamaSuggestionEngine {
private let runtimeManager: LlamaRuntimeGenerating
private var promptCacheHintTracker = LlamaPromptCacheHintTracker()

/// UserDefaults key (no UI) for the constrained decoder's beam width. 1 is greedy (the shipping
/// default); a value > 1 runs a multi-branch beam search. Kept as a tuning knob, not a feature
/// gate: beam-by-default still needs the batched-beam decode work to avoid per-branch latency.
/// UserDefaults key (no UI) that routes llama generation through the deterministic constrained
/// decoder instead of the engine's stochastic sampler. Default-off: decode quality can only be
/// judged with a real model in a real field, so this stays a hidden developer/dogfood toggle
/// until it is validated on device and promoted to the default.
private static let constrainedDecoderDefaultsKey = "cotabbyConstrainedDecoderEnabled"
private static var isConstrainedDecoderEnabled: Bool {
UserDefaults.standard.bool(forKey: constrainedDecoderDefaultsKey)
}

/// UserDefaults key (no UI) for the constrained decoder's beam width. Default 1 keeps the existing
/// single-path greedy decode; a value > 1 runs a multi-branch beam search. Paired with the
/// constrained-decoder flag as a hidden developer/dogfood knob until validated on device.
private static let constrainedBeamWidthDefaultsKey = "cotabbyConstrainedBeamWidth"
private static var constrainedBeamWidth: Int {
let stored = UserDefaults.standard.integer(forKey: constrainedBeamWidthDefaultsKey)
return stored > 0 ? stored : 1
}

/// The fill-in-middle request for a generation, or nil to use the forward base prompt. Built when
/// the caret is genuinely mid-line (real text follows it on the same line); the runtime still falls
/// back to the base prompt when the model lacks the FIM marker tokens.
/// UserDefaults key (no UI) for fill-in-middle prompting. Default off: FIM only helps models that
/// ship the FIM marker tokens, and it changes the prompt structure, so it stays a developer toggle
/// until validated on device.
private static let fillInMiddleDefaultsKey = "cotabbyFillInMiddleEnabled"
private static var isFillInMiddleEnabled: Bool {
UserDefaults.standard.bool(forKey: fillInMiddleDefaultsKey)
}

/// The fill-in-middle request for a generation, or nil to use the forward base prompt. Built only
/// when the flag is on and the caret is genuinely mid-line (real text follows it on the same line);
/// the runtime still falls back to the base prompt when the model lacks FIM markers.
private static func fillInMiddleRequest(for request: SuggestionRequest) -> FillInMiddleRequest? {
guard isFillInMiddleEnabled else {
return nil
}
// A caret at end of line wants a forward continuation, not infilling — even if `trailingText`
// is non-empty because a line break and later paragraphs follow it. Gating on end-of-line
// (rather than `trailingText.isEmpty`) keeps FIM to the case it is actually meant for.
Expand Down Expand Up @@ -76,6 +96,7 @@ final class LlamaSuggestionEngine {
precedingText: request.context.precedingText,
trailingText: request.context.trailingText
),
useConstrainedDecoder: Self.isConstrainedDecoderEnabled,
beamWidth: Self.constrainedBeamWidth,
fillInMiddle: Self.fillInMiddleRequest(for: request)
)
Expand Down