diff --git a/swift/Sources/CoreAILanguageModels/Bundle/LanguageConfig.swift b/swift/Sources/CoreAILanguageModels/Bundle/LanguageConfig.swift index 9870d6b..83d9955 100644 --- a/swift/Sources/CoreAILanguageModels/Bundle/LanguageConfig.swift +++ b/swift/Sources/CoreAILanguageModels/Bundle/LanguageConfig.swift @@ -4,6 +4,8 @@ // be found in the LICENSE file or at https://opensource.org/licenses/BSD-3-Clause import CoreAIShared +import Foundation +import Tokenizers /// `language` block of `metadata.json` schema 0.2 — LLM-specific config. public struct LanguageConfig: Codable, Sendable, Equatable { @@ -42,7 +44,7 @@ public struct LanguageConfig: Codable, Sendable, Equatable { case functionMap = "function_map" } - public init(from decoder: Decoder) throws { + public init(from decoder: Swift.Decoder) throws { let c = try decoder.container(keyedBy: CodingKeys.self) self.tokenizer = try c.decode(String.self, forKey: .tokenizer) self.vocabSize = try c.decode(Int.self, forKey: .vocabSize) @@ -50,4 +52,88 @@ public struct LanguageConfig: Codable, Sendable, Equatable { self.embeddedTokenizer = try c.decodeIfPresent(Bool.self, forKey: .embeddedTokenizer) ?? true self.functionMap = try c.decodeIfPresent(FunctionMap.self, forKey: .functionMap) } + + // MARK: - Additional Stop Tokens + + /// Extract additional stop token IDs from the tokenizer config. + /// Reads `additional_special_tokens` from tokenizer_config.json and + /// cross-references with the tokenizer to get their IDs. + /// + /// Also checks for array-valued `eos_token` (some models list multiple). + /// + /// Best-effort: returns empty if the file doesn't exist or can't be parsed. + /// + /// TODO: Upstream this to swift-transformers as `Tokenizer.additionalEosTokenIds` + /// so we don't need to parse tokenizer_config.json ourselves. + public static func additionalStopTokenIds( + from tokenizerDir: URL, + tokenizer: any Tokenizer + ) -> [Int32] { + let configURL = tokenizerDir.appending(path: "tokenizer_config.json") + guard let data = try? Data(contentsOf: configURL), + let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] + else { + return [] + } + + let mainEos = tokenizer.eosTokenId.map { Int32($0) } + var result = Set() + + // 1. Check additional_special_tokens array + if let specials = json["additional_special_tokens"] as? [Any] { + for item in specials { + // Each item can be a string or a dict with a "content" key + let tokenString: String? + if let s = item as? String { + tokenString = s + } else if let dict = item as? [String: Any], + let content = dict["content"] as? String + { + tokenString = content + } else { + tokenString = nil + } + guard let token = tokenString else { continue } + + if let id = tokenizer.convertTokenToId(token) { + let id32 = Int32(id) + if id32 != mainEos { + result.insert(id32) + } + } + } + } + + // 2. Check if eos_token is an array (some models list multiple) + if let eosArray = json["eos_token"] as? [String] { + for token in eosArray { + if let id = tokenizer.convertTokenToId(token) { + let id32 = Int32(id) + if id32 != mainEos { + result.insert(id32) + } + } + } + } + + // 3. Check added_tokens_decoder for turn-ending special tokens + // (e.g. Gemma's ID 106, Qwen's <|im_end|>) + // Only include tokens whose content matches known turn-ending patterns. + let turnEndPatterns = ["end_of_turn", "im_end", "eot_id"] + if let addedTokens = json["added_tokens_decoder"] as? [String: Any] { + for (idString, value) in addedTokens { + guard let dict = value as? [String: Any], + let isSpecial = dict["special"] as? Bool, isSpecial, + let content = dict["content"] as? String, + let id = Int32(idString) + else { continue } + let lower = content.lowercased() + if id != mainEos && turnEndPatterns.contains(where: { lower.contains($0) }) { + result.insert(id) + } + } + } + + return Array(result) + } } diff --git a/swift/Sources/CoreAILanguageModels/DecodingStrategies/DecodingStrategy.swift b/swift/Sources/CoreAILanguageModels/DecodingStrategies/DecodingStrategy.swift index 1f5ce2d..ed7af4c 100644 --- a/swift/Sources/CoreAILanguageModels/DecodingStrategies/DecodingStrategy.swift +++ b/swift/Sources/CoreAILanguageModels/DecodingStrategies/DecodingStrategy.swift @@ -76,7 +76,13 @@ public struct StopSequences: Sendable { /// Initialize with tokenizer, automatically including EOS tokens /// - Parameter tokenizer: Tokenizer to extract EOS token from /// - Parameter additionalSequences: Optional additional stop sequences to include - public init(for tokenizer: any Tokenizer, additionalSequences: [[Int32]] = []) { + /// - Parameter additionalEosTokenIds: Optional additional single-token EOS IDs + /// (e.g. from tokenizer_config.json's `additional_special_tokens`) + public init( + for tokenizer: any Tokenizer, + additionalSequences: [[Int32]] = [], + additionalEosTokenIds: [Int32] = [] + ) { var allSequences = additionalSequences // Collect existing single-token sequences to avoid duplicates @@ -94,6 +100,14 @@ public struct StopSequences: Sendable { } } + // Add additional EOS token IDs (e.g. from tokenizer_config.json) + for token in additionalEosTokenIds { + if !existingTokens.contains(token) { + existingTokens.insert(token) + allSequences.append([token]) + } + } + self.sequences = allSequences self.maxLength = allSequences.map { $0.count }.max() ?? 0 } diff --git a/swift/Sources/CoreAILanguageModels/LanguageModel/CoreAILanguageModel.swift b/swift/Sources/CoreAILanguageModels/LanguageModel/CoreAILanguageModel.swift index fcede4b..696da7b 100644 --- a/swift/Sources/CoreAILanguageModels/LanguageModel/CoreAILanguageModel.swift +++ b/swift/Sources/CoreAILanguageModels/LanguageModel/CoreAILanguageModel.swift @@ -37,6 +37,7 @@ public struct CoreAILanguageModel: LanguageModel { private let vocabSize: Int? private let supportsToolCalling: Bool private let supportsReasoning: Bool + private let additionalEosTokenIds: [Int32] // MARK: - Protocol Requirements @@ -56,7 +57,8 @@ public struct CoreAILanguageModel: LanguageModel { tokenizer: tokenizer, modelIdentifier: modelIdentifier, samplingConfig: samplingConfig, - vocabSize: vocabSize + vocabSize: vocabSize, + additionalEosTokenIds: additionalEosTokenIds ) } @@ -97,13 +99,15 @@ public struct CoreAILanguageModel: LanguageModel { tokenizer: any Tokenizer, modelIdentifier: String = "coreai-model", samplingConfig: SamplingConfiguration = .greedy, - vocabSize: Int? = nil + vocabSize: Int? = nil, + additionalEosTokenIds: [Int32] = [] ) { self.engine = engine self.tokenizer = tokenizer self.modelIdentifier = modelIdentifier self.samplingConfig = samplingConfig self.vocabSize = vocabSize + self.additionalEosTokenIds = additionalEosTokenIds self.supportsToolCalling = CoreAIExecutor.detectToolCallMarkers(using: tokenizer) != nil self.supportsReasoning = tokenizer.convertTokenToId("") != nil @@ -121,6 +125,7 @@ public struct CoreAILanguageModel: LanguageModel { fileprivate let modelIdentifier: String fileprivate let samplingConfig: SamplingConfiguration fileprivate let vocabSize: Int? + fileprivate let additionalEosTokenIds: [Int32] public static func == (lhs: Configuration, rhs: Configuration) -> Bool { lhs.modelIdentifier == rhs.modelIdentifier @@ -140,6 +145,9 @@ public struct CoreAILanguageModel: LanguageModel { private let modelIdentifier: String private let samplingConfig: SamplingConfiguration private let vocabSize: Int? + /// All EOS-like token IDs: the main `eosTokenId` plus any additional + /// stop tokens from tokenizer_config.json (e.g. Gemma's ``). + private let eosTokenIds: Set /// Open / close marker pair the model uses for chain-of-thought /// blocks, discovered from the tokenizer's known token ids at init /// (see `detectThinkingMarkers`). For models that don't emit @@ -162,6 +170,14 @@ public struct CoreAILanguageModel: LanguageModel { self.vocabSize = configuration.vocabSize self.thinkingMarkers = Self.detectThinkingMarkers(using: configuration.tokenizer) self.toolCallMarkers = Self.detectToolCallMarkers(using: configuration.tokenizer) + + // Build the full set of EOS-like token IDs + var eos = Set() + if let id = configuration.tokenizer.eosTokenId { + eos.insert(Int32(id)) + } + eos.formUnion(configuration.additionalEosTokenIds) + self.eosTokenIds = eos } /// Probes the tokenizer for known reasoning marker pairs. Each @@ -328,7 +344,8 @@ public struct CoreAILanguageModel: LanguageModel { inferenceOptions: InferenceOptions(maxTokens: maxTokens) ) - let eosTokenId = tokenizer.eosTokenId + // Use pre-computed set of all EOS-like tokens (main + additional) + let eosTokens = eosTokenIds // Incremental-decode buffer. After a clean emit, one token is // retained as context for the next step (see below). During a // multi-byte sequence that hasn't decoded cleanly yet, multiple @@ -359,7 +376,7 @@ public struct CoreAILanguageModel: LanguageModel { for try await output in tokenStream { let token = output.tokenId - if let eos = eosTokenId, Int(token) == eos { + if eosTokens.contains(token) { tokenStream.setStopReason(.eos) break } @@ -523,7 +540,10 @@ public struct CoreAILanguageModel: LanguageModel { } let strategy = ConstrainedDecodingStrategy(jsonSchema: jsonSchema, vocabSize: vocabSize) - let stopSequences = StopSequences(for: tokenizer) + let stopSequences = StopSequences( + for: tokenizer, + additionalEosTokenIds: Array(eosTokenIds) + ) let stream = try await strategy.decode( from: .tokens(promptTokens), diff --git a/swift/Sources/CoreAILanguageModels/LanguageModel/CoreAIRunner.swift b/swift/Sources/CoreAILanguageModels/LanguageModel/CoreAIRunner.swift index 892b69d..e9d8d76 100644 --- a/swift/Sources/CoreAILanguageModels/LanguageModel/CoreAIRunner.swift +++ b/swift/Sources/CoreAILanguageModels/LanguageModel/CoreAIRunner.swift @@ -80,12 +80,22 @@ public struct CoreAIRunner { let tokenizer = try await bundle.loadTokenizer() tokenizerLoadSpan.end() + // Read additional stop token IDs from tokenizer_config.json + let additionalEos: [Int32] + if let tokenizerDir = bundle.tokenizerPath { + additionalEos = LanguageConfig.additionalStopTokenIds( + from: tokenizerDir, tokenizer: tokenizer) + } else { + additionalEos = [] + } + return CoreAILanguageModel( engine: engine, tokenizer: tokenizer, modelIdentifier: bundle.name, samplingConfig: SamplingConfiguration.greedy, - vocabSize: bundle.vocabSize + vocabSize: bundle.vocabSize, + additionalEosTokenIds: additionalEos ) } diff --git a/swift/Sources/Tools/llm-runner/LLMRunnerMain.swift b/swift/Sources/Tools/llm-runner/LLMRunnerMain.swift index 4947cf3..a2bda2a 100644 --- a/swift/Sources/Tools/llm-runner/LLMRunnerMain.swift +++ b/swift/Sources/Tools/llm-runner/LLMRunnerMain.swift @@ -361,6 +361,20 @@ struct LLMRunner: AsyncParsableCommand, Sendable { "Tokenizer loaded from \(bundle.hasEmbeddedTokenizer ? "embedded bundle" : "HuggingFace")", component: "Main") + // Read additional stop token IDs from tokenizer_config.json (e.g. for Gemma) + let additionalEosTokenIds: [Int32] + if let tokenizerDir = bundle.tokenizerPath { + additionalEosTokenIds = LanguageConfig.additionalStopTokenIds( + from: tokenizerDir, tokenizer: tokenizer) + if !additionalEosTokenIds.isEmpty { + CLILogger.log( + "Found \(additionalEosTokenIds.count) additional stop token(s) from tokenizer config: \(additionalEosTokenIds)", + component: "Main") + } + } else { + additionalEosTokenIds = [] + } + CLILogger.log("Model loaded successfully:", component: "Main") CLILogger.log(" Name: \(modelName)", component: "Main") CLILogger.log(" Source: model bundle", component: "Main") @@ -494,7 +508,8 @@ struct LLMRunner: AsyncParsableCommand, Sendable { samplingConfiguration: samplingConfiguration, maxTokens: maxTokens, actualInputTokens: actualInputTokens, - modelVocabSize: modelVocabSize + modelVocabSize: modelVocabSize, + additionalEosTokenIds: additionalEosTokenIds ) } else { // Generate text (timing handled by decoding strategies) @@ -505,7 +520,8 @@ struct LLMRunner: AsyncParsableCommand, Sendable { // Encode stop tokens to sequences let stopSequences = try validateAndEncodeStopTokens( stopTokens: stopTokens, - tokenizer: tokenizer + tokenizer: tokenizer, + additionalEosTokenIds: additionalEosTokenIds ) // Check if logits are requested @@ -590,7 +606,8 @@ struct LLMRunner: AsyncParsableCommand, Sendable { samplingConfiguration: SamplingConfiguration, maxTokens: Int, actualInputTokens: Int, - modelVocabSize: Int? + modelVocabSize: Int?, + additionalEosTokenIds: [Int32] = [] ) async throws { let schema: String if FileManager.default.fileExists(atPath: schemaInput) { @@ -603,7 +620,8 @@ struct LLMRunner: AsyncParsableCommand, Sendable { let stopSequences = try validateAndEncodeStopTokens( stopTokens: stopTokens, - tokenizer: tokenizer + tokenizer: tokenizer, + additionalEosTokenIds: additionalEosTokenIds ) guard let vocabSize = modelVocabSize else { @@ -676,15 +694,19 @@ struct LLMRunner: AsyncParsableCommand, Sendable { /// - Parameters: /// - stopTokens: Array of stop token strings from CLI /// - tokenizer: Tokenizer to use for encoding + /// - additionalEosTokenIds: Additional EOS token IDs from tokenizer config /// - Returns: StopSequences containing all valid sequences plus tokenizer EOS tokens func validateAndEncodeStopTokens( stopTokens: [String], - tokenizer: any Tokenizer + tokenizer: any Tokenizer, + additionalEosTokenIds: [Int32] = [] ) throws -> StopSequences { var sequences: [[Int32]] = [] for stopString in stopTokens { - let tokens = tokenizer.encode(text: stopString).map { Int32($0) } + // Encode without adding BOS/EOS so special token strings like + // "" resolve to their single token ID, not [BOS, id]. + let tokens = tokenizer.encode(text: stopString, addSpecialTokens: false).map { Int32($0) } // Fatal error for empty encodings - user explicitly requested this stop token guard !tokens.isEmpty else { @@ -710,7 +732,11 @@ struct LLMRunner: AsyncParsableCommand, Sendable { } // Use new initializer that automatically includes EOS tokens from tokenizer - return StopSequences(for: tokenizer, additionalSequences: sequences) + return StopSequences( + for: tokenizer, + additionalSequences: sequences, + additionalEosTokenIds: additionalEosTokenIds + ) } // MARK: - Asset Type Label