codefiesta · codefiesta · Feb 28, 2025 · Feb 24, 2025 · Feb 28, 2025 · Feb 28, 2025
diff --git a/.github/workflows/swift.yml b/.github/workflows/swift.yml
@@ -28,3 +28,5 @@ jobs:
       run: swiftlint lint --strict --quiet
     - name: Build
       run: swift build -v
+    - name: Test
+      run: swift test
diff --git a/Package.swift b/Package.swift
@@ -17,14 +17,17 @@ let package = Package(
         ),
     ],
     dependencies: [
-        .package(url: "https://github.com/codefiesta/VimKit", from: .init(0, 4, 2))
+        .package(url: "https://github.com/codefiesta/VimKit", from: .init(0, 4, 3))
     ],
     targets: [
         .target(
             name: "VimAssistant",
             dependencies: ["VimKit"],
+            resources: [.process("Resources/")],
             linkerSettings: [
                 .linkedFramework("AVFoundation"),
+                .linkedFramework("CoreML"),
+                .linkedFramework("NaturalLanguage"),
                 .linkedFramework("Speech")
             ]
         ),

diff --git a/Sources/VimAssistant/Extensions/MLMultiArray+Extensions.swift b/Sources/VimAssistant/Extensions/MLMultiArray+Extensions.swift
@@ -0,0 +1,27 @@
+//
+//  MLMultiArray+Extensions.swift
+//  VimAssistant
+//
+//  Created by Kevin McKee
+//
+
+import CoreML
+
+extension MLMultiArray {
+
+    /// Builds an UnsafeBufferPointer from the multi-array's contents contents as the specifed type.
+    /// - Returns: a mutable buffer pointer of the specified type and length.
+    func toUnsafeBufferPointer<T>() -> UnsafeBufferPointer<T> {
+        let pointer: UnsafeMutablePointer<T> = dataPointer.bindMemory(to: T.self, capacity: count)
+        let bufferPointer = UnsafeBufferPointer(start: pointer, count: count)
+        return bufferPointer
+    }
+
+
+    /// Returns a copy of the multi-array's contents as an array of the specified type.
+    /// - Returns: an array of the specified type.
+    func toArray<T>() -> [T] {
+        let bufferPointer: UnsafeBufferPointer<T> = toUnsafeBufferPointer()
+        return [T](bufferPointer)
+    }
+}
diff --git a/Sources/VimAssistant/Model/TokenizedString.swift b/Sources/VimAssistant/Model/TokenizedString.swift
@@ -0,0 +1,138 @@
+//
+//  TokenizedString.swift
+//  VimAssistant
+//
+//  Created by Kevin McKee
+//
+
+import NaturalLanguage
+
+private let defaultPrefix: String = "##"
+private let maxTokenCount: Int = 128
+
+struct TokenizedString {
+
+    /// Use Natural Language's NLTagger to tokenize the input by word.
+    private let tagger: NLTagger = .init(tagSchemes: [.tokenType])
+
+    /// The vocabulary.
+    private let vocabulary: Vocabulary = .shared
+
+    /// The raw untokenized string value.
+    private let rawValue: String
+
+    /// The array of tokens
+    public private(set) var tokens: [Substring] = .init()
+
+    /// The array of token IDs
+    public private(set) var tokenIDs: [Int] = .init()
+
+    /// Common Initializer
+    /// - Parameter value: the raw value to tokenize.
+    init(_ value: String) {
+        rawValue = value
+        tagger.string = rawValue
+
+        let results = wordpieceTokens()
+        tokens = results.tokens
+        tokenIDs = results.tokenIDs
+    }
+
+    /// Splits the raw text into an array of word tokens.
+    /// - Returns: An array of substrings.
+    private func wordTokens() -> [Substring] {
+        var results: [Substring] = []
+
+        let range = rawValue.startIndex..<rawValue.endIndex
+
+        // Find all tokens in the string and append to the array.
+        tagger.enumerateTags(in: range,
+                             unit: .word,
+                             scheme: .tokenType,
+                             options: [.omitWhitespace]) { (_, range) -> Bool in
+            results.append(rawValue[range])
+            return true
+        }
+
+        return results
+    }
+
+    /// Splits word tokens into its component wordpiece tokens, if possible.
+    /// - Returns: A tuple of  word/wordpiece tokens and their IDs.
+    private func wordpieceTokens() -> (tokens: [Substring], tokenIDs: [Int]) {
+
+        let wordTokens = wordTokens()
+
+        var wordpieceTokens = [Substring]()
+        var wordpieceTokenIDs = [Int]()
+
+        for token in wordTokens {
+            // Skip tokens that are too long.
+            guard token.count <= maxTokenCount else {
+                continue
+            }
+
+            var subTokens = [Substring]()
+            var subTokenIDs = [Int]()
+
+            // Start with the whole token.
+            var subToken = token
+
+            // Note when we've found the root word.
+            var foundFirstSubtoken = false
+
+            while subToken.isNotEmpty {
+
+                // Word suffixes begin with ## in the vocabulary, such as `##ing`.
+                let prefix = foundFirstSubtoken ? defaultPrefix : ""
+
+                // Convert the string to lowercase to match the vocabulary.
+                let searchTerm = Substring(prefix + subToken).lowercased()
+
+                let subTokenID = vocabulary.tokenID(for: searchTerm)
+
+                if subTokenID == vocabulary.unknown {
+                    // Remove the last character and try again.
+                    let nextSubtoken = subToken.dropLast()
+
+                    if nextSubtoken.isEmpty {
+
+                        // This token and its components are not in the vocabulary.
+                        subTokens = [token]
+                        subTokenIDs = [vocabulary.unknown]
+
+                        // Exit the while-loop, but continue the for-loop.
+                        break
+                    }
+
+                    // Prepare for the next iteration of the while-loop.
+                    subToken = nextSubtoken
+
+                } else {
+
+                    // Note that this loop has found the first subtoken.
+                    // Ok to set true for additional subtokens.
+                    foundFirstSubtoken = true
+
+                    // Save this wordpiece and its ID.
+                    subTokens.append(subToken)
+                    subTokenIDs.append(subTokenID)
+
+                    // Repeat search with the token's remainder, if any.
+                    subToken = token.suffix(from: subToken.endIndex)
+                }
+            }
+
+            // Append all of this token's sub-tokens and their IDs.
+            wordpieceTokens += subTokens
+            wordpieceTokenIDs += subTokenIDs
+
+        }
+
+        guard wordpieceTokens.count == wordpieceTokenIDs.count else {
+            fatalError("Tokens array and TokenIDs arrays must be the same size.")
+        }
+
+        return (wordpieceTokens, wordpieceTokenIDs)
+    }
+}
diff --git a/Sources/VimAssistant/Model/Vocabulary.swift b/Sources/VimAssistant/Model/Vocabulary.swift
@@ -0,0 +1,55 @@
+//
+//  Vocabulary.swift
+//  VimAssistant
+//
+//  Created by Kevin McKee
+//
+
+import Foundation
+
+private let vocabularyFileName = "bert-base-uncased-vocab"
+
+public class Vocabulary: @unchecked Sendable {
+
+    static let shared: Vocabulary = Vocabulary()
+
+    private let values: [Substring: Int]
+
+    var unknown: Int {
+        values["[UNK]"]!
+    }
+
+    var padding: Int {
+        values["[PAD]"]!
+    }
+
+    var separator: Int {
+        values["[SEP]"]!
+    }
+
+    var classifyStart: Int {
+        values["[CLS]"]!
+    }
+
+    init() {
+        guard let url = Bundle.module.url(forResource: vocabularyFileName, withExtension: "txt") else {
+            fatalError("Vocabulary file is missing")
+        }
+        guard let rawVocabulary: String = try? .init(contentsOf: url, encoding: .utf8) else {
+            fatalError("Vocabulary file has no contents.")
+        }
+
+        let words = rawVocabulary.split(separator: "\n")
+        let values = 0..<words.count
+        self.values = Dictionary(uniqueKeysWithValues: zip(words, values))
+    }
+
+    public func tokenID(for token: Substring) -> Int {
+        values[token] ?? unknown
+    }
+
+    public func tokenID(for string: String) -> Int {
+        let token = Substring(string)
+        return values[token] ?? unknown
+    }
+}