Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/swift.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,5 @@ jobs:
run: swiftlint lint --strict --quiet
- name: Build
run: swift build -v
- name: Test
run: swift test
5 changes: 4 additions & 1 deletion Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,17 @@ let package = Package(
),
],
dependencies: [
.package(url: "https://github.com/codefiesta/VimKit", from: .init(0, 4, 2))
.package(url: "https://github.com/codefiesta/VimKit", from: .init(0, 4, 3))
],
targets: [
.target(
name: "VimAssistant",
dependencies: ["VimKit"],
resources: [.process("Resources/")],
linkerSettings: [
.linkedFramework("AVFoundation"),
.linkedFramework("CoreML"),
.linkedFramework("NaturalLanguage"),
.linkedFramework("Speech")
]
),
Expand Down
27 changes: 27 additions & 0 deletions Sources/VimAssistant/Extensions/MLMultiArray+Extensions.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
//
// MLMultiArray+Extensions.swift
// VimAssistant
//
// Created by Kevin McKee
//

import CoreML

extension MLMultiArray {

/// Builds an UnsafeBufferPointer from the multi-array's contents contents as the specifed type.
/// - Returns: a mutable buffer pointer of the specified type and length.
func toUnsafeBufferPointer<T>() -> UnsafeBufferPointer<T> {
let pointer: UnsafeMutablePointer<T> = dataPointer.bindMemory(to: T.self, capacity: count)
let bufferPointer = UnsafeBufferPointer(start: pointer, count: count)
return bufferPointer
}


/// Returns a copy of the multi-array's contents as an array of the specified type.
/// - Returns: an array of the specified type.
func toArray<T>() -> [T] {
let bufferPointer: UnsafeBufferPointer<T> = toUnsafeBufferPointer()
return [T](bufferPointer)
}
}
138 changes: 138 additions & 0 deletions Sources/VimAssistant/Model/TokenizedString.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
//
// TokenizedString.swift
// VimAssistant
//
// Created by Kevin McKee
//

import NaturalLanguage

private let defaultPrefix: String = "##"
private let maxTokenCount: Int = 128

struct TokenizedString {

/// Use Natural Language's NLTagger to tokenize the input by word.
private let tagger: NLTagger = .init(tagSchemes: [.tokenType])

/// The vocabulary.
private let vocabulary: Vocabulary = .shared

/// The raw untokenized string value.
private let rawValue: String

/// The array of tokens
public private(set) var tokens: [Substring] = .init()

/// The array of token IDs
public private(set) var tokenIDs: [Int] = .init()

/// Common Initializer
/// - Parameter value: the raw value to tokenize.
init(_ value: String) {
rawValue = value
tagger.string = rawValue

let results = wordpieceTokens()
tokens = results.tokens
tokenIDs = results.tokenIDs
}

/// Splits the raw text into an array of word tokens.
/// - Returns: An array of substrings.
private func wordTokens() -> [Substring] {
var results: [Substring] = []

let range = rawValue.startIndex..<rawValue.endIndex

// Find all tokens in the string and append to the array.
tagger.enumerateTags(in: range,
unit: .word,
scheme: .tokenType,
options: [.omitWhitespace]) { (_, range) -> Bool in
results.append(rawValue[range])
return true
}

return results
}

/// Splits word tokens into its component wordpiece tokens, if possible.
/// - Returns: A tuple of word/wordpiece tokens and their IDs.
private func wordpieceTokens() -> (tokens: [Substring], tokenIDs: [Int]) {

let wordTokens = wordTokens()

var wordpieceTokens = [Substring]()
var wordpieceTokenIDs = [Int]()

for token in wordTokens {
// Skip tokens that are too long.
guard token.count <= maxTokenCount else {
continue
}

var subTokens = [Substring]()
var subTokenIDs = [Int]()

// Start with the whole token.
var subToken = token

// Note when we've found the root word.
var foundFirstSubtoken = false

while subToken.isNotEmpty {

// Word suffixes begin with ## in the vocabulary, such as `##ing`.
let prefix = foundFirstSubtoken ? defaultPrefix : ""

// Convert the string to lowercase to match the vocabulary.
let searchTerm = Substring(prefix + subToken).lowercased()

let subTokenID = vocabulary.tokenID(for: searchTerm)

if subTokenID == vocabulary.unknown {
// Remove the last character and try again.
let nextSubtoken = subToken.dropLast()

if nextSubtoken.isEmpty {

// This token and its components are not in the vocabulary.
subTokens = [token]
subTokenIDs = [vocabulary.unknown]

// Exit the while-loop, but continue the for-loop.
break
}

// Prepare for the next iteration of the while-loop.
subToken = nextSubtoken

} else {

// Note that this loop has found the first subtoken.
// Ok to set true for additional subtokens.
foundFirstSubtoken = true

// Save this wordpiece and its ID.
subTokens.append(subToken)
subTokenIDs.append(subTokenID)

// Repeat search with the token's remainder, if any.
subToken = token.suffix(from: subToken.endIndex)
}
}

// Append all of this token's sub-tokens and their IDs.
wordpieceTokens += subTokens
wordpieceTokenIDs += subTokenIDs

}

guard wordpieceTokens.count == wordpieceTokenIDs.count else {
fatalError("Tokens array and TokenIDs arrays must be the same size.")
}

return (wordpieceTokens, wordpieceTokenIDs)
}
}
55 changes: 55 additions & 0 deletions Sources/VimAssistant/Model/Vocabulary.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
//
// Vocabulary.swift
// VimAssistant
//
// Created by Kevin McKee
//

import Foundation

private let vocabularyFileName = "bert-base-uncased-vocab"

public class Vocabulary: @unchecked Sendable {

static let shared: Vocabulary = Vocabulary()

private let values: [Substring: Int]

var unknown: Int {
values["[UNK]"]!
}

var padding: Int {
values["[PAD]"]!
}

var separator: Int {
values["[SEP]"]!
}

var classifyStart: Int {
values["[CLS]"]!
}

init() {
guard let url = Bundle.module.url(forResource: vocabularyFileName, withExtension: "txt") else {
fatalError("Vocabulary file is missing")
}
guard let rawVocabulary: String = try? .init(contentsOf: url, encoding: .utf8) else {
fatalError("Vocabulary file has no contents.")
}

let words = rawVocabulary.split(separator: "\n")
let values = 0..<words.count
self.values = Dictionary(uniqueKeysWithValues: zip(words, values))
}

public func tokenID(for token: Substring) -> Int {
values[token] ?? unknown
}

public func tokenID(for string: String) -> Int {
let token = Substring(string)
return values[token] ?? unknown
}
}
Loading