From ce90a9b96f4b9c57e268d4cad065aabad0ccaca2 Mon Sep 17 00:00:00 2001
From: Carina Peng <carinafpeng@gmail.com>
Date: Fri, 12 Jun 2026 18:14:22 -0700
Subject: [PATCH 1/2] Speech runner

---
 Package.swift                                 |  11 +
 .../speech-runner/SpeechRunnerMain.swift      | 256 ++++++++++++++++++
 2 files changed, 267 insertions(+)
 create mode 100644 swift/Sources/Tools/speech-runner/SpeechRunnerMain.swift
diff --git a/Package.swift b/Package.swift
index 3a23d4c..3ba2f25 100644
--- a/Package.swift
+++ b/Package.swift
@@ -155,6 +155,17 @@ let package = Package(
                 .enableUpcomingFeature("MemberImportVisibility")
             ]
         ),
+        .executableTarget(
+            name: "speech-runner",
+            dependencies: [
+                "CoreAIShared",
+                .product(name: "Transformers", package: "swift-transformers"),
+            ],
+            path: "swift/Sources/Tools/speech-runner",
+            swiftSettings: [
+                .enableUpcomingFeature("MemberImportVisibility")
+            ]
+        ),
 
         // Public LLM Benchmark CLI (based on mlx-lm benchmark)
         .executableTarget(
diff --git a/swift/Sources/Tools/speech-runner/SpeechRunnerMain.swift b/swift/Sources/Tools/speech-runner/SpeechRunnerMain.swift
new file mode 100644
index 0000000..cb3123a
--- /dev/null
+++ b/swift/Sources/Tools/speech-runner/SpeechRunnerMain.swift
@@ -0,0 +1,256 @@
+// Copyright 2026 Apple Inc.
+//
+// Use of this source code is governed by a BSD-3-clause license that can
+// be found in the LICENSE file or at https://opensource.org/licenses/BSD-3-Clause
+
+import CoreAI
+import CoreAIShared
+import Foundation
+import Tokenizers
+
+// Whisper forced prefix: <|startoftranscript|> <|en|> <|transcribe|> <|notimestamps|>
+private let forcedPrefix: [Int32] = [50258, 50259, 50360, 50364]
+private let eotToken: Int32 = 50257
+private let maxTargetPositions = 448
+private let maxDecodeSteps = 50
+private let melElements = 128 * 3000
+
+// MARK: - Entry point
+
+// Usage: speech-runner <model-path> [audio-or-mel]
+//
+// model-path  A bundle dir with encoder.aimodel + decoder.aimodel (--mode coreai),
+//             or a single .aimodel file (--mode legacy).
+//
+// audio-or-mel  An audio file (wav, flac, m4a, …) or a precomputed mel .bin
+//               from tools/compute_mel.py. Omit for silence benchmarking.
+@main
+struct Main {
+    static func main() async {
+        guard CommandLine.arguments.count > 1 else {
+            print("Usage: speech-runner <model-path> [audio-or-mel]")
+            exit(1)
+        }
+        let modelPath = CommandLine.arguments[1]
+        let audioPath = CommandLine.arguments.count > 2 ? CommandLine.arguments[2] : nil
+        do {
+            let encURL = URL(fileURLWithPath: "\(modelPath)/encoder.aimodel")
+            if FileManager.default.fileExists(atPath: encURL.path) {
+                try await runSplit(bundleDir: modelPath, audioPath: audioPath)
+            } else {
+                try await runLegacy(modelPath: modelPath, audioPath: audioPath)
+            }
+        } catch {
+            print("Fatal: \(error)")
+            exit(1)
+        }
+    }
+}
+
+// MARK: - Mel loading
+
+private let audioExtensions: Set<String> = ["wav", "flac", "m4a", "mp3", "aiff", "aif", "caf"]
+
+private func loadMelArray(from path: String, descriptor: NDArrayDescriptor) throws -> NDArray {
+    let url = URL(fileURLWithPath: path)
+    let floats: [Float]
+    if audioExtensions.contains(url.pathExtension.lowercased()) {
+        print("Computing mel from audio file…")
+        floats = try WhisperMel.fromFile(url)
+    } else {
+        let data = try Data(contentsOf: url)
+        let count = data.count / MemoryLayout<Float>.size
+        guard count == melElements else {
+            fatalError("mel bin has \(count) floats, expected \(melElements) (128×3000)")
+        }
+        floats = data.withUnsafeBytes { Array($0.bindMemory(to: Float.self)) }
+    }
+    var array = NDArray(descriptor: descriptor.resolvingDynamicDimensions([1, 128, 3000]))
+    fillNDArray(&array, as: Float.self, with: floats)
+    return array
+}
+
+// MARK: - Results
+
+private func printResults(tokens: [Int32], stepTimesMs: [Double]) async {
+    let avgMs = stepTimesMs.reduce(0, +) / Double(stepTimesMs.count)
+    print(String(format: "  steps:    %d", stepTimesMs.count))
+    print(String(format: "  latency:  %.1f ms/tok", avgMs))
+    print(String(format: "  speed:    %.1f tok/s", 1000.0 / avgMs))
+    if let lo = stepTimesMs.min(), let hi = stepTimesMs.max() {
+        print(String(format: "  min/max:  %.1f / %.1f ms", lo, hi))
+    }
+    print("\n── Transcription ──────────────────────────────────────────────────────")
+    if let tokenizer = try? await AutoTokenizer.from(pretrained: "openai/whisper-large-v3-turbo") {
+        let ids = tokens.filter { $0 < 50257 }.map { Int($0) }
+        print("  \(tokenizer.decode(tokens: ids))")
+    } else {
+        print("  (tokenizer unavailable — token ids: \(tokens))")
+    }
+}
+
+// MARK: - Split runner (encoder + decoder with KV cache)
+
+func runSplit(bundleDir: String, audioPath: String?) async throws {
+    print("Format: split (encoder + decoder, KV cache)")
+
+    let encModel = try await AIModel(contentsOf: URL(fileURLWithPath: "\(bundleDir)/encoder.aimodel"))
+    let decModel = try await AIModel(contentsOf: URL(fileURLWithPath: "\(bundleDir)/decoder.aimodel"))
+
+    guard let encFn = try encModel.loadFunction(named: "main"),
+          let decFn = try decModel.loadFunction(named: "main")
+    else { fatalError("No 'main' function") }
+
+    let encDesc = encModel.functionDescriptor(for: "main")!
+    let decDesc = decModel.functionDescriptor(for: "main")!
+
+    guard case .ndArray(let melNDDesc)    = encDesc.inputDescriptor(of: "input_features"),
+          case .ndArray(let encOutNDDesc) = encDesc.outputDescriptor(of: "encoder_hidden_states")
+    else { fatalError("Unexpected encoder descriptors") }
+
+    let encOutShape = encOutNDDesc.shape
+
+    var melArray: NDArray
+    if let path = audioPath {
+        melArray = try loadMelArray(from: path, descriptor: melNDDesc)
+    } else {
+        print("No audio — using silence for benchmarking")
+        melArray = NDArray(descriptor: melNDDesc.resolvingDynamicDimensions([1, 128, 3000]))
+        fillNDArray(&melArray, as: Float.self, count: melElements) { _ in 0.0 }
+    }
+    var encOutArray = NDArray(descriptor: encOutNDDesc.resolvingDynamicDimensions(encOutShape))
+
+    // Warmup
+    do {
+        var out = InferenceFunction.MutableViews()
+        out.insert(&encOutArray, for: "encoder_hidden_states")
+        _ = try await encFn.run(inputs: ["input_features": melArray],
+                                states: InferenceFunction.MutableViews(), outputViews: consume out)
+    }
+    print("\n── Encoder ────────────────────────────────────────────────────────────")
+    let encT0 = Date()
+    do {
+        var out = InferenceFunction.MutableViews()
+        out.insert(&encOutArray, for: "encoder_hidden_states")
+        _ = try await encFn.run(inputs: ["input_features": melArray],
+                                states: InferenceFunction.MutableViews(), outputViews: consume out)
+    }
+    print(String(format: "  latency: %.1f ms", Date().timeIntervalSince(encT0) * 1000))
+
+    guard case .ndArray(let inputIdsNDDesc) = decDesc.inputDescriptor(of: "input_ids"),
+          case .ndArray(let posIdsNDDesc)   = decDesc.inputDescriptor(of: "position_ids"),
+          case .ndArray(let encHSNDDesc)    = decDesc.inputDescriptor(of: "encoder_hidden_states"),
+          case .ndArray(let keyCacheNDDesc) = decDesc.stateDescriptor(of: "keyCache"),
+          case .ndArray(let valCacheNDDesc) = decDesc.stateDescriptor(of: "valueCache"),
+          case .ndArray(let logitsNDDesc)   = decDesc.outputDescriptor(of: "logits")
+    else { fatalError("Unexpected decoder descriptors") }
+
+    let vocabSize = logitsNDDesc.shape.last!
+    let kcShape = keyCacheNDDesc.shape.map { $0 < 0 ? maxTargetPositions : $0 }
+    let vcShape = valCacheNDDesc.shape.map { $0 < 0 ? maxTargetPositions : $0 }
+    var keyCache   = NDArray(descriptor: keyCacheNDDesc.resolvingDynamicDimensions(kcShape))
+    var valueCache = NDArray(descriptor: valCacheNDDesc.resolvingDynamicDimensions(vcShape))
+
+    let encFlat = readNDArray(encOutArray, as: Float.self, count: encOutShape.reduce(1, *))
+    var encHSArray = NDArray(descriptor: encHSNDDesc.resolvingDynamicDimensions(encOutShape))
+    fillNDArray(&encHSArray, as: Float.self, with: encFlat)
+    var logitsArray = NDArray(descriptor: logitsNDDesc.resolvingDynamicDimensions([1, 1, vocabSize]))
+
+    print("\n── Decoder ────────────────────────────────────────────────────────────")
+
+    var tokens: [Int32] = forcedPrefix
+    var pos = 0
+    for tok in forcedPrefix {
+        var ids = NDArray(descriptor: inputIdsNDDesc.resolvingDynamicDimensions([1, 1]))
+        var posIds = NDArray(descriptor: posIdsNDDesc.resolvingDynamicDimensions([1, pos + 1]))
+        fillNDArray(&ids, as: Int32.self, with: [tok])
+        fillNDArray(&posIds, as: Int32.self, count: pos + 1) { Int32($0) }
+        var st = InferenceFunction.MutableViews()
+        st.insert(&keyCache, for: "keyCache"); st.insert(&valueCache, for: "valueCache")
+        var out = InferenceFunction.MutableViews(); out.insert(&logitsArray, for: "logits")
+        _ = try await decFn.run(
+            inputs: ["input_ids": ids, "position_ids": posIds, "encoder_hidden_states": encHSArray],
+            states: consume st, outputViews: consume out)
+        pos += 1
+    }
+
+    var stepTimesMs: [Double] = []
+    while stepTimesMs.count < maxDecodeSteps {
+        var ids = NDArray(descriptor: inputIdsNDDesc.resolvingDynamicDimensions([1, 1]))
+        var posIds = NDArray(descriptor: posIdsNDDesc.resolvingDynamicDimensions([1, pos + 1]))
+        fillNDArray(&ids, as: Int32.self, with: [tokens.last!])
+        fillNDArray(&posIds, as: Int32.self, count: pos + 1) { Int32($0) }
+        var st = InferenceFunction.MutableViews()
+        st.insert(&keyCache, for: "keyCache"); st.insert(&valueCache, for: "valueCache")
+        var out = InferenceFunction.MutableViews(); out.insert(&logitsArray, for: "logits")
+        let t0 = Date()
+        _ = try await decFn.run(
+            inputs: ["input_ids": ids, "position_ids": posIds, "encoder_hidden_states": encHSArray],
+            states: consume st, outputViews: consume out)
+        stepTimesMs.append(Date().timeIntervalSince(t0) * 1000)
+        let logits = flattenAsFloat(logitsArray)
+        let next = Int32(logits.indices.max(by: { logits[$0] < logits[$1] })!)
+        tokens.append(next); pos += 1
+        if next == eotToken { break }
+    }
+
+    await printResults(tokens: tokens, stepTimesMs: stepTimesMs)
+}
+
+// MARK: - Legacy runner (monolithic model, no KV cache)
+
+func runLegacy(modelPath: String, audioPath: String?) async throws {
+    print("Format: legacy (monolithic, no KV cache)")
+
+    let model = try await AIModel(contentsOf: URL(fileURLWithPath: modelPath))
+    guard let fn = try model.loadFunction(named: "main") else { fatalError("No 'main' function") }
+    let desc = model.functionDescriptor(for: "main")!
+
+    guard case .ndArray(let melNDDesc)  = desc.inputDescriptor(of: "input_features"),
+          case .ndArray(let idsNDDesc)  = desc.inputDescriptor(of: "decoder_input_ids"),
+          case .ndArray(let logitsDesc) = desc.outputDescriptor(of: "logits")
+    else { fatalError("Unexpected model descriptors") }
+
+    let vocabSize = logitsDesc.shape.last!
+    let isStaticIds = !idsNDDesc.shape.contains(where: { $0 < 0 })
+    if isStaticIds {
+        print("decoder_input_ids exported with static shape — no past context per step")
+        print("Re-export with --mode legacy to get dynamic shapes")
+    }
+
+    var melArray: NDArray
+    if let path = audioPath {
+        melArray = try loadMelArray(from: path, descriptor: melNDDesc)
+    } else {
+        print("No audio — using silence for benchmarking")
+        melArray = NDArray(descriptor: melNDDesc.resolvingDynamicDimensions([1, 128, 3000]))
+        fillNDArray(&melArray, as: Float.self, count: melElements) { _ in 0.0 }
+    }
+
+    print("\n── Decode ─────────────────────────────────────────────────────────────")
+
+    var tokens: [Int32] = forcedPrefix
+    var stepTimesMs: [Double] = []
+
+    while stepTimesMs.count < maxDecodeSteps {
+        let inputTokens: [Int32] = isStaticIds ? [tokens.last!] : tokens
+        let seqLen = inputTokens.count
+        var ids = NDArray(descriptor: idsNDDesc.resolvingDynamicDimensions([1, seqLen]))
+        fillNDArray(&ids, as: Int32.self, with: inputTokens)
+        var logitsArray = NDArray(descriptor: logitsDesc.resolvingDynamicDimensions([1, seqLen, vocabSize]))
+        var out = InferenceFunction.MutableViews(); out.insert(&logitsArray, for: "logits")
+        let t0 = Date()
+        _ = try await fn.run(
+            inputs: ["input_features": melArray, "decoder_input_ids": ids],
+            states: InferenceFunction.MutableViews(), outputViews: consume out)
+        stepTimesMs.append(Date().timeIntervalSince(t0) * 1000)
+        let logits = flattenAsFloat(logitsArray)
+        let lastStart = (seqLen - 1) * vocabSize
+        let lastLogits = Array(logits[lastStart ..< lastStart + vocabSize])
+        let next = Int32(lastLogits.indices.max(by: { lastLogits[$0] < lastLogits[$1] })!)
+        tokens.append(next)
+        if next == eotToken { break }
+    }
+
+    await printResults(tokens: tokens, stepTimesMs: stepTimesMs)
+}

From b4f56cfa8973d44ae303c807e3f9c72fb6402ee6 Mon Sep 17 00:00:00 2001
From: Carina Peng <carinafpeng@gmail.com>
Date: Fri, 12 Jun 2026 23:06:35 -0700
Subject: [PATCH 2/2] Mel compute in Swift

---
 .../speech-runner/SpeechRunnerMain.swift      |   9 +-
 .../Tools/speech-runner/WhisperMel.swift      | 182 ++++++++++++++++++
 2 files changed, 190 insertions(+), 1 deletion(-)
 create mode 100644 swift/Sources/Tools/speech-runner/WhisperMel.swift

diff --git a/swift/Sources/Tools/speech-runner/SpeechRunnerMain.swift b/swift/Sources/Tools/speech-runner/SpeechRunnerMain.swift
index cb3123a..d5bcebe 100644
--- a/swift/Sources/Tools/speech-runner/SpeechRunnerMain.swift
+++ b/swift/Sources/Tools/speech-runner/SpeechRunnerMain.swift
@@ -81,7 +81,14 @@ private func printResults(tokens: [Int32], stepTimesMs: [Double]) async {
         print(String(format: "  min/max:  %.1f / %.1f ms", lo, hi))
     }
     print("\n── Transcription ──────────────────────────────────────────────────────")
-    if let tokenizer = try? await AutoTokenizer.from(pretrained: "openai/whisper-large-v3-turbo") {
+    // Load tokenizer from local HF cache (no network needed)
+    let cacheBase = FileManager.default.homeDirectoryForCurrentUser
+        .appending(path: ".cache/huggingface/hub/models--openai--whisper-large-v3-turbo/snapshots")
+    let snapshot = (try? FileManager.default.contentsOfDirectory(atPath: cacheBase.path))?.first
+    let tokenizerURL = snapshot.map { cacheBase.appending(path: $0) }
+
+    if let url = tokenizerURL,
+       let tokenizer = try? await AutoTokenizer.from(modelFolder: url) {
         let ids = tokens.filter { $0 < 50257 }.map { Int($0) }
         print("  \(tokenizer.decode(tokens: ids))")
     } else {
diff --git a/swift/Sources/Tools/speech-runner/WhisperMel.swift b/swift/Sources/Tools/speech-runner/WhisperMel.swift
new file mode 100644
index 0000000..a4c9ada
--- /dev/null
+++ b/swift/Sources/Tools/speech-runner/WhisperMel.swift
@@ -0,0 +1,182 @@
+// Copyright 2026 Apple Inc.
+//
+// Use of this source code is governed by a BSD-3-clause license that can
+// be found in the LICENSE file or at https://opensource.org/licenses/BSD-3-Clause
+
+import Accelerate
+import AVFoundation
+import Foundation
+
+// Whisper mel spectrogram: sr=16000, n_fft=400, hop=160, n_mels=128
+// Slaney-normalised filterbank, reflect-padded audio, matches WhisperFeatureExtractor.
+//
+// vDSP DFT only supports f×2^n sizes (f ∈ {1,3,5,15}); 400=5²×2⁴ doesn't qualify.
+// We precompute 201×400 DFT basis matrices and apply them with cblas_sgemv instead.
+
+enum WhisperMel {
+
+    static let sampleRate: Double = 16_000
+    static let nFFT       = 400   // analysis window (samples)
+    static let hopLength  = 160
+    static let nMelBins   = 128
+    static let nFrames    = 3_000
+    static let nSamples   = 480_000
+
+    private static let nFreqs = nFFT / 2 + 1  // 201
+
+    // MARK: - Public
+
+    static func fromFile(_ url: URL) throws -> [Float] {
+        return fromPCM(try loadAndResample(url))
+    }
+
+    // MARK: - Audio loading + resampling
+
+    static func loadAndResample(_ url: URL) throws -> [Float] {
+        let file = try AVAudioFile(forReading: url)
+        let fmt  = AVAudioFormat(commonFormat: .pcmFormatFloat32,
+                                 sampleRate: sampleRate, channels: 1, interleaved: false)!
+        guard let conv = AVAudioConverter(from: file.processingFormat, to: fmt) else {
+            throw NSError(domain: "WhisperMel", code: 1,
+                          userInfo: [NSLocalizedDescriptionKey:
+                            "Cannot convert \(file.processingFormat) → 16 kHz mono"])
+        }
+        let cap = AVAudioFrameCount(
+            ceil(Double(file.length) * sampleRate / file.processingFormat.sampleRate) + 1)
+        let out = AVAudioPCMBuffer(pcmFormat: fmt, frameCapacity: cap)!
+        var fed = false; var err: NSError?
+        conv.convert(to: out, error: &err) { _, status in
+            guard !fed else { status.pointee = .endOfStream; return nil }
+            fed = true
+            let buf = AVAudioPCMBuffer(pcmFormat: file.processingFormat,
+                                       frameCapacity: AVAudioFrameCount(file.length))!
+            try? file.read(into: buf)
+            status.pointee = buf.frameLength > 0 ? .haveData : .endOfStream
+            return buf
+        }
+        if let e = err { throw e }
+        return Array(UnsafeBufferPointer(start: out.floatChannelData![0],
+                                         count: Int(out.frameLength)))
+    }
+
+    // MARK: - Precomputed DFT basis (201 × 400)
+    // cos_basis[k, n] =  cos(2π k n / 400)  →  Y[k].real = cos_basis @ x
+    // sin_basis[k, n] = -sin(2π k n / 400)  →  Y[k].imag = sin_basis @ x
+
+    static let cosBasis: [Float] = {
+        var m = [Float](repeating: 0, count: (nFFT / 2 + 1) * nFFT)
+        for k in 0...nFFT / 2 {
+            for n in 0..<nFFT {
+                m[k * nFFT + n] = cos(2 * Float.pi * Float(k) * Float(n) / Float(nFFT))
+            }
+        }
+        return m
+    }()
+
+    static let sinBasis: [Float] = {
+        var m = [Float](repeating: 0, count: (nFFT / 2 + 1) * nFFT)
+        for k in 0...nFFT / 2 {
+            for n in 0..<nFFT {
+                m[k * nFFT + n] = -sin(2 * Float.pi * Float(k) * Float(n) / Float(nFFT))
+            }
+        }
+        return m
+    }()
+
+    // MARK: - Mel filterbank (128 × 201, Slaney-normalised)
+
+    static let melFilterbank: [Float] = makeMelFilterbank()
+
+    // MARK: - Mel computation
+
+    static func fromPCM(_ raw: [Float]) -> [Float] {
+        // 1. Trim / zero-pad to nSamples
+        var audio = raw
+        if audio.count > nSamples { audio = Array(audio.prefix(nSamples)) }
+        else if audio.count < nSamples {
+            audio += [Float](repeating: 0, count: nSamples - audio.count)
+        }
+
+        // 2. Reflect-pad by nFFT/2 (matches np.pad(..., mode='reflect'))
+        let pad = nFFT / 2  // 200
+        var padded = [Float](repeating: 0, count: nSamples + 2 * pad)
+        for i in 0..<pad      { padded[pad - 1 - i]       = audio[i + 1] }
+        for i in 0..<nSamples { padded[pad + i]            = audio[i] }
+        for i in 0..<pad      { padded[pad + nSamples + i] = audio[nSamples - 2 - i] }
+
+        // 3. Hann window
+        var window = [Float](repeating: 0, count: nFFT)
+        for i in 0..<nFFT {
+            window[i] = Float(0.5 * (1 - cos(2 * Double.pi * Double(i) / Double(nFFT - 1))))
+        }
+
+        var frame     = [Float](repeating: 0, count: nFFT)
+        var yReal     = [Float](repeating: 0, count: nFreqs)
+        var yImag     = [Float](repeating: 0, count: nFreqs)
+        var powerSpec = [Float](repeating: 0, count: nFreqs)
+        var melFrame  = [Float](repeating: 0, count: nMelBins)
+        var mel       = [Float](repeating: 0, count: nMelBins * nFrames)
+
+        for t in 0..<nFrames {
+            let offset = t * hopLength
+
+            // Apply Hann window
+            vDSP_vmul(Array(padded[offset ..< offset + nFFT]), 1,
+                      window, 1, &frame, 1, vDSP_Length(nFFT))
+
+            // DFT via matrix multiply: Y[k] = cosBasis[k,:] @ frame - i × sinBasis[k,:] @ frame
+            cblas_sgemv(CblasRowMajor, CblasNoTrans,
+                        Int32(nFreqs), Int32(nFFT), 1.0, cosBasis, Int32(nFFT),
+                        frame, 1, 0.0, &yReal, 1)
+            cblas_sgemv(CblasRowMajor, CblasNoTrans,
+                        Int32(nFreqs), Int32(nFFT), 1.0, sinBasis, Int32(nFFT),
+                        frame, 1, 0.0, &yImag, 1)
+
+            // Power spectrum |Y[k]|² = yReal² + yImag²
+            vDSP_vmma(yReal, 1, yReal, 1, yImag, 1, yImag, 1, &powerSpec, 1, vDSP_Length(nFreqs))
+
+            // Apply mel filterbank: (128×201) × (201) → (128)
+            cblas_sgemv(CblasRowMajor, CblasNoTrans,
+                        Int32(nMelBins), Int32(nFreqs), 1.0, melFilterbank, Int32(nFreqs),
+                        powerSpec, 1, 0.0, &melFrame, 1)
+
+            for i in 0..<nMelBins {
+                mel[i * nFrames + t] = log10(max(melFrame[i], 1e-10))
+            }
+        }
+
+        // Normalise: clamp to max−8, then (x+4)/4
+        let maxVal = mel.max() ?? 0
+        for i in 0..<mel.count { mel[i] = (max(mel[i], maxVal - 8) + 4) / 4 }
+        return mel
+    }
+
+    // MARK: - Filterbank builder
+
+    private static func makeMelFilterbank() -> [Float] {
+        let fMax: Float = Float(sampleRate) / 2  // 8000 Hz
+
+        func hzToMel(_ f: Float) -> Float { 2595 * log10(1 + f / 700) }
+        func melToHz(_ m: Float) -> Float { 700 * (pow(10, m / 2595) - 1) }
+
+        let melMin = hzToMel(0), melMax = hzToMel(fMax)
+        let nPts = nMelBins + 2
+        let pts = (0..<nPts).map { i -> Float in
+            melToHz(melMin + Float(i) / Float(nPts - 1) * (melMax - melMin))
+        }
+        // FFT bin frequencies for n_fft = 400
+        let fftFreqs = (0..<nFreqs).map { Float($0) * Float(sampleRate) / Float(nFFT) }
+
+        var fb = [Float](repeating: 0, count: nMelBins * nFreqs)
+        for m in 0..<nMelBins {
+            let fL = pts[m], fC = pts[m + 1], fR = pts[m + 2]
+            let norm: Float = 2 / (fR - fL)
+            for k in 0..<nFreqs {
+                let f = fftFreqs[k]
+                if      f >= fL && f <= fC { fb[m * nFreqs + k] = norm * (f - fL) / (fC - fL) }
+                else if f >  fC && f <= fR { fb[m * nFreqs + k] = norm * (fR - f) / (fR - fC) }
+            }
+        }
+        return fb
+    }
+}