Skip to content
This repository was archived by the owner on Jan 24, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 15 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,19 @@
<img src="https://github.com/user-attachments/assets/1be5e31c-de42-40ab-9b85-790cb911ed47" alt="WhisperKit" width="20%" />
</a>

# WhisperKit Android (Beta)
# WhisperKit Android

[![Tests](https://github.com/argmaxinc/whisperkitandroid/actions/workflows/pr-checks.yml/badge.svg)](https://github.com/argmaxinc/whisperkitandroid/actions/workflows/pr-checks.yml)
[![License](https://img.shields.io/github/license/argmaxinc/whisperkitandroid?logo=github&logoColor=969da4&label=License&labelColor=353a41&color=32d058)](LICENSE.md)
[![Maven Central](https://img.shields.io/maven-central/v/com.argmaxinc/whisperkit?logo=sonatype&logoColor=969da4&label=Maven%20Central&labelColor=353a41&color=32d058)](https://central.sonatype.com/artifact/com.argmaxinc/whisperkit)
[![Discord](https://img.shields.io/discord/1171912382512115722?style=flat&logo=discord&logoColor=969da4&label=Discord&labelColor=353a41&color=32d058&link=https%3A%2F%2Fdiscord.gg%2FG5F5GZGecC)](https://discord.gg/G5F5GZGecC)

[![Maven Central](https://img.shields.io/maven-central/v/com.argmaxinc/whisperkit?color=32d058)](https://central.sonatype.com/artifact/com.argmaxinc/whisperkit)
</div>

WhisperKit Android brings Foundation Models On Device for Automatic Speech Recognition. It extends the performance and feature set of [WhisperKit](https://github.com/argmaxinc/WhisperKit) from Apple platforms to Android and Linux. The current feature set is a subset of the iOS counterpart,
but we are continuing to invest in Android and now welcome contributions from the community.

[Example App (Coming Soon)] [[Blog Post]](https://takeargmax.com/blog/android) [[Python Tools Repo]](https://github.com/argmaxinc/whisperkittools)
[[Example App]](https://play.google.com/store/apps/details?id=com.argmaxinc.whisperax) [[Blog Post]](https://takeargmax.com/blog/android) [[Python Tools Repo]](https://github.com/argmaxinc/whisperkittools)

## Table of Contents

Expand All @@ -37,7 +41,7 @@ To use WhisperKit in your Android app, you need to:
```kotlin
dependencies {
// 1. WhisperKit SDK
implementation("com.argmaxinc:whisperkit:0.3.0") // Check badge above for latest version
implementation("com.argmaxinc:whisperkit:0.3.2") // Check badge above for latest version

// 2. QNN dependencies for hardware acceleration
implementation("com.qualcomm.qnn:qnn-runtime:2.34.0")
Expand Down Expand Up @@ -73,18 +77,22 @@ class YourActivity : AppCompatActivity() {
whisperKit = WhisperKit.Builder()
.setModel(WhisperKit.OPENAI_TINY_EN)
.setApplicationContext(applicationContext)
.setCallback { what, timestamp, msg ->
.setCallback { what, result ->
// Handle transcription output
when (what) {
WhisperKit.TextOutputCallback.MSG_INIT -> {
// Model initialized successfully
}
WhisperKit.TextOutputCallback.MSG_TEXT_OUT -> {
// New transcription available
val text = msg
val time = timestamp
val fullText = result.text
val segments = result.segments
// Process the transcribed text as it becomes available
// This callback will be called multiple times as more audio is processed
segments.forEach { segment ->
// Process each segment
val segmentText = segment.text
}
}
WhisperKit.TextOutputCallback.MSG_CLOSE -> {
// Cleanup complete
Expand Down
2 changes: 1 addition & 1 deletion android/config/detekt.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ complexity:
thresholdInObjects: 10
LongParameterList:
functionThreshold: 8
constructorThreshold: 7
constructorThreshold: 8
CyclomaticComplexMethod:
threshold: 20
NestedBlockDepth:
Expand Down
2 changes: 1 addition & 1 deletion android/examples/WhisperAX/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ android {
applicationId = "com.argmaxinc.whisperax"
minSdk = 26
targetSdk = 35
versionCode = 3
versionCode = 6
versionName = "0.1.0"

testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import androidx.compose.material3.Surface
import androidx.compose.material3.Text
import androidx.compose.runtime.Composable
import androidx.compose.runtime.collectAsState
import androidx.compose.runtime.derivedStateOf
import androidx.compose.runtime.getValue
import androidx.compose.runtime.mutableStateOf
import androidx.compose.runtime.remember
Expand All @@ -40,6 +41,7 @@ import androidx.compose.ui.Modifier
import androidx.compose.ui.draw.alpha
import androidx.compose.ui.draw.rotate
import androidx.compose.ui.unit.dp
import com.argmaxinc.whisperax.WhisperViewModel.Companion.MODELS_SUPPORTING_NPU
import com.argmaxinc.whisperkit.ExperimentalWhisperKit
import com.argmaxinc.whisperkit.WhisperKit

Expand All @@ -50,11 +52,18 @@ enum class ComputeUnits(val displayName: String, val backendValue: Int) {
CPU_AND_NPU("NPU", WhisperKit.Builder.CPU_AND_NPU),
}

@OptIn(ExperimentalWhisperKit::class)
@Composable
fun ComputeUnitsView(viewModel: WhisperViewModel) {
val modelState by viewModel.modelState.collectAsState()
val encoderState by viewModel.encoderState.collectAsState()
val decoderState by viewModel.decoderState.collectAsState()
val selectedModel by viewModel.selectedModel.collectAsState()
val shouldEnableNPUForEncoderDecoder by remember {
derivedStateOf {
selectedModel in MODELS_SUPPORTING_NPU
}
}
val isEnabled = modelState == ModelState.LOADED || modelState == ModelState.UNLOADED

var whisperKitExpanded by remember { mutableStateOf(true) }
Expand All @@ -75,6 +84,7 @@ fun ComputeUnitsView(viewModel: WhisperViewModel) {
currentState = encoderState,
currentUnit = viewModel.encoderComputeUnits.collectAsState().value,
onUnitSelected = { viewModel.setEncoderComputeUnits(it) },
shouldEnableNPU = shouldEnableNPUForEncoderDecoder,
enabled = isEnabled,
)

Expand All @@ -85,6 +95,7 @@ fun ComputeUnitsView(viewModel: WhisperViewModel) {
currentState = decoderState,
currentUnit = viewModel.decoderComputeUnits.collectAsState().value,
onUnitSelected = { viewModel.setDecoderComputeUnits(it) },
shouldEnableNPU = shouldEnableNPUForEncoderDecoder,
enabled = isEnabled,
)
}
Expand Down Expand Up @@ -185,6 +196,7 @@ fun ComputeUnitRow(
currentState: ModelState,
currentUnit: ComputeUnits,
onUnitSelected: (ComputeUnits) -> Unit,
shouldEnableNPU: Boolean = true,
enabled: Boolean = true,
) {
val infiniteTransition = rememberInfiniteTransition(label = "loading animation")
Expand Down Expand Up @@ -248,7 +260,11 @@ fun ComputeUnitRow(
expanded = expanded,
onDismissRequest = { expanded = false },
) {
ComputeUnits.values().forEach { unit ->
if (shouldEnableNPU) {
listOf(ComputeUnits.CPU_ONLY, ComputeUnits.CPU_AND_GPU, ComputeUnits.CPU_AND_NPU)
} else {
listOf(ComputeUnits.CPU_ONLY, ComputeUnits.CPU_AND_GPU)
}.forEach { unit ->
DropdownMenuItem(
text = { Text(unit.displayName) },
onClick = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import androidx.compose.material3.Icon
import androidx.compose.material3.IconButton
import androidx.compose.material3.LinearProgressIndicator
import androidx.compose.material3.MaterialTheme
import androidx.compose.material3.MenuAnchorType
import androidx.compose.material3.OutlinedTextField
import androidx.compose.material3.Surface
import androidx.compose.material3.Text
Expand Down Expand Up @@ -111,7 +112,8 @@ fun ModelSelectorView(viewModel: WhisperViewModel) {
},
modifier = Modifier
.fillMaxWidth()
.weight(1f),
.weight(1f)
.menuAnchor(MenuAnchorType.PrimaryNotEditable),
)

ExposedDropdownMenu(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ import androidx.compose.runtime.mutableStateListOf
import androidx.lifecycle.ViewModel
import androidx.lifecycle.viewModelScope
import com.argmaxinc.whisperkit.ExperimentalWhisperKit
import com.argmaxinc.whisperkit.TranscriptionResult
import com.argmaxinc.whisperkit.TranscriptionSegment
import com.argmaxinc.whisperkit.WhisperKit
import com.argmaxinc.whisperkit.WhisperKit.TextOutputCallback
import com.argmaxinc.whisperkit.WhisperKitException
Expand All @@ -33,22 +35,13 @@ import java.text.SimpleDateFormat
import java.util.Date
import java.util.Locale

data class TranscriptionSegment(
val text: String,
val start: Float,
val end: Float,
val tokens: List<Int> = emptyList(),
)

data class TranscriptionResult(
val text: String = "",
val segments: List<TranscriptionSegment> = emptyList(),
)

@OptIn(ExperimentalWhisperKit::class)
class WhisperViewModel : ViewModel() {
companion object {
const val TAG = "WhisperViewModel"

// Models currently supporting NPU backend, don't enable NPU for other models
val MODELS_SUPPORTING_NPU = listOf(WhisperKit.Builder.QUALCOMM_TINY_EN, WhisperKit.Builder.QUALCOMM_BASE_EN)
}

private lateinit var appContext: Context
Expand Down Expand Up @@ -190,25 +183,25 @@ class WhisperViewModel : ViewModel() {
cacheDir = context.cacheDir.absolutePath
}

fun onTextOutput(what: Int, timestamp: Float, msg: String) {
fun onTextOutput(what: Int, result: TranscriptionResult) {
val segments = result.segments
when (what) {
TextOutputCallback.MSG_INIT -> {
Log.i(MainActivity.TAG, "TFLite initialized: $msg")
Log.i(MainActivity.TAG, "TFLite initialized: ${result.text}")
startTime = System.currentTimeMillis()
_pipelineStart.value = startTime.toDouble() / 1000.0
_isInitializing.value = false
}

TextOutputCallback.MSG_TEXT_OUT -> {
Log.i(MainActivity.TAG, "TEXT OUT THREAD")
if (msg.isNotEmpty()) {
if (segments.isNotEmpty()) {
if (!firstTokenReceived) {
firstTokenReceived = true
firstTokenTimestamp = System.currentTimeMillis()
_firstTokenTime.value = (firstTokenTimestamp - startTime).toDouble() / 1000.0
}

val newTokens = msg.length / 4
val newTokens = segments.joinToString("") { it.text }.length / 4
totalTokens += newTokens

val currentTime = System.currentTimeMillis()
Expand All @@ -220,14 +213,14 @@ class WhisperViewModel : ViewModel() {
}

lastTokenTimestamp = currentTime
updateTranscript(msg)
updateTranscript(segments)
}
}

TextOutputCallback.MSG_CLOSE -> {
Log.i(MainActivity.TAG, "Transcription completed.")
if (msg.isNotEmpty()) {
val newTokens = msg.length / 4
if (segments.isNotEmpty()) {
val newTokens = segments.joinToString("") { it.text }.length / 4
totalTokens += newTokens

val totalTime = (System.currentTimeMillis() - startTime).toDouble() / 1000.0
Expand All @@ -236,8 +229,7 @@ class WhisperViewModel : ViewModel() {

updateRealtimeMetrics(totalTime)
}

updateTranscript(msg)
updateTranscript(segments)
}
}

Expand All @@ -247,25 +239,8 @@ class WhisperViewModel : ViewModel() {
}
}

private fun updateTranscript(chunkText: String, withTimestamps: Boolean = false) {
var processedText = chunkText

val timestamps = if (withTimestamps) {
val timestampPattern = "<\\|(\\d+\\.\\d+)\\|>".toRegex()
val timestampMatches = timestampPattern.findAll(chunkText).toList()
timestampMatches.map { it.groupValues[1].toFloat() }
} else {
emptyList()
}

if (!withTimestamps) {
processedText = processedText
.replace("<\\|[^>]*\\|>".toRegex(), "")
.trim()
} else {
processedText = processedText.trim()
}

private fun updateTranscript(segments: List<TranscriptionSegment>) {
val processedText = segments.joinToString("") { it.text }
if (processedText.isNotEmpty()) {
if (allText.isNotEmpty()) {
allText.append("\n")
Expand All @@ -284,13 +259,12 @@ class WhisperViewModel : ViewModel() {
fun listModels() {
viewModelScope.launch {
val modelDirs = listOf(
// TODO: enable when models are ready
// WhisperKit.Builder.OPENAI_TINY_EN,
// WhisperKit.Builder.OPENAI_BASE_EN,
// WhisperKit.Builder.OPENAI_SMALL_EN,
WhisperKit.Builder.QUALCOMM_TINY_EN,
WhisperKit.Builder.QUALCOMM_BASE_EN,
// WhisperKit.Builder.QUALCOMM_SMALL_EN
WhisperKit.Builder.OPENAI_TINY_EN,
WhisperKit.Builder.OPENAI_BASE_EN,
WhisperKit.Builder.OPENAI_TINY,
WhisperKit.Builder.OPENAI_BASE,
)
availableModels.clear()
availableModels.addAll(modelDirs)
Expand Down Expand Up @@ -364,6 +338,21 @@ class WhisperViewModel : ViewModel() {

fun selectModel(model: String) {
_selectedModel.value = model
if (model in MODELS_SUPPORTING_NPU) {
_encoderComputeUnits.update {
ComputeUnits.CPU_AND_NPU
}
_decoderComputeUnits.update {
ComputeUnits.CPU_AND_NPU
}
} else {
_encoderComputeUnits.update {
ComputeUnits.CPU_ONLY
}
_decoderComputeUnits.update {
ComputeUnits.CPU_ONLY
}
}
_modelState.value = ModelState.UNLOADED
_encoderState.value = ModelState.UNLOADED
_decoderState.value = ModelState.UNLOADED
Expand Down
2 changes: 1 addition & 1 deletion android/whisperkit/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ dependencies {

mavenPublishing {

coordinates("com.argmaxinc", "whisperkit", "0.3.0")
coordinates("com.argmaxinc", "whisperkit", "0.3.2")
pom {
name.set("WhisperKit")
description.set("On-device Speech Recognition for Android")
Expand Down
2 changes: 2 additions & 0 deletions android/whisperkit/detekt-baseline.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
<SmellBaseline>
<ManuallySuppressedIssues/>
<CurrentIssues>
<ID>LargeClass:ArgmaxModelDownloaderImplTest.kt$ArgmaxModelDownloaderImplTest</ID>
<ID>ThrowsCount:WhisperKit.kt$WhisperKit.Builder$@Throws(WhisperKitException::class) fun build(): WhisperKit</ID>
<ID>TooGenericExceptionCaught:KtorHuggingFaceApiImpl.kt$KtorHuggingFaceApiImpl$e: Exception</ID>
<ID>TooGenericExceptionCaught:WhisperKitImpl.kt$WhisperKitImpl$e: Exception</ID>
<ID>UnusedParameter:WhisperKitImpl.kt$WhisperKitImpl$timestamp: Float</ID>
</CurrentIssues>
</SmellBaseline>
Loading