Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ voxtral-model/
pyenv*/
misc/
*.o
mic_test
20 changes: 18 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,10 @@ SRCS += voxtral_mic_macos.c
blas: CFLAGS = $(CFLAGS_BASE) -DUSE_BLAS -DACCELERATE_NEW_LAPACK
blas: LDFLAGS += -framework Accelerate -framework AudioToolbox -framework CoreFoundation
else
SRCS += voxtral_mic_linux.c
blas: CFLAGS = $(CFLAGS_BASE) -DUSE_BLAS -DUSE_OPENBLAS -I/usr/include/openblas
blas: LDFLAGS += -lopenblas
SRCS += voxtral_mic_macos.c
blas: LDFLAGS += -lopenblas -lasound -lpthread
#SRCS += voxtral_mic_macos.c
endif
blas: clean $(TARGET)
@echo ""
Expand Down Expand Up @@ -122,11 +123,25 @@ inspect: inspect_weights.o voxtral_safetensors.o
test:
@./runtest.sh

# =============================================================================
# Mic backends test (macOS CoreAudio / Linux ALSA)
# =============================================================================
mic_test: voxtral_mic_test.c voxtral_mic_macos.o voxtral_mic_linux.o
ifeq ($(UNAME_S),Darwin)
$(CC) $(CFLAGS_BASE) -o mic_test voxtral_mic_test.c voxtral_mic_macos.o \
-framework AudioToolbox -framework CoreFoundation -lpthread -lm
else
$(CC) $(CFLAGS_BASE) -o mic_test voxtral_mic_test.c voxtral_mic_linux.o \
-lasound -lpthread -lm
endif
@echo "Built mic_test for $(UNAME_S)"

# =============================================================================
# Utilities
# =============================================================================
clean:
rm -f $(OBJS) *.mps.o voxtral_metal.o main.o inspect_weights.o $(TARGET) inspect_weights
rm -rf voxtral_mic_macos.o voxtral_mic_linux.o mic_test
rm -f voxtral_shaders_source.h

info:
Expand Down Expand Up @@ -155,4 +170,5 @@ voxtral_tokenizer.o: voxtral_tokenizer.c voxtral_tokenizer.h
voxtral_safetensors.o: voxtral_safetensors.c voxtral_safetensors.h
main.o: main.c voxtral.h voxtral_kernels.h voxtral_mic.h
voxtral_mic_macos.o: voxtral_mic_macos.c voxtral_mic.h
voxtral_mic_linux.o: voxtral_mic_linux.c voxtral_mic.h
inspect_weights.o: inspect_weights.c voxtral_safetensors.h
17 changes: 13 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

This is a C implementation of the inference pipeline for the [Mistral AI's Voxtral Realtime 4B model](https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602). It has zero external dependencies beyond the C standard library. The MPS inference is decently fast, while the BLAS acceleration is usable but slow (it continuously convert the bf16 weights to fp32).

Audio processing uses a chunked encoder with overlapping windows, bounding memory usage regardless of input length. Audio can also be piped from stdin (`--stdin`), or captured live from the microphone (`--from-mic`, macOS), making it easy to transcode and transcribe any format via ffmpeg. A streaming C API (`vox_stream_t`) lets you feed audio incrementally and receive token strings as they become available.
Audio processing uses a chunked encoder with overlapping windows, bounding memory usage regardless of input length. Audio can also be piped from stdin (`--stdin`), or captured live from the microphone (`--from-mic`, macOS and GNU/Linux), making it easy to transcode and transcribe any format via ffmpeg. A streaming C API (`vox_stream_t`) lets you feed audio incrementally and receive token strings as they become available.

**More testing needed:** please note that this project was mostly tested against few samples, and likely requires some more work to be production quality. However the hard part, to understand the model inference and reproduce the inference pipeline, is here, so the rest likely can be done easily. Testing it against very long transcriptions, able to stress the KV cache circular buffer, will be a useful task.

Expand All @@ -25,7 +25,7 @@ make mps # Apple Silicon (fastest)
# Transcribe audio (tokens stream to stdout as generated)
./voxtral -d voxtral-model -i audio.wav

# Live microphone transcription (macOS, Ctrl+C to stop)
# Live microphone transcription (macOS and GNU/Linux, Ctrl+C to stop)
./voxtral -d voxtral-model --from-mic

# Pipe any format via ffmpeg
Expand Down Expand Up @@ -57,7 +57,7 @@ This requires just PyTorch and a few standard libraries.
- **Streaming output**: Tokens are printed to stdout as they are generated, word by word.
- **Streaming C API**: Feed audio incrementally, get token strings back as they become available.
- **Memory-mapped weights**: BF16 weights are mmap'd directly from safetensors, loading is near-instant.
- **Live microphone input**: `--from-mic` captures and transcribes from the default microphone (macOS) with automatic silence detection.
- **Live microphone input**: `--from-mic` captures and transcribes from the default microphone (macOS and GNU/Linux) with automatic silence detection.
- **WAV input**: Supports 16-bit PCM WAV files at any sample rate (auto-resampled to 16kHz).
- **Chunked encoder**: Processes audio in overlapping chunks, bounding memory regardless of length.
- **Rolling KV cache**: Decoder KV cache is automatically compacted when it exceeds the sliding window (8192 positions), capping memory usage and allowing unlimited-length audio.
Expand Down Expand Up @@ -151,13 +151,21 @@ curl -sL http://stream.live.vc.bbcmedia.co.uk/bbc_world_service | \

### Live Microphone Input

The **`--from-mic` flag** captures audio from the default microphone (macOS only, uses AudioQueue Services). Press Ctrl+C to stop. Silence is automatically detected and stripped to reduce encoder/decoder work when you pause speaking — only actual speech is processed.
The **`--from-mic` flag** captures audio from the default microphone (macOS AudioQueue Services, GNU/Linux ALSA). Press Ctrl+C to stop. Silence is automatically detected and stripped to reduce encoder/decoder work when you pause speaking — only actual speech is processed.

```bash
./voxtral -d voxtral-model --from-mic # default 2s processing interval
./voxtral -d voxtral-model --from-mic -I 1.0 # lower latency
./voxtral -d voxtral-model --from-mic --silent # no stderr status
```
For GNU/Linux systems make sure to have ALSA development libraries, otherwise:

```bash
# Ubuntu/Debian
sudo apt install libasound2 libasound2-dev
# Fedora
sudo dnf install alsa-lib alsa-lib-devel
```

If the model falls behind real-time, a warning is printed and audio is skipped to catch up.

Expand Down Expand Up @@ -297,6 +305,7 @@ Other targets:
make clean # Clean build artifacts
make info # Show available backends for this platform
make inspect # Build safetensors weight inspector
make mic_test # Build microphone backends test (macOS and GNU/Linux)
```

## Model Download
Expand Down
227 changes: 227 additions & 0 deletions voxtral_mic_linux.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
/*
* voxtral_mic_linux.c - Microphone capture using ALSA (GNU/Linux)
* (need libasound2-dev)
*
* Captures audio from the default microphone at 16 kHz, mono S16LE,
* converts samples to float [-1, 1], and writes them into a
* mutex‑protected ring buffer.
* The main thread polls vox_mic_read() to drain samples.
*/

#ifdef __linux__

#include "voxtral_mic.h"
#include <alsa/asoundlib.h>
#include <pthread.h>
#include <string.h>
#include <stdio.h>

#define MIC_SAMPLE_RATE 16000
#define MIC_BUF_FRAMES 1600 // 1600 frames = 100 ms at 16 kHz (mono)
#define RING_CAPACITY 160000 // 10 seconds at 16kHz

static snd_pcm_t *pcm_handle = NULL; // ALSA PCM capture handle
static pthread_t capture_thread; // background thread for audio capture
static pthread_mutex_t ring_mutex = PTHREAD_MUTEX_INITIALIZER; // protects ring buffer state
static float ring[RING_CAPACITY]; // circular buffer
static int ring_head = 0; // next write position
static int ring_count = 0; // n of valid samples in ring
static volatile int running = 0; // capture running/not running



/* The audio capture thread:
* reads from ALSA pcm_handle, converts s16 to float, writes into ring buffer */
static void *mic_capture_thread(void *arg) {
(void)arg;

int16_t buf[MIC_BUF_FRAMES];

while (running) {
snd_pcm_sframes_t n = snd_pcm_readi(pcm_handle, buf, MIC_BUF_FRAMES);
if (n == -EPIPE) {
// XRUN: buffer overrun/underrun, reset pcm_handle with prepare and try again
snd_pcm_prepare(pcm_handle);
continue;
} else if (n == -EAGAIN || n == -EINTR) {
// Alsa is occupied or call interrupted, try again
continue;
} else if (n < 0) {
// Unrecoverable ALSA error
fprintf(stderr, "ALSA read error: %s\n", snd_strerror((int)n));
continue;
}

int frames = (int)n;
pthread_mutex_lock(&ring_mutex);
for (int i = 0; i < frames; i++) {
float sample = buf[i] / 32768.0f;
ring[ring_head] = sample;
ring_head = (ring_head + 1) % RING_CAPACITY;
if (ring_count < RING_CAPACITY) {
ring_count++;
}
/* If ring buffer is full, new samples overwrite the oldest ones.
* ring_head advances, ring_tail advances implicitly,
* ring_count stays at RING_CAPACITY. */
}
pthread_mutex_unlock(&ring_mutex);
}

return NULL;
}



/* Errors checking helper func */
static int alsa_errcheck(int err, const char *msg) {
if(err < 0) {
fprintf(stderr, "%s: %s\n", msg, snd_strerror(err));
return -1;
}
return 0;
}



int vox_mic_start(void) {
/* pcm_handle already running, skip start process */
if (running) return 0;

int err;
snd_pcm_hw_params_t *hw_params = NULL;

/* Open ALSA pcm_handle for capture */
err = snd_pcm_open(&pcm_handle, "default", SND_PCM_STREAM_CAPTURE, 0);
if (alsa_errcheck(err, "snd_pcm_open failed")) goto fail;

/* Allocate and initialize hw_params with ALSA default configuration */
snd_pcm_hw_params_alloca(&hw_params);
err = snd_pcm_hw_params_any(pcm_handle, hw_params);
if (alsa_errcheck(err, "snd_pcm_hw_params_any failed")) goto fail;

/* Set ACCESS param to INTERLEAVED */
err = snd_pcm_hw_params_set_access(pcm_handle, hw_params, SND_PCM_ACCESS_RW_INTERLEAVED);
if (alsa_errcheck(err, "snd_pcm_hw_params_set_access failed")) goto fail;

/* Set CHANNELS param to MONO */
err = snd_pcm_hw_params_set_channels(pcm_handle, hw_params, 1);
if (alsa_errcheck(err, "snd_pcm_hw_params_set_channels failed")) goto fail;

/* Set FORMAT param to S16LE */
err = snd_pcm_hw_params_set_format(pcm_handle, hw_params, SND_PCM_FORMAT_S16_LE);
if (alsa_errcheck(err, "snd_pcm_hw_params_set_format failed")) goto fail;

/* Set PERIOD SIZE (number of frames per read) to MIC_BUF_FRAMES */
snd_pcm_uframes_t period_size = MIC_BUF_FRAMES;
err = snd_pcm_hw_params_set_period_size_near(pcm_handle, hw_params, &period_size, NULL);
if (alsa_errcheck(err, "snd_pcm_hw_params_set_period_size_near failed")) goto fail;

/* Set RATE param to 16kHz (mandatory for Voxtral) */
unsigned int rate = MIC_SAMPLE_RATE;
err = snd_pcm_hw_params_set_rate_near(pcm_handle, hw_params, &rate, NULL);
if (alsa_errcheck(err, "snd_pcm_hw_params_set_rate_near failed")) goto fail;
if (rate != MIC_SAMPLE_RATE) {
fprintf(stderr, "Your audio pcm_handle does not support 16000 Hz\n");
goto fail;
}

/* Finally commit all previous params settings to the ALSA pcm_handle */
err = snd_pcm_hw_params(pcm_handle, hw_params);
if (alsa_errcheck(err, "snd_pcm_hw_params failed")) goto fail;

/* Set the ALSA pcm_handle as ready */
err = snd_pcm_prepare(pcm_handle);
if (alsa_errcheck(err, "snd_pcm_prepare failed")) goto fail;

/* Init ring buffer */
pthread_mutex_lock(&ring_mutex);
ring_head = 0;
ring_count = 0;
pthread_mutex_unlock(&ring_mutex);

/* Start capture thread */
running = 1;
err = pthread_create(&capture_thread, NULL, mic_capture_thread, NULL);
if (err != 0) {
fprintf(stderr, "pthread_create failed: %d\n", err);
running = 0;
goto fail;
}

return 0;

fail:
if (pcm_handle) {
snd_pcm_close(pcm_handle);
pcm_handle = NULL;
}
return -1;
}



int vox_mic_read(float *out, int max_samples) {
if (!out || max_samples <= 0) return 0;

pthread_mutex_lock(&ring_mutex);
int n = ring_count < max_samples ? ring_count : max_samples;
if (n > 0) {
/* ring_tail = posizione del campione più vecchio */
int ring_tail = (ring_head - ring_count + RING_CAPACITY) % RING_CAPACITY;
for (int i = 0; i < n; i++) {
out[i] = ring[(ring_tail + i) % RING_CAPACITY];
}
ring_count -= n;
}
pthread_mutex_unlock(&ring_mutex);

return n;
}



int vox_mic_read_available(void) {
pthread_mutex_lock(&ring_mutex);
int n = ring_count;
pthread_mutex_unlock(&ring_mutex);
return n;
}



void vox_mic_stop(void) {
if (!running) return;
running = 0;
// Force the blocking snd_pcm_readi() call to wake up by dropping the PCM stream
if (pcm_handle) snd_pcm_drop(pcm_handle);
pthread_join(capture_thread, NULL);
if (pcm_handle) {
snd_pcm_close(pcm_handle);
pcm_handle = NULL;
}
}



#else /* !__linux__ */
#include "voxtral_mic.h"
#include <stdio.h>

int vox_mic_start(void) {
fprintf(stderr, "Microphone capture with ALSA is not supported on this platform\n");
return -1;
}

int vox_mic_read(float *out, int max_samples) {
(void)out; (void)max_samples;
return 0;
}

int vox_mic_read_available(void) { return 0; }

void vox_mic_stop(void) {}

#endif


51 changes: 51 additions & 0 deletions voxtral_mic_test.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
* voxtral_mic_test.c - Microphone backends test (macOS and GNU/Linux)
*
* This test is usefull to verify if compiled backend is receiving microphone input.
* Since it uses the voxtral_mic.h interface, it is OS-agnostic
* and works with both AudioQueue (macOS) and ALSA (GNU/Linux) backends.
* */

#include "voxtral_mic.h"
#include <unistd.h>
#include <math.h>
#include <stdio.h>

static float rms(const float *buf, int n) {
if (n <= 0) return 0.0f;
double acc = 0.0;
for (int i = 0; i < n; i++) acc += buf[i] * buf[i];
return sqrt(acc / n);
}

int main() {
printf("=== Mic backend test ===\n");

printf("Starting microphone...\n");
if (vox_mic_start() != 0) {
fprintf(stderr, "ERROR: unable to start microphone\n");
return 1;
}

float buf[1024];

for (int iter = 0; iter < 100; iter++) {
int available = vox_mic_read_available();
int n = vox_mic_read(buf, 1024);
float r = rms(buf, n);

printf("iter=%d available=%d read=%d | RMS=%.6f\n",
iter, available, n, r);

usleep(100000);
}

printf("Stopping microphone...\n");
vox_mic_stop();
printf("Stopped.\n");

printf("=== End test ===\n");
return 0;
}