antirez · Lompo790 · Feb 12, 2026 · Feb 15, 2026
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,4 @@ voxtral-model/
 pyenv*/
 misc/
 *.o
+mic_test
diff --git a/Makefile b/Makefile
@@ -50,9 +50,10 @@ SRCS += voxtral_mic_macos.c
 blas: CFLAGS = $(CFLAGS_BASE) -DUSE_BLAS -DACCELERATE_NEW_LAPACK
 blas: LDFLAGS += -framework Accelerate -framework AudioToolbox -framework CoreFoundation
 else
+SRCS += voxtral_mic_linux.c
 blas: CFLAGS = $(CFLAGS_BASE) -DUSE_BLAS -DUSE_OPENBLAS -I/usr/include/openblas
-blas: LDFLAGS += -lopenblas
-SRCS += voxtral_mic_macos.c
+blas: LDFLAGS += -lopenblas -lasound -lpthread
+#SRCS += voxtral_mic_macos.c
 endif
 blas: clean $(TARGET)
 	@echo ""
@@ -122,11 +123,25 @@ inspect: inspect_weights.o voxtral_safetensors.o
 test:
 	@./runtest.sh
 
+# =============================================================================
+# Mic backends test (macOS CoreAudio / Linux ALSA)
+# =============================================================================
+mic_test: voxtral_mic_test.c voxtral_mic_macos.o voxtral_mic_linux.o
+ifeq ($(UNAME_S),Darwin)
+	$(CC) $(CFLAGS_BASE) -o mic_test voxtral_mic_test.c voxtral_mic_macos.o \
+		-framework AudioToolbox -framework CoreFoundation -lpthread -lm
+else
+	$(CC) $(CFLAGS_BASE) -o mic_test voxtral_mic_test.c voxtral_mic_linux.o \
+		-lasound -lpthread -lm
+endif
+	@echo "Built mic_test for $(UNAME_S)"
+
 # =============================================================================
 # Utilities
 # =============================================================================
 clean:
 	rm -f $(OBJS) *.mps.o voxtral_metal.o main.o inspect_weights.o $(TARGET) inspect_weights
+	rm -rf voxtral_mic_macos.o voxtral_mic_linux.o mic_test
 	rm -f voxtral_shaders_source.h
 
 info:
@@ -155,4 +170,5 @@ voxtral_tokenizer.o: voxtral_tokenizer.c voxtral_tokenizer.h
 voxtral_safetensors.o: voxtral_safetensors.c voxtral_safetensors.h
 main.o: main.c voxtral.h voxtral_kernels.h voxtral_mic.h
 voxtral_mic_macos.o: voxtral_mic_macos.c voxtral_mic.h
+voxtral_mic_linux.o: voxtral_mic_linux.c voxtral_mic.h
 inspect_weights.o: inspect_weights.c voxtral_safetensors.h
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 This is a C implementation of the inference pipeline for the [Mistral AI's Voxtral Realtime 4B model](https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602). It has zero external dependencies beyond the C standard library. The MPS inference is decently fast, while the BLAS acceleration is usable but slow (it continuously convert the bf16 weights to fp32).
 
-Audio processing uses a chunked encoder with overlapping windows, bounding memory usage regardless of input length. Audio can also be piped from stdin (`--stdin`), or captured live from the microphone (`--from-mic`, macOS), making it easy to transcode and transcribe any format via ffmpeg. A streaming C API (`vox_stream_t`) lets you feed audio incrementally and receive token strings as they become available.
+Audio processing uses a chunked encoder with overlapping windows, bounding memory usage regardless of input length. Audio can also be piped from stdin (`--stdin`), or captured live from the microphone (`--from-mic`, macOS and GNU/Linux), making it easy to transcode and transcribe any format via ffmpeg. A streaming C API (`vox_stream_t`) lets you feed audio incrementally and receive token strings as they become available.
 
 **More testing needed:** please note that this project was mostly tested against few samples, and likely requires some more work to be production quality. However the hard part, to understand the model inference and reproduce the inference pipeline, is here, so the rest likely can be done easily. Testing it against very long transcriptions, able to stress the KV cache circular buffer, will be a useful task.
 
@@ -25,7 +25,7 @@ make mps       # Apple Silicon (fastest)
 # Transcribe audio (tokens stream to stdout as generated)
 ./voxtral -d voxtral-model -i audio.wav
 
-# Live microphone transcription (macOS, Ctrl+C to stop)
+# Live microphone transcription (macOS and GNU/Linux, Ctrl+C to stop)
 ./voxtral -d voxtral-model --from-mic
 
 # Pipe any format via ffmpeg
@@ -57,7 +57,7 @@ This requires just PyTorch and a few standard libraries.
 - **Streaming output**: Tokens are printed to stdout as they are generated, word by word.
 - **Streaming C API**: Feed audio incrementally, get token strings back as they become available.
 - **Memory-mapped weights**: BF16 weights are mmap'd directly from safetensors, loading is near-instant.
-- **Live microphone input**: `--from-mic` captures and transcribes from the default microphone (macOS) with automatic silence detection.
+- **Live microphone input**: `--from-mic` captures and transcribes from the default microphone (macOS and GNU/Linux) with automatic silence detection.
 - **WAV input**: Supports 16-bit PCM WAV files at any sample rate (auto-resampled to 16kHz).
 - **Chunked encoder**: Processes audio in overlapping chunks, bounding memory regardless of length.
 - **Rolling KV cache**: Decoder KV cache is automatically compacted when it exceeds the sliding window (8192 positions), capping memory usage and allowing unlimited-length audio.
@@ -151,13 +151,21 @@ curl -sL http://stream.live.vc.bbcmedia.co.uk/bbc_world_service | \
 
 ### Live Microphone Input
 
-The **`--from-mic` flag** captures audio from the default microphone (macOS only, uses AudioQueue Services). Press Ctrl+C to stop. Silence is automatically detected and stripped to reduce encoder/decoder work when you pause speaking — only actual speech is processed.
+The **`--from-mic` flag** captures audio from the default microphone (macOS AudioQueue Services, GNU/Linux ALSA). Press Ctrl+C to stop. Silence is automatically detected and stripped to reduce encoder/decoder work when you pause speaking — only actual speech is processed.
 
 ```bash
 ./voxtral -d voxtral-model --from-mic                # default 2s processing interval
 ./voxtral -d voxtral-model --from-mic -I 1.0          # lower latency
 ./voxtral -d voxtral-model --from-mic --silent         # no stderr status
 ```
+For GNU/Linux systems make sure to have ALSA development libraries, otherwise:
+
+```bash
+# Ubuntu/Debian
+sudo apt install libasound2 libasound2-dev
+# Fedora
+sudo dnf install alsa-lib alsa-lib-devel
+```
 
 If the model falls behind real-time, a warning is printed and audio is skipped to catch up.
 
@@ -297,6 +305,7 @@ Other targets:
 make clean      # Clean build artifacts
 make info       # Show available backends for this platform
 make inspect    # Build safetensors weight inspector
+make mic_test   # Build microphone backends test (macOS and GNU/Linux)
 ```
 
 ## Model Download

diff --git a/voxtral_mic_linux.c b/voxtral_mic_linux.c
@@ -0,0 +1,227 @@
+/*
+ * voxtral_mic_linux.c - Microphone capture using ALSA (GNU/Linux)
+ * (need libasound2-dev)
+ *
+ * Captures audio from the default microphone at 16 kHz, mono S16LE,
+ * converts samples to float [-1, 1], and writes them into a
+ * mutex‑protected ring buffer.
+ * The main thread polls vox_mic_read() to drain samples.
+ */
+
+#ifdef __linux__
+
+#include "voxtral_mic.h"
+#include <alsa/asoundlib.h>
+#include <pthread.h>
+#include <string.h>
+#include <stdio.h>
+
+#define MIC_SAMPLE_RATE   16000
+#define MIC_BUF_FRAMES    1600    // 1600 frames = 100 ms at 16 kHz (mono)
+#define RING_CAPACITY     160000  // 10 seconds at 16kHz
+
+static snd_pcm_t          *pcm_handle = NULL;   // ALSA PCM capture handle
+static pthread_t           capture_thread;   // background thread for audio capture
+static pthread_mutex_t     ring_mutex = PTHREAD_MUTEX_INITIALIZER;  // protects ring buffer state
+static float               ring[RING_CAPACITY];   // circular buffer
+static int                 ring_head = 0;   // next write position
+static int                 ring_count = 0;  // n of valid samples in ring
+static volatile int        running = 0;    // capture running/not running
+
+
+
+/* The audio capture thread:
+ * reads from ALSA pcm_handle, converts s16 to float, writes into ring buffer */
+static void *mic_capture_thread(void *arg) {
+    (void)arg;
+
+    int16_t buf[MIC_BUF_FRAMES];
+
+    while (running) {
+        snd_pcm_sframes_t n = snd_pcm_readi(pcm_handle, buf, MIC_BUF_FRAMES);
+        if (n == -EPIPE) {
+            // XRUN: buffer overrun/underrun, reset pcm_handle with prepare and try again
+            snd_pcm_prepare(pcm_handle);
+            continue;
+        } else if (n == -EAGAIN || n == -EINTR) {
+            // Alsa is occupied or call interrupted, try again
+            continue;
+        } else if (n < 0) {
+            // Unrecoverable ALSA error
+            fprintf(stderr, "ALSA read error: %s\n", snd_strerror((int)n));
+            continue;
+        }
+
+        int frames = (int)n;
+        pthread_mutex_lock(&ring_mutex);
+        for (int i = 0; i < frames; i++) {
+            float sample = buf[i] / 32768.0f;
+            ring[ring_head] = sample;
+            ring_head = (ring_head + 1) % RING_CAPACITY;
+            if (ring_count < RING_CAPACITY) {
+                ring_count++;
+            }
+            /* If ring buffer is full, new samples overwrite the oldest ones.
+             * ring_head advances, ring_tail advances implicitly,
+             * ring_count stays at RING_CAPACITY. */
+        }
+        pthread_mutex_unlock(&ring_mutex);
+    }
+
+    return NULL;
+}
+
+
+
+/* Errors checking helper func */
+static int alsa_errcheck(int err, const char *msg) {
+    if(err < 0) {
+        fprintf(stderr, "%s: %s\n", msg, snd_strerror(err));
+        return -1;
+    }
+    return 0;
+}
+
+
+
+int vox_mic_start(void) {
+    /* pcm_handle already running, skip start process */
+    if (running) return 0;
+
+    int err;
+    snd_pcm_hw_params_t *hw_params = NULL;
+
+    /* Open ALSA pcm_handle for capture */
+    err = snd_pcm_open(&pcm_handle, "default", SND_PCM_STREAM_CAPTURE, 0);
+    if (alsa_errcheck(err, "snd_pcm_open failed")) goto fail;
+
+    /* Allocate and initialize hw_params with ALSA default configuration */
+    snd_pcm_hw_params_alloca(&hw_params);
+    err = snd_pcm_hw_params_any(pcm_handle, hw_params);
+    if (alsa_errcheck(err, "snd_pcm_hw_params_any failed")) goto fail;
+
+    /* Set ACCESS param to INTERLEAVED */
+    err = snd_pcm_hw_params_set_access(pcm_handle, hw_params, SND_PCM_ACCESS_RW_INTERLEAVED);
+    if (alsa_errcheck(err, "snd_pcm_hw_params_set_access failed")) goto fail;
+
+    /* Set CHANNELS param to MONO */
+    err = snd_pcm_hw_params_set_channels(pcm_handle, hw_params, 1);
+    if (alsa_errcheck(err, "snd_pcm_hw_params_set_channels failed")) goto fail;
+
+    /* Set FORMAT param to S16LE */
+    err = snd_pcm_hw_params_set_format(pcm_handle, hw_params, SND_PCM_FORMAT_S16_LE);
+    if (alsa_errcheck(err, "snd_pcm_hw_params_set_format failed")) goto fail;
+
+    /* Set PERIOD SIZE (number of frames per read) to MIC_BUF_FRAMES */
+    snd_pcm_uframes_t period_size = MIC_BUF_FRAMES;
+    err = snd_pcm_hw_params_set_period_size_near(pcm_handle, hw_params, &period_size, NULL);
+    if (alsa_errcheck(err, "snd_pcm_hw_params_set_period_size_near failed")) goto fail;
+
+    /* Set RATE param to 16kHz (mandatory for Voxtral) */
+    unsigned int rate = MIC_SAMPLE_RATE;
+    err = snd_pcm_hw_params_set_rate_near(pcm_handle, hw_params, &rate, NULL);
+    if (alsa_errcheck(err, "snd_pcm_hw_params_set_rate_near failed")) goto fail;
+    if (rate != MIC_SAMPLE_RATE) {
+        fprintf(stderr, "Your audio pcm_handle does not support 16000 Hz\n");
+        goto fail;
+    }
+
+    /* Finally commit all previous params settings to the ALSA pcm_handle */
+    err = snd_pcm_hw_params(pcm_handle, hw_params);
+    if (alsa_errcheck(err, "snd_pcm_hw_params failed")) goto fail;
+
+    /* Set the ALSA pcm_handle as ready */
+    err = snd_pcm_prepare(pcm_handle);
+    if (alsa_errcheck(err, "snd_pcm_prepare failed")) goto fail;
+
+    /* Init ring buffer */
+    pthread_mutex_lock(&ring_mutex);
+    ring_head = 0;
+    ring_count = 0;
+    pthread_mutex_unlock(&ring_mutex);
+
+    /* Start capture thread */
+    running = 1;
+    err = pthread_create(&capture_thread, NULL, mic_capture_thread, NULL);
+    if (err != 0) {
+        fprintf(stderr, "pthread_create failed: %d\n", err);
+        running = 0;
+        goto fail;
+    }
+
+    return 0;
+
+fail:
+     if (pcm_handle) {
+         snd_pcm_close(pcm_handle);
+         pcm_handle = NULL;
+     }
+     return -1;
+}
+
+
+
+int vox_mic_read(float *out, int max_samples) {
+    if (!out || max_samples <= 0) return 0;
+
+    pthread_mutex_lock(&ring_mutex);
+    int n = ring_count < max_samples ? ring_count : max_samples;
+    if (n > 0) {
+        /* ring_tail = posizione del campione più vecchio */
+        int ring_tail = (ring_head - ring_count + RING_CAPACITY) % RING_CAPACITY;
+        for (int i = 0; i < n; i++) {
+            out[i] = ring[(ring_tail + i) % RING_CAPACITY];
+        }
+        ring_count -= n;
+    }
+    pthread_mutex_unlock(&ring_mutex);
+
+    return n;
+}
+
+
+
+int vox_mic_read_available(void) {
+    pthread_mutex_lock(&ring_mutex);
+    int n = ring_count;
+    pthread_mutex_unlock(&ring_mutex);
+    return n;
+}
+
+
+
+void vox_mic_stop(void) {
+    if (!running) return;
+    running = 0;
+    // Force the blocking snd_pcm_readi() call to wake up by dropping the PCM stream
+    if (pcm_handle) snd_pcm_drop(pcm_handle);
+    pthread_join(capture_thread, NULL);
+    if (pcm_handle) {
+        snd_pcm_close(pcm_handle);
+        pcm_handle = NULL;
+    }
+}
+
+
+
+#else  /* !__linux__ */
+#include "voxtral_mic.h"
+#include <stdio.h>
+
+int vox_mic_start(void) {
+    fprintf(stderr, "Microphone capture with ALSA is not supported on this platform\n");
+    return -1;
+}
+
+int vox_mic_read(float *out, int max_samples) {
+    (void)out; (void)max_samples;
+    return 0;
+}
+
+int vox_mic_read_available(void) { return 0; }
+
+void vox_mic_stop(void) {}
+
+#endif
+
+
diff --git a/voxtral_mic_test.c b/voxtral_mic_test.c
@@ -0,0 +1,51 @@
+/*
+ * voxtral_mic_test.c - Microphone backends test (macOS and GNU/Linux)
+ *
+ * This test is usefull to verify if compiled backend is receiving microphone input.
+ * Since it uses the voxtral_mic.h interface, it is OS-agnostic
+ * and works with both AudioQueue (macOS) and ALSA (GNU/Linux) backends.
+ * */
+
+#include "voxtral_mic.h"
+#include <unistd.h>
+#include <math.h>
+#include <stdio.h>
+
+static float rms(const float *buf, int n) {
+    if (n <= 0) return 0.0f;
+    double acc = 0.0;
+    for (int i = 0; i < n; i++) acc += buf[i] * buf[i];
+    return sqrt(acc / n);
+}
+
+int main() {
+    printf("=== Mic backend test ===\n");
+
+    printf("Starting microphone...\n");
+    if (vox_mic_start() != 0) {
+        fprintf(stderr, "ERROR: unable to start microphone\n");
+        return 1;
+    }
+
+    float buf[1024];
+
+    for (int iter = 0; iter < 100; iter++) {
+        int available = vox_mic_read_available();
+        int n = vox_mic_read(buf, 1024);
+        float r = rms(buf, n);
+
+        printf("iter=%d  available=%d  read=%d | RMS=%.6f\n",
+               iter, available, n, r);
+
+        usleep(100000);
+    }
+
+    printf("Stopping microphone...\n");
+    vox_mic_stop();
+    printf("Stopped.\n");
+
+    printf("=== End test ===\n");
+    return 0;
+}
+
+
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,3 +4,4 @@ voxtral-model/ @@
     pyenv*/
     misc/
     *.o
+    mic_test