Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions docs/TRANSCRIPTION-PROVIDER-MATRIX-2026-05-18.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,39 @@ adding more providers.
the file-based OpenAI diarization path, because both have first-class
streaming APIs and richer vocabulary/diarization controls.

## Implemented Router Behavior

`whisperforge_core.audio.build_transcription_plan()` now exposes the provider
router contract as fixture-friendly structured data. This is a planning layer,
not a runtime behavior change: `transcribe_audio()` still uses the existing
default OpenAI path, size chunker, and sequential chunk transcription unless
the caller explicitly selects another backend or chunker.

The implemented plan fields connect this matrix to code:

| Plan field | Implemented behavior |
| --- | --- |
| `capabilities` | Reports backend limits and feature flags for `openai`, `mlx`, `whisper_cpp`, and `whisperx`. |
| `media` | Summarizes ffprobe-style media fixtures, or stays unprobed when no fixture/inspection is requested. |
| `normalization` | Emits a planned-only FFmpeg command for video extraction or large probed audio that needs mono 16 kHz PCM normalization. |
| `output_contract` | Marks text-only backends versus WhisperX segment timestamps and diarization capability. |
| `privacy` | States whether audio leaves the device, which cloud provider receives it, and which local temp artifacts are expected. |
| `cost` | States whether provider API billing applies, estimated billable minutes when duration is known, and whether local/FFmpeg compute is expected. |

Current router fixture coverage pins the main lanes:

- `openai`: cloud, billable audio-minute receipt, chunked for files over
`CHUNK_THRESHOLD_BYTES`, and no default FFmpeg probe.
- `mlx`: local/private receipt, no provider API billing, and no normalization
for ordinary small audio.
- `whisperx`: local timestamp-capable plan, whole-file default for large inputs
unless `CHUNKER=vad`, and explicit diarization-capable output metadata.
- Video sources and large probed audio: planned FFmpeg extraction/resampling
before transcription, without requiring FFmpeg in the default unit suite.

Do not enable this normalization path as a runtime default until a product
decision accepts the `privacy` and `cost` receipts for the selected backend.

## Smallest Next Integration

Add an OpenAI diarized transcription mode without changing the default.
Expand Down
203 changes: 203 additions & 0 deletions tests/test_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
client boundary — no network traffic.
"""

import json
from pathlib import Path
from unittest.mock import MagicMock

Expand Down Expand Up @@ -42,6 +43,35 @@ def mock_openai(monkeypatch):
return client


def media_probe_fixture(
*,
duration="120.0",
audio_codec="aac",
sample_rate="48000",
channels=2,
video=False,
container="wav",
):
streams = []
if video:
streams.append({
"codec_type": "video",
"codec_name": "h264",
"duration": duration,
})
streams.append({
"codec_type": "audio",
"codec_name": audio_codec,
"sample_rate": sample_rate,
"channels": channels,
"duration": duration,
})
return {
"streams": streams,
"format": {"duration": duration, "format_name": container},
}


class TestChunkAudio:
def test_small_file_yields_single_chunk(self, silent_wav):
chunks, tmp_dir = audio.chunk_audio(silent_wav, target_size_mb=25)
Expand Down Expand Up @@ -96,13 +126,122 @@ def boom(*a, **k):


class TestTranscriptionRouterPlan:
def test_probe_media_uses_ffprobe_json(self, silent_wav, monkeypatch):
probe = media_probe_fixture(
duration="42.5",
audio_codec="pcm_s16le",
sample_rate="16000",
channels=1,
)
result = MagicMock()
result.stdout = json.dumps(probe)
calls = {}

def fake_run(argv, check, capture_output, text):
calls["argv"] = argv
calls["check"] = check
calls["capture_output"] = capture_output
calls["text"] = text
return result

monkeypatch.setattr(audio.subprocess, "run", fake_run)

assert audio.probe_media(silent_wav) == probe
assert calls["argv"][0] == "ffprobe"
assert str(silent_wav) in calls["argv"]
assert calls["check"] is True
assert calls["capture_output"] is True
assert calls["text"] is True

def test_plan_does_not_probe_media_by_default(self, tmp_path, monkeypatch):
path = tmp_path / "small.wav"
path.write_bytes(b"audio")

def fail_probe(_path):
raise AssertionError("ffprobe should not run by default")

monkeypatch.setattr(audio, "probe_media", fail_probe)

plan = audio.build_transcription_plan(path, backend="openai")

assert plan["strategy"] == "single_pass"
assert plan["media"]["probe_available"] is False
assert plan["normalization"]["commands"] == []

def test_plan_inspects_media_when_requested(self, tmp_path, monkeypatch):
path = tmp_path / "clip.mp4"
path.write_bytes(b"video")
probe = media_probe_fixture(video=True, container="mov,mp4,m4a,3gp,3g2,mj2")
calls = []

def fake_probe(source_path):
calls.append(source_path)
return probe

monkeypatch.setattr(audio, "probe_media", fake_probe)

plan = audio.build_transcription_plan(
path,
backend="openai",
inspect_media=True,
)

assert calls == [path]
assert plan["media"]["probe_available"] is True
assert plan["media"]["has_video"] is True
assert plan["normalization"]["required"] is True
assert "ffprobe" in plan["privacy"]["local_processing_steps"]

def test_transcription_capabilities_reports_whisperx_supports_segments(self):
caps = audio.transcription_capabilities("whisperx")
assert caps["backend"] == "whisperx"
assert caps["supports_segments"] is True
assert caps["supports_diarization"] is True
assert caps["privacy_mode"] == "local"

def test_plan_cloud_backend_receipt_shows_upload_and_billable_minutes(self, tmp_path):
path = tmp_path / "meeting.wav"
path.write_bytes(b"audio")
probe = media_probe_fixture(
duration="90.0",
audio_codec="pcm_s16le",
sample_rate="16000",
channels=1,
)

plan = audio.build_transcription_plan(
path,
backend="openai",
media_probe=probe,
)

assert plan["privacy"]["mode"] == "cloud"
assert plan["privacy"]["audio_leaves_device"] is True
assert plan["privacy"]["cloud_provider"] == "openai"
assert plan["cost"]["provider_api_billable"] is True
assert plan["cost"]["estimated_billable_minutes"] == 1.5
assert plan["cost"]["pricing_review_required"] is True

def test_plan_local_private_backend_receipt_stays_offline(self, tmp_path):
path = tmp_path / "private.m4a"
path.write_bytes(b"audio")
probe = media_probe_fixture(
duration="60.0",
audio_codec="aac",
sample_rate="44100",
channels=2,
container="mov,mp4,m4a,3gp,3g2,mj2",
)

plan = audio.build_transcription_plan(path, backend="mlx", media_probe=probe)

assert plan["strategy"] == "single_pass"
assert plan["privacy"]["mode"] == "local"
assert plan["privacy"]["audio_leaves_device"] is False
assert plan["privacy"]["cloud_provider"] is None
assert plan["cost"]["provider_api_billable"] is False
assert plan["normalization"]["required"] is False

def test_plan_large_openai_uses_size_chunking(self, tmp_path):
path = tmp_path / "large.wav"
path.write_bytes(b"0" * (audio.CHUNK_THRESHOLD_BYTES + 1024))
Expand All @@ -113,6 +252,40 @@ def test_plan_large_openai_uses_size_chunking(self, tmp_path):
assert "exceeds_chunk_threshold" in plan["reasons"]
assert plan["requires_ffmpeg"] is False

def test_plan_large_audio_fixture_adds_ffmpeg_normalization(self, tmp_path):
path = tmp_path / "large.wav"
path.write_bytes(b"0" * (audio.CHUNK_THRESHOLD_BYTES + 1024))
normalized = tmp_path / "normalized.wav"
probe = media_probe_fixture(
duration="125.0",
audio_codec="aac",
sample_rate="48000",
channels=2,
)

plan = audio.build_transcription_plan(
path,
backend="openai",
chunker="size",
media_probe=probe,
normalized_audio_path=normalized,
)

assert plan["strategy"] == "chunked_size"
assert plan["requires_ffmpeg"] is True
assert plan["media"]["duration_seconds"] == 125.0
assert plan["normalization"]["required"] is True
assert plan["normalization"]["target"]["sample_rate_hz"] == 16000
assert plan["normalization"]["target"]["channels"] == 1
assert plan["normalization"]["output_path"] == str(normalized)
assert plan["normalization"]["commands"][0]["argv"][0] == "ffmpeg"
assert str(normalized) in plan["normalization"]["commands"][0]["argv"]
assert "ffprobe" in plan["privacy"]["local_processing_steps"]
assert "ffmpeg_normalization" in plan["privacy"]["local_processing_steps"]
assert "chunking" in plan["privacy"]["local_processing_steps"]
assert plan["privacy"]["temp_artifacts"] == ["normalized_audio", "chunks"]
assert plan["cost"]["ffmpeg_compute_required"] is True

def test_plan_large_whisperx_prefers_whole_file_without_vad(self, tmp_path):
path = tmp_path / "large.wav"
path.write_bytes(b"0" * (audio.CHUNK_THRESHOLD_BYTES + 1024))
Expand All @@ -121,6 +294,7 @@ def test_plan_large_whisperx_prefers_whole_file_without_vad(self, tmp_path):

assert plan["strategy"] == "whole_file"
assert plan["capabilities"]["supports_segments"] is True
assert plan["output_contract"]["timestamps"] == "segments"

def test_plan_large_whisperx_uses_vad_when_requested(self, tmp_path):
path = tmp_path / "large.wav"
Expand All @@ -130,6 +304,33 @@ def test_plan_large_whisperx_uses_vad_when_requested(self, tmp_path):

assert plan["strategy"] == "chunked_vad"

def test_plan_whisperx_fixture_is_timestamped_and_diarization_capable(self, tmp_path):
path = tmp_path / "interview.wav"
path.write_bytes(b"0" * (audio.CHUNK_THRESHOLD_BYTES + 1024))
probe = media_probe_fixture(
duration="600.0",
audio_codec="pcm_s16le",
sample_rate="16000",
channels=1,
)

plan = audio.build_transcription_plan(
path,
backend="whisperx",
chunker="size",
media_probe=probe,
)

assert plan["strategy"] == "whole_file"
assert plan["output_contract"]["segments"] is True
assert plan["output_contract"]["timestamps"] == "segments"
assert plan["output_contract"]["diarization"]["capable"] is True
assert plan["output_contract"]["diarization"]["requires_hf_token"] is True
assert plan["privacy"]["mode"] == "local"
assert plan["privacy"]["audio_leaves_device"] is False
assert plan["cost"]["provider_api_billable"] is False
assert plan["normalization"]["required"] is False

def test_plan_video_source_flags_ffmpeg_requirement(self, tmp_path):
path = tmp_path / "clip.mp4"
path.write_bytes(b"video-bytes")
Expand All @@ -138,3 +339,5 @@ def test_plan_video_source_flags_ffmpeg_requirement(self, tmp_path):

assert plan["requires_ffmpeg"] is True
assert "video_source_requires_extraction" in plan["reasons"]
assert plan["normalization"]["required"] is True
assert plan["normalization"]["commands"][0]["argv"][0] == "ffmpeg"
Loading
Loading