From b28b6ccaa73736cd335de4f13c2d68843f6ae9a4 Mon Sep 17 00:00:00 2001 From: dberkin1 Date: Tue, 13 Jan 2026 21:34:25 +0300 Subject: [PATCH 01/11] Add elevenlabs custom vocab --- BENCHMARKS.md | 40 +++++++++---------- .../ElevenLabsTranscriptionPipeline.yaml | 5 +++ pyproject.toml | 1 + src/openbench/pipeline/pipeline_aliases.py | 11 +++++ .../pipeline/transcription/__init__.py | 3 ++ uv.lock | 22 +++++++++- 6 files changed, 61 insertions(+), 21 deletions(-) create mode 100644 config/pipeline_configs/ElevenLabsTranscriptionPipeline.yaml diff --git a/BENCHMARKS.md b/BENCHMARKS.md index 91801c0..59c726c 100644 --- a/BENCHMARKS.md +++ b/BENCHMARKS.md @@ -549,11 +549,11 @@ -| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
(SFSpeechRecognizer)
| Apple
(SpeechAnalyzer)| -|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|---------------------------------------------|----------------------------| -| earnings22-keywords
(no keywords) | 15.34 | 20.69 | 12.58 | 15.4 | 14.69 | 16.89 | 28.42 | 17 | -| earnings22-keywords
(chunk-keywords) | 13.28 | 31.97 | 11.67 | 21.24 | 12.46 | 14.57 | 26.98 | - | -| earnings22-keywords
(file-keywords) | 13.85 | 28.37 | 11.80 | 14.69 | 12.57 | 14.73 | 27.26 | - | +| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
(SFSpeechRecognizer)
| Apple
(SpeechAnalyzer)| ElevenLabs | +|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|---------------------------------------------|----------------------------|----------------------------| +| earnings22-keywords
(no keywords) | 15.34 | 20.69 | 12.58 | 15.4 | 14.69 | 16.89 | 28.42 | 17 | | +| earnings22-keywords
(chunk-keywords) | 13.28 | 31.97 | 11.67 | 21.24 | 12.46 | 14.57 | 26.98 | - | | +| earnings22-keywords
(file-keywords) | 13.85 | 28.37 | 11.80 | 14.69 | 12.57 | 14.73 | 27.26 | - | 9.08 |

@@ -576,11 +576,11 @@ If the model predicts 20 keywords and 15 of them match the ground truth, precisi -| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
(SFSpeechRecognizer)
| Apple
(SpeechAnalyzer)| -|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|-------------------------------------------------|----------------------------| -| earnings22-keywords
(no keywords) | 0.98 | 0.97 | 0.97 | 0.97 | 0.97 | 0.98 | 1 | 0.99 | -| earnings22-keywords
(chunk-keywords) | 0.99 | 0.98 | 0.99 | 0.96 | 0.98 | 0.98 | 0.99 | - | -| earnings22-keywords
(file-keywords) | 0.96 | 0.93 | 0.96 | 0.94 | 0.96 | 0.95 | 0.99 | - | +| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
(SFSpeechRecognizer)
| Apple
(SpeechAnalyzer)| ElevenLabs | +|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|-------------------------------------------------|----------------------------|----------------------------| +| earnings22-keywords
(no keywords) | 0.98 | 0.97 | 0.97 | 0.97 | 0.97 | 0.98 | 1 | 0.99 | | +| earnings22-keywords
(chunk-keywords) | 0.99 | 0.98 | 0.99 | 0.96 | 0.98 | 0.98 | 0.99 | - | | +| earnings22-keywords
(file-keywords) | 0.96 | 0.93 | 0.96 | 0.94 | 0.96 | 0.95 | 0.99 | - | 0.96|

@@ -603,11 +603,11 @@ If the ground-truth transcript has 25 keywords and the model correctly finds 15, -| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
(SFSpeechRecognizer)
| Apple
(SpeechAnalyzer)| -|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|-------------------------------------------------|----------------------------| -| earnings22-keywords
(no keywords) | 0.61 | 0.53 | 0.55 | 0.53 | 0.47 | 0.45 | 0.26 | 0.39 | -| earnings22-keywords
(chunk-keywords) | 0.89 | 0.7 | 0.69 | 0.77 | 0.85 | 0.82 | 0.45 | - | -| earnings22-keywords
(file-keywords) | 0.83 | 0.79 | 0.68 | 0.82 | 0.82 | 0.8 | 0.4 | - | +| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
(SFSpeechRecognizer)
| Apple
(SpeechAnalyzer)| ElevenLabs | +|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|-------------------------------------------------|----------------------------|----------------------------| +| earnings22-keywords
(no keywords) | 0.61 | 0.53 | 0.55 | 0.53 | 0.47 | 0.45 | 0.26 | 0.39 | | +| earnings22-keywords
(chunk-keywords) | 0.89 | 0.7 | 0.69 | 0.77 | 0.85 | 0.82 | 0.45 | - | | +| earnings22-keywords
(file-keywords) | 0.83 | 0.79 | 0.68 | 0.82 | 0.82 | 0.8 | 0.4 | - | 0.94 |

@@ -632,11 +632,11 @@ F1 = 2 × (0.75 × 0.6) / (0.75 + 0.6) = **66.7%**, reflecting the model's overa -| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
SFSpeechRecognizer
(Old API) | Apple
(SpeechAnalyzer)| -|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|-------------------------------------------------|----------------------------| -| earnings22-keywords
(no keywords) | 0.75 | 0.68 | 0.7 | 0.69 | 0.63 | 0.62 | 0.41 | 0.56 | -| earnings22-keywords
(chunk-keywords) | 0.94 | 0.82 | 0.81 | 0.86 | 0.91 | 0.89 | 0.62 | - | -| earnings22-keywords
(file-keywords) | 0.89 | 0.86 | 0.8 | 0.87 | 0.88 | 0.87 | 0.58 | - | +| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
SFSpeechRecognizer
(Old API) | Apple
(SpeechAnalyzer)| ElevenLabs | +|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|-------------------------------------------------|----------------------------|----------------------------| +| earnings22-keywords
(no keywords) | 0.75 | 0.68 | 0.7 | 0.69 | 0.63 | 0.62 | 0.41 | 0.56 | | +| earnings22-keywords
(chunk-keywords) | 0.94 | 0.82 | 0.81 | 0.86 | 0.91 | 0.89 | 0.62 | - | | +| earnings22-keywords
(file-keywords) | 0.89 | 0.86 | 0.8 | 0.87 | 0.88 | 0.87 | 0.58 | - | 0.95 |

diff --git a/config/pipeline_configs/ElevenLabsTranscriptionPipeline.yaml b/config/pipeline_configs/ElevenLabsTranscriptionPipeline.yaml new file mode 100644 index 0000000..4bec5f2 --- /dev/null +++ b/config/pipeline_configs/ElevenLabsTranscriptionPipeline.yaml @@ -0,0 +1,5 @@ +ElevenLabsTranscriptionPipeline: + config: + model_id: "scribe_v2" + use_keywords: true + diff --git a/pyproject.toml b/pyproject.toml index 9213bc2..c8c1120 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,7 @@ dependencies = [ "openai>=2.7.1", "meeteval>=0.4.3", "nemo-toolkit[asr]>=2.6.0", + "elevenlabs>=2.30.0", ] [project.scripts] diff --git a/src/openbench/pipeline/pipeline_aliases.py b/src/openbench/pipeline/pipeline_aliases.py index a89f4b9..e36732e 100644 --- a/src/openbench/pipeline/pipeline_aliases.py +++ b/src/openbench/pipeline/pipeline_aliases.py @@ -32,6 +32,7 @@ from .transcription import ( AssemblyAITranscriptionPipeline, DeepgramTranscriptionPipeline, + ElevenLabsTranscriptionPipeline, GroqTranscriptionPipeline, NeMoTranscriptionPipeline, OpenAITranscriptionPipeline, @@ -585,6 +586,16 @@ def register_pipeline_aliases() -> None: description="AssemblyAI transcription pipeline with keyword boosting support. Requires API key from https://www.assemblyai.com/. Set `ASSEMBLYAI_API_KEY` env var.", ) + PipelineRegistry.register_alias( + "elevenlabs-transcription", + ElevenLabsTranscriptionPipeline, + default_config={ + "model_id": "scribe_v2", + "use_keywords": False, + }, + description="ElevenLabs transcription pipeline with keyterm prompting support. Requires API key from https://elevenlabs.io/. Set `ELEVENLABS_API_KEY` env var.", + ) + ################# STREAMING TRANSCRIPTION PIPELINES ################# PipelineRegistry.register_alias( diff --git a/src/openbench/pipeline/transcription/__init__.py b/src/openbench/pipeline/transcription/__init__.py index 75f3dc8..3fe5900 100644 --- a/src/openbench/pipeline/transcription/__init__.py +++ b/src/openbench/pipeline/transcription/__init__.py @@ -5,6 +5,7 @@ from .common import TranscriptionOutput from .transcription_assemblyai import AssemblyAITranscriptionPipeline, AssemblyAITranscriptionPipelineConfig from .transcription_deepgram import DeepgramTranscriptionPipeline, DeepgramTranscriptionPipelineConfig +from .transcription_elevenlabs import ElevenLabsTranscriptionPipeline, ElevenLabsTranscriptionPipelineConfig from .transcription_groq import GroqTranscriptionConfig, GroqTranscriptionPipeline from .transcription_nemo import NeMoTranscriptionPipeline, NeMoTranscriptionPipelineConfig from .transcription_openai import OpenAITranscriptionPipeline, OpenAITranscriptionPipelineConfig @@ -29,6 +30,8 @@ "WhisperOSSTranscriptionPipelineConfig", "DeepgramTranscriptionPipeline", "DeepgramTranscriptionPipelineConfig", + "ElevenLabsTranscriptionPipeline", + "ElevenLabsTranscriptionPipelineConfig", "NeMoTranscriptionPipeline", "NeMoTranscriptionPipelineConfig", ] diff --git a/uv.lock b/uv.lock index 3b9f95b..ba02fae 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.10, <3.13" resolution-markers = [ "python_full_version >= '3.12' and sys_platform == 'linux'", @@ -41,6 +41,7 @@ wheels = [ name = "aenum" version = "3.1.16" source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/09/7a/61ed58e8be9e30c3fe518899cc78c284896d246d51381bab59b5db11e1f3/aenum-3.1.16.tar.gz", hash = "sha256:bfaf9589bdb418ee3a986d85750c7318d9d2839c1b1a1d6fe8fc53ec201cf140", size = 137693, upload-time = "2026-01-12T22:34:38.819Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/e3/52/6ad8f63ec8da1bf40f96996d25d5b650fdd38f5975f8c813732c47388f18/aenum-3.1.16-py3-none-any.whl", hash = "sha256:9035092855a98e41b66e3d0998bd7b96280e85ceb3a04cc035636138a1943eaf", size = 165627, upload-time = "2025-04-25T03:17:58.89Z" }, ] @@ -1108,6 +1109,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/87/62/9773de14fe6c45c23649e98b83231fffd7b9892b6cf863251dc2afa73643/einops-0.8.1-py3-none-any.whl", hash = "sha256:919387eb55330f5757c6bea9165c5ff5cfe63a642682ea788a6d472576d81737", size = 64359, upload-time = "2025-02-09T03:17:01.998Z" }, ] +[[package]] +name = "elevenlabs" +version = "2.30.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "httpx" }, + { name = "pydantic" }, + { name = "pydantic-core" }, + { name = "requests" }, + { name = "typing-extensions" }, + { name = "websockets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3d/a0/e7761b7a5a73f3933480ff3bdf87fcc9228ab8bf776fc08d1af0e4ff2f89/elevenlabs-2.30.0.tar.gz", hash = "sha256:a6a0474e045b93475fcd5f5829b67438d5a6aef9698b6f8758e7148ac03c2b12", size = 438124, upload-time = "2026-01-13T09:25:21.352Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/eb/16/7854a5d114f23fe3e2778b3b7838f4e63973f4ab6f0d71247ce0bcb9038f/elevenlabs-2.30.0-py3-none-any.whl", hash = "sha256:eeee92703a27e7ecd0e8ba4d547f730bcef4ecb02485efa59aeecab3a904d024", size = 1171600, upload-time = "2026-01-13T09:25:19.515Z" }, +] + [[package]] name = "exceptiongroup" version = "1.3.0" @@ -3299,6 +3317,7 @@ dependencies = [ { name = "boto3" }, { name = "datasets" }, { name = "deepgram-sdk" }, + { name = "elevenlabs" }, { name = "groq" }, { name = "hdbscan" }, { name = "hydra-core" }, @@ -3351,6 +3370,7 @@ requires-dist = [ { name = "boto3", specifier = ">=1.36.20,<2" }, { name = "datasets", specifier = ">=3.1.0,<4" }, { name = "deepgram-sdk", specifier = ">=4.8.0,<5" }, + { name = "elevenlabs", specifier = ">=2.30.0" }, { name = "groq", specifier = ">=0.31.0" }, { name = "hdbscan", specifier = ">=0.8.40,<0.9" }, { name = "hydra-core", specifier = ">=1.3.2,<2" }, From 76c9d1c209c039241243c7a9815f77238856fdf4 Mon Sep 17 00:00:00 2001 From: dberkin1 Date: Tue, 13 Jan 2026 21:40:48 +0300 Subject: [PATCH 02/11] Refactor benchmarks.md --- BENCHMARKS.md | 40 ++++----- .../transcription/transcription_elevenlabs.py | 86 +++++++++++++++++++ 2 files changed, 106 insertions(+), 20 deletions(-) create mode 100644 src/openbench/pipeline/transcription/transcription_elevenlabs.py diff --git a/BENCHMARKS.md b/BENCHMARKS.md index 59c726c..de81da5 100644 --- a/BENCHMARKS.md +++ b/BENCHMARKS.md @@ -549,11 +549,11 @@ -| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
(SFSpeechRecognizer)
| Apple
(SpeechAnalyzer)| ElevenLabs | -|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|---------------------------------------------|----------------------------|----------------------------| -| earnings22-keywords
(no keywords) | 15.34 | 20.69 | 12.58 | 15.4 | 14.69 | 16.89 | 28.42 | 17 | | -| earnings22-keywords
(chunk-keywords) | 13.28 | 31.97 | 11.67 | 21.24 | 12.46 | 14.57 | 26.98 | - | | -| earnings22-keywords
(file-keywords) | 13.85 | 28.37 | 11.80 | 14.69 | 12.57 | 14.73 | 27.26 | - | 9.08 | +| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | ElevenLabs | Apple
(SFSpeechRecognizer)
| Apple
(SpeechAnalyzer)| +|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|------------|---------------------------------------------|----------------------------| +| earnings22-keywords
(no keywords) | 15.34 | 20.69 | 12.58 | 15.4 | 14.69 | 16.89 | | 28.42 | 17 | +| earnings22-keywords
(chunk-keywords) | 13.28 | 31.97 | 11.67 | 21.24 | 12.46 | 14.57 | | 26.98 | - | +| earnings22-keywords
(file-keywords) | 13.85 | 28.37 | 11.80 | 14.69 | 12.57 | 14.73 | 9.08 | 27.26 | - |

@@ -576,11 +576,11 @@ If the model predicts 20 keywords and 15 of them match the ground truth, precisi -| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
(SFSpeechRecognizer)
| Apple
(SpeechAnalyzer)| ElevenLabs | -|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|-------------------------------------------------|----------------------------|----------------------------| -| earnings22-keywords
(no keywords) | 0.98 | 0.97 | 0.97 | 0.97 | 0.97 | 0.98 | 1 | 0.99 | | -| earnings22-keywords
(chunk-keywords) | 0.99 | 0.98 | 0.99 | 0.96 | 0.98 | 0.98 | 0.99 | - | | -| earnings22-keywords
(file-keywords) | 0.96 | 0.93 | 0.96 | 0.94 | 0.96 | 0.95 | 0.99 | - | 0.96| +| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | ElevenLabs | Apple
(SFSpeechRecognizer)
| Apple
(SpeechAnalyzer)| +|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|------------|-------------------------------------------------|----------------------------| +| earnings22-keywords
(no keywords) | 0.98 | 0.97 | 0.97 | 0.97 | 0.97 | 0.98 | | 1 | 0.99 | +| earnings22-keywords
(chunk-keywords) | 0.99 | 0.98 | 0.99 | 0.96 | 0.98 | 0.98 | | 0.99 | - | +| earnings22-keywords
(file-keywords) | 0.96 | 0.93 | 0.96 | 0.94 | 0.96 | 0.95 | 0.96 | 0.99 | - |

@@ -603,11 +603,11 @@ If the ground-truth transcript has 25 keywords and the model correctly finds 15, -| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
(SFSpeechRecognizer)
| Apple
(SpeechAnalyzer)| ElevenLabs | -|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|-------------------------------------------------|----------------------------|----------------------------| -| earnings22-keywords
(no keywords) | 0.61 | 0.53 | 0.55 | 0.53 | 0.47 | 0.45 | 0.26 | 0.39 | | -| earnings22-keywords
(chunk-keywords) | 0.89 | 0.7 | 0.69 | 0.77 | 0.85 | 0.82 | 0.45 | - | | -| earnings22-keywords
(file-keywords) | 0.83 | 0.79 | 0.68 | 0.82 | 0.82 | 0.8 | 0.4 | - | 0.94 | +| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | ElevenLabs | Apple
(SFSpeechRecognizer)
| Apple
(SpeechAnalyzer)| +|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|------------|-------------------------------------------------|----------------------------| +| earnings22-keywords
(no keywords) | 0.61 | 0.53 | 0.55 | 0.53 | 0.47 | 0.45 | | 0.26 | 0.39 | +| earnings22-keywords
(chunk-keywords) | 0.89 | 0.7 | 0.69 | 0.77 | 0.85 | 0.82 | | 0.45 | - | +| earnings22-keywords
(file-keywords) | 0.83 | 0.79 | 0.68 | 0.82 | 0.82 | 0.8 | 0.94 | 0.4 | - |

@@ -632,11 +632,11 @@ F1 = 2 × (0.75 × 0.6) / (0.75 + 0.6) = **66.7%**, reflecting the model's overa -| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | Apple
SFSpeechRecognizer
(Old API) | Apple
(SpeechAnalyzer)| ElevenLabs | -|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|-------------------------------------------------|----------------------------|----------------------------| -| earnings22-keywords
(no keywords) | 0.75 | 0.68 | 0.7 | 0.69 | 0.63 | 0.62 | 0.41 | 0.56 | | -| earnings22-keywords
(chunk-keywords) | 0.94 | 0.82 | 0.81 | 0.86 | 0.91 | 0.89 | 0.62 | - | | -| earnings22-keywords
(file-keywords) | 0.89 | 0.86 | 0.8 | 0.87 | 0.88 | 0.87 | 0.58 | - | 0.95 | +| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | ElevenLabs | Apple
SFSpeechRecognizer
(Old API) | Apple
(SpeechAnalyzer)| +|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|------------|-------------------------------------------------|----------------------------| +| earnings22-keywords
(no keywords) | 0.75 | 0.68 | 0.7 | 0.69 | 0.63 | 0.62 | | 0.41 | 0.56 | +| earnings22-keywords
(chunk-keywords) | 0.94 | 0.82 | 0.81 | 0.86 | 0.91 | 0.89 | | 0.62 | - | +| earnings22-keywords
(file-keywords) | 0.89 | 0.86 | 0.8 | 0.87 | 0.88 | 0.87 | 0.95 | 0.58 | - |

diff --git a/src/openbench/pipeline/transcription/transcription_elevenlabs.py b/src/openbench/pipeline/transcription/transcription_elevenlabs.py new file mode 100644 index 0000000..dbb371f --- /dev/null +++ b/src/openbench/pipeline/transcription/transcription_elevenlabs.py @@ -0,0 +1,86 @@ +import os +from pathlib import Path +from typing import Callable + +from argmaxtools.utils import get_logger +from elevenlabs.client import ElevenLabs +from pydantic import Field + +from ...dataset import TranscriptionSample +from ...pipeline import Pipeline, register_pipeline +from ...pipeline_prediction import Transcript +from ...types import PipelineType +from .common import TranscriptionConfig, TranscriptionOutput + + +logger = get_logger(__name__) + +TEMP_AUDIO_DIR = Path("temp_audio_dir") + + +class ElevenLabsTranscriptionPipelineConfig(TranscriptionConfig): + model_id: str = Field( + default="scribe_v2", + description="The ElevenLabs speech-to-text model to use", + ) + + +@register_pipeline +class ElevenLabsTranscriptionPipeline(Pipeline): + _config_class = ElevenLabsTranscriptionPipelineConfig + pipeline_type = PipelineType.TRANSCRIPTION + + def build_pipeline(self) -> Callable[[Path], str]: + api_key = os.getenv("ELEVENLABS_API_KEY") + assert api_key is not None, "Please set ELEVENLABS_API_KEY in environment" + + client = ElevenLabs(api_key=api_key) + + def transcribe(audio_path: Path) -> str: + with open(audio_path, "rb") as f: + audio_data = f.read() + + kwargs = { + "file": audio_data, + "model_id": self.config.model_id, + } + + # Add keyterms if available (up to 100, max 50 chars each) + if self.current_keywords: + # Filter keywords to max 50 chars and limit to 100 + filtered_keywords = [kw[:50] for kw in self.current_keywords[:100]] + kwargs["keyterms"] = filtered_keywords + logger.debug(f"Using keyterms: {filtered_keywords}") + + transcription = client.speech_to_text.convert(**kwargs) + + # Remove temporary audio path + audio_path.unlink(missing_ok=True) + + return transcription.text + + return transcribe + + def parse_input(self, input_sample: TranscriptionSample) -> Path: + """Override to extract keywords from sample before processing.""" + self.current_keywords = None + if self.config.use_keywords: + keywords = input_sample.extra_info.get("dictionary", []) + if keywords: + self.current_keywords = keywords + + # Warn if force_language is enabled (not currently supported) + if self.config.force_language: + logger.warning( + f"{self.__class__.__name__} does not support language hinting. " + "The force_language flag will be ignored." + ) + + return input_sample.save_audio(TEMP_AUDIO_DIR) + + def parse_output(self, output: str) -> TranscriptionOutput: + # Split transcript into words + words = output.split() if output else [] + transcript = Transcript.from_words_info(words=words) + return TranscriptionOutput(prediction=transcript) + From 6258a4edc361ac953e06b070668a6ab73b4cdac5 Mon Sep 17 00:00:00 2001 From: dberkin1 Date: Tue, 13 Jan 2026 21:54:17 +0300 Subject: [PATCH 03/11] Add elevenlabs no keywords results --- BENCHMARKS.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/BENCHMARKS.md b/BENCHMARKS.md index de81da5..bc0eb56 100644 --- a/BENCHMARKS.md +++ b/BENCHMARKS.md @@ -551,7 +551,7 @@ | Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | ElevenLabs | Apple
(SFSpeechRecognizer)
| Apple
(SpeechAnalyzer)| |--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|------------|---------------------------------------------|----------------------------| -| earnings22-keywords
(no keywords) | 15.34 | 20.69 | 12.58 | 15.4 | 14.69 | 16.89 | | 28.42 | 17 | +| earnings22-keywords
(no keywords) | 15.34 | 20.69 | 12.58 | 15.4 | 14.69 | 16.89 | 10.53 | 28.42 | 17 | | earnings22-keywords
(chunk-keywords) | 13.28 | 31.97 | 11.67 | 21.24 | 12.46 | 14.57 | | 26.98 | - | | earnings22-keywords
(file-keywords) | 13.85 | 28.37 | 11.80 | 14.69 | 12.57 | 14.73 | 9.08 | 27.26 | - | @@ -578,7 +578,7 @@ If the model predicts 20 keywords and 15 of them match the ground truth, precisi | Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | ElevenLabs | Apple
(SFSpeechRecognizer)
| Apple
(SpeechAnalyzer)| |--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|------------|-------------------------------------------------|----------------------------| -| earnings22-keywords
(no keywords) | 0.98 | 0.97 | 0.97 | 0.97 | 0.97 | 0.98 | | 1 | 0.99 | +| earnings22-keywords
(no keywords) | 0.98 | 0.97 | 0.97 | 0.97 | 0.97 | 0.98 | 0.97 | 1 | 0.99 | | earnings22-keywords
(chunk-keywords) | 0.99 | 0.98 | 0.99 | 0.96 | 0.98 | 0.98 | | 0.99 | - | | earnings22-keywords
(file-keywords) | 0.96 | 0.93 | 0.96 | 0.94 | 0.96 | 0.95 | 0.96 | 0.99 | - | @@ -605,7 +605,7 @@ If the ground-truth transcript has 25 keywords and the model correctly finds 15, | Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | ElevenLabs | Apple
(SFSpeechRecognizer)
| Apple
(SpeechAnalyzer)| |--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|------------|-------------------------------------------------|----------------------------| -| earnings22-keywords
(no keywords) | 0.61 | 0.53 | 0.55 | 0.53 | 0.47 | 0.45 | | 0.26 | 0.39 | +| earnings22-keywords
(no keywords) | 0.61 | 0.53 | 0.55 | 0.53 | 0.47 | 0.45 | 0.75 | 0.26 | 0.39 | | earnings22-keywords
(chunk-keywords) | 0.89 | 0.7 | 0.69 | 0.77 | 0.85 | 0.82 | | 0.45 | - | | earnings22-keywords
(file-keywords) | 0.83 | 0.79 | 0.68 | 0.82 | 0.82 | 0.8 | 0.94 | 0.4 | - | @@ -634,7 +634,7 @@ F1 = 2 × (0.75 × 0.6) / (0.75 + 0.6) = **66.7%**, reflecting the model's overa | Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | ElevenLabs | Apple
SFSpeechRecognizer
(Old API) | Apple
(SpeechAnalyzer)| |--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|------------|-------------------------------------------------|----------------------------| -| earnings22-keywords
(no keywords) | 0.75 | 0.68 | 0.7 | 0.69 | 0.63 | 0.62 | | 0.41 | 0.56 | +| earnings22-keywords
(no keywords) | 0.75 | 0.68 | 0.7 | 0.69 | 0.63 | 0.62 | 0.84 | 0.41 | 0.56 | | earnings22-keywords
(chunk-keywords) | 0.94 | 0.82 | 0.81 | 0.86 | 0.91 | 0.89 | | 0.62 | - | | earnings22-keywords
(file-keywords) | 0.89 | 0.86 | 0.8 | 0.87 | 0.88 | 0.87 | 0.95 | 0.58 | - | From fb7f58ac0b6703cf45b83e4dcec40647363e0a75 Mon Sep 17 00:00:00 2001 From: dberkin1 Date: Tue, 13 Jan 2026 22:02:39 +0300 Subject: [PATCH 04/11] reformat --- src/openbench/pipeline/transcription/transcription_elevenlabs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/openbench/pipeline/transcription/transcription_elevenlabs.py b/src/openbench/pipeline/transcription/transcription_elevenlabs.py index dbb371f..e5ffbb5 100644 --- a/src/openbench/pipeline/transcription/transcription_elevenlabs.py +++ b/src/openbench/pipeline/transcription/transcription_elevenlabs.py @@ -83,4 +83,3 @@ def parse_output(self, output: str) -> TranscriptionOutput: words = output.split() if output else [] transcript = Transcript.from_words_info(words=words) return TranscriptionOutput(prediction=transcript) - From 401368bcce8b2ec40fd1dc53ecbc198a4829bdcc Mon Sep 17 00:00:00 2001 From: dberkin1 Date: Wed, 14 Jan 2026 01:27:57 +0300 Subject: [PATCH 05/11] update elevenlabs chunkwise results --- BENCHMARKS.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/BENCHMARKS.md b/BENCHMARKS.md index bc0eb56..f8922bb 100644 --- a/BENCHMARKS.md +++ b/BENCHMARKS.md @@ -552,7 +552,7 @@ | Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | ElevenLabs | Apple
(SFSpeechRecognizer)
| Apple
(SpeechAnalyzer)| |--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|------------|---------------------------------------------|----------------------------| | earnings22-keywords
(no keywords) | 15.34 | 20.69 | 12.58 | 15.4 | 14.69 | 16.89 | 10.53 | 28.42 | 17 | -| earnings22-keywords
(chunk-keywords) | 13.28 | 31.97 | 11.67 | 21.24 | 12.46 | 14.57 | | 26.98 | - | +| earnings22-keywords
(chunk-keywords) | 13.28 | 31.97 | 11.67 | 21.24 | 12.46 | 14.57 | 9.13 | 26.98 | - | | earnings22-keywords
(file-keywords) | 13.85 | 28.37 | 11.80 | 14.69 | 12.57 | 14.73 | 9.08 | 27.26 | - |

@@ -579,7 +579,7 @@ If the model predicts 20 keywords and 15 of them match the ground truth, precisi | Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | ElevenLabs | Apple
(SFSpeechRecognizer)
| Apple
(SpeechAnalyzer)| |--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|------------|-------------------------------------------------|----------------------------| | earnings22-keywords
(no keywords) | 0.98 | 0.97 | 0.97 | 0.97 | 0.97 | 0.98 | 0.97 | 1 | 0.99 | -| earnings22-keywords
(chunk-keywords) | 0.99 | 0.98 | 0.99 | 0.96 | 0.98 | 0.98 | | 0.99 | - | +| earnings22-keywords
(chunk-keywords) | 0.99 | 0.98 | 0.99 | 0.96 | 0.98 | 0.98 | 0.99 | 0.99 | - | | earnings22-keywords
(file-keywords) | 0.96 | 0.93 | 0.96 | 0.94 | 0.96 | 0.95 | 0.96 | 0.99 | - |

@@ -606,7 +606,7 @@ If the ground-truth transcript has 25 keywords and the model correctly finds 15, | Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | ElevenLabs | Apple
(SFSpeechRecognizer)
| Apple
(SpeechAnalyzer)| |--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|------------|-------------------------------------------------|----------------------------| | earnings22-keywords
(no keywords) | 0.61 | 0.53 | 0.55 | 0.53 | 0.47 | 0.45 | 0.75 | 0.26 | 0.39 | -| earnings22-keywords
(chunk-keywords) | 0.89 | 0.7 | 0.69 | 0.77 | 0.85 | 0.82 | | 0.45 | - | +| earnings22-keywords
(chunk-keywords) | 0.89 | 0.7 | 0.69 | 0.77 | 0.85 | 0.82 | 0.96 | 0.45 | - | | earnings22-keywords
(file-keywords) | 0.83 | 0.79 | 0.68 | 0.82 | 0.82 | 0.8 | 0.94 | 0.4 | - |

@@ -635,7 +635,7 @@ F1 = 2 × (0.75 × 0.6) / (0.75 + 0.6) = **66.7%**, reflecting the model's overa | Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | ElevenLabs | Apple
SFSpeechRecognizer
(Old API) | Apple
(SpeechAnalyzer)| |--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|------------|-------------------------------------------------|----------------------------| | earnings22-keywords
(no keywords) | 0.75 | 0.68 | 0.7 | 0.69 | 0.63 | 0.62 | 0.84 | 0.41 | 0.56 | -| earnings22-keywords
(chunk-keywords) | 0.94 | 0.82 | 0.81 | 0.86 | 0.91 | 0.89 | | 0.62 | - | +| earnings22-keywords
(chunk-keywords) | 0.94 | 0.82 | 0.81 | 0.86 | 0.91 | 0.89 | 0.97 | 0.62 | - | | earnings22-keywords
(file-keywords) | 0.89 | 0.86 | 0.8 | 0.87 | 0.88 | 0.87 | 0.95 | 0.58 | - |

From f145504e715d6752331b997f7b11f9fae4a081d5 Mon Sep 17 00:00:00 2001 From: dberkin1 Date: Wed, 14 Jan 2026 12:52:57 +0300 Subject: [PATCH 06/11] Add support for force_language input --- .../transcription/transcription_elevenlabs.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/openbench/pipeline/transcription/transcription_elevenlabs.py b/src/openbench/pipeline/transcription/transcription_elevenlabs.py index e5ffbb5..b4ccc42 100644 --- a/src/openbench/pipeline/transcription/transcription_elevenlabs.py +++ b/src/openbench/pipeline/transcription/transcription_elevenlabs.py @@ -48,10 +48,13 @@ def transcribe(audio_path: Path) -> str: # Add keyterms if available (up to 100, max 50 chars each) if self.current_keywords: # Filter keywords to max 50 chars and limit to 100 - filtered_keywords = [kw[:50] for kw in self.current_keywords[:100]] + filtered_keywords = [kw for kw in self.current_keywords] kwargs["keyterms"] = filtered_keywords logger.debug(f"Using keyterms: {filtered_keywords}") + if self.config.force_language: + kwargs["language_code"] = self.current_language + transcription = client.speech_to_text.convert(**kwargs) # Remove temporary audio path @@ -69,13 +72,6 @@ def parse_input(self, input_sample: TranscriptionSample) -> Path: if keywords: self.current_keywords = keywords - # Warn if force_language is enabled (not currently supported) - if self.config.force_language: - logger.warning( - f"{self.__class__.__name__} does not support language hinting. " - "The force_language flag will be ignored." - ) - return input_sample.save_audio(TEMP_AUDIO_DIR) def parse_output(self, output: str) -> TranscriptionOutput: From a2012d323e78149435ef4381b3e8a3e41fdbba64 Mon Sep 17 00:00:00 2001 From: dberkin1 Date: Wed, 14 Jan 2026 13:03:53 +0300 Subject: [PATCH 07/11] reformat --- .../pipeline/transcription/transcription_elevenlabs.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/openbench/pipeline/transcription/transcription_elevenlabs.py b/src/openbench/pipeline/transcription/transcription_elevenlabs.py index b4ccc42..bda2a75 100644 --- a/src/openbench/pipeline/transcription/transcription_elevenlabs.py +++ b/src/openbench/pipeline/transcription/transcription_elevenlabs.py @@ -47,10 +47,8 @@ def transcribe(audio_path: Path) -> str: # Add keyterms if available (up to 100, max 50 chars each) if self.current_keywords: - # Filter keywords to max 50 chars and limit to 100 - filtered_keywords = [kw for kw in self.current_keywords] - kwargs["keyterms"] = filtered_keywords - logger.debug(f"Using keyterms: {filtered_keywords}") + kwargs["keyterms"] = self.current_keywords + logger.debug(f"Using keyterms: {self.current_keywords}") if self.config.force_language: kwargs["language_code"] = self.current_language @@ -71,6 +69,9 @@ def parse_input(self, input_sample: TranscriptionSample) -> Path: keywords = input_sample.extra_info.get("dictionary", []) if keywords: self.current_keywords = keywords + self.current_language = None + if self.config.force_language: + self.current_language = input_sample.language return input_sample.save_audio(TEMP_AUDIO_DIR) From 8e0e1c41bc0241b5ef91dce3a9fa8133b4cf01f4 Mon Sep 17 00:00:00 2001 From: Berkin Durmus Date: Wed, 14 Jan 2026 19:06:14 +0300 Subject: [PATCH 08/11] Add Elevenlabs Orchestration Pipeline (#94) * Add elevenlabs orchestration, elevenlabs engine and elevenlabs diarization * Fix inference dummy Orchestrationsample and filter empty words in orchestration output for Elevenlabs --------- Co-authored-by: dberkin1 --- src/openbench/cli/commands/inference.py | 2 +- src/openbench/engine/__init__.py | 3 + src/openbench/engine/elevenlabs_engine.py | 110 ++++++++++++++++++ .../pipeline/diarization/__init__.py | 1 + .../pipeline/diarization/elevenlabs.py | 68 +++++++++++ .../pipeline/orchestration/__init__.py | 3 + .../orchestration/orchestration_elevenlabs.py | 71 +++++++++++ src/openbench/pipeline/pipeline_aliases.py | 22 ++++ .../transcription/transcription_elevenlabs.py | 56 ++++----- 9 files changed, 302 insertions(+), 34 deletions(-) create mode 100644 src/openbench/engine/elevenlabs_engine.py create mode 100644 src/openbench/pipeline/diarization/elevenlabs.py create mode 100644 src/openbench/pipeline/orchestration/orchestration_elevenlabs.py diff --git a/src/openbench/cli/commands/inference.py b/src/openbench/cli/commands/inference.py index d16a05d..5955925 100644 --- a/src/openbench/cli/commands/inference.py +++ b/src/openbench/cli/commands/inference.py @@ -88,7 +88,7 @@ def get_dummy_sample( waveform=waveform, sample_rate=sample_rate, extra_info={}, - reference=Transcript.from_words_info(words=["dummy"]), + reference=Transcript.from_words_info(words=["dummy"], speaker=["SPEAKER_0"]), ) elif pipeline_type == PipelineType.TRANSCRIPTION: return TranscriptionSample( diff --git a/src/openbench/engine/__init__.py b/src/openbench/engine/__init__.py index 9f30aa1..2dec8fe 100644 --- a/src/openbench/engine/__init__.py +++ b/src/openbench/engine/__init__.py @@ -1,4 +1,5 @@ from .deepgram_engine import DeepgramApi, DeepgramApiResponse +from .elevenlabs_engine import ElevenLabsApi, ElevenLabsApiResponse from .openai_engine import OpenAIApi from .whisperkitpro_engine import ( WhisperKitPro, @@ -11,6 +12,8 @@ __all__ = [ "DeepgramApi", "DeepgramApiResponse", + "ElevenLabsApi", + "ElevenLabsApiResponse", "OpenAIApi", "WhisperKitPro", "WhisperKitProInput", diff --git a/src/openbench/engine/elevenlabs_engine.py b/src/openbench/engine/elevenlabs_engine.py new file mode 100644 index 0000000..bce7a2e --- /dev/null +++ b/src/openbench/engine/elevenlabs_engine.py @@ -0,0 +1,110 @@ +import os +from pathlib import Path + +from argmaxtools.utils import get_logger +from elevenlabs.client import ElevenLabs +from pydantic import BaseModel, model_validator + + +logger = get_logger(__name__) + + +class ElevenLabsApiResponse(BaseModel): + """Response from ElevenLabs speech-to-text API.""" + + words: list[str] + speakers: list[str] + start: list[float] + end: list[float] + + @property + def transcript(self) -> str: + return " ".join(self.words) + + @model_validator(mode="after") + def validate_lengths(self) -> "ElevenLabsApiResponse": + if ( + len(self.words) != len(self.speakers) + or len(self.words) != len(self.start) + or len(self.words) != len(self.end) + ): + raise ValueError("All lists must be of the same length") + return self + + +class ElevenLabsApi: + """ElevenLabs Speech-to-Text API wrapper.""" + + def __init__( + self, + model_id: str = "scribe_v2", + timeout: float = 300, + ): + self.model_id = model_id + self.timeout = timeout + + api_key = os.getenv("ELEVENLABS_API_KEY") + if not api_key: + raise ValueError("`ELEVENLABS_API_KEY` is not set") + + self.client = ElevenLabs(api_key=api_key, timeout=timeout) + + def transcribe( + self, + audio_path: Path | str, + keyterms: list[str] | None = None, + language_code: str | None = None, + diarize: bool = False, + num_speakers: int | None = None, + ) -> ElevenLabsApiResponse: + """Transcribe an audio file using ElevenLabs API. + + Args: + audio_path: Path to the audio file + keyterms: List of keywords to boost recognition + language_code: Language code (e.g., 'eng') + diarize: Whether to enable speaker diarization + num_speakers: Maximum number of speakers + + Returns: + ElevenLabsApiResponse with words, speakers, and timestamps + """ + if isinstance(audio_path, str): + audio_path = Path(audio_path) + + with audio_path.open("rb") as f: + audio_data = f.read() + + kwargs = { + "model_id": self.model_id, + "file": audio_data, + } + + if keyterms: + kwargs["keyterms"] = keyterms + logger.debug(f"Using keyterms: {keyterms}") + + if language_code: + kwargs["language_code"] = language_code + logger.debug(f"Using language: {language_code}") + + if diarize: + kwargs["diarize"] = True + logger.debug("Diarization enabled") + + if num_speakers is not None: + kwargs["num_speakers"] = num_speakers + logger.debug(f"Max speakers: {num_speakers}") + + response = self.client.speech_to_text.convert(**kwargs) + + # ElevenLabs returns whitespace as separate "words" - filter them out + words = [w for w in response.words if w.text and w.text.strip()] + + return ElevenLabsApiResponse( + words=[w.text for w in words], + speakers=[str(w.speaker_id) for w in words], + start=[float(w.start) for w in words], + end=[float(w.end) for w in words], + ) + diff --git a/src/openbench/pipeline/diarization/__init__.py b/src/openbench/pipeline/diarization/__init__.py index c0842ba..3c3ac9f 100644 --- a/src/openbench/pipeline/diarization/__init__.py +++ b/src/openbench/pipeline/diarization/__init__.py @@ -4,6 +4,7 @@ from .aws import * from .common import * from .diarization_deepgram import * +from .elevenlabs import * from .nemo import * from .picovoice import * from .pyannote import * diff --git a/src/openbench/pipeline/diarization/elevenlabs.py b/src/openbench/pipeline/diarization/elevenlabs.py new file mode 100644 index 0000000..4b3a4b0 --- /dev/null +++ b/src/openbench/pipeline/diarization/elevenlabs.py @@ -0,0 +1,68 @@ +from pathlib import Path +from typing import Callable + +from argmaxtools.utils import get_logger +from pyannote.core import Segment +from pydantic import Field + +from ...dataset import DiarizationSample +from ...engine import ElevenLabsApi, ElevenLabsApiResponse +from ...pipeline_prediction import DiarizationAnnotation +from ..base import Pipeline, PipelineType, register_pipeline +from .common import DiarizationOutput, DiarizationPipelineConfig + + +__all__ = ["ElevenLabsDiarizationPipeline", "ElevenLabsDiarizationPipelineConfig"] + +TEMP_AUDIO_DIR = Path("audio_temp") + +logger = get_logger(__name__) + + +class ElevenLabsDiarizationPipelineConfig(DiarizationPipelineConfig): + model_id: str = Field( + default="scribe_v2", + description="The ElevenLabs speech-to-text model to use", + ) + num_speakers: int | None = Field( + default=None, + description="Maximum number of speakers (helps with diarization). Max 32.", + ) + + +@register_pipeline +class ElevenLabsDiarizationPipeline(Pipeline): + _config_class = ElevenLabsDiarizationPipelineConfig + pipeline_type = PipelineType.DIARIZATION + + def build_pipeline(self) -> Callable[[Path], ElevenLabsApiResponse]: + api = ElevenLabsApi(model_id=self.config.model_id) + + num_speakers = None + if self.config.use_exact_num_speakers: + num_speakers = self.config.num_speakers + + def transcribe(audio_path: Path) -> ElevenLabsApiResponse: + response = api.transcribe( + audio_path=audio_path, + diarize=True, + num_speakers=num_speakers, + ) + # Remove temporary audio path + audio_path.unlink(missing_ok=True) + return response + + return transcribe + + def parse_input(self, input_sample: DiarizationSample) -> Path: + return input_sample.save_audio(output_dir=TEMP_AUDIO_DIR) + + def parse_output(self, output: ElevenLabsApiResponse) -> DiarizationOutput: + annotation = DiarizationAnnotation() + for word, speaker, start, end in zip( + output.words, output.speakers, output.start, output.end + ): + annotation[Segment(start, end)] = f"SPEAKER_{speaker}" + + return DiarizationOutput(prediction=annotation) + diff --git a/src/openbench/pipeline/orchestration/__init__.py b/src/openbench/pipeline/orchestration/__init__.py index 2d7bd44..b7fee67 100644 --- a/src/openbench/pipeline/orchestration/__init__.py +++ b/src/openbench/pipeline/orchestration/__init__.py @@ -3,6 +3,7 @@ from .nemo import NeMoMTParakeetPipeline, NeMoMTParakeetPipelineConfig from .orchestration_deepgram import DeepgramOrchestrationPipeline, DeepgramOrchestrationPipelineConfig +from .orchestration_elevenlabs import ElevenLabsOrchestrationPipeline, ElevenLabsOrchestrationPipelineConfig from .orchestration_openai import OpenAIOrchestrationPipeline, OpenAIOrchestrationPipelineConfig from .orchestration_whisperkitpro import WhisperKitProOrchestrationConfig, WhisperKitProOrchestrationPipeline from .whisperx import WhisperXPipeline, WhisperXPipelineConfig @@ -11,6 +12,8 @@ __all__ = [ "DeepgramOrchestrationPipeline", "DeepgramOrchestrationPipelineConfig", + "ElevenLabsOrchestrationPipeline", + "ElevenLabsOrchestrationPipelineConfig", "WhisperXPipeline", "WhisperXPipelineConfig", "WhisperKitProOrchestrationPipeline", diff --git a/src/openbench/pipeline/orchestration/orchestration_elevenlabs.py b/src/openbench/pipeline/orchestration/orchestration_elevenlabs.py new file mode 100644 index 0000000..afceeb8 --- /dev/null +++ b/src/openbench/pipeline/orchestration/orchestration_elevenlabs.py @@ -0,0 +1,71 @@ +from pathlib import Path +from typing import Callable + +from argmaxtools.utils import get_logger +from pydantic import Field + +from ...dataset import OrchestrationSample +from ...engine import ElevenLabsApi, ElevenLabsApiResponse +from ...pipeline import Pipeline, register_pipeline +from ...pipeline_prediction import Transcript +from ...types import PipelineType +from .common import OrchestrationConfig, OrchestrationOutput + + +logger = get_logger(__name__) + +TEMP_AUDIO_DIR = Path("temp_audio_dir") + + +class ElevenLabsOrchestrationPipelineConfig(OrchestrationConfig): + model_id: str = Field( + default="scribe_v2", + description="The ElevenLabs speech-to-text model to use", + ) + num_speakers: int | None = Field( + default=None, + description="Maximum number of speakers (helps with diarization). Max 32.", + ) + + +@register_pipeline +class ElevenLabsOrchestrationPipeline(Pipeline): + _config_class = ElevenLabsOrchestrationPipelineConfig + pipeline_type = PipelineType.ORCHESTRATION + + def build_pipeline(self) -> Callable[[Path], ElevenLabsApiResponse]: + api = ElevenLabsApi(model_id=self.config.model_id) + + def orchestrate(audio_path: Path) -> ElevenLabsApiResponse: + response = api.transcribe( + audio_path=audio_path, + language_code=self.current_language, + diarize=True, + num_speakers=self.config.num_speakers, + ) + # Remove temporary audio path + audio_path.unlink(missing_ok=True) + return response + + return orchestrate + + def parse_input(self, input_sample: OrchestrationSample) -> Path: + """Override to extract language from sample before processing.""" + self.current_language = None + if self.config.force_language: + self.current_language = input_sample.language + + return input_sample.save_audio(TEMP_AUDIO_DIR) + + def parse_output(self, output: ElevenLabsApiResponse) -> OrchestrationOutput: + return OrchestrationOutput( + prediction=Transcript.from_words_info( + words=output.words, + speaker=output.speakers, + start=output.start, + end=output.end, + ), + diarization_output=None, + transcription_output=None, + ) + diff --git a/src/openbench/pipeline/pipeline_aliases.py b/src/openbench/pipeline/pipeline_aliases.py index e36732e..d653f6f 100644 --- a/src/openbench/pipeline/pipeline_aliases.py +++ b/src/openbench/pipeline/pipeline_aliases.py @@ -8,6 +8,7 @@ from .diarization import ( AWSTranscribePipeline, DeepgramDiarizationPipeline, + ElevenLabsDiarizationPipeline, NeMoSortformerPipeline, PicovoicePipeline, PyannoteApiPipeline, @@ -16,6 +17,7 @@ ) from .orchestration import ( DeepgramOrchestrationPipeline, + ElevenLabsOrchestrationPipeline, NeMoMTParakeetPipeline, OpenAIOrchestrationPipeline, WhisperKitProOrchestrationPipeline, @@ -132,6 +134,16 @@ def register_pipeline_aliases() -> None: description="Deepgram diarization pipeline. Requires API key from https://www.deepgram.com/. Set `DEEPGRAM_API_KEY` env var.", ) + PipelineRegistry.register_alias( + "elevenlabs-diarization", + ElevenLabsDiarizationPipeline, + default_config={ + "out_dir": "./elevenlabs_diarization_results", + "model_id": "scribe_v2", + }, + description="ElevenLabs diarization pipeline. Requires API key from https://elevenlabs.io/. Set `ELEVENLABS_API_KEY` env var.", + ) + ################# ORCHESTRATION PIPELINES ################# PipelineRegistry.register_alias( @@ -172,6 +184,16 @@ def register_pipeline_aliases() -> None: description="Deepgram orchestration pipeline. Requires API key from https://www.deepgram.com/. Set `DEEPGRAM_API_KEY` env var.", ) + PipelineRegistry.register_alias( + "elevenlabs-orchestration", + ElevenLabsOrchestrationPipeline, + default_config={ + "out_dir": "./elevenlabs_orchestration_results", + "model_id": "scribe_v2", + }, + description="ElevenLabs orchestration pipeline with diarization. Requires API key from https://elevenlabs.io/. Set `ELEVENLABS_API_KEY` env var.", + ) + PipelineRegistry.register_alias( "whisperkitpro-orchestration-tiny", WhisperKitProOrchestrationPipeline, diff --git a/src/openbench/pipeline/transcription/transcription_elevenlabs.py b/src/openbench/pipeline/transcription/transcription_elevenlabs.py index bda2a75..fb7b57a 100644 --- a/src/openbench/pipeline/transcription/transcription_elevenlabs.py +++ b/src/openbench/pipeline/transcription/transcription_elevenlabs.py @@ -1,12 +1,11 @@ -import os from pathlib import Path from typing import Callable from argmaxtools.utils import get_logger -from elevenlabs.client import ElevenLabs from pydantic import Field from ...dataset import TranscriptionSample +from ...engine import ElevenLabsApi, ElevenLabsApiResponse from ...pipeline import Pipeline, register_pipeline from ...pipeline_prediction import Transcript from ...types import PipelineType @@ -30,35 +29,19 @@ class ElevenLabsTranscriptionPipeline(Pipeline): _config_class = ElevenLabsTranscriptionPipelineConfig pipeline_type = PipelineType.TRANSCRIPTION - def build_pipeline(self) -> Callable[[Path], str]: - api_key = os.getenv("ELEVENLABS_API_KEY") - assert api_key is not None, "Please set ELEVENLABS_API_KEY in environment" - - client = ElevenLabs(api_key=api_key) - - def transcribe(audio_path: Path) -> str: - with open(audio_path, "rb") as f: - audio_data = f.read() - - kwargs = { - "file": audio_data, - "model_id": self.config.model_id, - } - - # Add keyterms if available (up to 100, max 50 chars each) - if self.current_keywords: - kwargs["keyterms"] = self.current_keywords - logger.debug(f"Using keyterms: {self.current_keywords}") - - if self.config.force_language: - kwargs["language_code"] = self.current_language - - transcription = client.speech_to_text.convert(**kwargs) + def build_pipeline(self) -> Callable[[Path], ElevenLabsApiResponse]: + api = ElevenLabsApi(model_id=self.config.model_id) + def transcribe(audio_path: Path) -> ElevenLabsApiResponse: + response = api.transcribe( + audio_path=audio_path, + keyterms=self.current_keywords, + language_code=self.current_language, + diarize=False, + ) # Remove temporary audio path audio_path.unlink(missing_ok=True) - - return transcription.text + return response return transcribe @@ -69,14 +52,21 @@ def parse_input(self, input_sample: TranscriptionSample) -> Path: keywords = input_sample.extra_info.get("dictionary", []) if keywords: self.current_keywords = keywords + self.current_language = None if self.config.force_language: self.current_language = input_sample.language return input_sample.save_audio(TEMP_AUDIO_DIR) - def parse_output(self, output: str) -> TranscriptionOutput: - # Split transcript into words - words = output.split() if output else [] - transcript = Transcript.from_words_info(words=words) - return TranscriptionOutput(prediction=transcript) + def parse_output( + self, output: ElevenLabsApiResponse + ) -> TranscriptionOutput: + return TranscriptionOutput( + prediction=Transcript.from_words_info( + words=output.words, + speaker=None, + start=output.start, + end=output.end, + ) + ) From 5a1864cc90f8ac56e878e3cd4d3d7afb25130a14 Mon Sep 17 00:00:00 2001 From: dberkin1 Date: Wed, 14 Jan 2026 19:08:08 +0300 Subject: [PATCH 09/11] reformat --- src/openbench/engine/elevenlabs_engine.py | 1 - src/openbench/pipeline/diarization/elevenlabs.py | 5 +---- .../pipeline/orchestration/orchestration_elevenlabs.py | 1 - .../pipeline/transcription/transcription_elevenlabs.py | 4 +--- 4 files changed, 2 insertions(+), 9 deletions(-) diff --git a/src/openbench/engine/elevenlabs_engine.py b/src/openbench/engine/elevenlabs_engine.py index bce7a2e..05f9dda 100644 --- a/src/openbench/engine/elevenlabs_engine.py +++ b/src/openbench/engine/elevenlabs_engine.py @@ -107,4 +107,3 @@ def transcribe( start=[float(w.start) for w in words], end=[float(w.end) for w in words], ) - diff --git a/src/openbench/pipeline/diarization/elevenlabs.py b/src/openbench/pipeline/diarization/elevenlabs.py index 4b3a4b0..6a56a6c 100644 --- a/src/openbench/pipeline/diarization/elevenlabs.py +++ b/src/openbench/pipeline/diarization/elevenlabs.py @@ -59,10 +59,7 @@ def parse_input(self, input_sample: DiarizationSample) -> Path: def parse_output(self, output: ElevenLabsApiResponse) -> DiarizationOutput: annotation = DiarizationAnnotation() - for word, speaker, start, end in zip( - output.words, output.speakers, output.start, output.end - ): + for word, speaker, start, end in zip(output.words, output.speakers, output.start, output.end): annotation[Segment(start, end)] = f"SPEAKER_{speaker}" return DiarizationOutput(prediction=annotation) - diff --git a/src/openbench/pipeline/orchestration/orchestration_elevenlabs.py b/src/openbench/pipeline/orchestration/orchestration_elevenlabs.py index afceeb8..3c101fe 100644 --- a/src/openbench/pipeline/orchestration/orchestration_elevenlabs.py +++ b/src/openbench/pipeline/orchestration/orchestration_elevenlabs.py @@ -68,4 +68,3 @@ def parse_output(self, output: ElevenLabsApiResponse) -> OrchestrationOutput: diarization_output=None, transcription_output=None, ) - diff --git a/src/openbench/pipeline/transcription/transcription_elevenlabs.py b/src/openbench/pipeline/transcription/transcription_elevenlabs.py index fb7b57a..8379af4 100644 --- a/src/openbench/pipeline/transcription/transcription_elevenlabs.py +++ b/src/openbench/pipeline/transcription/transcription_elevenlabs.py @@ -59,9 +59,7 @@ def parse_input(self, input_sample: TranscriptionSample) -> Path: return input_sample.save_audio(TEMP_AUDIO_DIR) - def parse_output( - self, output: ElevenLabsApiResponse - ) -> TranscriptionOutput: + def parse_output(self, output: ElevenLabsApiResponse) -> TranscriptionOutput: return TranscriptionOutput( prediction=Transcript.from_words_info( words=output.words, From e03ead688f559647adfe22504ff75662c9ca295f Mon Sep 17 00:00:00 2001 From: dberkin1 Date: Wed, 14 Jan 2026 20:24:41 +0300 Subject: [PATCH 10/11] Flip keyword recognition tables --- BENCHMARKS.md | 64 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 20 deletions(-) diff --git a/BENCHMARKS.md b/BENCHMARKS.md index f8922bb..ab9f8d2 100644 --- a/BENCHMARKS.md +++ b/BENCHMARKS.md @@ -549,11 +549,17 @@ -| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | ElevenLabs | Apple
(SFSpeechRecognizer)
| Apple
(SpeechAnalyzer)| -|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|------------|---------------------------------------------|----------------------------| -| earnings22-keywords
(no keywords) | 15.34 | 20.69 | 12.58 | 15.4 | 14.69 | 16.89 | 10.53 | 28.42 | 17 | -| earnings22-keywords
(chunk-keywords) | 13.28 | 31.97 | 11.67 | 21.24 | 12.46 | 14.57 | 9.13 | 26.98 | - | -| earnings22-keywords
(file-keywords) | 13.85 | 28.37 | 11.80 | 14.69 | 12.57 | 14.73 | 9.08 | 27.26 | - | +| System | no keywords | chunk-keywords | file-keywords | +|--------------------------------------------|-------------|----------------|---------------| +| Deepgram (nova-3) | 15.34 | 13.28 | 13.85 | +| OpenAI (whisper-1) | 20.69 | 31.97 | 28.37 | +| AssemblyAI | 12.58 | 11.67 | 11.80 | +| Whisper OSS (large-v3-turbo) | 15.4 | 21.24 | 14.69 | +| Argmax (parakeet-v2) | 14.69 | 12.46 | 12.57 | +| Argmax (parakeet-v3) | 16.89 | 14.57 | 14.73 | +| ElevenLabs | 10.53 | 9.13 | 9.08 | +| Apple (SFSpeechRecognizer) | 28.42 | 26.98 | 27.26 | +| Apple (SpeechAnalyzer) | 17 | - | - |

@@ -576,11 +582,17 @@ If the model predicts 20 keywords and 15 of them match the ground truth, precisi -| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | ElevenLabs | Apple
(SFSpeechRecognizer)
| Apple
(SpeechAnalyzer)| -|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|------------|-------------------------------------------------|----------------------------| -| earnings22-keywords
(no keywords) | 0.98 | 0.97 | 0.97 | 0.97 | 0.97 | 0.98 | 0.97 | 1 | 0.99 | -| earnings22-keywords
(chunk-keywords) | 0.99 | 0.98 | 0.99 | 0.96 | 0.98 | 0.98 | 0.99 | 0.99 | - | -| earnings22-keywords
(file-keywords) | 0.96 | 0.93 | 0.96 | 0.94 | 0.96 | 0.95 | 0.96 | 0.99 | - | +| System | no keywords | chunk-keywords | file-keywords | +|--------------------------------------------|-------------|----------------|---------------| +| Deepgram (nova-3) | 0.98 | 0.99 | 0.96 | +| OpenAI (whisper-1) | 0.97 | 0.98 | 0.93 | +| AssemblyAI | 0.97 | 0.99 | 0.96 | +| Whisper OSS (large-v3-turbo) | 0.97 | 0.96 | 0.94 | +| Argmax (parakeet-v2) | 0.97 | 0.98 | 0.96 | +| Argmax (parakeet-v3) | 0.98 | 0.98 | 0.95 | +| ElevenLabs | 0.97 | 0.99 | 0.96 | +| Apple (SFSpeechRecognizer) | 1 | 0.99 | 0.99 | +| Apple (SpeechAnalyzer) | 0.99 | - | - |

@@ -603,11 +615,17 @@ If the ground-truth transcript has 25 keywords and the model correctly finds 15, -| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | ElevenLabs | Apple
(SFSpeechRecognizer)
| Apple
(SpeechAnalyzer)| -|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|------------|-------------------------------------------------|----------------------------| -| earnings22-keywords
(no keywords) | 0.61 | 0.53 | 0.55 | 0.53 | 0.47 | 0.45 | 0.75 | 0.26 | 0.39 | -| earnings22-keywords
(chunk-keywords) | 0.89 | 0.7 | 0.69 | 0.77 | 0.85 | 0.82 | 0.96 | 0.45 | - | -| earnings22-keywords
(file-keywords) | 0.83 | 0.79 | 0.68 | 0.82 | 0.82 | 0.8 | 0.94 | 0.4 | - | +| System | no keywords | chunk-keywords | file-keywords | +|--------------------------------------------|-------------|----------------|---------------| +| Deepgram (nova-3) | 0.61 | 0.89 | 0.83 | +| OpenAI (whisper-1) | 0.53 | 0.7 | 0.79 | +| AssemblyAI | 0.55 | 0.69 | 0.68 | +| Whisper OSS (large-v3-turbo) | 0.53 | 0.77 | 0.82 | +| Argmax (parakeet-v2) | 0.47 | 0.85 | 0.82 | +| Argmax (parakeet-v3) | 0.45 | 0.82 | 0.8 | +| ElevenLabs | 0.75 | 0.96 | 0.94 | +| Apple (SFSpeechRecognizer) | 0.26 | 0.45 | 0.4 | +| Apple (SpeechAnalyzer) | 0.39 | - | - |

@@ -632,11 +650,17 @@ F1 = 2 × (0.75 × 0.6) / (0.75 + 0.6) = **66.7%**, reflecting the model's overa -| Dataset | Deepgram
(nova-3) | OpenAI
(whisper-1) | AssemblyAI | Whisper OSS
(large-v3-turbo) | Argmax
(parakeet-v2) | Argmax
(parakeet-v3) | ElevenLabs | Apple
SFSpeechRecognizer
(Old API) | Apple
(SpeechAnalyzer)| -|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|------------|-------------------------------------------------|----------------------------| -| earnings22-keywords
(no keywords) | 0.75 | 0.68 | 0.7 | 0.69 | 0.63 | 0.62 | 0.84 | 0.41 | 0.56 | -| earnings22-keywords
(chunk-keywords) | 0.94 | 0.82 | 0.81 | 0.86 | 0.91 | 0.89 | 0.97 | 0.62 | - | -| earnings22-keywords
(file-keywords) | 0.89 | 0.86 | 0.8 | 0.87 | 0.88 | 0.87 | 0.95 | 0.58 | - | +| System | no keywords | chunk-keywords | file-keywords | +|--------------------------------------------|-------------|----------------|---------------| +| Deepgram (nova-3) | 0.75 | 0.94 | 0.89 | +| OpenAI (whisper-1) | 0.68 | 0.82 | 0.86 | +| AssemblyAI | 0.7 | 0.81 | 0.8 | +| Whisper OSS (large-v3-turbo) | 0.69 | 0.86 | 0.87 | +| Argmax (parakeet-v2) | 0.63 | 0.91 | 0.88 | +| Argmax (parakeet-v3) | 0.62 | 0.89 | 0.87 | +| ElevenLabs | 0.84 | 0.97 | 0.95 | +| Apple (SFSpeechRecognizer) | 0.41 | 0.62 | 0.58 | +| Apple (SpeechAnalyzer) | 0.56 | - | - |

From 4ebc4f28b91204ac7727e352496f9a79abcc77bb Mon Sep 17 00:00:00 2001 From: dberkin1 Date: Wed, 14 Jan 2026 20:26:30 +0300 Subject: [PATCH 11/11] Refactor table column names --- BENCHMARKS.md | 72 +++++++++++++++++++++++++-------------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/BENCHMARKS.md b/BENCHMARKS.md index ab9f8d2..339da7a 100644 --- a/BENCHMARKS.md +++ b/BENCHMARKS.md @@ -549,17 +549,17 @@ -| System | no keywords | chunk-keywords | file-keywords | -|--------------------------------------------|-------------|----------------|---------------| -| Deepgram (nova-3) | 15.34 | 13.28 | 13.85 | -| OpenAI (whisper-1) | 20.69 | 31.97 | 28.37 | +| System | earnings22-keywords
(no keywords) | earnings22-keywords
(chunk-keywords) | earnings22-keywords
(file-keywords) | +|--------------------------------------------|---------------------------------------|------------------------------------------|----------------------------------------| +| Deepgram
(nova-3) | 15.34 | 13.28 | 13.85 | +| OpenAI
(whisper-1) | 20.69 | 31.97 | 28.37 | | AssemblyAI | 12.58 | 11.67 | 11.80 | -| Whisper OSS (large-v3-turbo) | 15.4 | 21.24 | 14.69 | -| Argmax (parakeet-v2) | 14.69 | 12.46 | 12.57 | -| Argmax (parakeet-v3) | 16.89 | 14.57 | 14.73 | +| Whisper OSS
(large-v3-turbo) | 15.4 | 21.24 | 14.69 | +| Argmax
(parakeet-v2) | 14.69 | 12.46 | 12.57 | +| Argmax
(parakeet-v3) | 16.89 | 14.57 | 14.73 | | ElevenLabs | 10.53 | 9.13 | 9.08 | -| Apple (SFSpeechRecognizer) | 28.42 | 26.98 | 27.26 | -| Apple (SpeechAnalyzer) | 17 | - | - | +| Apple
(SFSpeechRecognizer) | 28.42 | 26.98 | 27.26 | +| Apple
(SpeechAnalyzer) | 17 | - | - |

@@ -582,17 +582,17 @@ If the model predicts 20 keywords and 15 of them match the ground truth, precisi -| System | no keywords | chunk-keywords | file-keywords | -|--------------------------------------------|-------------|----------------|---------------| -| Deepgram (nova-3) | 0.98 | 0.99 | 0.96 | -| OpenAI (whisper-1) | 0.97 | 0.98 | 0.93 | +| System | earnings22-keywords
(no keywords) | earnings22-keywords
(chunk-keywords) | earnings22-keywords
(file-keywords) | +|--------------------------------------------|---------------------------------------|------------------------------------------|----------------------------------------| +| Deepgram
(nova-3) | 0.98 | 0.99 | 0.96 | +| OpenAI
(whisper-1) | 0.97 | 0.98 | 0.93 | | AssemblyAI | 0.97 | 0.99 | 0.96 | -| Whisper OSS (large-v3-turbo) | 0.97 | 0.96 | 0.94 | -| Argmax (parakeet-v2) | 0.97 | 0.98 | 0.96 | -| Argmax (parakeet-v3) | 0.98 | 0.98 | 0.95 | +| Whisper OSS
(large-v3-turbo) | 0.97 | 0.96 | 0.94 | +| Argmax
(parakeet-v2) | 0.97 | 0.98 | 0.96 | +| Argmax
(parakeet-v3) | 0.98 | 0.98 | 0.95 | | ElevenLabs | 0.97 | 0.99 | 0.96 | -| Apple (SFSpeechRecognizer) | 1 | 0.99 | 0.99 | -| Apple (SpeechAnalyzer) | 0.99 | - | - | +| Apple
(SFSpeechRecognizer) | 1 | 0.99 | 0.99 | +| Apple
(SpeechAnalyzer) | 0.99 | - | - |

@@ -615,17 +615,17 @@ If the ground-truth transcript has 25 keywords and the model correctly finds 15, -| System | no keywords | chunk-keywords | file-keywords | -|--------------------------------------------|-------------|----------------|---------------| -| Deepgram (nova-3) | 0.61 | 0.89 | 0.83 | -| OpenAI (whisper-1) | 0.53 | 0.7 | 0.79 | +| System | earnings22-keywords
(no keywords) | earnings22-keywords
(chunk-keywords) | earnings22-keywords
(file-keywords) | +|--------------------------------------------|---------------------------------------|------------------------------------------|----------------------------------------| +| Deepgram
(nova-3) | 0.61 | 0.89 | 0.83 | +| OpenAI
(whisper-1) | 0.53 | 0.7 | 0.79 | | AssemblyAI | 0.55 | 0.69 | 0.68 | -| Whisper OSS (large-v3-turbo) | 0.53 | 0.77 | 0.82 | -| Argmax (parakeet-v2) | 0.47 | 0.85 | 0.82 | -| Argmax (parakeet-v3) | 0.45 | 0.82 | 0.8 | +| Whisper OSS
(large-v3-turbo) | 0.53 | 0.77 | 0.82 | +| Argmax
(parakeet-v2) | 0.47 | 0.85 | 0.82 | +| Argmax
(parakeet-v3) | 0.45 | 0.82 | 0.8 | | ElevenLabs | 0.75 | 0.96 | 0.94 | -| Apple (SFSpeechRecognizer) | 0.26 | 0.45 | 0.4 | -| Apple (SpeechAnalyzer) | 0.39 | - | - | +| Apple
(SFSpeechRecognizer) | 0.26 | 0.45 | 0.4 | +| Apple
(SpeechAnalyzer) | 0.39 | - | - |

@@ -650,17 +650,17 @@ F1 = 2 × (0.75 × 0.6) / (0.75 + 0.6) = **66.7%**, reflecting the model's overa -| System | no keywords | chunk-keywords | file-keywords | -|--------------------------------------------|-------------|----------------|---------------| -| Deepgram (nova-3) | 0.75 | 0.94 | 0.89 | -| OpenAI (whisper-1) | 0.68 | 0.82 | 0.86 | +| System | earnings22-keywords
(no keywords) | earnings22-keywords
(chunk-keywords) | earnings22-keywords
(file-keywords) | +|--------------------------------------------|---------------------------------------|------------------------------------------|----------------------------------------| +| Deepgram
(nova-3) | 0.75 | 0.94 | 0.89 | +| OpenAI
(whisper-1) | 0.68 | 0.82 | 0.86 | | AssemblyAI | 0.7 | 0.81 | 0.8 | -| Whisper OSS (large-v3-turbo) | 0.69 | 0.86 | 0.87 | -| Argmax (parakeet-v2) | 0.63 | 0.91 | 0.88 | -| Argmax (parakeet-v3) | 0.62 | 0.89 | 0.87 | +| Whisper OSS
(large-v3-turbo) | 0.69 | 0.86 | 0.87 | +| Argmax
(parakeet-v2) | 0.63 | 0.91 | 0.88 | +| Argmax
(parakeet-v3) | 0.62 | 0.89 | 0.87 | | ElevenLabs | 0.84 | 0.97 | 0.95 | -| Apple (SFSpeechRecognizer) | 0.41 | 0.62 | 0.58 | -| Apple (SpeechAnalyzer) | 0.56 | - | - | +| Apple
(SFSpeechRecognizer) | 0.41 | 0.62 | 0.58 | +| Apple
(SpeechAnalyzer) | 0.56 | - | - |