argmaxinc · dbrkn · Jan 14, 2026 · Jan 13, 2026 · Jan 13, 2026 · Jan 13, 2026
diff --git a/BENCHMARKS.md b/BENCHMARKS.md
@@ -549,11 +549,17 @@
 
 </details>
 
-| Dataset                                          |   Deepgram<br/>(nova-3) |   OpenAI<br/>(whisper-1) |   AssemblyAI |   Whisper OSS<br/>(large-v3-turbo) |   Argmax<br/>(parakeet-v2) |   Argmax<br/>(parakeet-v3) | Apple <br/> (SFSpeechRecognizer) <br/>      | Apple <br/>(SpeechAnalyzer)|
-|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|---------------------------------------------|----------------------------|
-| earnings22-keywords<br/>(no keywords)            |                   15.34 |                    20.69 |        12.58 |                               15.4 |                      14.69 |                      16.89 | 28.42 |                       17 |
-| earnings22-keywords<br/>(chunk-keywords)         |                   13.28 |                    31.97 |        11.67 |                              21.24 |                      12.46 |                      14.57 | 26.98 | - |
-| earnings22-keywords<br/>(file-keywords)          |                   13.85 |                    28.37 |        11.80 |                              14.69 |                      12.57 |                      14.73 | 27.26 | - |
+| System                                     | earnings22-keywords<br/>(no keywords) | earnings22-keywords<br/>(chunk-keywords) | earnings22-keywords<br/>(file-keywords) |
+|--------------------------------------------|---------------------------------------|------------------------------------------|----------------------------------------|
+| Deepgram<br/>(nova-3)                      | 15.34       | 13.28          | 13.85         |
+| OpenAI<br/>(whisper-1)                     | 20.69       | 31.97          | 28.37         |
+| AssemblyAI                                 | 12.58       | 11.67          | 11.80         |
+| Whisper OSS<br/>(large-v3-turbo)           | 15.4        | 21.24          | 14.69         |
+| Argmax<br/>(parakeet-v2)                   | 14.69       | 12.46          | 12.57         |
+| Argmax<br/>(parakeet-v3)                   | 16.89       | 14.57          | 14.73         |
+| ElevenLabs                                 | 10.53       | 9.13           | 9.08          |
+| Apple<br/>(SFSpeechRecognizer)             | 28.42       | 26.98          | 27.26         |
+| Apple<br/>(SpeechAnalyzer)                 | 17          | -              | -             |
 
 <br/><br/>
 
@@ -576,11 +582,17 @@ If the model predicts 20 keywords and 15 of them match the ground truth, precisi
 
 </details>
 
-| Dataset                                          |   Deepgram<br/>(nova-3) |   OpenAI<br/>(whisper-1) |   AssemblyAI |   Whisper OSS<br/>(large-v3-turbo) |   Argmax<br/>(parakeet-v2) |   Argmax<br/>(parakeet-v3) | Apple <br/> (SFSpeechRecognizer) <br/> | Apple <br/>(SpeechAnalyzer)|
-|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|-------------------------------------------------|----------------------------|
-| earnings22-keywords<br/>(no keywords)            |                    0.98 |                     0.97 |         0.97 |                               0.97 |                       0.97 |                       0.98 |  1   | 0.99 |   
-| earnings22-keywords<br/>(chunk-keywords)         |                    0.99 |                     0.98 |         0.99 |                               0.96 |                       0.98 |                       0.98 |   0.99  | - |   
-| earnings22-keywords<br/>(file-keywords)          |                    0.96 |                     0.93 |         0.96 |                               0.94 |                       0.96 |                       0.95 |  0.99   | - |   
+| System                                     | earnings22-keywords<br/>(no keywords) | earnings22-keywords<br/>(chunk-keywords) | earnings22-keywords<br/>(file-keywords) |
+|--------------------------------------------|---------------------------------------|------------------------------------------|----------------------------------------|
+| Deepgram<br/>(nova-3)                      | 0.98        | 0.99           | 0.96          |
+| OpenAI<br/>(whisper-1)                     | 0.97        | 0.98           | 0.93          |
+| AssemblyAI                                 | 0.97        | 0.99           | 0.96          |
+| Whisper OSS<br/>(large-v3-turbo)           | 0.97        | 0.96           | 0.94          |
+| Argmax<br/>(parakeet-v2)                   | 0.97        | 0.98           | 0.96          |
+| Argmax<br/>(parakeet-v3)                   | 0.98        | 0.98           | 0.95          |
+| ElevenLabs                                 | 0.97        | 0.99           | 0.96          |
+| Apple<br/>(SFSpeechRecognizer)             | 1           | 0.99           | 0.99          |
+| Apple<br/>(SpeechAnalyzer)                 | 0.99        | -              | -             |
 
 <br/><br/>
 
@@ -603,11 +615,17 @@ If the ground-truth transcript has 25 keywords and the model correctly finds 15,
 
 </details>
 
-| Dataset                                          |   Deepgram<br/>(nova-3) |   OpenAI<br/>(whisper-1) |   AssemblyAI |   Whisper OSS<br/>(large-v3-turbo) |   Argmax<br/>(parakeet-v2) |   Argmax<br/>(parakeet-v3) | Apple <br/> (SFSpeechRecognizer) <br/> | Apple <br/>(SpeechAnalyzer)|
-|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|-------------------------------------------------|----------------------------|
-| earnings22-keywords<br/>(no keywords)            |                    0.61 |                     0.53 |         0.55 |                               0.53 |                       0.47 |                       0.45 |  0.26  | 0.39 |   
-| earnings22-keywords<br/>(chunk-keywords)         |                    0.89 |                     0.7  |         0.69 |                               0.77 |                       0.85 |                       0.82 |  0.45  | - |   
-| earnings22-keywords<br/>(file-keywords)          |                    0.83 |                     0.79 |         0.68 |                               0.82 |                       0.82 |                       0.8  |  0.4  | - |   
+| System                                     | earnings22-keywords<br/>(no keywords) | earnings22-keywords<br/>(chunk-keywords) | earnings22-keywords<br/>(file-keywords) |
+|--------------------------------------------|---------------------------------------|------------------------------------------|----------------------------------------|
+| Deepgram<br/>(nova-3)                      | 0.61        | 0.89           | 0.83          |
+| OpenAI<br/>(whisper-1)                     | 0.53        | 0.7            | 0.79          |
+| AssemblyAI                                 | 0.55        | 0.69           | 0.68          |
+| Whisper OSS<br/>(large-v3-turbo)           | 0.53        | 0.77           | 0.82          |
+| Argmax<br/>(parakeet-v2)                   | 0.47        | 0.85           | 0.82          |
+| Argmax<br/>(parakeet-v3)                   | 0.45        | 0.82           | 0.8           |
+| ElevenLabs                                 | 0.75        | 0.96           | 0.94          |
+| Apple<br/>(SFSpeechRecognizer)             | 0.26        | 0.45           | 0.4           |
+| Apple<br/>(SpeechAnalyzer)                 | 0.39        | -              | -             |
 
 <br/><br/>
 
@@ -632,11 +650,17 @@ F1 = 2 × (0.75 × 0.6) / (0.75 + 0.6) = **66.7%**, reflecting the model's overa
 
 </details>
 
-| Dataset                                          |   Deepgram<br/>(nova-3) |   OpenAI<br/>(whisper-1) |   AssemblyAI |   Whisper OSS<br/>(large-v3-turbo) |   Argmax<br/>(parakeet-v2) |   Argmax<br/>(parakeet-v3) | Apple <br/> SFSpeechRecognizer <br/> (Old API) | Apple <br/>(SpeechAnalyzer)|
-|--------------------------------------------------|-------------------------|--------------------------|--------------|------------------------------------|----------------------------|----------------------------|-------------------------------------------------|----------------------------|
-| earnings22-keywords<br/>(no keywords) |                    0.75 |                     0.68 |         0.7  |                               0.69 |                       0.63 |                       0.62 |  0.41  | 0.56 |   
-| earnings22-keywords<br/>(chunk-keywords)         |                    0.94 |                     0.82 |         0.81 |                               0.86 |                       0.91 |                       0.89 |  0.62  | - |   
-| earnings22-keywords<br/>(file-keywords)          |                    0.89 |                     0.86 |         0.8  |                               0.87 |                       0.88 |                       0.87 |  0.58  | - |   
+| System                                     | earnings22-keywords<br/>(no keywords) | earnings22-keywords<br/>(chunk-keywords) | earnings22-keywords<br/>(file-keywords) |
+|--------------------------------------------|---------------------------------------|------------------------------------------|----------------------------------------|
+| Deepgram<br/>(nova-3)                      | 0.75        | 0.94           | 0.89          |
+| OpenAI<br/>(whisper-1)                     | 0.68        | 0.82           | 0.86          |
+| AssemblyAI                                 | 0.7         | 0.81           | 0.8           |
+| Whisper OSS<br/>(large-v3-turbo)           | 0.69        | 0.86           | 0.87          |
+| Argmax<br/>(parakeet-v2)                   | 0.63        | 0.91           | 0.88          |
+| Argmax<br/>(parakeet-v3)                   | 0.62        | 0.89           | 0.87          |
+| ElevenLabs                                 | 0.84        | 0.97           | 0.95          |
+| Apple<br/>(SFSpeechRecognizer)             | 0.41        | 0.62           | 0.58          |
+| Apple<br/>(SpeechAnalyzer)                 | 0.56        | -              | -             |
 
 <br/><br/>
 
diff --git a/config/pipeline_configs/ElevenLabsTranscriptionPipeline.yaml b/config/pipeline_configs/ElevenLabsTranscriptionPipeline.yaml
@@ -0,0 +1,5 @@
+ElevenLabsTranscriptionPipeline:
+  config:
+    model_id: "scribe_v2"
+    use_keywords: true
+
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,6 +39,7 @@ dependencies = [
     "openai>=2.7.1",
     "meeteval>=0.4.3",
     "nemo-toolkit[asr]>=2.6.0",
+    "elevenlabs>=2.30.0",
 ]
 
 [project.scripts]

diff --git a/src/openbench/cli/commands/inference.py b/src/openbench/cli/commands/inference.py
@@ -88,7 +88,7 @@ def get_dummy_sample(
             waveform=waveform,
             sample_rate=sample_rate,
             extra_info={},
-            reference=Transcript.from_words_info(words=["dummy"]),
+            reference=Transcript.from_words_info(words=["dummy"], speaker=["SPEAKER_0"]),
         )
     elif pipeline_type == PipelineType.TRANSCRIPTION:
         return TranscriptionSample(

diff --git a/src/openbench/engine/__init__.py b/src/openbench/engine/__init__.py
@@ -1,4 +1,5 @@
 from .deepgram_engine import DeepgramApi, DeepgramApiResponse
+from .elevenlabs_engine import ElevenLabsApi, ElevenLabsApiResponse
 from .openai_engine import OpenAIApi
 from .whisperkitpro_engine import (
     WhisperKitPro,
@@ -11,6 +12,8 @@
 __all__ = [
     "DeepgramApi",
     "DeepgramApiResponse",
+    "ElevenLabsApi",
+    "ElevenLabsApiResponse",
     "OpenAIApi",
     "WhisperKitPro",
     "WhisperKitProInput",

diff --git a/src/openbench/engine/elevenlabs_engine.py b/src/openbench/engine/elevenlabs_engine.py
@@ -0,0 +1,109 @@
+import os
+from pathlib import Path
+
+from argmaxtools.utils import get_logger
+from elevenlabs.client import ElevenLabs
+from pydantic import BaseModel, model_validator
+
+
+logger = get_logger(__name__)
+
+
+class ElevenLabsApiResponse(BaseModel):
+    """Response from ElevenLabs speech-to-text API."""
+
+    words: list[str]
+    speakers: list[str]
+    start: list[float]
+    end: list[float]
+
+    @property
+    def transcript(self) -> str:
+        return " ".join(self.words)
+
+    @model_validator(mode="after")
+    def validate_lengths(self) -> "ElevenLabsApiResponse":
+        if (
+            len(self.words) != len(self.speakers)
+            or len(self.words) != len(self.start)
+            or len(self.words) != len(self.end)
+        ):
+            raise ValueError("All lists must be of the same length")
+        return self
+
+
+class ElevenLabsApi:
+    """ElevenLabs Speech-to-Text API wrapper."""
+
+    def __init__(
+        self,
+        model_id: str = "scribe_v2",
+        timeout: float = 300,
+    ):
+        self.model_id = model_id
+        self.timeout = timeout
+
+        api_key = os.getenv("ELEVENLABS_API_KEY")
+        if not api_key:
+            raise ValueError("`ELEVENLABS_API_KEY` is not set")
+
+        self.client = ElevenLabs(api_key=api_key, timeout=timeout)
+
+    def transcribe(
+        self,
+        audio_path: Path | str,
+        keyterms: list[str] | None = None,
+        language_code: str | None = None,
+        diarize: bool = False,
+        num_speakers: int | None = None,
+    ) -> ElevenLabsApiResponse:
+        """Transcribe an audio file using ElevenLabs API.
+
+        Args:
+            audio_path: Path to the audio file
+            keyterms: List of keywords to boost recognition
+            language_code: Language code (e.g., 'eng')
+            diarize: Whether to enable speaker diarization
+            num_speakers: Maximum number of speakers
+
+        Returns:
+            ElevenLabsApiResponse with words, speakers, and timestamps
+        """
+        if isinstance(audio_path, str):
+            audio_path = Path(audio_path)
+
+        with audio_path.open("rb") as f:
+            audio_data = f.read()
+
+        kwargs = {
+            "model_id": self.model_id,
+            "file": audio_data,
+        }
+
+        if keyterms:
+            kwargs["keyterms"] = keyterms
+            logger.debug(f"Using keyterms: {keyterms}")
+
+        if language_code:
+            kwargs["language_code"] = language_code
+            logger.debug(f"Using language: {language_code}")
+
+        if diarize:
+            kwargs["diarize"] = True
+            logger.debug("Diarization enabled")
+
+        if num_speakers is not None:
+            kwargs["num_speakers"] = num_speakers
+            logger.debug(f"Max speakers: {num_speakers}")
+
+        response = self.client.speech_to_text.convert(**kwargs)
+
+        # ElevenLabs returns whitespace as separate "words" - filter them out
+        words = [w for w in response.words if w.text and w.text.strip()]
+
+        return ElevenLabsApiResponse(
+            words=[w.text for w in words],
+            speakers=[str(w.speaker_id) for w in words],
+            start=[float(w.start) for w in words],
+            end=[float(w.end) for w in words],
+        )
diff --git a/src/openbench/pipeline/diarization/__init__.py b/src/openbench/pipeline/diarization/__init__.py
@@ -4,6 +4,7 @@
 from .aws import *
 from .common import *
 from .diarization_deepgram import *
+from .elevenlabs import *
 from .nemo import *
 from .picovoice import *
 from .pyannote import *

diff --git a/src/openbench/pipeline/diarization/elevenlabs.py b/src/openbench/pipeline/diarization/elevenlabs.py
@@ -0,0 +1,65 @@
+from pathlib import Path
+from typing import Callable
+
+from argmaxtools.utils import get_logger
+from pyannote.core import Segment
+from pydantic import Field
+
+from ...dataset import DiarizationSample
+from ...engine import ElevenLabsApi, ElevenLabsApiResponse
+from ...pipeline_prediction import DiarizationAnnotation
+from ..base import Pipeline, PipelineType, register_pipeline
+from .common import DiarizationOutput, DiarizationPipelineConfig
+
+
+__all__ = ["ElevenLabsDiarizationPipeline", "ElevenLabsDiarizationPipelineConfig"]
+
+TEMP_AUDIO_DIR = Path("audio_temp")
+
+logger = get_logger(__name__)
+
+
+class ElevenLabsDiarizationPipelineConfig(DiarizationPipelineConfig):
+    model_id: str = Field(
+        default="scribe_v2",
+        description="The ElevenLabs speech-to-text model to use",
+    )
+    num_speakers: int | None = Field(
+        default=None,
+        description="Maximum number of speakers (helps with diarization). Max 32.",
+    )
+
+
+@register_pipeline
+class ElevenLabsDiarizationPipeline(Pipeline):
+    _config_class = ElevenLabsDiarizationPipelineConfig
+    pipeline_type = PipelineType.DIARIZATION
+
+    def build_pipeline(self) -> Callable[[Path], ElevenLabsApiResponse]:
+        api = ElevenLabsApi(model_id=self.config.model_id)
+
+        num_speakers = None
+        if self.config.use_exact_num_speakers:
+            num_speakers = self.config.num_speakers
+
+        def transcribe(audio_path: Path) -> ElevenLabsApiResponse:
+            response = api.transcribe(
+                audio_path=audio_path,
+                diarize=True,
+                num_speakers=num_speakers,
+            )
+            # Remove temporary audio path
+            audio_path.unlink(missing_ok=True)
+            return response
+
+        return transcribe
+
+    def parse_input(self, input_sample: DiarizationSample) -> Path:
+        return input_sample.save_audio(output_dir=TEMP_AUDIO_DIR)
+
+    def parse_output(self, output: ElevenLabsApiResponse) -> DiarizationOutput:
+        annotation = DiarizationAnnotation()
+        for word, speaker, start, end in zip(output.words, output.speakers, output.start, output.end):
+            annotation[Segment(start, end)] = f"SPEAKER_{speaker}"
+
+        return DiarizationOutput(prediction=annotation)
diff --git a/src/openbench/pipeline/orchestration/__init__.py b/src/openbench/pipeline/orchestration/__init__.py
@@ -3,6 +3,7 @@
 
 from .nemo import NeMoMTParakeetPipeline, NeMoMTParakeetPipelineConfig
 from .orchestration_deepgram import DeepgramOrchestrationPipeline, DeepgramOrchestrationPipelineConfig
+from .orchestration_elevenlabs import ElevenLabsOrchestrationPipeline, ElevenLabsOrchestrationPipelineConfig
 from .orchestration_openai import OpenAIOrchestrationPipeline, OpenAIOrchestrationPipelineConfig
 from .orchestration_whisperkitpro import WhisperKitProOrchestrationConfig, WhisperKitProOrchestrationPipeline
 from .whisperx import WhisperXPipeline, WhisperXPipelineConfig
@@ -11,6 +12,8 @@
 __all__ = [
     "DeepgramOrchestrationPipeline",
     "DeepgramOrchestrationPipelineConfig",
+    "ElevenLabsOrchestrationPipeline",
+    "ElevenLabsOrchestrationPipelineConfig",
     "WhisperXPipeline",
     "WhisperXPipelineConfig",
     "WhisperKitProOrchestrationPipeline",