From fd0f2eb04c1fd4a4d9f79583cca93fae31528664 Mon Sep 17 00:00:00 2001 From: Eduardo Pacheco Date: Mon, 16 Feb 2026 12:22:15 -0300 Subject: [PATCH 1/8] feat: Argmax Sortformer diarization pipelines --- .../pipeline/diarization/speakerkit.py | 117 ++++++++++++++---- src/openbench/pipeline/pipeline_aliases.py | 17 ++- 2 files changed, 106 insertions(+), 28 deletions(-) diff --git a/src/openbench/pipeline/diarization/speakerkit.py b/src/openbench/pipeline/diarization/speakerkit.py index 3718cb3..24404c4 100644 --- a/src/openbench/pipeline/diarization/speakerkit.py +++ b/src/openbench/pipeline/diarization/speakerkit.py @@ -8,7 +8,7 @@ from typing import Callable, Literal, TypedDict from argmaxtools.utils import get_logger -from pydantic import Field +from pydantic import Field, model_validator from ...dataset import DiarizationSample from ...pipeline_prediction import DiarizationAnnotation @@ -23,48 +23,113 @@ TEMP_AUDIO_DIR = Path("audio_temp") -class SpeakerKitPipelineConfig(DiarizationPipelineConfig): - cli_path: str = Field(..., description="The absolute path to the SpeakerKit CLI") - clusterer_version: Literal["pyannote3", "pyannote4"] = Field( - "pyannote4", description="The version of the clusterer to use" - ) - model_path: str | None = Field(None, description="The absolute path to the SpeakerKit model") - - class SpeakerKitInput(TypedDict): audio_path: Path output_path: Path num_speakers: int | None -class SpeakerKitCli: - def __init__(self, config: SpeakerKitPipelineConfig): - self.cli_path = config.cli_path - self.model_path = config.model_path - self.clusterer_version = config.clusterer_version - - def __call__(self, speakerkit_input: SpeakerKitInput) -> tuple[Path, float]: +class SpeakerKitPipelineConfig(DiarizationPipelineConfig): + cli_path: str = Field(..., description="The absolute path to the SpeakerKit CLI") + model_path: str | None = Field(None, description="The absolute path to the SpeakerKit model directory") + clusterer_version: Literal["pyannote3", "pyannote4"] | None = Field( + None, description="The version of the clusterer to use" + ) + sortformer_model_name: str | None = Field(None, description="The name of the Sortformer model to use") + sortformer_model_variant: str | None = Field(None, description="The variant of the Sortformer model to use") + + @model_validator(mode="after") + def validate_sortformer_model(self) -> "SpeakerKitPipelineConfig": + if self.sortformer_model_name is not None and self.sortformer_model_variant is None: + raise ValueError( + "If `sortformer_model_name` is provided, `sortformer_model_variant` must also be provided" + ) + + if self.sortformer_model_name is None and self.sortformer_model_variant is not None: + raise ValueError( + "If `sortformer_model_variant` is provided, `sortformer_model_name` must also be provided" + ) + + return self + + @model_validator(mode="after") + def validate_model_options(self) -> "SpeakerKitPipelineConfig": + if ( + self.clusterer_version is None + and self.sortformer_model_name is None + and self.sortformer_model_variant is None + ): + raise ValueError( + "At least one of `clusterer_version`, `sortformer_model_name`, or `sortformer_model_variant` must be provided" + ) + + if ( + self.clusterer_version is not None + and self.sortformer_model_name is not None + and self.sortformer_model_variant is not None + ): + raise ValueError( + "Only one of `clusterer_version`, `sortformer_model_name`, or `sortformer_model_variant` can be provided" + ) + + return self + + @property + def is_sortformer(self) -> bool: + return self.clusterer_version is None + + def generate_cli_args(self, inputs: SpeakerKitInput) -> list[str]: cmd = [ self.cli_path, "diarize", "--audio-path", - str(speakerkit_input["audio_path"]), + str(inputs["audio_path"]), "--rttm-path", - str(speakerkit_input["output_path"]), - "--clusterer-version", - self.clusterer_version, + str(inputs["output_path"]), "--verbose", ] - if self.model_path: + if self.clusterer_version is not None: + cmd.extend(["--clusterer-version", self.clusterer_version]) + elif self.sortformer_model_name is not None and self.sortformer_model_variant is not None: + cmd.extend( + [ + "--sortformer-model-name", + self.sortformer_model_name, + "--sortformer-model-variant", + self.sortformer_model_variant, + ] + ) + + if self.model_path is not None: cmd.extend(["--model-path", self.model_path]) - if speakerkit_input["num_speakers"] is not None: - cmd.extend(["--num-speakers", str(speakerkit_input["num_speakers"])]) + if inputs["num_speakers"] is not None: + if self.is_sortformer: + logger.warning("`num_speakers` is not supported for Sortformer. Ignoring...") + else: + cmd.extend(["--num-speakers", str(inputs["num_speakers"])]) if "SPEAKERKIT_API_KEY" in os.environ: cmd.extend(["--api-key", os.environ["SPEAKERKIT_API_KEY"]]) + return cmd + + +def parse_stdout(stdout: str) -> float: + pattern = r"Model Load Time:\s+\d+\.\d+\s+ms\nTotal Time:\s+(\d+\.\d+)\s+ms" + matches = re.search(pattern, stdout) + total_time = float(matches.group(1)) + return total_time / 1000 + + +class SpeakerKitCli: + def __init__(self, config: SpeakerKitPipelineConfig): + self.config = config + + def __call__(self, speakerkit_input: SpeakerKitInput) -> tuple[Path, float]: + cmd = self.config.generate_cli_args(speakerkit_input) + try: result = subprocess.run(cmd, check=True, capture_output=True, text=True) logger.debug(f"Diarization CLI stdout:\n{result.stdout}") @@ -81,11 +146,9 @@ def __call__(self, speakerkit_input: SpeakerKitInput) -> tuple[Path, float]: speakerkit_input["audio_path"].unlink() # Parse stdout and take the total time it took to diarize - pattern = r"Model Load Time:\s+\d+\.\d+\s+ms\nTotal Time:\s+(\d+\.\d+)\s+ms" - matches = re.search(pattern, result.stdout) - total_time = float(matches.group(1)) + total_time = parse_stdout(result.stdout) - return speakerkit_input["output_path"], total_time / 1000 + return speakerkit_input["output_path"], total_time @register_pipeline diff --git a/src/openbench/pipeline/pipeline_aliases.py b/src/openbench/pipeline/pipeline_aliases.py index 97287b0..83a77e8 100644 --- a/src/openbench/pipeline/pipeline_aliases.py +++ b/src/openbench/pipeline/pipeline_aliases.py @@ -114,7 +114,22 @@ def register_pipeline_aliases() -> None: "cli_path": os.getenv("SPEAKERKIT_CLI_PATH"), "clusterer_version": "pyannote4", }, - description="SpeakerKit speaker diarization pipeline. Requires CLI installation and API key. Set `SPEAKERKIT_CLI_PATH` and `SPEAKERKIT_API_KEY` env vars. For access to the CLI binary contact speakerkitpro@argmaxinc.com", + description="SpeakerKit speaker diarization pipeline using community-1 model from pyannote. Requires CLI installation and API key. Set `SPEAKERKIT_CLI_PATH` and `SPEAKERKIT_API_KEY` env vars. For access to the CLI binary contact speakerkitpro@argmaxinc.com", + ) + + PipelineRegistry.register_alias( + "speakerkit-sortformer-compressed", + SpeakerKitPipeline, + default_config={ + "out_dir": "./speakerkit-sortformer-report", + "cli_path": os.getenv("SPEAKERKIT_CLI_PATH"), + "sortformer_model_name": "v2-1", + "sortformer_model_variant": "384_94MB", + }, + description=( + "SpeakerKit speaker diarization pipeline using Sortformer model compressed to 94MB. Requires CLI installation and API key. " + "Set `SPEAKERKIT_CLI_PATH` and `SPEAKERKIT_API_KEY` env vars. For access to the CLI binary contact speakerkitpro@argmaxinc.com." + ), ) PipelineRegistry.register_alias( From f7b86345fa8ca98f585fb0cc4a99a852779adeb6 Mon Sep 17 00:00:00 2001 From: Eduardo Pacheco Date: Mon, 16 Feb 2026 13:43:26 -0300 Subject: [PATCH 2/8] refactor: sortformer orchestration --- src/openbench/engine/whisperkitpro_engine.py | 11 +++- .../orchestration_whisperkitpro.py | 17 +++-- src/openbench/pipeline/pipeline_aliases.py | 66 ++++++++++++++----- 3 files changed, 69 insertions(+), 25 deletions(-) diff --git a/src/openbench/engine/whisperkitpro_engine.py b/src/openbench/engine/whisperkitpro_engine.py index 0103320..b8bb0d7 100644 --- a/src/openbench/engine/whisperkitpro_engine.py +++ b/src/openbench/engine/whisperkitpro_engine.py @@ -90,7 +90,11 @@ class WhisperKitProConfig(BaseModel): False, description="Whether to perform diarization", ) - orchestration_strategy: Literal["word", "segment"] = Field( + diarization_mode: Literal["realtime", "prerecorded"] = Field( + "prerecorded", + description="Sortformer streaming mode: `realtime` (1.04s latency) or `prerecorded` (9.84s latency). This is only applicable when `clusterer_version` is `sortformer`.", + ) + orchestration_strategy: Literal["word", "segment", "subsegment"] = Field( "segment", description="The orchestration strategy to use either `word` or `segment`", ) @@ -98,9 +102,9 @@ class WhisperKitProConfig(BaseModel): None, description="The path to the speaker models directory", ) - clusterer_version: Literal["pyannote3", "pyannote4"] = Field( + clusterer_version: Literal["pyannote3", "pyannote4", "sortformer"] = Field( "pyannote4", - description="The version of the clusterer to use", + description="The version of the clusterer to use. If `sortformer` is the diarization model used is Sortformer, otherwise it is pyannote.", ) use_exclusive_reconciliation: bool = Field( False, @@ -174,6 +178,7 @@ def generate_cli_args(self, model_path: Path | None = None) -> list[str]: # Add rttm path args.extend(["--rttm-path", self.rttm_path]) args.extend(["--clusterer-version", self.clusterer_version]) + args.extend(["--diarization-mode", self.diarization_mode]) # If speaker models path is provided use it if self.speaker_models_path: args.extend(["--speaker-models-path", self.speaker_models_path]) diff --git a/src/openbench/pipeline/orchestration/orchestration_whisperkitpro.py b/src/openbench/pipeline/orchestration/orchestration_whisperkitpro.py index 5e88083..6680438 100644 --- a/src/openbench/pipeline/orchestration/orchestration_whisperkitpro.py +++ b/src/openbench/pipeline/orchestration/orchestration_whisperkitpro.py @@ -69,13 +69,17 @@ class WhisperKitProOrchestrationConfig(OrchestrationConfig): ComputeUnit.CPU_AND_NE, description="The compute units to use for the text decoder. Default is CPU_AND_NE.", ) - orchestration_strategy: Literal["word", "segment"] = Field( - "segment", - description="The orchestration strategy to use either `word` or `segment`", + orchestration_strategy: Literal["word", "segment", "subsegment"] = Field( + "subsegment", + description="The orchestration strategy to use either `word`, `segment` or `subsegment`", ) - clusterer_version: Literal["pyannote3", "pyannote4"] = Field( + clusterer_version: Literal["pyannote3", "pyannote4", "sortformer"] = Field( "pyannote4", - description="The version of the clusterer to use", + description="The version of the clusterer to use. If `sortformer` is the diarization model used is Sortformer, otherwise it is pyannote.", + ) + diarization_mode: Literal["realtime", "prerecorded"] = Field( + "prerecorded", + description="Sortformer streaming mode: `realtime` (1.04s latency) or `prerecorded` (9.84s latency). This is only applicable when `clusterer_version` is `sortformer`.", ) use_exclusive_reconciliation: bool = Field( False, @@ -107,7 +111,8 @@ def build_pipeline(self) -> WhisperKitPro: chunking_strategy="vad", diarization=True, orchestration_strategy=self.config.orchestration_strategy, - clusterer_version_string=self.config.clusterer_version, + clusterer_version=self.config.clusterer_version, + diarization_mode=self.config.diarization_mode, use_exclusive_reconciliation=self.config.use_exclusive_reconciliation, fast_load=self.config.fast_load, ) diff --git a/src/openbench/pipeline/pipeline_aliases.py b/src/openbench/pipeline/pipeline_aliases.py index 83a77e8..171aee2 100644 --- a/src/openbench/pipeline/pipeline_aliases.py +++ b/src/openbench/pipeline/pipeline_aliases.py @@ -218,8 +218,8 @@ def register_pipeline_aliases() -> None: "repo_id": "argmaxinc/whisperkit-pro", "model_variant": "openai_whisper-tiny", "cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"), - "orchestration_strategy": "segment", - "clusterer_version_string": "pyannote4", + "orchestration_strategy": "subsegment", + "clusterer_version": "pyannote4", "use_exclusive_reconciliation": True, }, description="WhisperKitPro orchestration pipeline using the tiny version of the model. Requires `WHISPERKITPRO_CLI_PATH` env var and depending on your permissions also `WHISPERKITPRO_API_KEY` env var.", @@ -232,8 +232,8 @@ def register_pipeline_aliases() -> None: "repo_id": "argmaxinc/whisperkit-pro", "model_variant": "openai_whisper-large-v3", "cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"), - "orchestration_strategy": "segment", - "clusterer_version_string": "pyannote4", + "orchestration_strategy": "subsegment", + "clusterer_version": "pyannote4", "use_exclusive_reconciliation": True, }, description="WhisperKitPro orchestration pipeline using the large-v3 version of the model. Requires `WHISPERKITPRO_CLI_PATH` env var and depending on your permissions also `WHISPERKITPRO_API_KEY` env var.", @@ -246,8 +246,8 @@ def register_pipeline_aliases() -> None: "repo_id": "argmaxinc/whisperkit-pro", "model_variant": "openai_whisper-large-v3-v20240930", "cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"), - "orchestration_strategy": "segment", - "clusterer_version_string": "pyannote4", + "orchestration_strategy": "subsegment", + "clusterer_version": "pyannote4", "use_exclusive_reconciliation": True, }, description="WhisperKitPro orchestration pipeline using the large-v3-v20240930 version of the model (which is the same as large-v3-turbo from OpenAI). Requires `WHISPERKITPRO_CLI_PATH` env var and depending on your permissions also `WHISPERKITPRO_API_KEY` env var.", @@ -260,8 +260,8 @@ def register_pipeline_aliases() -> None: "repo_id": "argmaxinc/whisperkit-pro", "model_variant": "openai_whisper-large-v3-v20240930_626MB", "cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"), - "orchestration_strategy": "segment", - "clusterer_version_string": "pyannote4", + "orchestration_strategy": "subsegment", + "clusterer_version": "pyannote4", "use_exclusive_reconciliation": True, }, description="WhisperKitPro orchestration pipeline using the large-v3-v20240930 version of the model compressed to 626MB (which is the same as large-v3-turbo from OpenAI). Requires `WHISPERKITPRO_CLI_PATH` env var and depending on your permissions also `WHISPERKITPRO_API_KEY` env var.", @@ -274,8 +274,8 @@ def register_pipeline_aliases() -> None: "repo_id": "argmaxinc/parakeetkit-pro", "model_variant": "nvidia_parakeet-v2", "cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"), - "orchestration_strategy": "segment", - "clusterer_version_string": "pyannote4", + "orchestration_strategy": "subsegment", + "clusterer_version": "pyannote4", "use_exclusive_reconciliation": True, }, description="WhisperKitPro orchestration pipeline using the parakeet-v2 version of the model. Requires `WHISPERKITPRO_CLI_PATH` env var and depending on your permissions also `WHISPERKITPRO_API_KEY` env var.", @@ -288,13 +288,30 @@ def register_pipeline_aliases() -> None: "repo_id": "argmaxinc/parakeetkit-pro", "model_variant": "nvidia_parakeet-v2_476MB", "cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"), - "orchestration_strategy": "segment", - "clusterer_version_string": "pyannote4", + "orchestration_strategy": "subsegment", + "clusterer_version": "pyannote4", "use_exclusive_reconciliation": True, }, description="WhisperKitPro orchestration pipeline using the parakeet-v2 version of the model compressed to 476MB. Requires `WHISPERKITPRO_CLI_PATH` env var and depending on your permissions also `WHISPERKITPRO_API_KEY` env var.", ) + PipelineRegistry.register_alias( + "whisperkitpro-orchestration-parakeet-v2-compressed-sortformer-compressed", + WhisperKitProOrchestrationPipeline, + default_config={ + "repo_id": "argmaxinc/parakeetkit-pro", + "model_variant": "nvidia_parakeet-v2_476MB", + "cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"), + "orchestration_strategy": "subsegment", + "clusterer_version": "sortformer", + "diarization_mode": "prerecorded", + }, + description=( + "WhisperKitPro orchestration pipeline using the parakeet-v2 version of the model compressed to 476MB and using Sortformer for diarization. " + "Requires `WHISPERKITPRO_CLI_PATH` env var and depending on your permissions also `WHISPERKITPRO_API_KEY` env var." + ), + ) + PipelineRegistry.register_alias( "whisperkitpro-orchestration-parakeet-v3", WhisperKitProOrchestrationPipeline, @@ -302,8 +319,8 @@ def register_pipeline_aliases() -> None: "repo_id": "argmaxinc/parakeetkit-pro", "model_variant": "nvidia_parakeet-v3", "cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"), - "orchestration_strategy": "segment", - "clusterer_version_string": "pyannote4", + "orchestration_strategy": "subsegment", + "clusterer_version": "pyannote4", "use_exclusive_reconciliation": True, }, description="WhisperKitPro orchestration pipeline using the parakeet-v3 version of the model. Requires `WHISPERKITPRO_CLI_PATH` env var and depending on your permissions also `WHISPERKITPRO_API_KEY` env var.", @@ -316,13 +333,30 @@ def register_pipeline_aliases() -> None: "repo_id": "argmaxinc/parakeetkit-pro", "model_variant": "nvidia_parakeet-v3_494MB", "cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"), - "orchestration_strategy": "segment", - "clusterer_version_string": "pyannote4", + "orchestration_strategy": "subsegment", + "clusterer_version": "pyannote4", "use_exclusive_reconciliation": True, }, description="WhisperKitPro orchestration pipeline using the parakeet-v3 version of the model compressed to 494MB. Requires `WHISPERKITPRO_CLI_PATH` env var and depending on your permissions also `WHISPERKITPRO_API_KEY` env var.", ) + PipelineRegistry.register_alias( + "whisperkitpro-orchestration-parakeet-v3-compressed-sortformer-compressed", + WhisperKitProOrchestrationPipeline, + default_config={ + "repo_id": "argmaxinc/parakeetkit-pro", + "model_variant": "nvidia_parakeet-v3_494MB", + "cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"), + "orchestration_strategy": "subsegment", + "clusterer_version": "sortformer", + "diarization_mode": "prerecorded", + }, + description=( + "WhisperKitPro orchestration pipeline using the parakeet-v3 version of the model compressed to 494MB and using Sortformer for diarization. " + "Requires `WHISPERKITPRO_CLI_PATH` env var and depending on your permissions also `WHISPERKITPRO_API_KEY` env var." + ), + ) + PipelineRegistry.register_alias( "openai-orchestration", OpenAIOrchestrationPipeline, From a6dc656f6120f77e87efc4b6de834b5eb69c1ae4 Mon Sep 17 00:00:00 2001 From: Eduardo Pacheco Date: Mon, 16 Feb 2026 16:02:34 -0300 Subject: [PATCH 3/8] fix: sortformer usage with speakerkit --- .../pipeline/diarization/speakerkit.py | 55 +++++++------------ src/openbench/pipeline/pipeline_aliases.py | 3 +- 2 files changed, 23 insertions(+), 35 deletions(-) diff --git a/src/openbench/pipeline/diarization/speakerkit.py b/src/openbench/pipeline/diarization/speakerkit.py index 24404c4..805e9c4 100644 --- a/src/openbench/pipeline/diarization/speakerkit.py +++ b/src/openbench/pipeline/diarization/speakerkit.py @@ -32,8 +32,8 @@ class SpeakerKitInput(TypedDict): class SpeakerKitPipelineConfig(DiarizationPipelineConfig): cli_path: str = Field(..., description="The absolute path to the SpeakerKit CLI") model_path: str | None = Field(None, description="The absolute path to the SpeakerKit model directory") - clusterer_version: Literal["pyannote3", "pyannote4"] | None = Field( - None, description="The version of the clusterer to use" + clusterer_version: Literal["pyannote3", "pyannote4", "sortformer"] = Field( + "pyannote4", description="The version of the clusterer to use" ) sortformer_model_name: str | None = Field(None, description="The name of the Sortformer model to use") sortformer_model_variant: str | None = Field(None, description="The variant of the Sortformer model to use") @@ -52,31 +52,9 @@ def validate_sortformer_model(self) -> "SpeakerKitPipelineConfig": return self - @model_validator(mode="after") - def validate_model_options(self) -> "SpeakerKitPipelineConfig": - if ( - self.clusterer_version is None - and self.sortformer_model_name is None - and self.sortformer_model_variant is None - ): - raise ValueError( - "At least one of `clusterer_version`, `sortformer_model_name`, or `sortformer_model_variant` must be provided" - ) - - if ( - self.clusterer_version is not None - and self.sortformer_model_name is not None - and self.sortformer_model_variant is not None - ): - raise ValueError( - "Only one of `clusterer_version`, `sortformer_model_name`, or `sortformer_model_variant` can be provided" - ) - - return self - @property def is_sortformer(self) -> bool: - return self.clusterer_version is None + return self.clusterer_version == "sortformer" def generate_cli_args(self, inputs: SpeakerKitInput) -> list[str]: cmd = [ @@ -86,12 +64,13 @@ def generate_cli_args(self, inputs: SpeakerKitInput) -> list[str]: str(inputs["audio_path"]), "--rttm-path", str(inputs["output_path"]), + "--clusterer-version", + self.clusterer_version, "--verbose", ] - if self.clusterer_version is not None: - cmd.extend(["--clusterer-version", self.clusterer_version]) - elif self.sortformer_model_name is not None and self.sortformer_model_variant is not None: + # Only check variant as we already checked both should be provided + if self.sortformer_model_variant is not None: cmd.extend( [ "--sortformer-model-name", @@ -115,12 +94,20 @@ def generate_cli_args(self, inputs: SpeakerKitInput) -> list[str]: return cmd + def parse_stdout(self, stdout: str) -> float: + # Default pattern for pyannote models + pattern = r"Model Load Time:\s+\d+\.\d+\s+ms\nTotal Time:\s+(\d+\.\d+)\s+ms" + divisor = 1000.0 + + # if model is sortfomer we override the pattern and divisor + if self.is_sortformer: + pattern = r"Prediction time:\s+(\d+\.\d+)\s+seconds" + divisor = 1.0 -def parse_stdout(stdout: str) -> float: - pattern = r"Model Load Time:\s+\d+\.\d+\s+ms\nTotal Time:\s+(\d+\.\d+)\s+ms" - matches = re.search(pattern, stdout) - total_time = float(matches.group(1)) - return total_time / 1000 + matches = re.search(pattern, stdout) + if matches is None: + raise ValueError(f"Could not parse prediction time from stdout: {stdout!r}") + return float(matches.group(1)) / divisor class SpeakerKitCli: @@ -146,7 +133,7 @@ def __call__(self, speakerkit_input: SpeakerKitInput) -> tuple[Path, float]: speakerkit_input["audio_path"].unlink() # Parse stdout and take the total time it took to diarize - total_time = parse_stdout(result.stdout) + total_time = self.config.parse_stdout(result.stdout) return speakerkit_input["output_path"], total_time diff --git a/src/openbench/pipeline/pipeline_aliases.py b/src/openbench/pipeline/pipeline_aliases.py index 171aee2..a8193aa 100644 --- a/src/openbench/pipeline/pipeline_aliases.py +++ b/src/openbench/pipeline/pipeline_aliases.py @@ -123,8 +123,9 @@ def register_pipeline_aliases() -> None: default_config={ "out_dir": "./speakerkit-sortformer-report", "cli_path": os.getenv("SPEAKERKIT_CLI_PATH"), - "sortformer_model_name": "v2-1", "sortformer_model_variant": "384_94MB", + "sortformer_model_name": "sortformer", + "clusterer_version": "sortformer", }, description=( "SpeakerKit speaker diarization pipeline using Sortformer model compressed to 94MB. Requires CLI installation and API key. " From abe70cb9610feb3fcad6469066a136cf5ca9223a Mon Sep 17 00:00:00 2001 From: Eduardo Pacheco Date: Mon, 16 Feb 2026 16:10:19 -0300 Subject: [PATCH 4/8] refactor: add verbose to whisperkitpro --- src/openbench/engine/whisperkitpro_engine.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/openbench/engine/whisperkitpro_engine.py b/src/openbench/engine/whisperkitpro_engine.py index b8bb0d7..e1b689e 100644 --- a/src/openbench/engine/whisperkitpro_engine.py +++ b/src/openbench/engine/whisperkitpro_engine.py @@ -162,6 +162,7 @@ def generate_cli_args(self, model_path: Path | None = None) -> list[str]: COMPUTE_UNITS_MAPPER[self.text_decoder_compute_units], "--fast-load", str(self.fast_load).lower(), + "--verbose", ] ) From 2685d017308f1d001b7dcd53df1c7ad44f548648 Mon Sep 17 00:00:00 2001 From: Eduardo Pacheco Date: Fri, 20 Feb 2026 15:09:06 -0300 Subject: [PATCH 5/8] refactor: only add --diarization-mode for sortformer --- src/openbench/engine/whisperkitpro_engine.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/openbench/engine/whisperkitpro_engine.py b/src/openbench/engine/whisperkitpro_engine.py index e1b689e..0b7a238 100644 --- a/src/openbench/engine/whisperkitpro_engine.py +++ b/src/openbench/engine/whisperkitpro_engine.py @@ -176,10 +176,15 @@ def generate_cli_args(self, model_path: Path | None = None) -> list[str]: if self.diarization: args.extend(["--diarization"]) args.extend(["--orchestration-strategy", self.orchestration_strategy]) + # Add rttm path args.extend(["--rttm-path", self.rttm_path]) args.extend(["--clusterer-version", self.clusterer_version]) - args.extend(["--diarization-mode", self.diarization_mode]) + + # Only add diarization mode if using Sortformer + if self.clusterer_version == "sortformer": + args.extend(["--diarization-mode", self.diarization_mode]) + # If speaker models path is provided use it if self.speaker_models_path: args.extend(["--speaker-models-path", self.speaker_models_path]) From edd30ad206c12c6aa45ba7ea6e949bb2aa542cb4 Mon Sep 17 00:00:00 2001 From: Eduardo Pacheco Date: Tue, 10 Mar 2026 21:38:37 -0300 Subject: [PATCH 6/8] refactor: CLI args --- src/openbench/engine/whisperkitpro_engine.py | 20 ++++----- .../pipeline/diarization/speakerkit.py | 41 ++----------------- .../orchestration_whisperkitpro.py | 14 +++---- src/openbench/pipeline/pipeline_aliases.py | 26 ++++++------ 4 files changed, 33 insertions(+), 68 deletions(-) diff --git a/src/openbench/engine/whisperkitpro_engine.py b/src/openbench/engine/whisperkitpro_engine.py index 0b7a238..69a1d00 100644 --- a/src/openbench/engine/whisperkitpro_engine.py +++ b/src/openbench/engine/whisperkitpro_engine.py @@ -83,7 +83,7 @@ class WhisperKitProConfig(BaseModel): description="The compute units to use for the audio encoder. Default is CPU_AND_NE.", ) text_decoder_compute_units: ct.ComputeUnit = Field( - ct.ComputeUnit.CPU_AND_GPU, + ct.ComputeUnit.CPU_AND_NE, description="The compute units to use for the text decoder. Default is CPU_AND_GPU.", ) diarization: bool = Field( @@ -92,19 +92,19 @@ class WhisperKitProConfig(BaseModel): ) diarization_mode: Literal["realtime", "prerecorded"] = Field( "prerecorded", - description="Sortformer streaming mode: `realtime` (1.04s latency) or `prerecorded` (9.84s latency). This is only applicable when `clusterer_version` is `sortformer`.", + description="Sortformer streaming mode: `realtime` (1.04s latency) or `prerecorded` (9.84s latency). This is only applicable when `engine` is `sortformer`.", ) - orchestration_strategy: Literal["word", "segment", "subsegment"] = Field( - "segment", - description="The orchestration strategy to use either `word` or `segment`", + orchestration_strategy: Literal["segment", "subsegment"] = Field( + "subsegment", + description="The orchestration strategy to use either `segment` or `subsegment`", ) speaker_models_path: str | None = Field( None, description="The path to the speaker models directory", ) - clusterer_version: Literal["pyannote3", "pyannote4", "sortformer"] = Field( - "pyannote4", - description="The version of the clusterer to use. If `sortformer` is the diarization model used is Sortformer, otherwise it is pyannote.", + engine: Literal["pyannote", "sortformer"] = Field( + "pyannote", + description="The engine to use. If `sortformer` the diarization model used is Sortformer, otherwise it is pyannote.", ) use_exclusive_reconciliation: bool = Field( False, @@ -179,10 +179,10 @@ def generate_cli_args(self, model_path: Path | None = None) -> list[str]: # Add rttm path args.extend(["--rttm-path", self.rttm_path]) - args.extend(["--clusterer-version", self.clusterer_version]) + args.extend(["--engine", self.engine]) # Only add diarization mode if using Sortformer - if self.clusterer_version == "sortformer": + if self.engine == "sortformer": args.extend(["--diarization-mode", self.diarization_mode]) # If speaker models path is provided use it diff --git a/src/openbench/pipeline/diarization/speakerkit.py b/src/openbench/pipeline/diarization/speakerkit.py index 805e9c4..7c7bc8a 100644 --- a/src/openbench/pipeline/diarization/speakerkit.py +++ b/src/openbench/pipeline/diarization/speakerkit.py @@ -8,7 +8,7 @@ from typing import Callable, Literal, TypedDict from argmaxtools.utils import get_logger -from pydantic import Field, model_validator +from pydantic import Field from ...dataset import DiarizationSample from ...pipeline_prediction import DiarizationAnnotation @@ -32,29 +32,7 @@ class SpeakerKitInput(TypedDict): class SpeakerKitPipelineConfig(DiarizationPipelineConfig): cli_path: str = Field(..., description="The absolute path to the SpeakerKit CLI") model_path: str | None = Field(None, description="The absolute path to the SpeakerKit model directory") - clusterer_version: Literal["pyannote3", "pyannote4", "sortformer"] = Field( - "pyannote4", description="The version of the clusterer to use" - ) - sortformer_model_name: str | None = Field(None, description="The name of the Sortformer model to use") - sortformer_model_variant: str | None = Field(None, description="The variant of the Sortformer model to use") - - @model_validator(mode="after") - def validate_sortformer_model(self) -> "SpeakerKitPipelineConfig": - if self.sortformer_model_name is not None and self.sortformer_model_variant is None: - raise ValueError( - "If `sortformer_model_name` is provided, `sortformer_model_variant` must also be provided" - ) - - if self.sortformer_model_name is None and self.sortformer_model_variant is not None: - raise ValueError( - "If `sortformer_model_variant` is provided, `sortformer_model_name` must also be provided" - ) - - return self - - @property - def is_sortformer(self) -> bool: - return self.clusterer_version == "sortformer" + engine: Literal["pyannote", "sortformer"] = Field("pyannote", description="The engine to use") def generate_cli_args(self, inputs: SpeakerKitInput) -> list[str]: cmd = [ @@ -64,22 +42,11 @@ def generate_cli_args(self, inputs: SpeakerKitInput) -> list[str]: str(inputs["audio_path"]), "--rttm-path", str(inputs["output_path"]), - "--clusterer-version", - self.clusterer_version, + "--engine", + self.engine, "--verbose", ] - # Only check variant as we already checked both should be provided - if self.sortformer_model_variant is not None: - cmd.extend( - [ - "--sortformer-model-name", - self.sortformer_model_name, - "--sortformer-model-variant", - self.sortformer_model_variant, - ] - ) - if self.model_path is not None: cmd.extend(["--model-path", self.model_path]) diff --git a/src/openbench/pipeline/orchestration/orchestration_whisperkitpro.py b/src/openbench/pipeline/orchestration/orchestration_whisperkitpro.py index 6680438..9609e73 100644 --- a/src/openbench/pipeline/orchestration/orchestration_whisperkitpro.py +++ b/src/openbench/pipeline/orchestration/orchestration_whisperkitpro.py @@ -69,17 +69,17 @@ class WhisperKitProOrchestrationConfig(OrchestrationConfig): ComputeUnit.CPU_AND_NE, description="The compute units to use for the text decoder. Default is CPU_AND_NE.", ) - orchestration_strategy: Literal["word", "segment", "subsegment"] = Field( + orchestration_strategy: Literal["segment", "subsegment"] = Field( "subsegment", - description="The orchestration strategy to use either `word`, `segment` or `subsegment`", + description="The orchestration strategy to use either `segment` or `subsegment`", ) - clusterer_version: Literal["pyannote3", "pyannote4", "sortformer"] = Field( - "pyannote4", - description="The version of the clusterer to use. If `sortformer` is the diarization model used is Sortformer, otherwise it is pyannote.", + engine: Literal["pyannote", "sortformer"] = Field( + "pyannote", + description="The engine to use. If `sortformer` the diarization model used is Sortformer, otherwise it is pyannote.", ) diarization_mode: Literal["realtime", "prerecorded"] = Field( "prerecorded", - description="Sortformer streaming mode: `realtime` (1.04s latency) or `prerecorded` (9.84s latency). This is only applicable when `clusterer_version` is `sortformer`.", + description="Sortformer streaming mode: `realtime` (1.04s latency) or `prerecorded` (9.84s latency). This is only applicable when `engine` is `sortformer`.", ) use_exclusive_reconciliation: bool = Field( False, @@ -111,7 +111,7 @@ def build_pipeline(self) -> WhisperKitPro: chunking_strategy="vad", diarization=True, orchestration_strategy=self.config.orchestration_strategy, - clusterer_version=self.config.clusterer_version, + engine=self.config.engine, diarization_mode=self.config.diarization_mode, use_exclusive_reconciliation=self.config.use_exclusive_reconciliation, fast_load=self.config.fast_load, diff --git a/src/openbench/pipeline/pipeline_aliases.py b/src/openbench/pipeline/pipeline_aliases.py index a8193aa..ce9792b 100644 --- a/src/openbench/pipeline/pipeline_aliases.py +++ b/src/openbench/pipeline/pipeline_aliases.py @@ -112,7 +112,7 @@ def register_pipeline_aliases() -> None: default_config={ "out_dir": "./speakerkit-report", "cli_path": os.getenv("SPEAKERKIT_CLI_PATH"), - "clusterer_version": "pyannote4", + "engine": "pyannote", }, description="SpeakerKit speaker diarization pipeline using community-1 model from pyannote. Requires CLI installation and API key. Set `SPEAKERKIT_CLI_PATH` and `SPEAKERKIT_API_KEY` env vars. For access to the CLI binary contact speakerkitpro@argmaxinc.com", ) @@ -123,9 +123,7 @@ def register_pipeline_aliases() -> None: default_config={ "out_dir": "./speakerkit-sortformer-report", "cli_path": os.getenv("SPEAKERKIT_CLI_PATH"), - "sortformer_model_variant": "384_94MB", - "sortformer_model_name": "sortformer", - "clusterer_version": "sortformer", + "engine": "sortformer", }, description=( "SpeakerKit speaker diarization pipeline using Sortformer model compressed to 94MB. Requires CLI installation and API key. " @@ -220,7 +218,7 @@ def register_pipeline_aliases() -> None: "model_variant": "openai_whisper-tiny", "cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"), "orchestration_strategy": "subsegment", - "clusterer_version": "pyannote4", + "engine": "pyannote", "use_exclusive_reconciliation": True, }, description="WhisperKitPro orchestration pipeline using the tiny version of the model. Requires `WHISPERKITPRO_CLI_PATH` env var and depending on your permissions also `WHISPERKITPRO_API_KEY` env var.", @@ -234,7 +232,7 @@ def register_pipeline_aliases() -> None: "model_variant": "openai_whisper-large-v3", "cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"), "orchestration_strategy": "subsegment", - "clusterer_version": "pyannote4", + "engine": "pyannote", "use_exclusive_reconciliation": True, }, description="WhisperKitPro orchestration pipeline using the large-v3 version of the model. Requires `WHISPERKITPRO_CLI_PATH` env var and depending on your permissions also `WHISPERKITPRO_API_KEY` env var.", @@ -248,7 +246,7 @@ def register_pipeline_aliases() -> None: "model_variant": "openai_whisper-large-v3-v20240930", "cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"), "orchestration_strategy": "subsegment", - "clusterer_version": "pyannote4", + "engine": "pyannote", "use_exclusive_reconciliation": True, }, description="WhisperKitPro orchestration pipeline using the large-v3-v20240930 version of the model (which is the same as large-v3-turbo from OpenAI). Requires `WHISPERKITPRO_CLI_PATH` env var and depending on your permissions also `WHISPERKITPRO_API_KEY` env var.", @@ -262,7 +260,7 @@ def register_pipeline_aliases() -> None: "model_variant": "openai_whisper-large-v3-v20240930_626MB", "cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"), "orchestration_strategy": "subsegment", - "clusterer_version": "pyannote4", + "engine": "pyannote", "use_exclusive_reconciliation": True, }, description="WhisperKitPro orchestration pipeline using the large-v3-v20240930 version of the model compressed to 626MB (which is the same as large-v3-turbo from OpenAI). Requires `WHISPERKITPRO_CLI_PATH` env var and depending on your permissions also `WHISPERKITPRO_API_KEY` env var.", @@ -276,7 +274,7 @@ def register_pipeline_aliases() -> None: "model_variant": "nvidia_parakeet-v2", "cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"), "orchestration_strategy": "subsegment", - "clusterer_version": "pyannote4", + "engine": "pyannote", "use_exclusive_reconciliation": True, }, description="WhisperKitPro orchestration pipeline using the parakeet-v2 version of the model. Requires `WHISPERKITPRO_CLI_PATH` env var and depending on your permissions also `WHISPERKITPRO_API_KEY` env var.", @@ -290,7 +288,7 @@ def register_pipeline_aliases() -> None: "model_variant": "nvidia_parakeet-v2_476MB", "cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"), "orchestration_strategy": "subsegment", - "clusterer_version": "pyannote4", + "engine": "pyannote", "use_exclusive_reconciliation": True, }, description="WhisperKitPro orchestration pipeline using the parakeet-v2 version of the model compressed to 476MB. Requires `WHISPERKITPRO_CLI_PATH` env var and depending on your permissions also `WHISPERKITPRO_API_KEY` env var.", @@ -304,7 +302,7 @@ def register_pipeline_aliases() -> None: "model_variant": "nvidia_parakeet-v2_476MB", "cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"), "orchestration_strategy": "subsegment", - "clusterer_version": "sortformer", + "engine": "sortformer", "diarization_mode": "prerecorded", }, description=( @@ -321,7 +319,7 @@ def register_pipeline_aliases() -> None: "model_variant": "nvidia_parakeet-v3", "cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"), "orchestration_strategy": "subsegment", - "clusterer_version": "pyannote4", + "engine": "pyannote", "use_exclusive_reconciliation": True, }, description="WhisperKitPro orchestration pipeline using the parakeet-v3 version of the model. Requires `WHISPERKITPRO_CLI_PATH` env var and depending on your permissions also `WHISPERKITPRO_API_KEY` env var.", @@ -335,7 +333,7 @@ def register_pipeline_aliases() -> None: "model_variant": "nvidia_parakeet-v3_494MB", "cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"), "orchestration_strategy": "subsegment", - "clusterer_version": "pyannote4", + "engine": "pyannote", "use_exclusive_reconciliation": True, }, description="WhisperKitPro orchestration pipeline using the parakeet-v3 version of the model compressed to 494MB. Requires `WHISPERKITPRO_CLI_PATH` env var and depending on your permissions also `WHISPERKITPRO_API_KEY` env var.", @@ -349,7 +347,7 @@ def register_pipeline_aliases() -> None: "model_variant": "nvidia_parakeet-v3_494MB", "cli_path": os.getenv("WHISPERKITPRO_CLI_PATH"), "orchestration_strategy": "subsegment", - "clusterer_version": "sortformer", + "engine": "sortformer", "diarization_mode": "prerecorded", }, description=( From 20344b809c0ae6c05497e86f1b8eb334f9522b54 Mon Sep 17 00:00:00 2001 From: Eduardo Pacheco Date: Wed, 11 Mar 2026 15:52:59 -0300 Subject: [PATCH 7/8] fix: missing is_sortformer --- src/openbench/pipeline/diarization/speakerkit.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/openbench/pipeline/diarization/speakerkit.py b/src/openbench/pipeline/diarization/speakerkit.py index 7c7bc8a..6dfa908 100644 --- a/src/openbench/pipeline/diarization/speakerkit.py +++ b/src/openbench/pipeline/diarization/speakerkit.py @@ -34,6 +34,10 @@ class SpeakerKitPipelineConfig(DiarizationPipelineConfig): model_path: str | None = Field(None, description="The absolute path to the SpeakerKit model directory") engine: Literal["pyannote", "sortformer"] = Field("pyannote", description="The engine to use") + @property + def is_sortformer(self) -> bool: + return self.engine == "sortformer" + def generate_cli_args(self, inputs: SpeakerKitInput) -> list[str]: cmd = [ self.cli_path, @@ -51,10 +55,7 @@ def generate_cli_args(self, inputs: SpeakerKitInput) -> list[str]: cmd.extend(["--model-path", self.model_path]) if inputs["num_speakers"] is not None: - if self.is_sortformer: - logger.warning("`num_speakers` is not supported for Sortformer. Ignoring...") - else: - cmd.extend(["--num-speakers", str(inputs["num_speakers"])]) + cmd.extend(["--num-speakers", str(inputs["num_speakers"])]) if "SPEAKERKIT_API_KEY" in os.environ: cmd.extend(["--api-key", os.environ["SPEAKERKIT_API_KEY"]]) From fd8483d0c3a2aba04cf1a206d75b4780fc54056b Mon Sep 17 00:00:00 2001 From: Eduardo Pacheco Date: Thu, 12 Mar 2026 11:41:15 -0300 Subject: [PATCH 8/8] chore: ignore .sh --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index ddfc86c..10c049c 100644 --- a/.gitignore +++ b/.gitignore @@ -45,4 +45,6 @@ inference_outputs/ miscellaneous/ # Default openbench-cli output directory -downloaded_datasets/ \ No newline at end of file +downloaded_datasets/ + +*.sh \ No newline at end of file