From 9d687e015e41e001900a5871c9db3bf93bafea14 Mon Sep 17 00:00:00 2001 From: MrPrayer Date: Sat, 14 Feb 2026 16:11:52 +0300 Subject: [PATCH 1/3] fix: propagate --model_dir and --model_cache_only to all model loading paths (#1285) - Add `model_cache_only` param to `load_align_model()`, pass as `local_files_only` to HuggingFace `from_pretrained` calls - Forward `model_dir` and `model_cache_only` to both `load_align_model` call sites (initial load and language-change reload) - Add `cache_dir` param to `DiarizationPipeline.__init__`, forward to pyannote `Pipeline.from_pretrained` - Pass `--model_dir` as `cache_dir` when constructing `DiarizationPipeline` in CLI Previously only the ASR model respected these flags. Alignment and diarization models would always download from HuggingFace to the default cache, breaking offline and custom-cache workflows. --------- Co-authored-by: Barabazs <31799121+Barabazs@users.noreply.github.com> --- whisperx/alignment.py | 6 +++--- whisperx/diarize.py | 3 ++- whisperx/transcribe.py | 6 +++--- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/whisperx/alignment.py b/whisperx/alignment.py index 81c475668..ce92d7a4f 100644 --- a/whisperx/alignment.py +++ b/whisperx/alignment.py @@ -77,7 +77,7 @@ } -def load_align_model(language_code: str, device: str, model_name: Optional[str] = None, model_dir=None): +def load_align_model(language_code: str, device: str, model_name: Optional[str] = None, model_dir=None, model_cache_only: bool = False): if model_name is None: # use default model if language_code in DEFAULT_ALIGN_MODELS_TORCH: @@ -98,8 +98,8 @@ def load_align_model(language_code: str, device: str, model_name: Optional[str] align_dictionary = {c.lower(): i for i, c in enumerate(labels)} else: try: - processor = Wav2Vec2Processor.from_pretrained(model_name, cache_dir=model_dir) - align_model = Wav2Vec2ForCTC.from_pretrained(model_name, cache_dir=model_dir) + processor = Wav2Vec2Processor.from_pretrained(model_name, cache_dir=model_dir, local_files_only=model_cache_only) + align_model = Wav2Vec2ForCTC.from_pretrained(model_name, cache_dir=model_dir, local_files_only=model_cache_only) except Exception as e: print(e) print(f"Error loading model from huggingface, check https://huggingface.co/models for finetuned wav2vec2.0 models") diff --git a/whisperx/diarize.py b/whisperx/diarize.py index 59b0f2f1b..041fb129d 100644 --- a/whisperx/diarize.py +++ b/whisperx/diarize.py @@ -94,12 +94,13 @@ def __init__( model_name=None, token=None, device: Optional[Union[str, torch.device]] = "cpu", + cache_dir=None, ): if isinstance(device, str): device = torch.device(device) model_config = model_name or "pyannote/speaker-diarization-community-1" logger.info(f"Loading diarization model: {model_config}") - self.model = Pipeline.from_pretrained(model_config, token=token).to(device) + self.model = Pipeline.from_pretrained(model_config, token=token, cache_dir=cache_dir).to(device) def __call__( self, diff --git a/whisperx/transcribe.py b/whisperx/transcribe.py index 0aae410c8..b1ab2e01e 100644 --- a/whisperx/transcribe.py +++ b/whisperx/transcribe.py @@ -166,7 +166,7 @@ def transcribe_task(args: dict, parser: argparse.ArgumentParser): tmp_results = results results = [] align_model, align_metadata = load_align_model( - align_language, device, model_name=align_model + align_language, device, model_name=align_model, model_dir=model_dir, model_cache_only=model_cache_only ) for result, audio_path in tmp_results: # >> Align @@ -183,7 +183,7 @@ def transcribe_task(args: dict, parser: argparse.ArgumentParser): f"New language found ({result['language']})! Previous was ({align_metadata['language']}), loading new alignment model for new language..." ) align_model, align_metadata = load_align_model( - result["language"], device + result["language"], device, model_dir=model_dir, model_cache_only=model_cache_only ) logger.info("Performing alignment...") result: AlignedTranscriptionResult = align( @@ -214,7 +214,7 @@ def transcribe_task(args: dict, parser: argparse.ArgumentParser): logger.info("Performing diarization...") logger.info(f"Using model: {diarize_model_name}") results = [] - diarize_model = DiarizationPipeline(model_name=diarize_model_name, token=hf_token, device=device) + diarize_model = DiarizationPipeline(model_name=diarize_model_name, token=hf_token, device=device, cache_dir=model_dir) for result, input_audio_path in tmp_results: diarize_result = diarize_model( input_audio_path, From 1baf8d2314b9622636346ae59c344e531f5fb4ba Mon Sep 17 00:00:00 2001 From: Barabazs <31799121+Barabazs@users.noreply.github.com> Date: Sat, 14 Feb 2026 14:34:31 +0100 Subject: [PATCH 2/3] feat: pass --hf_token to WhisperModel for gated model support Forward the existing --hf_token CLI argument to faster-whisper's WhisperModel via a new use_auth_token parameter on load_model(), enabling downloads of gated/private HuggingFace models. --- whisperx/asr.py | 4 +++- whisperx/transcribe.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/whisperx/asr.py b/whisperx/asr.py index f9456be8a..7540770f4 100644 --- a/whisperx/asr.py +++ b/whisperx/asr.py @@ -314,6 +314,7 @@ def load_model( download_root: Optional[str] = None, local_files_only=False, threads=4, + use_auth_token: Optional[Union[str, bool]] = None, ) -> FasterWhisperPipeline: """Load a Whisper model for inference. Args: @@ -341,7 +342,8 @@ def load_model( compute_type=compute_type, download_root=download_root, local_files_only=local_files_only, - cpu_threads=threads) + cpu_threads=threads, + use_auth_token=use_auth_token) if language is not None: tokenizer = Tokenizer(model.hf_tokenizer, model.model.is_multilingual, task=task, language=language) else: diff --git a/whisperx/transcribe.py b/whisperx/transcribe.py index b1ab2e01e..7c8be6794 100644 --- a/whisperx/transcribe.py +++ b/whisperx/transcribe.py @@ -141,6 +141,7 @@ def transcribe_task(args: dict, parser: argparse.ArgumentParser): task=task, local_files_only=model_cache_only, threads=faster_whisper_threads, + use_auth_token=hf_token, ) for audio_path in args.pop("audio"): From 42beab1f9f2ecdc29c4e9d6c399506feca373b66 Mon Sep 17 00:00:00 2001 From: Barabazs <31799121+Barabazs@users.noreply.github.com> Date: Sat, 14 Feb 2026 15:00:56 +0100 Subject: [PATCH 3/3] chore: bump version to 3.8.1 --- pyproject.toml | 2 +- uv.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 61dc5df44..15d392da6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ urls = { repository = "https://github.com/m-bain/whisperx" } authors = [{ name = "Max Bain" }] name = "whisperx" -version = "3.8.0" +version = "3.8.1" description = "Time-Accurate Automatic Speech Recognition using Whisper." readme = "README.md" requires-python = ">=3.10, <3.14" diff --git a/uv.lock b/uv.lock index 66992af56..421c50445 100644 --- a/uv.lock +++ b/uv.lock @@ -3026,7 +3026,7 @@ wheels = [ [[package]] name = "whisperx" -version = "3.8.0" +version = "3.8.1" source = { editable = "." } dependencies = [ { name = "ctranslate2" },