From 9d687e015e41e001900a5871c9db3bf93bafea14 Mon Sep 17 00:00:00 2001
From: MrPrayer <github.com@9033334.ru>
Date: Sat, 14 Feb 2026 16:11:52 +0300
Subject: [PATCH 1/3] fix: propagate --model_dir and --model_cache_only to all
 model loading paths  (#1285)

- Add `model_cache_only` param to `load_align_model()`, pass as `local_files_only` to HuggingFace `from_pretrained` calls
- Forward `model_dir` and `model_cache_only` to both `load_align_model` call sites (initial load and language-change reload)
- Add `cache_dir` param to `DiarizationPipeline.__init__`, forward to pyannote `Pipeline.from_pretrained`
- Pass `--model_dir` as `cache_dir` when constructing `DiarizationPipeline` in CLI

Previously only the ASR model respected these flags. Alignment and diarization models would always download from HuggingFace to the default cache, breaking offline and custom-cache workflows.


---------

Co-authored-by: Barabazs <31799121+Barabazs@users.noreply.github.com>
---
 whisperx/alignment.py  | 6 +++---
 whisperx/diarize.py    | 3 ++-
 whisperx/transcribe.py | 6 +++---
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/whisperx/alignment.py b/whisperx/alignment.py
index 81c475668..ce92d7a4f 100644
--- a/whisperx/alignment.py
+++ b/whisperx/alignment.py
@@ -77,7 +77,7 @@
 }
 
 
-def load_align_model(language_code: str, device: str, model_name: Optional[str] = None, model_dir=None):
+def load_align_model(language_code: str, device: str, model_name: Optional[str] = None, model_dir=None, model_cache_only: bool = False):
     if model_name is None:
         # use default model
         if language_code in DEFAULT_ALIGN_MODELS_TORCH:
@@ -98,8 +98,8 @@ def load_align_model(language_code: str, device: str, model_name: Optional[str]
         align_dictionary = {c.lower(): i for i, c in enumerate(labels)}
     else:
         try:
-            processor = Wav2Vec2Processor.from_pretrained(model_name, cache_dir=model_dir)
-            align_model = Wav2Vec2ForCTC.from_pretrained(model_name, cache_dir=model_dir)
+            processor = Wav2Vec2Processor.from_pretrained(model_name, cache_dir=model_dir, local_files_only=model_cache_only)
+            align_model = Wav2Vec2ForCTC.from_pretrained(model_name, cache_dir=model_dir, local_files_only=model_cache_only)
         except Exception as e:
             print(e)
             print(f"Error loading model from huggingface, check https://huggingface.co/models for finetuned wav2vec2.0 models")
diff --git a/whisperx/diarize.py b/whisperx/diarize.py
index 59b0f2f1b..041fb129d 100644
--- a/whisperx/diarize.py
+++ b/whisperx/diarize.py
@@ -94,12 +94,13 @@ def __init__(
         model_name=None,
         token=None,
         device: Optional[Union[str, torch.device]] = "cpu",
+        cache_dir=None,
     ):
         if isinstance(device, str):
             device = torch.device(device)
         model_config = model_name or "pyannote/speaker-diarization-community-1"
         logger.info(f"Loading diarization model: {model_config}")
-        self.model = Pipeline.from_pretrained(model_config, token=token).to(device)
+        self.model = Pipeline.from_pretrained(model_config, token=token, cache_dir=cache_dir).to(device)
 
     def __call__(
         self,
diff --git a/whisperx/transcribe.py b/whisperx/transcribe.py
index 0aae410c8..b1ab2e01e 100644
--- a/whisperx/transcribe.py
+++ b/whisperx/transcribe.py
@@ -166,7 +166,7 @@ def transcribe_task(args: dict, parser: argparse.ArgumentParser):
         tmp_results = results
         results = []
         align_model, align_metadata = load_align_model(
-            align_language, device, model_name=align_model
+            align_language, device, model_name=align_model, model_dir=model_dir, model_cache_only=model_cache_only
         )
         for result, audio_path in tmp_results:
             # >> Align
@@ -183,7 +183,7 @@ def transcribe_task(args: dict, parser: argparse.ArgumentParser):
                         f"New language found ({result['language']})! Previous was ({align_metadata['language']}), loading new alignment model for new language..."
                     )
                     align_model, align_metadata = load_align_model(
-                        result["language"], device
+                        result["language"], device, model_dir=model_dir, model_cache_only=model_cache_only
                     )
                 logger.info("Performing alignment...")
                 result: AlignedTranscriptionResult = align(
@@ -214,7 +214,7 @@ def transcribe_task(args: dict, parser: argparse.ArgumentParser):
         logger.info("Performing diarization...")
         logger.info(f"Using model: {diarize_model_name}")
         results = []
-        diarize_model = DiarizationPipeline(model_name=diarize_model_name, token=hf_token, device=device)
+        diarize_model = DiarizationPipeline(model_name=diarize_model_name, token=hf_token, device=device, cache_dir=model_dir)
         for result, input_audio_path in tmp_results:
             diarize_result = diarize_model(
                 input_audio_path, 

From 1baf8d2314b9622636346ae59c344e531f5fb4ba Mon Sep 17 00:00:00 2001
From: Barabazs <31799121+Barabazs@users.noreply.github.com>
Date: Sat, 14 Feb 2026 14:34:31 +0100
Subject: [PATCH 2/3] feat: pass --hf_token to WhisperModel for gated model
 support

Forward the existing --hf_token CLI argument to faster-whisper's
WhisperModel via a new use_auth_token parameter on load_model(),
enabling downloads of gated/private HuggingFace models.
---
 whisperx/asr.py        | 4 +++-
 whisperx/transcribe.py | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/whisperx/asr.py b/whisperx/asr.py
index f9456be8a..7540770f4 100644
--- a/whisperx/asr.py
+++ b/whisperx/asr.py
@@ -314,6 +314,7 @@ def load_model(
     download_root: Optional[str] = None,
     local_files_only=False,
     threads=4,
+    use_auth_token: Optional[Union[str, bool]] = None,
 ) -> FasterWhisperPipeline:
     """Load a Whisper model for inference.
     Args:
@@ -341,7 +342,8 @@ def load_model(
                          compute_type=compute_type,
                          download_root=download_root,
                          local_files_only=local_files_only,
-                         cpu_threads=threads)
+                         cpu_threads=threads,
+                         use_auth_token=use_auth_token)
     if language is not None:
         tokenizer = Tokenizer(model.hf_tokenizer, model.model.is_multilingual, task=task, language=language)
     else:
diff --git a/whisperx/transcribe.py b/whisperx/transcribe.py
index b1ab2e01e..7c8be6794 100644
--- a/whisperx/transcribe.py
+++ b/whisperx/transcribe.py
@@ -141,6 +141,7 @@ def transcribe_task(args: dict, parser: argparse.ArgumentParser):
         task=task,
         local_files_only=model_cache_only,
         threads=faster_whisper_threads,
+        use_auth_token=hf_token,
     )
 
     for audio_path in args.pop("audio"):

From 42beab1f9f2ecdc29c4e9d6c399506feca373b66 Mon Sep 17 00:00:00 2001
From: Barabazs <31799121+Barabazs@users.noreply.github.com>
Date: Sat, 14 Feb 2026 15:00:56 +0100
Subject: [PATCH 3/3] chore: bump version to 3.8.1

---
 pyproject.toml | 2 +-
 uv.lock        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 61dc5df44..15d392da6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,7 +2,7 @@
 urls = { repository = "https://github.com/m-bain/whisperx" }
 authors = [{ name = "Max Bain" }]
 name = "whisperx"
-version = "3.8.0"
+version = "3.8.1"
 description = "Time-Accurate Automatic Speech Recognition using Whisper."
 readme = "README.md"
 requires-python = ">=3.10, <3.14"
diff --git a/uv.lock b/uv.lock
index 66992af56..421c50445 100644
--- a/uv.lock
+++ b/uv.lock
@@ -3026,7 +3026,7 @@ wheels = [
 
 [[package]]
 name = "whisperx"
-version = "3.8.0"
+version = "3.8.1"
 source = { editable = "." }
 dependencies = [
     { name = "ctranslate2" },