From 83afb81ac714bd98cb0f0681dbb32cdb01e6b22f Mon Sep 17 00:00:00 2001
From: Barabazs <31799121+Barabazs@users.noreply.github.com>
Date: Wed, 1 Oct 2025 08:37:00 +0200
Subject: [PATCH 1/2] fix: restrict pyannote-audio version to avoid
 compatibility issues (#1242)

* fix: restrict pyannote-audio version to avoid compatibility issues
* chore: bump whisperX version to 3.4.3
---
 pyproject.toml | 4 ++--
 uv.lock        | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9b849abae..b1663c5a4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,7 +2,7 @@
 urls = { repository = "https://github.com/m-bain/whisperx" }
 authors = [{ name = "Max Bain" }]
 name = "whisperx"
-version = "3.4.2"
+version = "3.4.3"
 description = "Time-Accurate Automatic Speech Recognition using Whisper."
 readme = "README.md"
 requires-python = ">=3.9, <3.13"
@@ -15,7 +15,7 @@ dependencies = [
     "numpy>=2.0.2",
     "onnxruntime>=1.19",
     "pandas>=2.2.3",
-    "pyannote-audio>=3.3.2",
+    "pyannote-audio>=3.3.2,<4.0.0",
     "torch>=2.5.1",
     "torchaudio>=2.5.1",
     "transformers>=4.48.0",
diff --git a/uv.lock b/uv.lock
index 7ed518b2c..4c073e56a 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,5 @@
 version = 1
-revision = 2
+revision = 3
 requires-python = ">=3.9, <3.13"
 resolution-markers = [
     "python_full_version >= '3.12'",
@@ -2788,7 +2788,7 @@ wheels = [
 
 [[package]]
 name = "whisperx"
-version = "3.4.2"
+version = "3.4.3"
 source = { editable = "." }
 dependencies = [
     { name = "ctranslate2" },
@@ -2812,7 +2812,7 @@ requires-dist = [
     { name = "numpy", specifier = ">=2.0.2" },
     { name = "onnxruntime", specifier = ">=1.19" },
     { name = "pandas", specifier = ">=2.2.3" },
-    { name = "pyannote-audio", specifier = ">=3.3.2" },
+    { name = "pyannote-audio", specifier = ">=3.3.2,<4.0.0" },
     { name = "torch", specifier = ">=2.5.1" },
     { name = "torchaudio", specifier = ">=2.5.1" },
     { name = "transformers", specifier = ">=4.48.0" },

From c7d31883bcce818ed78264a05cdc666ef7d022d2 Mon Sep 17 00:00:00 2001
From: Alex Cannan <alexfcannan@gmail.com>
Date: Tue, 18 Feb 2025 12:25:57 -0500
Subject: [PATCH 2/2] Add jr, sr, and ph.d to punkt abbreviations

---
 whisperx/alignment.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/whisperx/alignment.py b/whisperx/alignment.py
index 34fbbbba3..3e19292ab 100644
--- a/whisperx/alignment.py
+++ b/whisperx/alignment.py
@@ -24,7 +24,7 @@
 )
 from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
 
-PUNKT_ABBREVIATIONS = ['dr', 'vs', 'mr', 'mrs', 'prof']
+PUNKT_ABBREVIATIONS = ['dr', 'vs', 'mr', 'mrs', 'prof', 'jr', 'sr', 'ph.d']
 
 LANGUAGES_WITHOUT_SPACES = ["ja", "zh"]
 
@@ -124,14 +124,14 @@ def align(
     """
     Align phoneme recognition predictions to known transcription.
     """
-    
+
     if not torch.is_tensor(audio):
         if isinstance(audio, str):
             audio = load_audio(audio)
         audio = torch.from_numpy(audio)
     if len(audio.shape) == 1:
         audio = audio.unsqueeze(0)
-    
+
     MAX_DURATION = audio.shape[1] / SAMPLE_RATE
 
     model_dictionary = align_model_metadata["dictionary"]
@@ -148,7 +148,7 @@ def align(
             base_progress = ((sdx + 1) / total_segments) * 100
             percent_complete = (50 + base_progress / 2) if combined_progress else base_progress
             print(f"Progress: {percent_complete:.2f}%...")
-            
+
         num_leading = len(segment["text"]) - len(segment["text"].lstrip())
         num_trailing = len(segment["text"]) - len(segment["text"].rstrip())
         text = segment["text"]
@@ -165,7 +165,7 @@ def align(
             # wav2vec2 models use "|" character to represent spaces
             if model_lang not in LANGUAGES_WITHOUT_SPACES:
                 char_ = char_.replace(" ", "|")
-            
+
             # ignore whitespace at beginning and end of transcript
             if cdx < num_leading:
                 pass
@@ -187,7 +187,7 @@ def align(
                 # index for placeholder
                 clean_wdx.append(wdx)
 
-                
+
         punkt_param = PunktParameters()
         punkt_param.abbrev_types = set(PUNKT_ABBREVIATIONS)
         sentence_splitter = PunktSentenceTokenizer(punkt_param)
@@ -199,12 +199,12 @@ def align(
             "clean_wdx": clean_wdx,
             "sentence_spans": sentence_spans
         }
-            
+
     aligned_segments: List[SingleAlignedSegment] = []
-    
+
     # 2. Get prediction matrix from alignment model & align
     for sdx, segment in enumerate(transcript):
-        
+
         t1 = segment["start"]
         t2 = segment["end"]
         text = segment["text"]
@@ -247,7 +247,7 @@ def align(
             )
         else:
             lengths = None
-            
+
         with torch.inference_mode():
             if model_type == "torchaudio":
                 emissions, _ = model(waveform_segment.to(device), lengths=lengths)
@@ -304,7 +304,7 @@ def align(
                 word_idx += 1
             elif cdx == len(text) - 1 or text[cdx+1] == " ":
                 word_idx += 1
-            
+
         char_segments_arr = pd.DataFrame(char_segments_arr)
 
         aligned_subsegments = []
@@ -333,7 +333,7 @@ def align(
                 word_end = word_chars["end"].max()
                 word_score = round(word_chars["score"].mean(), 3)
 
-                # -1 indicates unalignable 
+                # -1 indicates unalignable
                 word_segment = {"word": word_text}
 
                 if not np.isnan(word_start):
@@ -344,7 +344,7 @@ def align(
                     word_segment["score"] = word_score
 
                 sentence_words.append(word_segment)
-            
+
             aligned_subsegments.append({
                 "text": sentence_text,
                 "start": sentence_start,