Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 9 additions & 44 deletions backend/parakeet/stream_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@
from langdetect import detect as langdetect_detect
from langdetect.lang_detect_exception import LangDetectException
from scipy.spatial.distance import cdist
from transcribe import transcribe_file, _stream_model as _asr_model, INFERENCE_MODE as _INFERENCE_MODE

try:
from pyannote.audio import Model as _PyannoteModel, Inference as _PyannoteInference
except ImportError:
_PyannoteModel = None
_PyannoteInference = None
from transcribe import (
transcribe_file,
_stream_model as _asr_model,
INFERENCE_MODE as _INFERENCE_MODE,
get_builtin_embedding_model,
wav_bytes_to_waveform,
)

logger = logging.getLogger(__name__)

Expand All @@ -45,35 +45,6 @@
SPEAKER_EMBEDDING_URL = os.getenv("HOSTED_SPEAKER_EMBEDDING_API_URL", "")
MIN_EMBEDDING_AUDIO_S = 0.5

_embedding_model = None
_embedding_lock = threading.Lock()


def _get_builtin_embedding_model():
global _embedding_model
if _embedding_model is not None:
return _embedding_model
with _embedding_lock:
if _embedding_model is not None:
return _embedding_model
try:
if _PyannoteModel is None or _PyannoteInference is None:
logger.warning("pyannote.audio not installed, built-in embedding unavailable")
return None
model = _PyannoteModel.from_pretrained(
"pyannote/wespeaker-voxceleb-resnet34-LM", token=os.getenv("HUGGINGFACE_TOKEN")
)
inference = _PyannoteInference(model, window="whole")
if _torch is not None:
device = _torch.device("cuda" if _torch.cuda.is_available() else "cpu")
inference.to(device)
_embedding_model = inference
logger.info("Built-in speaker embedding model loaded (wespeaker-voxceleb-resnet34-LM)")
return _embedding_model
except Exception as e:
logger.warning(f"Could not load built-in embedding model: {e}")
return None


_vad_model = None
_vad_lock = threading.Lock()
Expand All @@ -88,11 +59,6 @@ def _get_builtin_embedding_model():
except ImportError:
_torch = None

try:
import torchaudio
except ImportError:
torchaudio = None


def _make_divisible_by(num, factor: int) -> int:
return (num // factor) * factor
Expand Down Expand Up @@ -728,7 +694,7 @@ def _assign_speaker(self, pcm: bytes, start: float, end: float) -> str:
return f"SPEAKER_{self._last_speaker}"

def _get_embedding(self, wav_bytes: bytes):
model = _get_builtin_embedding_model()
model = get_builtin_embedding_model()
if model is not None:
return self._get_embedding_builtin(wav_bytes, model)
if SPEAKER_EMBEDDING_URL:
Expand All @@ -737,8 +703,7 @@ def _get_embedding(self, wav_bytes: bytes):

def _get_embedding_builtin(self, wav_bytes: bytes, model):
try:
buf = io.BytesIO(wav_bytes)
waveform, sample_rate = torchaudio.load(buf)
waveform, sample_rate = wav_bytes_to_waveform(wav_bytes)
dur = waveform.shape[1] / sample_rate
if dur < MIN_EMBEDDING_AUDIO_S:
return None
Expand Down
92 changes: 90 additions & 2 deletions backend/parakeet/transcribe.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import io
import os
import logging
import threading
import wave as _wave

import httpx
Expand All @@ -24,6 +25,67 @@
except ImportError:
nemo_asr = None

try:
import torch as _torch
except ImportError:
_torch = None

try:
from pyannote.audio import Model as _PyannoteModel, Inference as _PyannoteInference
except ImportError:
_PyannoteModel = None
_PyannoteInference = None

_embedding_model = None
_embedding_lock = threading.Lock()


def get_builtin_embedding_model():
global _embedding_model
if _embedding_model is not None:
return _embedding_model
with _embedding_lock:
if _embedding_model is not None:
return _embedding_model
try:
if _PyannoteModel is None or _PyannoteInference is None:
logger.warning("pyannote.audio not installed, built-in embedding unavailable")
return None
model = _PyannoteModel.from_pretrained(

@cubic-dev-ai cubic-dev-ai Bot Jun 21, 2026

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: Built-in model load failures are not cached, causing repeated from_pretrained attempts per segment. This can add large latency and log noise before HTTP fallback.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At backend/parakeet/transcribe.py, line 54:

<comment>Built-in model load failures are not cached, causing repeated `from_pretrained` attempts per segment. This can add large latency and log noise before HTTP fallback.</comment>

<file context>
@@ -24,6 +25,70 @@
+            if _PyannoteModel is None or _PyannoteInference is None:
+                logger.warning("pyannote.audio not installed, built-in embedding unavailable")
+                return None
+            model = _PyannoteModel.from_pretrained(
+                "pyannote/wespeaker-voxceleb-resnet34-LM", token=os.getenv("HUGGINGFACE_TOKEN")
+            )
</file context>
Fix with cubic

"pyannote/wespeaker-voxceleb-resnet34-LM", token=os.getenv("HUGGINGFACE_TOKEN")
)
inference = _PyannoteInference(model, window="whole")
if _torch is not None and _torch.cuda.is_available():
inference.to(_torch.device("cuda"))
_embedding_model = inference
logger.info("Built-in speaker embedding model loaded (wespeaker-voxceleb-resnet34-LM)")
return _embedding_model
except Exception as e:
logger.warning(f"Could not load built-in embedding model: {e}")
return None


def wav_bytes_to_waveform(wav_bytes: bytes):
buf = io.BytesIO(wav_bytes)
with _wave.open(buf, "rb") as wf:
sr = wf.getframerate()
nch = wf.getnchannels()
sw = wf.getsampwidth()
pcm = wf.readframes(wf.getnframes())

if sw == 1:
samples = np.frombuffer(pcm, dtype=np.uint8).astype(np.float32) / 128.0 - 1.0
elif sw == 2:
samples = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32768.0
elif sw == 4:
samples = np.frombuffer(pcm, dtype=np.int32).astype(np.float32) / 2147483648.0
else:
raise ValueError(f"Unsupported WAV sample width: {sw} bytes")
if nch > 1:
samples = samples.reshape(-1, nch).mean(axis=1)
waveform = _torch.from_numpy(samples).unsqueeze(0)
return waveform, sr


def set_gpu_worker(worker) -> None:
global _gpu_worker
Expand Down Expand Up @@ -197,7 +259,7 @@ def _transcribe_nim(file_path: str):


def _diarize_segments(file_path: str, base: dict) -> dict:
if not SPEAKER_EMBEDDING_URL:
if not SPEAKER_EMBEDDING_URL and get_builtin_embedding_model() is None:
for seg in base["segments"]:
seg["speaker"] = "SPEAKER_0"
return base
Expand Down Expand Up @@ -270,7 +332,33 @@ def _extract_segment_wav(wav_bytes: bytes, start: float, end: float) -> bytes:


def _get_embedding(wav_bytes: bytes):
model = get_builtin_embedding_model()
if model is not None:
emb = _get_embedding_builtin(wav_bytes, model)
if emb is not None:
return emb
if SPEAKER_EMBEDDING_URL:
return _get_embedding_http(wav_bytes)
return None


def _get_embedding_builtin(wav_bytes: bytes, model):
try:
waveform, sample_rate = wav_bytes_to_waveform(wav_bytes)
dur = waveform.shape[1] / sample_rate
if dur < MIN_SEGMENT_DURATION:
return None
emb = model({"waveform": waveform, "sample_rate": sample_rate})
emb = np.array(emb, dtype=np.float32)
if emb.ndim == 1:
emb = emb.reshape(1, -1)
return emb
except Exception as e:
logger.warning(f"Built-in embedding failed: {e}")
return None


def _get_embedding_http(wav_bytes: bytes):
try:
with httpx.Client(timeout=httpx.Timeout(connect=5.0, read=30.0, write=10.0, pool=5.0)) as client:
resp = client.post(
Expand All @@ -289,5 +377,5 @@ def _get_embedding(wav_bytes: bytes):
emb = emb.reshape(1, -1)
return emb
except Exception as e:
logger.warning(f"Embedding extraction failed: {e}")
logger.warning(f"HTTP embedding failed: {e}")
return None
1 change: 1 addition & 0 deletions backend/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ pytest tests/unit/test_parakeet_stream_session.py -v
pytest tests/unit/test_parakeet_gpu_worker.py -v
pytest tests/unit/test_parakeet_batch_engine.py -v
pytest tests/unit/test_parakeet_batch_routing.py -v
pytest tests/unit/test_parakeet_builtin_embedding.py -v
pytest tests/unit/test_parakeet_endpoints.py -v
pytest tests/unit/test_audiobuffer_guard.py -v
pytest tests/unit/test_memory_leak_buffers.py -v
Expand Down
Loading
Loading