From 544842b15a6c53211b7a8e73a4eede312d3155cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maple=EF=BC=81?= Date: Mon, 4 May 2026 10:26:51 +0800 Subject: [PATCH] fix: auto-fallback to visual description for videos without audio stream Videos from X/Twitter sometimes have no audio stream, causing ffmpeg to crash with "Output file does not contain any stream" during audio extraction. Detect audio presence in metadata and auto-switch to visual description mode when no audio is found. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/bilingualsub/api/pipeline.py | 10 ++++- src/bilingualsub/core/downloader.py | 15 +++++++ src/bilingualsub/utils/ffmpeg.py | 5 ++- tests/unit/api/test_pipeline.py | 48 +++++++++++++++++++- tests/unit/utils/test_ffmpeg.py | 68 +++++++++++++++++++++++++++++ 5 files changed, 143 insertions(+), 3 deletions(-) diff --git a/src/bilingualsub/api/pipeline.py b/src/bilingualsub/api/pipeline.py index e833c37..10b5cba 100644 --- a/src/bilingualsub/api/pipeline.py +++ b/src/bilingualsub/api/pipeline.py @@ -223,6 +223,7 @@ async def _acquire_video( width=int(meta_dict["width"]), height=int(meta_dict["height"]), fps=float(meta_dict["fps"]), + has_audio=bool(meta_dict.get("has_audio", True)), ) log.info("step_done", step="upload", source=str(video_path)) return video_path, metadata @@ -288,7 +289,14 @@ async def run_download(job: Job) -> None: try: video_path, metadata = await _acquire_video(job, work_dir, log) if job.processing_mode != ProcessingMode.VISUAL_DESCRIPTION: - await _extract_audio_step(job, video_path, work_dir, log) + if not metadata.has_audio: + log.info( + "no_audio_stream_detected", + msg="Auto-switching to visual description mode", + ) + job.processing_mode = ProcessingMode.VISUAL_DESCRIPTION + else: + await _extract_audio_step(job, video_path, work_dir, log) # Save metadata for subtitle phase job.video_width = metadata.width diff --git a/src/bilingualsub/core/downloader.py b/src/bilingualsub/core/downloader.py index 28ea255..b81ce3b 100644 --- a/src/bilingualsub/core/downloader.py +++ b/src/bilingualsub/core/downloader.py @@ -29,6 +29,7 @@ class VideoMetadata: height: int fps: float description: str = "" + has_audio: bool = True def __post_init__(self) -> None: """Validate metadata constraints.""" @@ -279,6 +280,16 @@ def _extract_metadata_from_info_dict( if fps is None or fps <= 0: fps = 30.0 + # Detect audio: check acodec field and requested_formats + acodec = info_dict.get("acodec", "none") + has_audio = acodec not in ("none", None) + if not has_audio: + # Also check requested_formats for separate audio streams + requested_formats = info_dict.get("requested_formats") or [] + has_audio = any( + fmt.get("acodec", "none") not in ("none", None) for fmt in requested_formats + ) + return VideoMetadata( title=title, duration=float(duration), @@ -286,6 +297,7 @@ def _extract_metadata_from_info_dict( height=int(height), fps=float(fps), description=_sanitize_description(info_dict.get("description", "")), + has_audio=has_audio, ) @@ -320,6 +332,8 @@ def _extract_metadata_with_ffprobe(video_path: Path) -> VideoMetadata: if not video_stream: raise DownloadError("No video stream found in file") + has_audio = any(s.get("codec_type") == "audio" for s in data.get("streams", [])) + # Extract metadata try: title = data.get("format", {}).get("tags", {}).get("title", video_path.stem) @@ -341,4 +355,5 @@ def _extract_metadata_with_ffprobe(video_path: Path) -> VideoMetadata: width=width, height=height, fps=fps, + has_audio=has_audio, ) diff --git a/src/bilingualsub/utils/ffmpeg.py b/src/bilingualsub/utils/ffmpeg.py index 71e1e0b..74e4b27 100644 --- a/src/bilingualsub/utils/ffmpeg.py +++ b/src/bilingualsub/utils/ffmpeg.py @@ -260,7 +260,7 @@ def extract_video_metadata(video_path: Path) -> dict[str, str | float | int]: video_path: Path to the video file Returns: - Dict with keys: title, duration, width, height, fps + Dict with keys: title, duration, width, height, fps, has_audio Raises: FFmpegError: If ffprobe fails or no video stream found @@ -298,6 +298,8 @@ def extract_video_metadata(video_path: Path) -> dict[str, str | float | int]: if not video_stream: raise FFmpegError(f"No video stream found in {video_path}") + has_audio = any(s.get("codec_type") == "audio" for s in data.get("streams", [])) + try: title = data.get("format", {}).get("tags", {}).get("title", video_path.stem) duration = float(data.get("format", {}).get("duration", 0)) @@ -317,6 +319,7 @@ def extract_video_metadata(video_path: Path) -> dict[str, str | float | int]: "width": width, "height": height, "fps": fps, + "has_audio": has_audio, } diff --git a/tests/unit/api/test_pipeline.py b/tests/unit/api/test_pipeline.py index f87f099..ab620d0 100644 --- a/tests/unit/api/test_pipeline.py +++ b/tests/unit/api/test_pipeline.py @@ -6,7 +6,7 @@ import pytest -from bilingualsub.api.constants import FileType, JobStatus, SSEEvent +from bilingualsub.api.constants import FileType, JobStatus, ProcessingMode, SSEEvent from bilingualsub.api.jobs import Job from bilingualsub.api.pipeline import run_download, run_subtitle from bilingualsub.core.downloader import DownloadError, VideoMetadata @@ -216,6 +216,52 @@ async def test_run_download_extract_audio_failure_sends_error( assert "ffmpeg segfault" in error_events[0]["data"]["detail"] assert job.status == JobStatus.FAILED + @patch("bilingualsub.api.pipeline.download_video") + async def test_run_download_no_audio_switches_to_visual_description( + self, mock_download + ) -> None: + """When video has no audio stream, auto-switch to visual description mode.""" + metadata = VideoMetadata( + title="Silent Video", + duration=60.0, + width=1920, + height=1080, + fps=30.0, + has_audio=False, + ) + mock_download.return_value = metadata + + job = _make_job() + assert job.processing_mode == ProcessingMode.SUBTITLE + + await run_download(job) + + assert job.processing_mode == ProcessingMode.VISUAL_DESCRIPTION + assert job.status == JobStatus.DOWNLOAD_COMPLETE + + @patch("bilingualsub.api.pipeline.extract_audio") + @patch("bilingualsub.api.pipeline.download_video") + async def test_run_download_with_audio_keeps_subtitle_mode( + self, mock_download, mock_extract_audio + ) -> None: + """When video has audio stream, processing mode stays as SUBTITLE.""" + metadata = VideoMetadata( + title="Normal Video", + duration=60.0, + width=1920, + height=1080, + fps=30.0, + has_audio=True, + ) + mock_download.return_value = metadata + + job = _make_job() + await run_download(job) + + assert job.processing_mode == ProcessingMode.SUBTITLE + assert job.status == JobStatus.DOWNLOAD_COMPLETE + mock_extract_audio.assert_called_once() + @pytest.mark.unit @pytest.mark.asyncio diff --git a/tests/unit/utils/test_ffmpeg.py b/tests/unit/utils/test_ffmpeg.py index e341d20..faa0e3e 100644 --- a/tests/unit/utils/test_ffmpeg.py +++ b/tests/unit/utils/test_ffmpeg.py @@ -9,6 +9,7 @@ FFmpegError, burn_subtitles, extract_audio, + extract_video_metadata, get_audio_duration, split_audio, trim_video, @@ -48,6 +49,7 @@ def mock_ffmpeg(self): "height": 1080, "fps": 30.0, "title": "test video", + "has_audio": True, } yield { @@ -804,3 +806,69 @@ def test_non_existent_file_raises_error(self, tmp_path): with pytest.raises(ValueError, match="Audio file does not exist"): split_audio(audio_path, output_dir=tmp_path) + + +def _ffprobe_json( + streams: list[dict], duration: float = 120.0, title: str = "test" +) -> str: + """Build a minimal ffprobe JSON output.""" + return json.dumps( + { + "streams": streams, + "format": {"duration": str(duration), "tags": {"title": title}}, + } + ) + + +_VIDEO_STREAM = { + "codec_type": "video", + "width": 1920, + "height": 1080, + "r_frame_rate": "30/1", +} +_AUDIO_STREAM = {"codec_type": "audio", "codec_name": "aac"} + + +@pytest.mark.unit +class TestExtractVideoMetadata: + """Test cases for extract_video_metadata has_audio detection.""" + + @patch("bilingualsub.utils.ffmpeg.subprocess.run") + def test_has_audio_true_when_audio_stream_present(self, mock_run, tmp_path): + """Given video with audio+video streams, has_audio is True.""" + mock_run.return_value = MagicMock( + stdout=_ffprobe_json([_VIDEO_STREAM, _AUDIO_STREAM]), + ) + + result = extract_video_metadata(tmp_path / "video.mp4") + + assert result["has_audio"] is True + + @patch("bilingualsub.utils.ffmpeg.subprocess.run") + def test_has_audio_false_when_no_audio_stream(self, mock_run, tmp_path): + """Given video with only video stream, has_audio is False.""" + mock_run.return_value = MagicMock( + stdout=_ffprobe_json([_VIDEO_STREAM]), + ) + + result = extract_video_metadata(tmp_path / "video.mp4") + + assert result["has_audio"] is False + + @patch("bilingualsub.utils.ffmpeg.subprocess.run") + def test_returns_standard_metadata_fields(self, mock_run, tmp_path): + """Given a normal video, all standard metadata fields are returned.""" + mock_run.return_value = MagicMock( + stdout=_ffprobe_json( + [_VIDEO_STREAM, _AUDIO_STREAM], duration=60.0, title="My Video" + ), + ) + + result = extract_video_metadata(tmp_path / "video.mp4") + + assert result["title"] == "My Video" + assert result["duration"] == 60.0 + assert result["width"] == 1920 + assert result["height"] == 1080 + assert result["fps"] == 30.0 + assert "has_audio" in result