From 544842b15a6c53211b7a8e73a4eede312d3155cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maple=EF=BC=81?= <mapleee723@gmail.com>
Date: Mon, 4 May 2026 10:26:51 +0800
Subject: [PATCH] fix: auto-fallback to visual description for videos without
 audio stream

Videos from X/Twitter sometimes have no audio stream, causing ffmpeg to
crash with "Output file does not contain any stream" during audio
extraction. Detect audio presence in metadata and auto-switch to visual
description mode when no audio is found.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/bilingualsub/api/pipeline.py    | 10 ++++-
 src/bilingualsub/core/downloader.py | 15 +++++++
 src/bilingualsub/utils/ffmpeg.py    |  5 ++-
 tests/unit/api/test_pipeline.py     | 48 +++++++++++++++++++-
 tests/unit/utils/test_ffmpeg.py     | 68 +++++++++++++++++++++++++++++
 5 files changed, 143 insertions(+), 3 deletions(-)

diff --git a/src/bilingualsub/api/pipeline.py b/src/bilingualsub/api/pipeline.py
index e833c37..10b5cba 100644
--- a/src/bilingualsub/api/pipeline.py
+++ b/src/bilingualsub/api/pipeline.py
@@ -223,6 +223,7 @@ async def _acquire_video(
             width=int(meta_dict["width"]),
             height=int(meta_dict["height"]),
             fps=float(meta_dict["fps"]),
+            has_audio=bool(meta_dict.get("has_audio", True)),
         )
         log.info("step_done", step="upload", source=str(video_path))
         return video_path, metadata
@@ -288,7 +289,14 @@ async def run_download(job: Job) -> None:
     try:
         video_path, metadata = await _acquire_video(job, work_dir, log)
         if job.processing_mode != ProcessingMode.VISUAL_DESCRIPTION:
-            await _extract_audio_step(job, video_path, work_dir, log)
+            if not metadata.has_audio:
+                log.info(
+                    "no_audio_stream_detected",
+                    msg="Auto-switching to visual description mode",
+                )
+                job.processing_mode = ProcessingMode.VISUAL_DESCRIPTION
+            else:
+                await _extract_audio_step(job, video_path, work_dir, log)
 
         # Save metadata for subtitle phase
         job.video_width = metadata.width
diff --git a/src/bilingualsub/core/downloader.py b/src/bilingualsub/core/downloader.py
index 28ea255..b81ce3b 100644
--- a/src/bilingualsub/core/downloader.py
+++ b/src/bilingualsub/core/downloader.py
@@ -29,6 +29,7 @@ class VideoMetadata:
     height: int
     fps: float
     description: str = ""
+    has_audio: bool = True
 
     def __post_init__(self) -> None:
         """Validate metadata constraints."""
@@ -279,6 +280,16 @@ def _extract_metadata_from_info_dict(
     if fps is None or fps <= 0:
         fps = 30.0
 
+    # Detect audio: check acodec field and requested_formats
+    acodec = info_dict.get("acodec", "none")
+    has_audio = acodec not in ("none", None)
+    if not has_audio:
+        # Also check requested_formats for separate audio streams
+        requested_formats = info_dict.get("requested_formats") or []
+        has_audio = any(
+            fmt.get("acodec", "none") not in ("none", None) for fmt in requested_formats
+        )
+
     return VideoMetadata(
         title=title,
         duration=float(duration),
@@ -286,6 +297,7 @@ def _extract_metadata_from_info_dict(
         height=int(height),
         fps=float(fps),
         description=_sanitize_description(info_dict.get("description", "")),
+        has_audio=has_audio,
     )
 
 
@@ -320,6 +332,8 @@ def _extract_metadata_with_ffprobe(video_path: Path) -> VideoMetadata:
     if not video_stream:
         raise DownloadError("No video stream found in file")
 
+    has_audio = any(s.get("codec_type") == "audio" for s in data.get("streams", []))
+
     # Extract metadata
     try:
         title = data.get("format", {}).get("tags", {}).get("title", video_path.stem)
@@ -341,4 +355,5 @@ def _extract_metadata_with_ffprobe(video_path: Path) -> VideoMetadata:
         width=width,
         height=height,
         fps=fps,
+        has_audio=has_audio,
     )
diff --git a/src/bilingualsub/utils/ffmpeg.py b/src/bilingualsub/utils/ffmpeg.py
index 71e1e0b..74e4b27 100644
--- a/src/bilingualsub/utils/ffmpeg.py
+++ b/src/bilingualsub/utils/ffmpeg.py
@@ -260,7 +260,7 @@ def extract_video_metadata(video_path: Path) -> dict[str, str | float | int]:
         video_path: Path to the video file
 
     Returns:
-        Dict with keys: title, duration, width, height, fps
+        Dict with keys: title, duration, width, height, fps, has_audio
 
     Raises:
         FFmpegError: If ffprobe fails or no video stream found
@@ -298,6 +298,8 @@ def extract_video_metadata(video_path: Path) -> dict[str, str | float | int]:
     if not video_stream:
         raise FFmpegError(f"No video stream found in {video_path}")
 
+    has_audio = any(s.get("codec_type") == "audio" for s in data.get("streams", []))
+
     try:
         title = data.get("format", {}).get("tags", {}).get("title", video_path.stem)
         duration = float(data.get("format", {}).get("duration", 0))
@@ -317,6 +319,7 @@ def extract_video_metadata(video_path: Path) -> dict[str, str | float | int]:
         "width": width,
         "height": height,
         "fps": fps,
+        "has_audio": has_audio,
     }
 
 
diff --git a/tests/unit/api/test_pipeline.py b/tests/unit/api/test_pipeline.py
index f87f099..ab620d0 100644
--- a/tests/unit/api/test_pipeline.py
+++ b/tests/unit/api/test_pipeline.py
@@ -6,7 +6,7 @@
 
 import pytest
 
-from bilingualsub.api.constants import FileType, JobStatus, SSEEvent
+from bilingualsub.api.constants import FileType, JobStatus, ProcessingMode, SSEEvent
 from bilingualsub.api.jobs import Job
 from bilingualsub.api.pipeline import run_download, run_subtitle
 from bilingualsub.core.downloader import DownloadError, VideoMetadata
@@ -216,6 +216,52 @@ async def test_run_download_extract_audio_failure_sends_error(
         assert "ffmpeg segfault" in error_events[0]["data"]["detail"]
         assert job.status == JobStatus.FAILED
 
+    @patch("bilingualsub.api.pipeline.download_video")
+    async def test_run_download_no_audio_switches_to_visual_description(
+        self, mock_download
+    ) -> None:
+        """When video has no audio stream, auto-switch to visual description mode."""
+        metadata = VideoMetadata(
+            title="Silent Video",
+            duration=60.0,
+            width=1920,
+            height=1080,
+            fps=30.0,
+            has_audio=False,
+        )
+        mock_download.return_value = metadata
+
+        job = _make_job()
+        assert job.processing_mode == ProcessingMode.SUBTITLE
+
+        await run_download(job)
+
+        assert job.processing_mode == ProcessingMode.VISUAL_DESCRIPTION
+        assert job.status == JobStatus.DOWNLOAD_COMPLETE
+
+    @patch("bilingualsub.api.pipeline.extract_audio")
+    @patch("bilingualsub.api.pipeline.download_video")
+    async def test_run_download_with_audio_keeps_subtitle_mode(
+        self, mock_download, mock_extract_audio
+    ) -> None:
+        """When video has audio stream, processing mode stays as SUBTITLE."""
+        metadata = VideoMetadata(
+            title="Normal Video",
+            duration=60.0,
+            width=1920,
+            height=1080,
+            fps=30.0,
+            has_audio=True,
+        )
+        mock_download.return_value = metadata
+
+        job = _make_job()
+        await run_download(job)
+
+        assert job.processing_mode == ProcessingMode.SUBTITLE
+        assert job.status == JobStatus.DOWNLOAD_COMPLETE
+        mock_extract_audio.assert_called_once()
+
 
 @pytest.mark.unit
 @pytest.mark.asyncio
diff --git a/tests/unit/utils/test_ffmpeg.py b/tests/unit/utils/test_ffmpeg.py
index e341d20..faa0e3e 100644
--- a/tests/unit/utils/test_ffmpeg.py
+++ b/tests/unit/utils/test_ffmpeg.py
@@ -9,6 +9,7 @@
     FFmpegError,
     burn_subtitles,
     extract_audio,
+    extract_video_metadata,
     get_audio_duration,
     split_audio,
     trim_video,
@@ -48,6 +49,7 @@ def mock_ffmpeg(self):
                 "height": 1080,
                 "fps": 30.0,
                 "title": "test video",
+                "has_audio": True,
             }
 
             yield {
@@ -804,3 +806,69 @@ def test_non_existent_file_raises_error(self, tmp_path):
 
         with pytest.raises(ValueError, match="Audio file does not exist"):
             split_audio(audio_path, output_dir=tmp_path)
+
+
+def _ffprobe_json(
+    streams: list[dict], duration: float = 120.0, title: str = "test"
+) -> str:
+    """Build a minimal ffprobe JSON output."""
+    return json.dumps(
+        {
+            "streams": streams,
+            "format": {"duration": str(duration), "tags": {"title": title}},
+        }
+    )
+
+
+_VIDEO_STREAM = {
+    "codec_type": "video",
+    "width": 1920,
+    "height": 1080,
+    "r_frame_rate": "30/1",
+}
+_AUDIO_STREAM = {"codec_type": "audio", "codec_name": "aac"}
+
+
+@pytest.mark.unit
+class TestExtractVideoMetadata:
+    """Test cases for extract_video_metadata has_audio detection."""
+
+    @patch("bilingualsub.utils.ffmpeg.subprocess.run")
+    def test_has_audio_true_when_audio_stream_present(self, mock_run, tmp_path):
+        """Given video with audio+video streams, has_audio is True."""
+        mock_run.return_value = MagicMock(
+            stdout=_ffprobe_json([_VIDEO_STREAM, _AUDIO_STREAM]),
+        )
+
+        result = extract_video_metadata(tmp_path / "video.mp4")
+
+        assert result["has_audio"] is True
+
+    @patch("bilingualsub.utils.ffmpeg.subprocess.run")
+    def test_has_audio_false_when_no_audio_stream(self, mock_run, tmp_path):
+        """Given video with only video stream, has_audio is False."""
+        mock_run.return_value = MagicMock(
+            stdout=_ffprobe_json([_VIDEO_STREAM]),
+        )
+
+        result = extract_video_metadata(tmp_path / "video.mp4")
+
+        assert result["has_audio"] is False
+
+    @patch("bilingualsub.utils.ffmpeg.subprocess.run")
+    def test_returns_standard_metadata_fields(self, mock_run, tmp_path):
+        """Given a normal video, all standard metadata fields are returned."""
+        mock_run.return_value = MagicMock(
+            stdout=_ffprobe_json(
+                [_VIDEO_STREAM, _AUDIO_STREAM], duration=60.0, title="My Video"
+            ),
+        )
+
+        result = extract_video_metadata(tmp_path / "video.mp4")
+
+        assert result["title"] == "My Video"
+        assert result["duration"] == 60.0
+        assert result["width"] == 1920
+        assert result["height"] == 1080
+        assert result["fps"] == 30.0
+        assert "has_audio" in result