Mapleeeeeeeeeee · Mapleeeeeeeeeee · Apr 28, 2026 · Apr 24, 2026 · Apr 27, 2026 · Apr 27, 2026
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -42,7 +42,7 @@ repos:
   # Mypy - Static type checking
   # ===========================================
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.10.0
+    rev: v1.19.1
     hooks:
       - id: mypy
         additional_dependencies:
@@ -53,6 +53,7 @@ repos:
           - fastapi>=0.115.0
           - structlog>=24.0.0
           - sse-starlette>=2.0.0
+          - google-genai>=1.0.0
         args: [--config-file=pyproject.toml]
         pass_filenames: false
         entry: bash -c 'mypy src/'

diff --git a/docs/arch/visual-description-mode.md b/docs/arch/visual-description-mode.md
diff --git a/docs/design/visual-description-mode.md b/docs/design/visual-description-mode.md
@@ -0,0 +1,127 @@
+# 視覺描述模式（Visual Description Mode）
+
+## 背景與問題
+
+現有系統僅支援語音轉字幕（Whisper ASR → LLM 翻譯），遇到無語音的影片（品牌形象影片、純音樂 MV、產品展示動畫等）時，Whisper 回傳空 segments 直接拋出 `TranscriptionError`，使用者只會看到一個模糊的「轉錄失敗」錯誤訊息。
+
+這類影片的畫面上往往有大量有價值的視覺資訊——標題文字、產品名稱、UI 介面文字、場景說明——但系統完全無法處理，使用者只能放棄。
+
+不做的後果：整個工具的適用範圍被限縮在「有人說話的影片」，大量品牌內容、教學動畫、產品 Demo 無法使用。
+
+## 使用者角色
+
+**一般觀眾**：想理解外語品牌影片或產品介紹的內容，貼入 URL 後期望系統能產出翻譯後的說明字幕。
+
+## 需求情境
+
+- 一般觀眾：When 我看到一支外語品牌形象影片，畫面上有文字但沒有旁白，I want to 讓系統分析畫面內容並翻譯成我的語言，so I can 理解影片在傳達什麼。
+
+## 設計意圖
+
+- **手動切換而非自動偵測** → 自動偵測需要先跑 Whisper 才能判斷有無語音，浪費時間且判斷邊界模糊（幾句話算「有語音」？）。手動切換讓使用者掌控意圖，流程更直覺。
+- **只產出翻譯後的單語字幕** → 視覺描述的「原文」是畫面內容而非語言文字，雙語對照在此場景沒有意義。
+- **第一版不做混合模式** → 混合模式需要時間軸對齊和內容類型判斷，複雜度高。先做純模式，驗證價值後再擴展。
+- **使用 Gemini 3.1 Flash Lite Preview** → 目前唯一支援原生影片輸入的主流模型，可直接吃整段影片（最長 90 分鐘），同時處理視覺和音訊，不需自行抽 frame。成本低、速度快，適合生產環境。
+
+## User Journey
+
+### Journey 1：觀眾 — 取得品牌影片的視覺描述字幕
+
+前置條件：使用者已開啟 BilingualSub 網頁
+
+1. 使用者看到 URL 輸入框上方的 Toggle，預設為「語音字幕」模式
+2. 使用者將 Toggle 切換到「視覺描述」模式
+   → 頁面提示文字變更，說明此模式會分析畫面內容而非語音
+3. 使用者貼入影片 URL，選擇目標語言，點擊「開始處理」
+   → 系統開始下載影片
+4. 下載完成後，系統將影片送入 Gemini 3.1 Flash Lite Preview 分析
+   → 進度條顯示「分析畫面內容中...」
+5. Gemini 回傳帶時間戳的畫面描述（英文或原始語言）
+   → 系統將描述翻譯成目標語言
+6. 翻譯完成，使用者看到字幕預覽
+   → 字幕以時間軸格式顯示，每條字幕對應一個畫面片段
+7. 使用者可選擇「下載字幕檔」（SRT）或「燒錄進影片」
+   → 與現有語音字幕流程一致的輸出選項
+
+### Journey 2：觀眾 — 切換回語音字幕模式
+
+前置條件：使用者目前在「視覺描述」模式
+
+1. 使用者將 Toggle 切回「語音字幕」
+   → 回到原有的語音字幕流程，所有現有功能不受影響
+
+## 替代流程
+
+- **影片過長（超過 90 分鐘）**：系統提示「影片過長，視覺描述模式最長支援 90 分鐘」，建議使用者裁剪影片或使用時間範圍功能
+- **Gemini 回傳內容極少**：影片畫面資訊不足（如純黑畫面、靜態圖片），系統仍產出結果但字幕數量可能很少，不額外提示
+
+## 錯誤情境
+
+### 系統錯誤
+
+- Gemini API 呼叫失敗（網路、quota、API key 無效）：顯示明確錯誤訊息「視覺分析服務暫時無法使用，請稍後再試」
+- 影片下載失敗：與現有語音模式共用相同的下載錯誤處理
+
+### 使用者誤操作
+
+- 對有大量語音的影片使用視覺描述模式：系統正常執行，只是產出的字幕是畫面描述而非語音轉錄。不阻擋，因為使用者可能確實想要畫面描述
+- 未設定 Gemini API key 就使用視覺描述模式：啟動時檢查，提示「請設定 GEMINI_API_KEY 環境變數」
+
+### 惡意行為
+
+- 不適用（無額外攻擊面，影片下載的安全性由現有 yt-dlp 處理）
+
+## Out of Scope
+
+- 語音字幕 + 視覺描述混合模式
+- 自動偵測影片有無語音並切換模式
+- 雙語對照輸出（原文描述 + 翻譯）
+- 自訂 Gemini prompt / 描述風格
+- 支援 Gemini 以外的視覺模型
+
+## 整合點
+
+- **Gemini API**：新增 `GEMINI_API_KEY` 環境變數，透過 Google AI SDK 呼叫 Gemini 3.1 Flash Lite Preview
+- **現有 Pipeline**：視覺描述模式複用現有的 download → translate → merge → burn 步驟，僅將 transcribe 步驟替換為 Gemini 視覺分析
+- **前端狀態**：`useJob` hook 需支援新的模式參數，Toggle 狀態影響 API 請求的 payload
+- **翻譯模組**：視覺描述的翻譯複用現有的 translator，輸入格式與語音轉錄的字幕條目相同
+
+## Acceptance Criteria
+
+- Given 使用者在首頁
+  When 頁面載入
+  Then 看到 Toggle 預設為「語音字幕」模式
+
+- Given 使用者切換到「視覺描述」模式
+  When 貼入影片 URL 並點擊開始
+  Then 系統使用 Gemini 分析畫面內容，而非 Whisper 語音辨識
+
+- Given 視覺分析完成
+  When 使用者查看結果
+  Then 看到帶時間戳的翻譯後字幕，內容描述畫面中的文字和視覺元素
+
+- Given 視覺描述字幕產出完成
+  When 使用者選擇「燒錄進影片」
+  Then 字幕被燒錄進影片，與語音字幕的燒錄效果一致
+
+- Given 視覺描述字幕產出完成
+  When 使用者選擇「下載字幕檔」
+  Then 下載到 SRT 格式的字幕檔
+
+- Given 使用者切換回「語音字幕」模式
+  When 操作流程
+  Then 所有現有功能不受影響，行為與切換前完全一致
+
+- Given 未設定 GEMINI_API_KEY
+  When 使用者嘗試使用視覺描述模式
+  Then 顯示明確提示要求設定 API key
+
+- Given 影片超過 90 分鐘
+  When 使用者以視覺描述模式處理
+  Then 顯示影片過長的提示訊息
+
+## 開放問題
+
+- Gemini 回傳的時間戳精度是否足夠產出流暢的字幕體驗？需實際測試驗證
+- 視覺描述的翻譯品質是否需要針對描述性文本調整 prompt？與語音轉錄的翻譯 prompt 可能有差異
+- 是否需要讓使用者指定「原始語言」？Gemini 可能需要知道畫面上文字的語言才能更準確辨識
diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx
@@ -120,7 +120,9 @@ function App() {
                 </div>
               </div>
               <button
-                onClick={() => subtitleJob(sourceLang, targetLang)}
+                onClick={() =>
+                  subtitleJob(sourceLang, targetLang, state.processingMode ?? undefined)
+                }
                 className="px-8 py-3 bg-black text-white rounded-full hover:scale-105 transition-transform"
               >
                 {t('app.generate_subtitles')}
@@ -133,13 +135,15 @@ function App() {
                 >
                   {t('app.download_original_video')}
                 </a>
-                <a
-                  href={apiClient.getDownloadUrl(state.jobId!, FileType.AUDIO)}
-                  download
-                  className="text-sm text-gray-400 hover:text-black transition-colors"
-                >
-                  {t('app.download_audio')}
-                </a>
+                {state.processingMode !== 'visual_description' && (
+                  <a
+                    href={apiClient.getDownloadUrl(state.jobId!, FileType.AUDIO)}
+                    download
+                    className="text-sm text-gray-400 hover:text-black transition-colors"
+                  >
+                    {t('app.download_audio')}
+                  </a>
+                )}
                 <button
                   onClick={reset}
                   className="text-sm text-gray-400 hover:text-black transition-colors"
@@ -272,7 +276,11 @@ function App() {
               <div className="lg:col-span-1 space-y-8">
                 <div>
                   <h3 className="text-3xl font-serif mb-6">{t('app.downloads_title')}</h3>
-                  <DownloadLinks jobId={state.jobId!} showVideo={true} />
+                  <DownloadLinks
+                    jobId={state.jobId!}
+                    showVideo={true}
+                    processingMode={state.processingMode}
+                  />
                 </div>
                 <button
                   onClick={backToEdit}

diff --git a/frontend/src/api/client.ts b/frontend/src/api/client.ts
@@ -38,6 +38,7 @@ class ApiClient {
     if (request.target_lang) formData.append('target_lang', request.target_lang);
     if (request.start_time !== undefined) formData.append('start_time', String(request.start_time));
     if (request.end_time !== undefined) formData.append('end_time', String(request.end_time));
+    if (request.processing_mode) formData.append('processing_mode', request.processing_mode);
 
     const response = await fetch(`${this.baseUrl}/api/jobs/upload`, {
       method: 'POST',
@@ -105,11 +106,13 @@ class ApiClient {
   async startSubtitle(
     jobId: string,
     sourceLang?: string,
-    targetLang?: string
+    targetLang?: string,
+    processingMode?: string
   ): Promise<{ status: string }> {
-    const payload: { source_lang?: string; target_lang?: string } = {};
+    const payload: { source_lang?: string; target_lang?: string; processing_mode?: string } = {};
     if (sourceLang) payload.source_lang = sourceLang;
     if (targetLang) payload.target_lang = targetLang;
+    if (processingMode) payload.processing_mode = processingMode;
 
     const response = await fetch(`${this.baseUrl}/api/jobs/${jobId}/subtitle`, {
       method: 'POST',

diff --git a/frontend/src/components/DownloadLinks.tsx b/frontend/src/components/DownloadLinks.tsx
@@ -4,10 +4,12 @@ import { FileType } from '../constants';
 import { apiClient } from '../api/client';
 import { DisclaimerDialog } from './DisclaimerDialog';
 import { triggerDownload } from '../utils/download';
+import type { ProcessingMode } from '../types';
 
 interface DownloadLinksProps {
   jobId: string;
   showVideo?: boolean;
+  processingMode?: ProcessingMode | null;
 }
 
 const FILE_OPTIONS = [
@@ -17,12 +19,20 @@ const FILE_OPTIONS = [
   { type: FileType.AUDIO, labelKey: 'download.audio' },
 ] as const;
 
-export function DownloadLinks({ jobId, showVideo }: DownloadLinksProps) {
+export function DownloadLinks({ jobId, showVideo, processingMode }: DownloadLinksProps) {
   const { t } = useTranslation();
   const [pendingUrl, setPendingUrl] = useState<string | null>(null);
 
-  const visibleOptions =
-    showVideo === false ? FILE_OPTIONS.filter(opt => opt.type !== FileType.VIDEO) : FILE_OPTIONS;
+  let visibleOptions =
+    showVideo === false
+      ? FILE_OPTIONS.filter(opt => opt.type !== FileType.VIDEO)
+      : [...FILE_OPTIONS];
+
+  if (processingMode === 'visual_description') {
+    visibleOptions = visibleOptions.filter(
+      opt => opt.type !== FileType.ASS && opt.type !== FileType.AUDIO
+    );
+  }
 
   return (
     <>

diff --git a/frontend/src/components/ProgressTracker.tsx b/frontend/src/components/ProgressTracker.tsx
@@ -59,7 +59,9 @@ export function ProgressTracker({
           {': '}
           {subtitleSource === SubtitleSource.YOUTUBE_MANUAL
             ? t('progress.subtitleSourceYoutube')
-            : t('progress.subtitleSourceWhisper')}
+            : subtitleSource === SubtitleSource.VISUAL_DESCRIPTION
+              ? t('progress.subtitleSourceVisual')
+              : t('progress.subtitleSourceWhisper')}
         </p>
       )}
     </div>

diff --git a/frontend/src/components/UrlInput.tsx b/frontend/src/components/UrlInput.tsx
@@ -28,6 +28,9 @@ export function UrlInput({ onSubmit, disabled }: UrlInputProps) {
   const [url, setUrl] = useState('');
   const [selectedFile, setSelectedFile] = useState<File | null>(null);
   const fileInputRef = useRef<HTMLInputElement>(null);
+  const [processingMode, setProcessingMode] = useState<'subtitle' | 'visual_description'>(
+    'subtitle'
+  );
   const [rangeEnabled, setRangeEnabled] = useState(false);
   const [startTime, setStartTime] = useState<TimeParts>({
     hours: '00',
@@ -65,6 +68,7 @@ export function UrlInput({ onSubmit, disabled }: UrlInputProps) {
       }
       const request: JobUploadRequest = {
         file: selectedFile,
+        processing_mode: processingMode,
       };
       if (startSeconds !== undefined) request.start_time = startSeconds;
       if (endSeconds !== undefined) request.end_time = endSeconds;
@@ -80,6 +84,7 @@ export function UrlInput({ onSubmit, disabled }: UrlInputProps) {
 
     const request: JobCreateRequest = {
       source_url: url,
+      processing_mode: processingMode,
     };
     if (startSeconds !== undefined) request.start_time = startSeconds;
     if (endSeconds !== undefined) request.end_time = endSeconds;
@@ -153,6 +158,42 @@ export function UrlInput({ onSubmit, disabled }: UrlInputProps) {
         )}
       </div>
 
+      <div className="flex flex-col items-center gap-2 text-gray-400">
+        <div className="flex items-center gap-4">
+          <span className="text-sm text-gray-600 dark:text-gray-400">
+            {t('form.processingModeLabel')}
+          </span>
+          <button
+            type="button"
+            disabled={disabled}
+            onClick={() =>
+              setProcessingMode(prev => (prev === 'subtitle' ? 'visual_description' : 'subtitle'))
+            }
+            className={`relative inline-flex h-6 w-11 items-center rounded-full transition-colors disabled:opacity-50 disabled:cursor-not-allowed ${
+              processingMode === 'visual_description'
+                ? 'bg-blue-600'
+                : 'bg-gray-300 dark:bg-gray-600'
+            }`}
+          >
+            <span
+              className={`inline-block h-4 w-4 transform rounded-full bg-white transition-transform ${
+                processingMode === 'visual_description' ? 'translate-x-6' : 'translate-x-1'
+              }`}
+            />
+          </button>
+          <span className="text-sm text-gray-600 dark:text-gray-400">
+            {processingMode === 'visual_description'
+              ? t('form.processingModeVisual')
+              : t('form.processingModeSubtitle')}
+          </span>
+        </div>
+        {processingMode === 'visual_description' && (
+          <p className="text-xs text-blue-600 dark:text-blue-400">
+            {t('form.processingModeVisualHint')}
+          </p>
+        )}
+      </div>
+
       <div className="flex flex-col items-center gap-4 text-gray-400">
         <div className="flex items-center gap-4">
           <label className="text-xs uppercase tracking-widest font-bold">

diff --git a/frontend/src/constants.ts b/frontend/src/constants.ts
@@ -46,5 +46,6 @@ export const PIPELINE_STEPS = [
 export const SubtitleSource = {
   WHISPER: 'whisper',
   YOUTUBE_MANUAL: 'youtube_manual',
+  VISUAL_DESCRIPTION: 'visual_description',
 } as const;
 export type SubtitleSource = (typeof SubtitleSource)[keyof typeof SubtitleSource];