diff --git a/.gitignore b/.gitignore index 1b019672..508717b3 100644 --- a/.gitignore +++ b/.gitignore @@ -54,3 +54,4 @@ resources/ffprobe/ resources/media-server .ffmpeg-cache .ffprobe-cache +.cursor diff --git a/packages/shared/IpcChannel.ts b/packages/shared/IpcChannel.ts index 9c665151..920a3203 100644 --- a/packages/shared/IpcChannel.ts +++ b/packages/shared/IpcChannel.ts @@ -140,6 +140,12 @@ export enum IpcChannel { Media_ExtractSubtitle = 'media:extract-subtitle', SubtitleExtractor_CleanupTemp = 'subtitle-extractor:cleanup-temp', + // ASR 字幕生成相关 IPC 通道 / ASR subtitle generation related IPC channels + ASR_Generate = 'asr:generate', + ASR_Progress = 'asr:progress', + ASR_Cancel = 'asr:cancel', + ASR_ValidateApiKey = 'asr:validate-api-key', + // 文件系统相关 IPC 通道 / File system related IPC channels Fs_CheckFileExists = 'fs:check-file-exists', Fs_ReadFile = 'fs:read-file', diff --git a/packages/shared/types/asr.ts b/packages/shared/types/asr.ts new file mode 100644 index 00000000..555a065a --- /dev/null +++ b/packages/shared/types/asr.ts @@ -0,0 +1,223 @@ +/** + * ASR 字幕生成相关类型定义 + */ + +export type ASRErrorCode = + | 'NO_API_KEY' + | 'INVALID_API_KEY' + | 'QUOTA_EXCEEDED' + | 'NETWORK_ERROR' + | 'AUDIO_EXTRACTION_FAILED' + | 'SUBTITLE_EXTRACTION_FAILED' + | 'TASK_CANCELLED' + | 'UNKNOWN_ERROR' + +/** + * ASR 生成的字幕条目(简化版,后续需要转换为 SubtitleItem) + */ +export interface ASRSubtitleItem { + /** 索引 */ + index: number + /** 开始时间(秒) */ + startTime: number + /** 结束时间(秒) */ + endTime: number + /** 文本 */ + text: string + /** 单词级时间戳(可选) */ + words?: DeepgramWord[] +} + +/** + * ASR 生成选项 + */ +export interface ASRGenerateOptions { + /** 视频文件路径 */ + videoPath: string + /** 视频 ID(用于保存字幕记录) */ + videoId: number + /** 目标语言(ISO 639-1 代码,如 'en', 'zh', 'ja',或 'auto' 进行自动语言检测) */ + language?: string | 'auto' + /** Deepgram 模型选择 */ + model?: 'nova-2' | 'nova-3' + /** 输出格式 */ + outputFormat?: 'srt' | 'vtt' +} + +/** + * ASR 进度阶段 + */ +export enum ASRProgressStage { + /** 初始化 */ + Initializing = 'initializing', + /** 音频提取 */ + ExtractingAudio = 'extracting_audio', + /** 转写中 */ + Transcribing = 'transcribing', + /** 格式化 */ + Formatting = 'formatting', + /** 保存 */ + Saving = 'saving', + /** 完成 */ + Complete = 'complete', + /** 失败 */ + Failed = 'failed' +} + +/** + * ASR 进度信息 + */ +export interface ASRProgress { + /** 任务 ID */ + taskId: string + /** 当前阶段 */ + stage: ASRProgressStage + /** 进度百分比 (0-100) */ + percent: number + /** 当前处理的段索引(转写阶段) */ + current?: number + /** 总段数(转写阶段) */ + total?: number + /** 阶段消息 */ + message?: string + /** 预计剩余时间(秒) */ + eta?: number +} + +/** + * ASR 生成结果 + */ +export interface ASRResult { + /** 是否成功 */ + success: boolean + /** 生成的字幕数据 */ + subtitles?: ASRSubtitleItem[] + /** 输出文件路径(SRT/VTT) */ + outputPath?: string + /** 字幕库记录 ID */ + subtitleLibraryId?: number + /** 错误信息 */ + error?: string + /** 错误代码 */ + errorCode?: ASRErrorCode + /** 统计信息 */ + stats?: { + /** 音频时长(秒) */ + duration: number + /** 处理时长(秒) */ + processingTime: number + /** 段数 */ + segmentCount: number + /** 字幕条数 */ + subtitleCount: number + } +} + +/** + * 音频段信息 + */ +export interface AudioSegment { + /** 段索引 */ + index: number + /** 开始时间(秒) */ + start: number + /** 结束时间(秒) */ + end: number + /** 时长(秒) */ + duration: number + /** 音频文件路径 */ + filePath: string +} + +/** + * Deepgram 词级时间戳 + */ +export interface DeepgramWord { + /** 词文本 */ + word: string + /** 开始时间(秒) */ + start: number + /** 结束时间(秒) */ + end: number + /** 置信度 (0-1) */ + confidence: number + /** 带标点的词形 */ + punctuated_word?: string +} + +/** + * Deepgram 句段信息 + */ +export interface DeepgramUtterance { + /** 开始时间(秒) */ + start: number + /** 结束时间(秒) */ + end: number + /** 句段文本 */ + transcript: string + /** 置信度 (0-1) */ + confidence: number + /** 词数组 */ + words: DeepgramWord[] +} + +/** + * Deepgram API 响应(简化) + */ +export interface DeepgramResponse { + /** 结果数组 */ + results: { + /** 通道数组 */ + channels: Array<{ + /** 备选结果 */ + alternatives: Array<{ + /** 完整转录文本 */ + transcript: string + /** 置信度 */ + confidence: number + /** 词数组 */ + words: DeepgramWord[] + }> + /** 句段数组(utterances=true 时) */ + utterances?: DeepgramUtterance[] + }> + } + /** 元数据 */ + metadata: { + /** 请求 ID */ + request_id: string + /** 音频时长 */ + duration: number + /** 通道数 */ + channels: number + } +} + +/** + * 转录段结果 + */ +export interface TranscriptSegment { + /** 原始音频段信息 */ + audioSegment: AudioSegment + /** Deepgram 响应 */ + response?: DeepgramResponse + /** 是否成功 */ + success: boolean + /** 错误信息 */ + error?: string +} + +/** + * API 密钥验证结果 + */ +export interface ApiKeyValidationResult { + /** 是否有效 */ + valid: boolean + /** 错误消息 */ + error?: string + /** 账户信息(可选) */ + account?: { + /** 剩余配额 */ + remainingBalance?: number + } +} diff --git a/packages/shared/types/index.ts b/packages/shared/types/index.ts index 4eacab43..3c5eaa14 100644 --- a/packages/shared/types/index.ts +++ b/packages/shared/types/index.ts @@ -1,6 +1,7 @@ /** * Shared types index */ +export * from './asr' export * from './database' export * from './media-server' export * from './mediainfo' diff --git a/src/main/ipc.ts b/src/main/ipc.ts index df15bc05..b7de6de7 100644 --- a/src/main/ipc.ts +++ b/src/main/ipc.ts @@ -2,6 +2,7 @@ import fs from 'node:fs' import { UpgradeChannel } from '@shared/config/constant' import { IpcChannel } from '@shared/IpcChannel' +import type { ASRGenerateOptions } from '@shared/types' import { Notification, Shortcut, ThemeMode } from '@types' import { BrowserWindow, @@ -19,6 +20,7 @@ import { isLinux, isMac, isPortable, isWin } from './constant' import { db } from './db/dao' import appService from './services/AppService' import AppUpdater from './services/AppUpdater' +import ASRSubtitleService from './services/ASRSubtitleService' import { configManager } from './services/ConfigManager' import DictionaryService from './services/DictionaryService' import FFmpegService from './services/FFmpegService' @@ -43,6 +45,7 @@ const dictionaryService = new DictionaryService() const ffmpegService = new FFmpegService() const mediaParserService = new MediaParserService() const subtitleExtractorService = new SubtitleExtractorService() +const asrSubtitleService = new ASRSubtitleService() /** * Registers all ipcMain handlers used by the main process. @@ -713,6 +716,32 @@ export function registerIpc(mainWindow: BrowserWindow, app: Electron.App) { return count }) + // ASR 字幕生成相关 IPC 处理程序 / ASR subtitle generation related IPC handlers + ipcMain.handle(IpcChannel.ASR_Generate, async (_, options: ASRGenerateOptions) => { + logger.info('收到 ASR 字幕生成请求', { videoId: options.videoId }) + return await asrSubtitleService.generateSubtitle(options, (progress) => { + try { + if (!mainWindow.isDestroyed() && !mainWindow.webContents.isDestroyed()) { + mainWindow.webContents.send(IpcChannel.ASR_Progress, progress) + } + } catch (err) { + logger.warn('ASR 进度事件发送失败', { + error: err instanceof Error ? err.message : String(err) + }) + } + }) + }) + + ipcMain.handle(IpcChannel.ASR_Cancel, async (_, taskId: string) => { + logger.info('取消 ASR 任务', { taskId }) + return await asrSubtitleService.cancelTask(taskId) + }) + + ipcMain.handle(IpcChannel.ASR_ValidateApiKey, async (_, apiKey: string) => { + logger.info('验证 Deepgram API Key') + return await asrSubtitleService.validateApiKey(apiKey) + }) + // 文件系统相关 IPC 处理程序 / File system-related IPC handlers ipcMain.handle(IpcChannel.Fs_CheckFileExists, async (_, filePath: string) => { try { diff --git a/src/main/services/ASRSubtitleService.ts b/src/main/services/ASRSubtitleService.ts new file mode 100644 index 00000000..038296f5 --- /dev/null +++ b/src/main/services/ASRSubtitleService.ts @@ -0,0 +1,542 @@ +/** + * ASR 字幕生成主服务 + * 简化版:直接上传完整音频到 Deepgram,无需分段 + */ + +import type { + ASRErrorCode, + ASRGenerateOptions, + ASRProgress, + ASRResult, + ASRSubtitleItem, + DeepgramResponse, + DeepgramUtterance, + DeepgramWord +} from '@shared/types' +import { ASRProgressStage } from '@shared/types' +import { app } from 'electron' +import * as fs from 'fs' +import * as path from 'path' +import { v4 as uuidv4 } from 'uuid' + +import { db } from '../db/dao' +import DeepgramTranscriber from './asr/DeepgramTranscriber' +import SubtitleFormatter from './asr/SubtitleFormatter' +import AudioPreprocessor from './audio/AudioPreprocessor' +import { configManager } from './ConfigManager' +import { loggerService } from './LoggerService' + +const logger = loggerService.withContext('ASRSubtitleService') + +export type ASRProgressCallback = (progress: ASRProgress) => void + +class ASRSubtitleService { + private audioPreprocessor: AudioPreprocessor + private subtitleFormatter: SubtitleFormatter + + // 当前运行的任务 + private activeTasks: Map = + new Map() + + constructor() { + this.audioPreprocessor = new AudioPreprocessor() + this.subtitleFormatter = new SubtitleFormatter() + + logger.info('ASR 字幕服务初始化完成') + } + + /** + * 创建持久化字幕文件路径 + */ + private async createPersistentSubtitlePath( + videoId: string, + taskId: string, + outputFormat: string + ): Promise { + const userDataPath = app.getPath('userData') + const subtitlesDir = path.join(userDataPath, 'subtitles', videoId) + + // 确保目录存在(异步操作) + try { + await fs.promises.access(subtitlesDir) + } catch { + await fs.promises.mkdir(subtitlesDir, { recursive: true }) + logger.debug('创建字幕目录', { subtitlesDir }) + } + + return path.join(subtitlesDir, `${taskId}.${outputFormat}`) + } + + /** + * 生成字幕(简化版) + */ + public async generateSubtitle( + options: ASRGenerateOptions, + progressCallback?: ASRProgressCallback + ): Promise { + const taskId = uuidv4() + const startTime = Date.now() + + logger.info('开始生成 ASR 字幕', { + taskId, + videoPath: options.videoPath, + language: options.language + }) + + // 创建临时目录 + const tempDir = this.audioPreprocessor.createTempDir(`asr-${taskId}-`) + + try { + // 检查 API Key + const apiKey = configManager.getDeepgramApiKey() + if (!apiKey) { + throw new Error('NO_API_KEY') + } + + // 获取配置 + const language = options.language || configManager.getASRDefaultLanguage() + const model = (options.model || configManager.getASRModel()) as 'nova-2' | 'nova-3' + const outputFormat = options.outputFormat || 'srt' + + // 阶段 1: 初始化 + this.reportProgress(taskId, ASRProgressStage.Initializing, 0, progressCallback) + + // 阶段 2: 提取音频 + this.reportProgress(taskId, ASRProgressStage.ExtractingAudio, 5, progressCallback) + logger.info('开始提取音频') + + const extractResult = await this.audioPreprocessor.extractAudioTrack( + options.videoPath, + tempDir, + { + sampleRate: 16000, + channels: 1 + } + ) + + if (!extractResult.success || !extractResult.audioPath) { + throw new Error('AUDIO_EXTRACTION_FAILED') + } + + const audioDuration = extractResult.duration || 0 + logger.info('音频提取成功', { duration: audioDuration }) + + // 阶段 3: 转写完整音频 + this.reportProgress(taskId, ASRProgressStage.Transcribing, 15, progressCallback) + logger.info('开始转写音频') + + const transcriber = new DeepgramTranscriber(1) + this.activeTasks.set(taskId, { transcriber, cancelled: false }) + + const deepgramResponse = await transcriber.transcribeFile(extractResult.audioPath, { + apiKey, + model, + language, + smartFormat: true, + utterances: true, + utteranceEndMs: 1000 + }) + + // 检查是否被取消 + if (this.activeTasks.get(taskId)?.cancelled) { + throw new Error('TASK_CANCELLED') + } + + logger.info('音频转写完成') + + // 再次检查是否被取消(在转写完成后) + if (this.activeTasks.get(taskId)?.cancelled) { + throw new Error('TASK_CANCELLED') + } + + // 阶段 4: 提取字幕数据 + this.reportProgress(taskId, ASRProgressStage.Formatting, 85, progressCallback) + logger.info('开始格式化字幕') + + // 检查是否被取消(在格式化前) + if (this.activeTasks.get(taskId)?.cancelled) { + throw new Error('TASK_CANCELLED') + } + + // 从 Deepgram 响应中提取字幕 + const rawSubtitles = this.extractSubtitlesFromResponse(deepgramResponse) + const formattedSubtitles = rawSubtitles + // 格式化字幕(如需要可以启用) + // const formattedSubtitles = this.subtitleFormatter.formatSubtitles(rawSubtitles, { + // maxDuration: 8, + // maxCharsPerLine: 42 + // }) + + // 检查是否被取消(在提取完成后) + if (this.activeTasks.get(taskId)?.cancelled) { + throw new Error('TASK_CANCELLED') + } + + // 阶段 5: 导出文件 + this.reportProgress(taskId, ASRProgressStage.Saving, 90, progressCallback) + logger.info('开始导出字幕文件') + + // 检查是否被取消(在导出前) + if (this.activeTasks.get(taskId)?.cancelled) { + throw new Error('TASK_CANCELLED') + } + + // 直接生成到持久化目录 + const outputPath = await this.createPersistentSubtitlePath( + String(options.videoId), + taskId, + outputFormat + ) + if (outputFormat === 'srt') { + await this.subtitleFormatter.exportToSRT(formattedSubtitles, outputPath) + } else { + await this.subtitleFormatter.exportToVTT(formattedSubtitles, outputPath) + } + + // 检查是否被取消(在导出完成后) + if (this.activeTasks.get(taskId)?.cancelled) { + throw new Error('TASK_CANCELLED') + } + + // 阶段 6: 保存到数据库 + this.reportProgress(taskId, ASRProgressStage.Saving, 95, progressCallback) + logger.info('开始保存字幕到数据库') + + // 转换 ASRSubtitleItem 到 SubtitleItem 格式 + const subtitleItems = formattedSubtitles.map((item) => ({ + id: `${taskId}-${item.index}`, + startTime: item.startTime, + endTime: item.endTime, + originalText: item.text, + translatedText: undefined, + words: item.words // 保存单词级时间戳 + })) + + // 保存到数据库 + let subtitleLibraryId: number | undefined + try { + const result = await db.subtitleLibrary.addSubtitle({ + videoId: options.videoId, + filePath: outputPath, // 直接使用持久化路径 + subtitles: JSON.stringify(subtitleItems), + parsed_at: Date.now() + }) + subtitleLibraryId = result.id + logger.info('字幕保存到数据库成功', { subtitleLibraryId }) + } catch (error) { + logger.error('保存字幕到数据库失败', { + error: error instanceof Error ? error.message : String(error) + }) + // 不抛出错误,继续返回结果 + } + + // 完成 + const processingTime = (Date.now() - startTime) / 1000 + this.reportProgress(taskId, ASRProgressStage.Complete, 100, progressCallback) + + logger.info('ASR 字幕生成完成', { + taskId, + subtitleCount: formattedSubtitles.length, + processingTime: `${processingTime.toFixed(2)}s`, + subtitleLibraryId + }) + + // 清理任务 + this.activeTasks.delete(taskId) + + return { + success: true, + subtitles: formattedSubtitles, + outputPath, + subtitleLibraryId, + stats: { + duration: audioDuration, + processingTime, + segmentCount: 1, + subtitleCount: formattedSubtitles.length + } + } + } catch (error) { + const errorMessage = error instanceof Error ? error.message : String(error) + const errorCode = this.getErrorCode(errorMessage) + + // 如果是用户取消,使用 info 级别日志 + if (errorCode === 'TASK_CANCELLED') { + logger.info('用户取消了 ASR 字幕生成', { taskId }) + } else { + logger.error('ASR 字幕生成失败', { + taskId, + error: errorMessage + }) + } + + this.reportProgress(taskId, ASRProgressStage.Failed, 0, progressCallback) + this.activeTasks.delete(taskId) + + return { + success: false, + error: errorMessage, + errorCode + } + } finally { + // 清理临时目录 + try { + await this.audioPreprocessor.cleanupTempDir(tempDir) + logger.debug('临时目录清理成功', { tempDir }) + } catch (error) { + logger.error('临时目录清理失败', { + tempDir, + error: error instanceof Error ? error.message : String(error) + }) + } + } + } + + /** + * 从 Deepgram 响应中提取字幕 + */ + private extractSubtitlesFromResponse(response?: DeepgramResponse): ASRSubtitleItem[] { + const subtitles: ASRSubtitleItem[] = [] + + try { + // 处理响应为空的情况 + if (!response) { + logger.warn('Deepgram 响应为空,无法提取字幕') + return subtitles + } + + // 优先使用 utterances(句段) + const channel = response.results?.channels?.[0] + const utterances = channel?.utterances as DeepgramUtterance[] | undefined + + if (utterances && utterances.length > 0) { + logger.info('使用 utterances 提取字幕', { count: utterances.length }) + + utterances.forEach((utterance, index) => { + subtitles.push({ + index, + startTime: utterance.start, + endTime: utterance.end, + text: utterance.transcript, + words: utterance.words // 保存单词级时间戳 + }) + }) + } else { + // 降级:使用 words(词级)智能分段 + const words = channel?.alternatives?.[0]?.words as DeepgramWord[] | undefined + if (words && words.length > 0) { + logger.info('使用 words 提取字幕(智能分段)', { count: words.length }) + const grouped = this.groupWordsIntoSentences(words) + subtitles.push(...grouped) + } + } + + logger.info('字幕提取完成', { count: subtitles.length }) + } catch (error) { + logger.error('提取字幕失败', { + error: error instanceof Error ? error.message : String(error) + }) + throw new Error('SUBTITLE_EXTRACTION_FAILED') + } + + return subtitles + } + + /** + * 将单词智能分组为句子 + * 基于标点符号、停顿时长和时长限制 + */ + private groupWordsIntoSentences(words: DeepgramWord[]): ASRSubtitleItem[] { + const sentences: ASRSubtitleItem[] = [] + let currentWords: DeepgramWord[] = [] + let sentenceStartTime = 0 + + for (let i = 0; i < words.length; i++) { + const word = words[i] + const nextWord = words[i + 1] + + // 初始化句子开始时间 + if (currentWords.length === 0) { + sentenceStartTime = word.start + } + + currentWords.push(word) + + // 判断是否应该结束当前句子 + const shouldBreak = this.shouldBreakSentence( + word, + nextWord, + sentenceStartTime, + i === words.length - 1 + ) + + if (shouldBreak) { + // 创建字幕条目 + const text = currentWords.map((w) => w.punctuated_word || w.word).join(' ') + sentences.push({ + index: sentences.length, + startTime: sentenceStartTime, + endTime: word.end, + text, + words: currentWords + }) + + currentWords = [] + } + } + + logger.info('智能分段完成', { + totalWords: words.length, + sentenceCount: sentences.length, + avgWordsPerSentence: (words.length / sentences.length).toFixed(1) + }) + + return sentences + } + + /** + * 判断是否应该在当前位置断句 + * 策略:句末标点优先,信任 Deepgram 的标点识别 + */ + private shouldBreakSentence( + currentWord: DeepgramWord, + nextWord: DeepgramWord | undefined, + sentenceStartTime: number, + isLastWord: boolean + ): boolean { + // 1. 最后一个单词,必须断句 + if (isLastWord) { + return true + } + + // 2. 检测句末标点符号 (., !, ?, 。, !, ?) - 直接断句 + const punctuatedWord = currentWord.punctuated_word || currentWord.word + const hasSentenceEndingPunctuation = /[.!?。!?]$/.test(punctuatedWord) + + if (hasSentenceEndingPunctuation) { + logger.debug('断句:句末标点', { + word: punctuatedWord + }) + return true + } + + // 3. 计算停顿时长(下一个单词的开始时间 - 当前单词的结束时间) + const pauseDuration = nextWord ? nextWord.start - currentWord.end : 0 + + // 4. 停顿时间 > 800ms:长停顿,可能是句子边界 + if (pauseDuration > 0.8) { + logger.debug('断句:长停顿', { + word: punctuatedWord, + pauseDuration: pauseDuration.toFixed(3) + }) + return true + } + + // 5. 计算当前句子的时长 + const sentenceDuration = currentWord.end - sentenceStartTime + + // 6. 句子时长 > 8 秒:强制断句,避免过长 + if (sentenceDuration > 8) { + // 如果有标点符号(逗号、分号、冒号),优先在标点处断句 + if (/[,;:,;:]$/.test(punctuatedWord)) { + logger.debug('断句:超时 + 标点', { + word: punctuatedWord, + duration: sentenceDuration.toFixed(2) + }) + return true + } + // 如果有停顿,在停顿处断句 + if (pauseDuration > 0.2) { + logger.debug('断句:超时 + 短停顿', { + word: punctuatedWord, + duration: sentenceDuration.toFixed(2) + }) + return true + } + } + + // 7. 句子时长 > 10 秒:强制断句(无论是否有标点) + if (sentenceDuration > 10) { + logger.debug('断句:强制超时', { + word: punctuatedWord, + duration: sentenceDuration.toFixed(2) + }) + return true + } + + return false + } + + /** + * 取消任务 + */ + public async cancelTask(taskId: string): Promise { + const task = this.activeTasks.get(taskId) + if (!task) { + return false + } + + logger.info('取消 ASR 任务', { taskId }) + task.cancelled = true + await task.transcriber.cancelAll() + this.activeTasks.delete(taskId) + + return true + } + + /** + * 验证 API Key + */ + public async validateApiKey(apiKey: string): Promise<{ valid: boolean; error?: string }> { + logger.info('验证 Deepgram API Key') + + try { + const transcriber = new DeepgramTranscriber(1) + const result = await transcriber.validateApiKey(apiKey) + return result + } catch (error) { + return { + valid: false, + error: error instanceof Error ? error.message : 'API Key 验证失败' + } + } + } + + /** + * 报告进度 + */ + private reportProgress( + taskId: string, + stage: ASRProgressStage, + percent: number, + callback?: ASRProgressCallback, + current?: number, + total?: number + ): void { + if (callback) { + callback({ + taskId, + stage, + percent: Math.round(percent), + current, + total + }) + } + } + + /** + * 获取错误代码 + */ + private getErrorCode(errorMessage: string): ASRErrorCode { + if (errorMessage.includes('NO_API_KEY')) return 'NO_API_KEY' + if (errorMessage.includes('API Key 无效')) return 'INVALID_API_KEY' + if (errorMessage.includes('配额')) return 'QUOTA_EXCEEDED' + if (errorMessage.includes('网络')) return 'NETWORK_ERROR' + if (errorMessage.includes('AUDIO_EXTRACTION_FAILED')) return 'AUDIO_EXTRACTION_FAILED' + if (errorMessage.includes('TASK_CANCELLED')) return 'TASK_CANCELLED' + if (errorMessage.includes('REQUEST_CANCELLED')) return 'TASK_CANCELLED' + return 'UNKNOWN_ERROR' + } +} + +export default ASRSubtitleService diff --git a/src/main/services/ConfigManager.ts b/src/main/services/ConfigManager.ts index 8798e76c..ad8b3de2 100644 --- a/src/main/services/ConfigManager.ts +++ b/src/main/services/ConfigManager.ts @@ -39,7 +39,11 @@ export enum ConfigKeys { TestChannel = 'testChannel', TestPlan = 'testPlan', SpellCheckLanguages = 'spellCheckLanguages', - DisableHardwareAcceleration = 'disableHardwareAcceleration' + DisableHardwareAcceleration = 'disableHardwareAcceleration', + // ASR 相关配置 + DeepgramApiKey = 'deepgramApiKey', + ASRDefaultLanguage = 'asrDefaultLanguage', + ASRModel = 'asrModel' } // 获取基于版本的动态默认值 @@ -56,7 +60,11 @@ const defaultValues: Record = { [ConfigKeys.TestChannel]: versionBasedDefaults.testChannel, [ConfigKeys.TestPlan]: versionBasedDefaults.testPlan, [ConfigKeys.SpellCheckLanguages]: [] as string[], - [ConfigKeys.DisableHardwareAcceleration]: false + [ConfigKeys.DisableHardwareAcceleration]: false, + // ASR 默认配置 + [ConfigKeys.DeepgramApiKey]: '', + [ConfigKeys.ASRDefaultLanguage]: 'en', + [ConfigKeys.ASRModel]: 'nova-3' } export class ConfigManager { @@ -206,7 +214,32 @@ export class ConfigManager { * @param defaultValue 默认值 */ get(key: string, defaultValue?: T) { - return this.store.get(key, defaultValue ? defaultValue : defaultValues[key]) as T + return this.store.get(key, defaultValue ?? defaultValues[key]) as T + } + + // ASR 相关配置方法 + getDeepgramApiKey(): string { + return this.get(ConfigKeys.DeepgramApiKey, '') + } + + setDeepgramApiKey(apiKey: string) { + this.setAndNotify(ConfigKeys.DeepgramApiKey, apiKey) + } + + getASRDefaultLanguage(): string { + return this.get(ConfigKeys.ASRDefaultLanguage, 'en') + } + + setASRDefaultLanguage(language: string) { + this.setAndNotify(ConfigKeys.ASRDefaultLanguage, language) + } + + getASRModel(): string { + return this.get(ConfigKeys.ASRModel, 'nova-3') + } + + setASRModel(model: string) { + this.setAndNotify(ConfigKeys.ASRModel, model) } } diff --git a/src/main/services/__tests__/ASRSubtitleService.shouldBreakSentence.test.ts b/src/main/services/__tests__/ASRSubtitleService.shouldBreakSentence.test.ts new file mode 100644 index 00000000..53d89af8 --- /dev/null +++ b/src/main/services/__tests__/ASRSubtitleService.shouldBreakSentence.test.ts @@ -0,0 +1,430 @@ +import type { DeepgramWord } from '@shared/types' +import { beforeEach, describe, expect, it, vi } from 'vitest' + +// Mock logger using vi.hoisted to ensure it's available before imports +const mockLogger = vi.hoisted(() => ({ + info: vi.fn(), + debug: vi.fn(), + warn: vi.fn(), + error: vi.fn() +})) + +vi.mock('../LoggerService', () => ({ + loggerService: { + withContext: () => mockLogger + } +})) + +// Mock Electron +vi.mock('electron', () => ({ + app: { + getPath: vi.fn(() => '/mock/path'), + getVersion: vi.fn(() => '1.0.0') + }, + ipcMain: { + on: vi.fn(), + handle: vi.fn() + } +})) + +// Mock electron-conf +vi.mock('electron-conf/main', () => ({ + Conf: vi.fn().mockImplementation(() => ({ + get: vi.fn(), + set: vi.fn(), + has: vi.fn() + })) +})) + +// Mock other dependencies +vi.mock('../ConfigManager', () => ({ + configManager: { + getDeepgramApiKey: vi.fn(() => 'mock-api-key'), + getASRDefaultLanguage: vi.fn(() => 'en'), + getASRModel: vi.fn(() => 'nova-2') + } +})) + +vi.mock('../../db/dao', () => ({ + db: { + subtitleLibrary: { + addSubtitle: vi.fn() + } + } +})) + +// Mock AudioPreprocessor +vi.mock('../audio/AudioPreprocessor', () => ({ + default: vi.fn().mockImplementation(() => ({ + createTempDir: vi.fn(() => '/mock/temp'), + extractAudioTrack: vi.fn(), + cleanupTempDir: vi.fn() + })) +})) + +// Mock SubtitleFormatter +vi.mock('../asr/SubtitleFormatter', () => ({ + default: vi.fn().mockImplementation(() => ({ + formatSubtitles: vi.fn(), + exportToSRT: vi.fn(), + exportToVTT: vi.fn() + })) +})) + +// Mock DeepgramTranscriber +vi.mock('../asr/DeepgramTranscriber', () => ({ + default: vi.fn().mockImplementation(() => ({ + transcribeFile: vi.fn(), + validateApiKey: vi.fn(), + cancelAll: vi.fn() + })) +})) + +describe('ASRSubtitleService - shouldBreakSentence', () => { + let ASRSubtitleService: any + let service: any + + beforeEach(async () => { + vi.clearAllMocks() + + // Dynamically import the service + const module = await import('../ASRSubtitleService') + ASRSubtitleService = module.default + service = new ASRSubtitleService() + }) + + /** + * Helper function to access the private shouldBreakSentence method + */ + const testShouldBreakSentence = ( + currentWord: DeepgramWord, + nextWord: DeepgramWord | undefined, + sentenceStartTime: number, + isLastWord: boolean + ): boolean => { + // Access private method using bracket notation + return service['shouldBreakSentence'](currentWord, nextWord, sentenceStartTime, isLastWord) + } + + /** + * Helper function to create a mock DeepgramWord + */ + const createWord = ( + word: string, + punctuated_word: string, + start: number, + end: number + ): DeepgramWord => ({ + word, + punctuated_word, + start, + end, + confidence: 0.95 + }) + + describe('Last word detection', () => { + it('should break at the last word', () => { + const word = createWord('world', 'world.', 1.0, 1.5) + const result = testShouldBreakSentence(word, undefined, 0, true) + expect(result).toBe(true) + }) + }) + + describe('Sentence ending punctuation (no pause required)', () => { + it('should break on period with any pause', () => { + const currentWord = createWord('world', 'world.', 1.0, 1.5) + const nextWord = createWord('Hello', 'Hello', 2.0, 2.5) // 500ms pause + const result = testShouldBreakSentence(currentWord, nextWord, 0, false) + expect(result).toBe(true) + }) + + it('should break on question mark with any pause', () => { + const currentWord = createWord('you', 'you?', 1.0, 1.5) + const nextWord = createWord('Yes', 'Yes', 2.0, 2.5) // 500ms pause + const result = testShouldBreakSentence(currentWord, nextWord, 0, false) + expect(result).toBe(true) + }) + + it('should break on exclamation mark with any pause', () => { + const currentWord = createWord('amazing', 'amazing!', 1.0, 1.5) + const nextWord = createWord('Really', 'Really', 2.0, 2.5) // 500ms pause + const result = testShouldBreakSentence(currentWord, nextWord, 0, false) + expect(result).toBe(true) + }) + + it('should break on period even with short pause (< 300ms)', () => { + const currentWord = createWord('world', 'world.', 1.0, 1.5) + const nextWord = createWord('Hello', 'Hello', 1.7, 2.2) // 200ms pause + const result = testShouldBreakSentence(currentWord, nextWord, 0, false) + expect(result).toBe(true) // Now breaks on punctuation alone + }) + + it('should break on Chinese period regardless of pause', () => { + const currentWord = createWord('世界', '世界。', 1.0, 1.5) + const nextWord = createWord('你好', '你好', 2.0, 2.5) // 500ms pause + const result = testShouldBreakSentence(currentWord, nextWord, 0, false) + expect(result).toBe(true) + }) + }) + + describe('Long pause detection', () => { + it('should break on pause > 800ms', () => { + const currentWord = createWord('hello', 'hello', 1.0, 1.5) + const nextWord = createWord('world', 'world', 2.5, 3.0) // 1000ms pause + const result = testShouldBreakSentence(currentWord, nextWord, 0, false) + expect(result).toBe(true) + }) + + it('should NOT break on pause < 800ms', () => { + const currentWord = createWord('hello', 'hello', 1.0, 1.5) + const nextWord = createWord('world', 'world', 2.0, 2.5) // 500ms pause + const result = testShouldBreakSentence(currentWord, nextWord, 0, false) + expect(result).toBe(false) + }) + }) + + describe('Duration-based breaking (8-10 seconds)', () => { + it('should break at 8s+ duration with comma', () => { + const currentWord = createWord('word', 'word,', 8.5, 9.0) + const nextWord = createWord('next', 'next', 9.2, 9.7) + const result = testShouldBreakSentence(currentWord, nextWord, 0, false) + expect(result).toBe(true) + }) + + it('should break at 8s+ duration with pause > 200ms', () => { + const currentWord = createWord('word', 'word', 8.5, 9.0) + const nextWord = createWord('next', 'next', 9.5, 10.0) // 500ms pause + const result = testShouldBreakSentence(currentWord, nextWord, 0, false) + expect(result).toBe(true) + }) + + it('should NOT break at 8s+ duration without punctuation or pause', () => { + const currentWord = createWord('word', 'word', 8.5, 9.0) + const nextWord = createWord('next', 'next', 9.1, 9.6) // 100ms pause, no punctuation + const result = testShouldBreakSentence(currentWord, nextWord, 0, false) + expect(result).toBe(false) + }) + + it('should force break at 10s+ duration regardless of punctuation', () => { + const currentWord = createWord('word', 'word', 10.5, 11.0) + const nextWord = createWord('next', 'next', 11.1, 11.6) // No punctuation, short pause + const result = testShouldBreakSentence(currentWord, nextWord, 0, false) + expect(result).toBe(true) + }) + }) + + describe('Edge cases', () => { + it('should handle word without punctuated_word field', () => { + const currentWord: DeepgramWord = { + word: 'hello', + start: 1.0, + end: 1.5, + confidence: 0.95 + } + const nextWord = createWord('world', 'world', 2.0, 2.5) + const result = testShouldBreakSentence(currentWord, nextWord, 0, false) + expect(result).toBe(false) + }) + + it('should handle zero pause duration', () => { + const currentWord = createWord('hello', 'hello', 1.0, 1.5) + const nextWord = createWord('world', 'world', 1.5, 2.0) // 0ms pause + const result = testShouldBreakSentence(currentWord, nextWord, 0, false) + expect(result).toBe(false) + }) + + it('should handle very short sentence with period and pause', () => { + const currentWord = createWord('Hi', 'Hi.', 0.5, 1.0) + const nextWord = createWord('Bye', 'Bye', 1.5, 2.0) // 500ms pause + const result = testShouldBreakSentence(currentWord, nextWord, 0, false) + expect(result).toBe(true) + }) + }) + + describe('Real-world test cases', () => { + it('should break on sentence-ending punctuation even with minimal pause', () => { + // Real data from user: "tell." followed by "It's" with 0ms pause + // This should break because "tell." has sentence-ending punctuation + const currentWord = createWord('tell', 'tell.', 50.745, 51.225) + const nextWord = createWord("it's", "It's", 51.225, 51.385002) // 0ms pause + const result = testShouldBreakSentence(currentWord, nextWord, 49.785, false) + + // ✅ Fixed: Now breaks on sentence-ending punctuation + expect(result).toBe(true) + }) + + it('should break on sentence-ending punctuation with short pause (80ms)', () => { + // Real data: "with." followed by "Come" with 80ms pause + const currentWord = createWord('with', 'with.', 52.345, 52.825) + const nextWord = createWord('come', 'Come', 52.905, 53.465) // 80ms pause + const result = testShouldBreakSentence(currentWord, nextWord, 49.785, false) + + // ✅ Fixed: Now breaks on sentence-ending punctuation + expect(result).toBe(true) + }) + + it('should handle complete real-world sentence sequence', () => { + // Test the complete sequence from user's example + const words: DeepgramWord[] = [ + { + word: "there's", + start: 49.785, + end: 50.265, + confidence: 0.9863529, + punctuated_word: "There's" + }, + { + word: 'nothing', + start: 50.265, + end: 50.505, + confidence: 0.99985075, + punctuated_word: 'nothing' + }, + { word: 'to', start: 50.505, end: 50.745, confidence: 0.99971515, punctuated_word: 'to' }, + { + word: 'tell', + start: 50.745, + end: 51.225, + confidence: 0.9826325, + punctuated_word: 'tell.' + }, + { + word: "it's", + start: 51.225, + end: 51.385002, + confidence: 0.8772707, + punctuated_word: "It's" + }, + { + word: 'just', + start: 51.385002, + end: 51.545, + confidence: 0.99974877, + punctuated_word: 'just' + }, + { + word: 'some', + start: 51.545, + end: 51.705, + confidence: 0.99927837, + punctuated_word: 'some' + }, + { + word: 'guy', + start: 51.705, + end: 51.864998, + confidence: 0.999765, + punctuated_word: 'guy' + }, + { word: 'i', start: 51.864998, end: 52.105, confidence: 0.9979578, punctuated_word: 'I' }, + { + word: 'work', + start: 52.105, + end: 52.345, + confidence: 0.98591065, + punctuated_word: 'work' + }, + { + word: 'with', + start: 52.345, + end: 52.825, + confidence: 0.9990688, + punctuated_word: 'with.' + }, + { + word: 'come', + start: 52.905, + end: 53.465, + confidence: 0.9908832, + punctuated_word: 'Come' + }, + { word: 'on', start: 53.465, end: 54.025, confidence: 0.96900225, punctuated_word: 'on.' }, + { + word: "you're", + start: 54.025, + end: 54.505, + confidence: 0.99452776, + punctuated_word: "You're" + }, + { + word: 'going', + start: 54.505, + end: 54.745, + confidence: 0.9983157, + punctuated_word: 'going' + }, + { word: 'out', start: 54.745, end: 54.905, confidence: 0.9927585, punctuated_word: 'out' }, + { + word: 'with', + start: 54.905, + end: 55.065, + confidence: 0.9994931, + punctuated_word: 'with' + }, + { word: 'the', start: 55.065, end: 55.145, confidence: 0.8043892, punctuated_word: 'the' }, + { + word: 'guy', + start: 55.145, + end: 55.465, + confidence: 0.99105775, + punctuated_word: 'guy.' + }, + { + word: "there's", + start: 55.465, + end: 55.625, + confidence: 0.9992779, + punctuated_word: "There's" + }, + { + word: 'gotta', + start: 55.625, + end: 55.785, + confidence: 0.7105091, + punctuated_word: 'gotta' + }, + { word: 'be', start: 55.785, end: 55.945, confidence: 0.9926218, punctuated_word: 'be' }, + { + word: 'something', + start: 55.945, + end: 56.265, + confidence: 0.99703157, + punctuated_word: 'something' + }, + { + word: 'wrong', + start: 56.265, + end: 56.505, + confidence: 0.99949515, + punctuated_word: 'wrong' + }, + { + word: 'with', + start: 56.505, + end: 56.665, + confidence: 0.9996629, + punctuated_word: 'with' + }, + { word: 'him', start: 56.665, end: 56.905, confidence: 0.9875486, punctuated_word: 'him.' } + ] + + // Use the private method to group words into sentences + const sentences = service['groupWordsIntoSentences'](words) + + // ✅ Fixed: Now correctly produces 5 sentences based on sentence-ending punctuation + // Expected sentences: + // 1. "There's nothing to tell." + // 2. "It's just some guy I work with." + // 3. "Come on." + // 4. "You're going out with the guy." + // 5. "There's gotta be something wrong with him." + + expect(sentences.length).toBe(5) + expect(sentences[0].text).toBe("There's nothing to tell.") + expect(sentences[1].text).toBe("It's just some guy I work with.") + expect(sentences[2].text).toBe('Come on.') + expect(sentences[3].text).toBe("You're going out with the guy.") + expect(sentences[4].text).toBe("There's gotta be something wrong with him.") + }) + }) +}) diff --git a/src/main/services/__tests__/AudioPreprocessor.test.ts b/src/main/services/__tests__/AudioPreprocessor.test.ts new file mode 100644 index 00000000..3aedc170 --- /dev/null +++ b/src/main/services/__tests__/AudioPreprocessor.test.ts @@ -0,0 +1,162 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest' + +// 取消全局 fs mock,因为我们需要真实的文件系统操作 +vi.unmock('node:fs') +vi.unmock('node:fs/promises') + +import AudioPreprocessor, { parseFFmpegDuration } from '../audio/AudioPreprocessor' + +describe('parseFFmpegDuration', () => { + it('should parse duration with two-digit centiseconds', () => { + const mockOutput = 'Duration: 01:23:45.67' + const result = parseFFmpegDuration(mockOutput) + expect(result).toBe(1 * 3600 + 23 * 60 + 45 + 67 / 100) // 5025.67 + }) + + it('should parse duration with single-digit milliseconds', () => { + const mockOutput = 'Duration: 02:34:56.7' + const result = parseFFmpegDuration(mockOutput) + expect(result).toBe(2 * 3600 + 34 * 60 + 56 + 7 / 10) // 9296.7 + }) + + it('should parse duration with three-digit milliseconds', () => { + const mockOutput = 'Duration: 00:12:34.567' + const result = parseFFmpegDuration(mockOutput) + expect(result).toBe(0 * 3600 + 12 * 60 + 34 + 567 / 1000) // 754.567 + }) + + it('should parse duration without fractional part', () => { + const mockOutput = 'Duration: 03:45:00' + const result = parseFFmpegDuration(mockOutput) + expect(result).toBe(3 * 3600 + 45 * 60 + 0) // 13500 + }) + + it('should handle edge cases with zero values', () => { + const mockOutput = 'Duration: 00:00:00.00' + const result = parseFFmpegDuration(mockOutput) + expect(result).toBe(0) + }) + + it('should handle malformed duration gracefully', () => { + const mockOutput = 'Duration: XX:YY:ZZ.invalid' + const result = parseFFmpegDuration(mockOutput) + expect(result).toBeNull() + }) + + it('should extract duration from complex FFmpeg output', () => { + const complexOutput = ` + Input #0, mp3, from 'test.mp3': + Metadata: + title : Test Audio + artist : Test Artist + Duration: 00:01:23.456, bitrate: 128 kb/s + Stream #0:0: Audio: mp3, 44100 Hz, stereo, fltp, 128 kb/s + ` + const result = parseFFmpegDuration(complexOutput) + expect(result).toBe(0 * 3600 + 1 * 60 + 23 + 456 / 1000) // 83.456 + }) + + it('should handle maximum valid hour values', () => { + const mockOutput = 'Duration: 99:59:59.999' + const result = parseFFmpegDuration(mockOutput) + expect(result).toBe(99 * 3600 + 59 * 60 + 59 + 999 / 1000) // 359999.999 + }) +}) + +describe('AudioPreprocessor', () => { + let audioPreprocessor: AudioPreprocessor + + beforeEach(() => { + audioPreprocessor = new AudioPreprocessor() + }) + + describe('buildFFmpegArgs', () => { + // 由于 buildFFmpegArgs 是私有方法,我们通过测试 extractAudioTrack 的行为来间接测试它 + // 这里我们主要关注格式参数是否正确传递和处理 + + it('should handle MP3 format correctly', async () => { + // 由于这是一个集成测试,需要真实的文件和 FFmpeg,我们主要测试逻辑是否正确 + // 创建一个临时目录用于测试 + const tempDir = audioPreprocessor.createTempDir('test-') + + try { + // 验证 options 参数能够正确传递 format + // 由于 buildFFmpegArgs 是私有的,我们通过反射来测试它 + const buildFFmpegArgsMethod = (audioPreprocessor as any).buildFFmpegArgs + + if (buildFFmpegArgsMethod) { + // 测试 MP3 格式 + const mp3Args = buildFFmpegArgsMethod.call( + audioPreprocessor, + '/test/input.mp4', + '/test/output.mp3', + 16000, + 1, + 'mp3' + ) + + // 验证 MP3 格式使用了正确的编解码器 + expect(mp3Args).toContain('libmp3lame') + expect(mp3Args).toContain('-b:a') + expect(mp3Args).toContain('128k') + expect(mp3Args).not.toContain('pcm_s16le') + } + } finally { + // 清理临时目录 + await audioPreprocessor.cleanupTempDir(tempDir) + } + }) + + it('should handle WAV format correctly', async () => { + const tempDir = audioPreprocessor.createTempDir('test-') + + try { + const buildFFmpegArgsMethod = (audioPreprocessor as any).buildFFmpegArgs + + if (buildFFmpegArgsMethod) { + // 测试 WAV 格式 + const wavArgs = buildFFmpegArgsMethod.call( + audioPreprocessor, + '/test/input.mp4', + '/test/output.wav', + 16000, + 1, + 'wav' + ) + + // 验证 WAV 格式使用了正确的编解码器 + expect(wavArgs).toContain('pcm_s16le') + expect(wavArgs).not.toContain('libmp3lame') + expect(wavArgs).not.toContain('-b:a') + } + } finally { + await audioPreprocessor.cleanupTempDir(tempDir) + } + }) + + it('should use WAV as default format', async () => { + const tempDir = audioPreprocessor.createTempDir('test-') + + try { + const buildFFmpegArgsMethod = (audioPreprocessor as any).buildFFmpegArgs + + if (buildFFmpegArgsMethod) { + // 测试默认格式(不传递 format 参数) + const defaultArgs = buildFFmpegArgsMethod.call( + audioPreprocessor, + '/test/input.mp4', + '/test/output.wav', + 16000, + 1 + ) + + // 验证默认使用 PCM 编解码器 + expect(defaultArgs).toContain('pcm_s16le') + expect(defaultArgs).not.toContain('libmp3lame') + } + } finally { + await audioPreprocessor.cleanupTempDir(tempDir) + } + }) + }) +}) diff --git a/src/main/services/asr/DeepgramTranscriber.ts b/src/main/services/asr/DeepgramTranscriber.ts new file mode 100644 index 00000000..aaf819f2 --- /dev/null +++ b/src/main/services/asr/DeepgramTranscriber.ts @@ -0,0 +1,516 @@ +/** + * Deepgram 转写服务 + * 负责调用 Deepgram API 进行语音转文本 + */ + +import type { AudioSegment, DeepgramResponse, TranscriptSegment } from '@shared/types' +import * as fs from 'fs' +import { promises as fsPromises } from 'fs' +import type { ClientRequest } from 'http' +import https from 'https' +import PQueue from 'p-queue' +import * as path from 'path' + +import { loggerService } from '../LoggerService' + +const logger = loggerService.withContext('DeepgramTranscriber') + +export interface DeepgramOptions { + /** API Key */ + apiKey: string + /** 模型选择 */ + model?: 'nova-2' | 'nova-3' + /** 语言 */ + language?: string + /** 是否启用智能格式化 */ + smartFormat?: boolean + /** 是否启用句段检测 */ + utterances?: boolean + /** 句段结束静音时长(毫秒) */ + utteranceEndMs?: number + /** 提示文本(前文上下文) */ + prompt?: string +} + +export interface TranscriptionProgress { + /** 已完成的段数 */ + completed: number + /** 总段数 */ + total: number + /** 当前段索引 */ + current: number +} + +class DeepgramTranscriber { + private queue: PQueue + private activeRequests: Set = new Set() + private abortController: AbortController = new AbortController() + private currentRequestAbortController: AbortController | null = null + + constructor(concurrency: number = 3) { + this.queue = new PQueue({ concurrency }) + logger.info('Deepgram 转写器初始化', { concurrency }) + } + + /** + * 重置取消状态(用于开始新的转写任务) + */ + private resetCancellationState(): void { + if (this.abortController.signal.aborted) { + this.abortController = new AbortController() + } + this.currentRequestAbortController = null + } + + /** + * 批量转写多个音频段 + */ + public async transcribeSegments( + segments: AudioSegment[], + options: DeepgramOptions, + onProgress?: (progress: TranscriptionProgress) => void + ): Promise { + logger.info('开始批量转写', { segmentCount: segments.length }) + + // 重置取消状态,准备新的转写任务 + this.resetCancellationState() + + const results: TranscriptSegment[] = [] + let completed = 0 + + // 上一段的末尾文本(用作提示) + let previousTranscript = '' + + for (let i = 0; i < segments.length; i++) { + const segment = segments[i] + + // 添加到队列 + const promise = this.queue.add(async () => { + try { + // 携带上一段末尾文本作为提示 + const promptText = + previousTranscript.length > 200 ? previousTranscript.slice(-200) : previousTranscript + + const response = await this.transcribeSingleSegment(segment.filePath, { + ...options, + prompt: promptText + }) + + completed++ + onProgress?.({ completed, total: segments.length, current: i }) + + // 更新上一段文本 + if (response.results.channels[0]?.alternatives[0]?.transcript) { + const fullTranscript = response.results.channels[0].alternatives[0].transcript + previousTranscript = fullTranscript + } + + return { + audioSegment: segment, + response, + success: true + } as TranscriptSegment + } catch (error) { + completed++ + onProgress?.({ completed, total: segments.length, current: i }) + + logger.error('段转写失败', { + index: segment.index, + error: error instanceof Error ? error.message : String(error) + }) + + return { + audioSegment: segment, + success: false, + error: error instanceof Error ? error.message : String(error) + } as TranscriptSegment + } + }) + + if (promise) { + results.push((await promise) as TranscriptSegment) + } + } + + // 等待所有任务完成 + await this.queue.onIdle() + + const successCount = results.filter((r) => r.success).length + logger.info('批量转写完成', { + total: results.length, + success: successCount, + failed: results.length - successCount + }) + + return results + } + + /** + * 转写单个音频段 + */ + private async transcribeSingleSegment( + audioPath: string, + options: DeepgramOptions, + retries: number = 3 + ): Promise { + const { + apiKey, + model = 'nova-3', + language = 'en', + smartFormat = true, + utterances = true, + utteranceEndMs = 1000, + prompt + } = options + + let lastError: Error | null = null + + for (let attempt = 0; attempt < retries; attempt++) { + try { + // 在重试前检查是否被取消 + if (this.abortController.signal.aborted) { + logger.debug('检测到请求被取消,停止重试') + throw new Error('REQUEST_CANCELLED') + } + + if (attempt > 0) { + // 指数退避 + const delay = Math.pow(2, attempt) * 1000 + logger.debug('重试前等待', { attempt, delay }) + await new Promise((resolve) => setTimeout(resolve, delay)) + + // 等待后再次检查取消状态 + if (this.abortController.signal.aborted) { + logger.debug('等待期间检测到取消,停止重试') + throw new Error('REQUEST_CANCELLED') + } + } + + logger.debug('调用 Deepgram API', { audioPath, model, language, attempt }) + + const response = await this.callDeepgramAPI(audioPath, { + apiKey, + model, + language, + smartFormat, + utterances, + utteranceEndMs, + prompt + }) + + logger.debug('Deepgram API 调用成功', { audioPath }) + return response + } catch (error) { + lastError = error instanceof Error ? error : new Error(String(error)) + + // 如果是请求取消错误,直接抛出,不重试 + if ( + lastError.message === 'REQUEST_CANCELLED' || + lastError.message.includes('socket hang up') || + lastError.message.includes('请求被中断') || + lastError.message.includes('socket was destroyed') + ) { + logger.info('用户取消了 ASR 任务,停止处理', { error: lastError.message }) + throw new Error('REQUEST_CANCELLED') + } + + logger.warn('Deepgram API 调用失败', { + attempt: attempt + 1, + maxRetries: retries, + error: lastError.message + }) + } + } + + throw lastError || new Error('Deepgram API 调用失败') + } + + /** + * 调用 Deepgram API + */ + private async callDeepgramAPI( + audioPath: string, + options: DeepgramOptions + ): Promise { + // 构建查询参数 + const queryParams = new URLSearchParams({ + model: options.model || 'nova-3', + smart_format: String(options.smartFormat !== false), + punctuate: 'true', + utterances: String(options.utterances !== false), + utterance_end_ms: String(options.utteranceEndMs || 1000) + }) + + // 处理语言参数:如果是 'auto',使用 detect_language;否则使用 language + if (options.language === 'auto') { + queryParams.append('detect_language', 'true') + } else if (options.language) { + queryParams.append('language', options.language) + } + + const url = `https://api.deepgram.com/v1/listen?${queryParams.toString()}` + + // 获取音频文件的 MIME 类型 + const ext = path.extname(audioPath).toLowerCase() + const mimeTypes: Record = { + '.wav': 'audio/wav', + '.mp3': 'audio/mpeg', + '.m4a': 'audio/mp4', + '.flac': 'audio/flac', + '.ogg': 'audio/ogg', + '.opus': 'audio/opus', + '.webm': 'audio/webm' + } + const contentType = mimeTypes[ext] || 'audio/wav' + + // 获取文件大小(用于 Content-Length) + const stats = await fsPromises.stat(audioPath) + const fileSize = stats.size + + // 创建读取流 + const readStream = fs.createReadStream(audioPath) + + return new Promise((resolve, reject) => { + // 发送请求 + const req = https.request( + url, + { + method: 'POST', + headers: { + 'Content-Type': contentType, + 'Content-Length': fileSize, + Authorization: `Token ${options.apiKey}` + } + }, + (res) => { + let responseData = '' + + res.on('data', (chunk) => { + responseData += chunk.toString() + }) + + res.on('end', () => { + // 请求完成后从活动请求列表中移除 + this.activeRequests.delete(req) + + if (res.statusCode === 200) { + try { + const parsed = JSON.parse(responseData) as DeepgramResponse + resolve(parsed) + } catch (error) { + reject(new Error(`解析 Deepgram 响应失败: ${error}`)) + } + } else if (res.statusCode === 401) { + reject(new Error('API Key 无效')) + } else if (res.statusCode === 402) { + reject(new Error('API 配额不足')) + } else if (res.statusCode === 429) { + reject(new Error('API 调用频率超限')) + } else { + reject(new Error(`Deepgram API 错误 (${res.statusCode}): ${responseData}`)) + } + }) + } + ) + + // 将请求添加到活动请求列表 + this.activeRequests.add(req) + + // 请求错误处理 + req.on('error', (error) => { + this.activeRequests.delete(req) + readStream.destroy() + reject(new Error(`网络错误: ${error.message}`)) + }) + + // 设置超时(10分钟,符合 Deepgram 文档的最大处理时间) + req.setTimeout(10 * 60 * 1000, () => { + this.activeRequests.delete(req) + readStream.destroy() + req.destroy() + reject(new Error('请求超时(超过10分钟)')) + }) + + // 读取流错误处理 + readStream.on('error', (error) => { + this.activeRequests.delete(req) + req.destroy() + reject(new Error(`读取音频文件失败: ${error.message}`)) + }) + + // 将读取流管道连接到请求 + readStream.pipe(req) + }) + } + + /** + * 转写完整音频文件 + */ + public async transcribeFile( + audioPath: string, + options: DeepgramOptions + ): Promise { + logger.info('开始转写完整音频文件', { audioPath }) + + try { + const response = await this.transcribeSingleSegment(audioPath, options) + logger.info('音频文件转写成功', { audioPath }) + return response + } catch (error) { + const errorMessage = error instanceof Error ? error.message : String(error) + + // 如果是用户取消,使用 info 级别日志 + if (errorMessage === 'REQUEST_CANCELLED') { + logger.info('用户取消了音频文件转写', { audioPath }) + } else { + logger.error('音频文件转写失败', { + audioPath, + error: errorMessage + }) + } + + throw error + } + } + + /** + * 验证 API Key + */ + public async validateApiKey(apiKey: string): Promise<{ valid: boolean; error?: string }> { + try { + logger.info('验证 Deepgram API Key') + + if (!apiKey || apiKey.length < 10) { + return { valid: false, error: 'API Key 格式无效' } + } + + // 调用 Deepgram 官方验证端点 + const result = await this.makeValidationRequest(apiKey) + return result + } catch (error) { + return { + valid: false, + error: error instanceof Error ? error.message : 'API Key 验证失败' + } + } + } + + /** + * 发送验证请求到 Deepgram API + */ + private makeValidationRequest(apiKey: string): Promise<{ valid: boolean; error?: string }> { + return new Promise((resolve) => { + const requestOptions = { + hostname: 'api.deepgram.com', + port: 443, + path: '/v1/auth/token', + method: 'GET', + headers: { + Authorization: `Token ${apiKey}`, + 'Content-Type': 'application/json' + }, + timeout: 8000 // 8秒超时 + } + + const req = https.request(requestOptions, (res) => { + let responseBody = '' + + res.on('data', (chunk) => { + responseBody += chunk + }) + + res.on('end', () => { + // 根据状态码返回相应结果 + if (res.statusCode === 200) { + logger.info('API Key 验证成功') + resolve({ valid: true }) + } else if (res.statusCode === 401) { + try { + const errorData = JSON.parse(responseBody) + const errorCode = errorData.err_code || 'UNKNOWN' + + if (errorCode === 'INVALID_AUTH') { + resolve({ valid: false, error: 'API Key 无效' }) + } else if (errorCode === 'INSUFFICIENT_PERMISSIONS') { + resolve({ valid: false, error: 'API Key 权限不足' }) + } else { + resolve({ valid: false, error: 'API Key 认证失败' }) + } + } catch { + resolve({ valid: false, error: 'API Key 认证失败' }) + } + } else if (res.statusCode === 403) { + resolve({ valid: false, error: 'API Key 权限不足或访问被拒绝' }) + } else { + logger.warn('API Key 验证收到意外状态码', { + statusCode: res.statusCode, + body: responseBody + }) + resolve({ + valid: false, + error: `验证失败 (HTTP ${res.statusCode})` + }) + } + }) + }) + + req.on('error', (error) => { + logger.error('API Key 验证请求失败', { error: error.message }) + resolve({ + valid: false, + error: '网络连接失败,请检查网络设置' + }) + }) + + req.on('timeout', () => { + req.destroy() + logger.error('API Key 验证请求超时') + resolve({ + valid: false, + error: '验证请求超时,请稍后重试' + }) + }) + + req.end() + }) + } + + /** + * 取消所有待处理的任务 + */ + public async cancelAll(): Promise { + // 设置取消标志 + this.abortController.abort() + + // 如果有当前请求的控制器,也取消它 + if (this.currentRequestAbortController) { + this.currentRequestAbortController.abort() + } + + // 清空队列,防止新任务开始 + this.queue.clear() + + // 中断所有正在进行的 HTTP 请求 + const requestCount = this.activeRequests.size + for (const req of this.activeRequests) { + try { + req.destroy() + } catch (error) { + logger.warn('中断请求失败', { + error: error instanceof Error ? error.message : String(error) + }) + } + } + this.activeRequests.clear() + + // 等待队列空闲 + await this.queue.onIdle() + + logger.info('已取消所有转写任务', { + cancelledRequests: requestCount, + queuedTasks: this.queue.size + }) + } +} + +export default DeepgramTranscriber diff --git a/src/main/services/asr/SubtitleFormatter.ts b/src/main/services/asr/SubtitleFormatter.ts new file mode 100644 index 00000000..d2d03f49 --- /dev/null +++ b/src/main/services/asr/SubtitleFormatter.ts @@ -0,0 +1,264 @@ +/** + * 字幕格式化服务 + * 负责将字幕数据格式化并导出为 SRT/VTT 文件 + */ + +import type { ASRSubtitleItem } from '@shared/types' +import * as fs from 'fs' +import { convert } from 'subsrt-ts' + +import { loggerService } from '../LoggerService' + +const logger = loggerService.withContext('SubtitleFormatter') + +export interface FormatOptions { + /** 单条字幕最大时长(秒) */ + maxDuration?: number + /** 单行最大字符数 */ + maxCharsPerLine?: number +} + +class SubtitleFormatter { + /** + * 格式化字幕 + * 确保符合时长和字符数限制 + */ + public formatSubtitles(items: ASRSubtitleItem[], options: FormatOptions = {}): ASRSubtitleItem[] { + const { maxDuration = 8, maxCharsPerLine = 42 } = options + + logger.info('开始格式化字幕', { + itemCount: items.length, + maxDuration, + maxCharsPerLine + }) + + const formatted: ASRSubtitleItem[] = [] + + for (const item of items) { + // 检查是否需要拆分 + const duration = item.endTime - item.startTime + const text = item.text + + if (duration <= maxDuration && text.length <= maxCharsPerLine) { + // 不需要拆分 + formatted.push({ + ...item, + text, + index: formatted.length + }) + } else { + // 需要拆分 + const split = this.splitSubtitle(item, maxDuration, maxCharsPerLine) + formatted.push(...split.map((s, i) => ({ ...s, index: formatted.length + i }))) + } + } + + logger.info('字幕格式化完成', { outputCount: formatted.length }) + + return formatted + } + + /** + * 拆分过长的字幕 + */ + private splitSubtitle( + item: ASRSubtitleItem, + _maxDuration: number, + maxCharsPerLine: number + ): ASRSubtitleItem[] { + const result: ASRSubtitleItem[] = [] + + // 如果有单词级时间戳,使用精确拆分 + if (item.words && item.words.length > 0) { + return this.splitSubtitleWithWordTimestamps(item, maxCharsPerLine) + } + + // 降级:按文本估算拆分 + const words = item.text.split(/\s+/) + const duration = item.endTime - item.startTime + const avgTimePerChar = duration / item.text.length + + let currentWords: string[] = [] + let currentChars = 0 + let segmentStart = item.startTime + + for (const word of words) { + const wordLength = word.length + 1 // +1 for space + + if (currentChars + wordLength > maxCharsPerLine && currentWords.length > 0) { + // 创建一个字幕段 + const text = currentWords.join(' ') + const estimatedDuration = text.length * avgTimePerChar + const segmentEnd = Math.min(segmentStart + estimatedDuration, item.endTime) + + result.push({ + index: 0, // 稍后重新索引 + startTime: segmentStart, + endTime: segmentEnd, + text + }) + + // 重置 + currentWords = [word] + currentChars = wordLength + segmentStart = segmentEnd + } else { + currentWords.push(word) + currentChars += wordLength + } + } + + // 处理剩余的词 + if (currentWords.length > 0) { + const text = currentWords.join(' ') + result.push({ + index: 0, + startTime: segmentStart, + endTime: item.endTime, + text + }) + } + + return result + } + + /** + * 使用单词级时间戳精确拆分字幕 + */ + private splitSubtitleWithWordTimestamps( + item: ASRSubtitleItem, + maxCharsPerLine: number + ): ASRSubtitleItem[] { + const result: ASRSubtitleItem[] = [] + const words = item.words! + + let currentWords: typeof words = [] + let currentChars = 0 + + for (let i = 0; i < words.length; i++) { + const word = words[i] + const wordText = word.punctuated_word || word.word + const wordLength = wordText.length + 1 // +1 for space + + if (currentChars + wordLength > maxCharsPerLine && currentWords.length > 0) { + // 创建一个字幕段(使用精确的单词时间戳) + const text = currentWords.map((w) => w.punctuated_word || w.word).join(' ') + const segmentStart = currentWords[0].start + const segmentEnd = currentWords[currentWords.length - 1].end + + result.push({ + index: 0, // 稍后重新索引 + startTime: segmentStart, + endTime: segmentEnd, + text, + words: currentWords + }) + + // 重置 + currentWords = [word] + currentChars = wordLength + } else { + currentWords.push(word) + currentChars += wordLength + } + } + + // 处理剩余的词 + if (currentWords.length > 0) { + const text = currentWords.map((w) => w.punctuated_word || w.word).join(' ') + const segmentStart = currentWords[0].start + const segmentEnd = currentWords[currentWords.length - 1].end + + result.push({ + index: 0, + startTime: segmentStart, + endTime: segmentEnd, + text, + words: currentWords + }) + } + + logger.debug('使用单词级时间戳拆分字幕', { + originalLength: item.text.length, + segmentCount: result.length + }) + + return result + } + + /** + * 导出为 SRT 格式 + */ + public async exportToSRT(items: ASRSubtitleItem[], outputPath: string): Promise { + logger.info('导出 SRT 文件', { outputPath, itemCount: items.length }) + + try { + // 生成 SRT 内容 + let srtContent = '' + + for (const item of items) { + srtContent += `${item.index + 1}\n` + srtContent += `${this.formatTime(item.startTime)} --> ${this.formatTime(item.endTime)}\n` + srtContent += `${item.text}\n\n` + } + + // 写入文件 + await fs.promises.writeFile(outputPath, srtContent, 'utf-8') + + logger.info('SRT 文件导出成功', { outputPath }) + } catch (error) { + logger.error('SRT 文件导出失败', { + error: error instanceof Error ? error.message : String(error) + }) + throw error + } + } + + /** + * 导出为 VTT 格式 + */ + public async exportToVTT(items: ASRSubtitleItem[], outputPath: string): Promise { + logger.info('导出 VTT 文件', { outputPath, itemCount: items.length }) + + try { + // 先生成 SRT 格式 + const srtItems = items.map((item) => ({ + index: item.index + 1, + start: this.formatTime(item.startTime), + end: this.formatTime(item.endTime), + text: item.text + })) + + const srtContent = srtItems + .map((item) => `${item.index}\n${item.start} --> ${item.end}\n${item.text}\n`) + .join('\n') + + // 转换为 VTT + const vttContent = convert(srtContent, { from: 'srt', to: 'vtt' }) + + // 写入文件 + await fs.promises.writeFile(outputPath, vttContent, 'utf-8') + + logger.info('VTT 文件导出成功', { outputPath }) + } catch (error) { + logger.error('VTT 文件导出失败', { + error: error instanceof Error ? error.message : String(error) + }) + throw error + } + } + + /** + * 格式化时间为 SRT 格式 (HH:MM:SS,mmm) + */ + private formatTime(seconds: number): string { + const hours = Math.floor(seconds / 3600) + const minutes = Math.floor((seconds % 3600) / 60) + const secs = Math.floor(seconds % 60) + const millis = Math.floor((seconds % 1) * 1000) + + return `${String(hours).padStart(2, '0')}:${String(minutes).padStart(2, '0')}:${String(secs).padStart(2, '0')},${String(millis).padStart(3, '0')}` + } +} + +export default SubtitleFormatter diff --git a/src/main/services/asr/__tests__/DeepgramTranscriber.callDeepgramAPI.test.ts b/src/main/services/asr/__tests__/DeepgramTranscriber.callDeepgramAPI.test.ts new file mode 100644 index 00000000..0f8933e3 --- /dev/null +++ b/src/main/services/asr/__tests__/DeepgramTranscriber.callDeepgramAPI.test.ts @@ -0,0 +1,313 @@ +import type { DeepgramResponse } from '@shared/types' +import https from 'https' +import { afterEach, beforeAll, beforeEach, describe, expect, it, vi } from 'vitest' + +type SpyInstance = any + +const mockLogger = { + info: vi.fn(), + debug: vi.fn(), + warn: vi.fn(), + error: vi.fn() +} + +vi.mock('../../LoggerService', () => ({ + loggerService: { + withContext: () => mockLogger + } +})) + +const statMock = vi.fn() +const createReadStreamMock = vi.fn() + +vi.mock('fs', () => { + const mockModule = { + createReadStream: createReadStreamMock, + promises: { + stat: statMock + } + } + + return { + __esModule: true, + default: mockModule, + ...mockModule + } +}) + +type Handler = (...args: any[]) => void +type HandlerMap = Record + +const createFakeReadStream = () => { + const handlers: HandlerMap = {} + const stream = { + pipe: vi.fn(), + destroy: vi.fn(), + on: vi.fn((event: string, handler: Handler) => { + ;(handlers[event] ||= []).push(handler) + return stream + }) + } + + return { stream, handlers } +} + +const createFakeRequest = () => { + const handlers: HandlerMap = {} + const req = { + on: vi.fn((event: string, handler: Handler) => { + ;(handlers[event] ||= []).push(handler) + return req + }), + setTimeout: vi.fn((_: number, handler: Handler) => { + ;(handlers.timeout ||= []).push(handler) + return req + }), + destroy: vi.fn(), + end: vi.fn() + } + + return { req, handlers } +} + +const emitResponse = ( + response: { statusCode: number }, + handlers: HandlerMap, + statusCode: number, + bodyChunks: Array = [] +) => { + response.statusCode = statusCode + + process.nextTick(() => { + for (const chunk of bodyChunks) { + handlers.data?.forEach((handler) => handler(chunk)) + } + + handlers.end?.forEach((handler) => handler()) + }) +} + +type DeepgramTranscriberClass = typeof import('../DeepgramTranscriber').default +let DeepgramTranscriber: DeepgramTranscriberClass + +beforeAll(async () => { + DeepgramTranscriber = (await import('../DeepgramTranscriber')).default +}) + +describe('DeepgramTranscriber.callDeepgramAPI', () => { + let transcriber: InstanceType + let requestSpy: SpyInstance + + beforeEach(() => { + createReadStreamMock.mockReset() + transcriber = new DeepgramTranscriber(1) + + statMock.mockResolvedValue({ size: 1024 } as any) + requestSpy = vi.spyOn(https, 'request') + }) + + afterEach(() => { + vi.restoreAllMocks() + statMock.mockReset() + createReadStreamMock.mockReset() + }) + + it('resolves with parsed response when Deepgram returns 200', async () => { + const audioPath = '/tmp/audio.m4a' + const callOptions = { + apiKey: 'test-key', + model: 'nova-2' as const, + language: 'en', + smartFormat: true, + utterances: true, + utteranceEndMs: 750 + } + + const fakeResponse: DeepgramResponse = { + results: { + channels: [ + { + alternatives: [ + { + transcript: 'hello world', + confidence: 0.95, + words: [] + } + ], + utterances: [] + } + ] + }, + metadata: { + request_id: 'req-123', + duration: 1.23, + channels: 1 + } + } + + const readStream = createFakeReadStream() + createReadStreamMock.mockReturnValue(readStream.stream as any) + + let currentRequest: ReturnType | undefined + + requestSpy.mockImplementation((url: string, options: any, callback: any) => { + const searchParams = new URL(url).searchParams + expect(searchParams.get('model')).toBe('nova-2') + expect(searchParams.get('language')).toBe('en') + expect(searchParams.get('detect_language')).toBeNull() + expect(searchParams.get('smart_format')).toBe('true') + expect(searchParams.get('utterances')).toBe('true') + expect(searchParams.get('utterance_end_ms')).toBe('750') + + expect(options.method).toBe('POST') + expect(options.headers['Content-Type']).toBe('audio/mp4') + expect(options.headers['Content-Length']).toBe(1024) + expect(options.headers.Authorization).toBe(`Token ${callOptions.apiKey}`) + + const responseHandlers: HandlerMap = {} + const response = { + statusCode: 200, + on: vi.fn((event: string, handler: Handler) => { + ;(responseHandlers[event] ||= []).push(handler) + return response + }) + } + + callback?.(response as any) + + currentRequest = createFakeRequest() + readStream.stream.pipe.mockReturnValue(currentRequest.req as any) + + emitResponse(response, responseHandlers, 200, [Buffer.from(JSON.stringify(fakeResponse))]) + + return currentRequest.req as any + }) + + const result = await (transcriber as any).callDeepgramAPI(audioPath, callOptions) + + expect(result).toEqual(fakeResponse) + expect(statMock).toHaveBeenCalledWith(audioPath) + expect(readStream.stream.pipe).toHaveBeenCalledWith(currentRequest?.req) + }) + + it('rejects with specific error when Deepgram returns 401', async () => { + const audioPath = '/tmp/audio.wav' + const callOptions = { + apiKey: 'test-key', + model: 'nova-3' as const, + language: 'auto' as const, + smartFormat: false, + utterances: false, + utteranceEndMs: 500 + } + + const readStream = createFakeReadStream() + createReadStreamMock.mockReturnValue(readStream.stream as any) + + requestSpy.mockImplementation((url: string, options: any, callback: any) => { + const params = new URL(url).searchParams + expect(params.get('model')).toBe('nova-3') + expect(params.get('language')).toBeNull() + expect(params.get('detect_language')).toBe('true') + expect(params.get('smart_format')).toBe('false') + expect(params.get('utterances')).toBe('false') + expect(params.get('utterance_end_ms')).toBe('500') + + expect(options.headers['Content-Type']).toBe('audio/wav') + + const responseHandlers: HandlerMap = {} + const response = { + statusCode: 401, + on: vi.fn((event: string, handler: Handler) => { + ;(responseHandlers[event] ||= []).push(handler) + return response + }) + } + + callback?.(response as any) + + const requestState = createFakeRequest() + readStream.stream.pipe.mockReturnValue(requestState.req as any) + + emitResponse(response, responseHandlers, 401, ['Unauthorized']) + + return requestState.req as any + }) + + await expect((transcriber as any).callDeepgramAPI(audioPath, callOptions)).rejects.toThrow( + 'API Key 无效' + ) + }) + + it('rejects when Deepgram returns invalid JSON body', async () => { + const audioPath = '/tmp/audio.flac' + const callOptions = { + apiKey: 'test-key' + } + + const readStream = createFakeReadStream() + createReadStreamMock.mockReturnValue(readStream.stream as any) + + requestSpy.mockImplementation((_url: string, _options: any, callback: any) => { + const responseHandlers: HandlerMap = {} + const response = { + statusCode: 200, + on: vi.fn((event: string, handler: Handler) => { + ;(responseHandlers[event] ||= []).push(handler) + return response + }) + } + + callback?.(response as any) + + const requestState = createFakeRequest() + readStream.stream.pipe.mockReturnValue(requestState.req as any) + + emitResponse(response, responseHandlers, 200, ['{ invalid json']) + + return requestState.req as any + }) + + await expect((transcriber as any).callDeepgramAPI(audioPath, callOptions)).rejects.toThrow( + '解析 Deepgram 响应失败' + ) + }) + + it('rejects on network error and destroys read stream', async () => { + const audioPath = '/tmp/audio.ogg' + const callOptions = { + apiKey: 'test-key' + } + + const readStream = createFakeReadStream() + createReadStreamMock.mockReturnValue(readStream.stream as any) + + requestSpy.mockImplementation((_url: string, _options: any, callback: any) => { + const responseHandlers: HandlerMap = {} + const response = { + statusCode: 200, + on: vi.fn((event: string, handler: Handler) => { + ;(responseHandlers[event] ||= []).push(handler) + return response + }) + } + + callback?.(response as any) + + const requestState = createFakeRequest() + readStream.stream.pipe.mockReturnValue(requestState.req as any) + + process.nextTick(() => { + requestState.handlers.error?.forEach((handler) => handler(new Error('connection reset'))) + }) + + return requestState.req as any + }) + + await expect((transcriber as any).callDeepgramAPI(audioPath, callOptions)).rejects.toThrow( + '网络错误: connection reset' + ) + + expect(readStream.stream.destroy).toHaveBeenCalledTimes(1) + }) +}) diff --git a/src/main/services/asr/__tests__/DeepgramTranscriber.makeValidationRequest.test.ts b/src/main/services/asr/__tests__/DeepgramTranscriber.makeValidationRequest.test.ts new file mode 100644 index 00000000..c83d6a71 --- /dev/null +++ b/src/main/services/asr/__tests__/DeepgramTranscriber.makeValidationRequest.test.ts @@ -0,0 +1,314 @@ +import https from 'https' +import { afterEach, beforeAll, beforeEach, describe, expect, it, vi } from 'vitest' + +type SpyInstance = any + +const mockLogger = { + info: vi.fn(), + debug: vi.fn(), + warn: vi.fn(), + error: vi.fn() +} + +vi.mock('../../LoggerService', () => ({ + loggerService: { + withContext: () => mockLogger + } +})) + +type Handler = (...args: any[]) => void +type HandlerMap = Record + +const createFakeRequest = () => { + const handlers: HandlerMap = {} + const req = { + on: vi.fn((event: string, handler: Handler) => { + ;(handlers[event] ||= []).push(handler) + return req + }), + setTimeout: vi.fn((_: number, handler: Handler) => { + ;(handlers.timeout ||= []).push(handler) + return req + }), + destroy: vi.fn(), + end: vi.fn() + } + + return { req, handlers } +} + +const emitResponse = ( + response: { statusCode: number }, + handlers: HandlerMap, + statusCode: number, + bodyChunks: Array = [] +) => { + response.statusCode = statusCode + + process.nextTick(() => { + for (const chunk of bodyChunks) { + handlers.data?.forEach((handler) => handler(chunk)) + } + + handlers.end?.forEach((handler) => handler()) + }) +} + +type DeepgramTranscriberClass = typeof import('../DeepgramTranscriber').default +let DeepgramTranscriber: DeepgramTranscriberClass + +beforeAll(async () => { + DeepgramTranscriber = (await import('../DeepgramTranscriber')).default +}) + +describe('DeepgramTranscriber.makeValidationRequest', () => { + let transcriber: InstanceType + let requestSpy: SpyInstance + + beforeEach(() => { + transcriber = new DeepgramTranscriber(1) + requestSpy = vi.spyOn(https, 'request') + }) + + afterEach(() => { + vi.restoreAllMocks() + }) + + it('返回验证成功当 Deepgram 返回 200', async () => { + const apiKey = 'valid-api-key' + + requestSpy.mockImplementation((options: any, callback: any) => { + expect(options.hostname).toBe('api.deepgram.com') + expect(options.path).toBe('/v1/auth/token') + expect(options.method).toBe('GET') + expect(options.headers.Authorization).toBe(`Token ${apiKey}`) + expect(options.headers['Content-Type']).toBe('application/json') + expect(options.timeout).toBe(8000) + + const responseHandlers: HandlerMap = {} + const response = { + statusCode: 200, + on: vi.fn((event: string, handler: Handler) => { + ;(responseHandlers[event] ||= []).push(handler) + return response + }) + } + + callback?.(response as any) + + const requestState = createFakeRequest() + emitResponse(response, responseHandlers, 200, [Buffer.from('{"access_token": "test-token"}')]) + + return requestState.req as any + }) + + const result = await (transcriber as any).makeValidationRequest(apiKey) + + expect(result).toEqual({ valid: true }) + expect(mockLogger.info).toHaveBeenCalledWith('API Key 验证成功') + }) + + it('返回验证失败当 Deepgram 返回 401 INVALID_AUTH', async () => { + const apiKey = 'invalid-api-key' + + requestSpy.mockImplementation((_options: any, callback: any) => { + const responseHandlers: HandlerMap = {} + const response = { + statusCode: 401, + on: vi.fn((event: string, handler: Handler) => { + ;(responseHandlers[event] ||= []).push(handler) + return response + }) + } + + callback?.(response as any) + + const requestState = createFakeRequest() + const errorBody = JSON.stringify({ + err_code: 'INVALID_AUTH', + message: 'Invalid credentials.' + }) + emitResponse(response, responseHandlers, 401, [Buffer.from(errorBody)]) + + return requestState.req as any + }) + + const result = await (transcriber as any).makeValidationRequest(apiKey) + + expect(result).toEqual({ valid: false, error: 'API Key 无效' }) + }) + + it('返回验证失败当 Deepgram 返回 401 INSUFFICIENT_PERMISSIONS', async () => { + const apiKey = 'insufficient-permissions-key' + + requestSpy.mockImplementation((_options: any, callback: any) => { + const responseHandlers: HandlerMap = {} + const response = { + statusCode: 401, + on: vi.fn((event: string, handler: Handler) => { + ;(responseHandlers[event] ||= []).push(handler) + return response + }) + } + + callback?.(response as any) + + const requestState = createFakeRequest() + const errorBody = JSON.stringify({ + err_code: 'INSUFFICIENT_PERMISSIONS', + message: 'Insufficient permissions.' + }) + emitResponse(response, responseHandlers, 401, [Buffer.from(errorBody)]) + + return requestState.req as any + }) + + const result = await (transcriber as any).makeValidationRequest(apiKey) + + expect(result).toEqual({ valid: false, error: 'API Key 权限不足' }) + }) + + it('返回验证失败当 Deepgram 返回 401 但响应体不是有效 JSON', async () => { + const apiKey = 'invalid-api-key' + + requestSpy.mockImplementation((_options: any, callback: any) => { + const responseHandlers: HandlerMap = {} + const response = { + statusCode: 401, + on: vi.fn((event: string, handler: Handler) => { + ;(responseHandlers[event] ||= []).push(handler) + return response + }) + } + + callback?.(response as any) + + const requestState = createFakeRequest() + emitResponse(response, responseHandlers, 401, ['Invalid credentials']) + + return requestState.req as any + }) + + const result = await (transcriber as any).makeValidationRequest(apiKey) + + expect(result).toEqual({ valid: false, error: 'API Key 认证失败' }) + }) + + it('返回验证失败当 Deepgram 返回 403', async () => { + const apiKey = 'forbidden-api-key' + + requestSpy.mockImplementation((_options: any, callback: any) => { + const responseHandlers: HandlerMap = {} + const response = { + statusCode: 403, + on: vi.fn((event: string, handler: Handler) => { + ;(responseHandlers[event] ||= []).push(handler) + return response + }) + } + + callback?.(response as any) + + const requestState = createFakeRequest() + emitResponse(response, responseHandlers, 403) + + return requestState.req as any + }) + + const result = await (transcriber as any).makeValidationRequest(apiKey) + + expect(result).toEqual({ valid: false, error: 'API Key 权限不足或访问被拒绝' }) + }) + + it('返回验证失败当收到其他 HTTP 状态码', async () => { + const apiKey = 'test-api-key' + + requestSpy.mockImplementation((_options: any, callback: any) => { + const responseHandlers: HandlerMap = {} + const response = { + statusCode: 500, + on: vi.fn((event: string, handler: Handler) => { + ;(responseHandlers[event] ||= []).push(handler) + return response + }) + } + + callback?.(response as any) + + const requestState = createFakeRequest() + emitResponse(response, responseHandlers, 500, [Buffer.from('Internal Server Error')]) + + return requestState.req as any + }) + + const result = await (transcriber as any).makeValidationRequest(apiKey) + + expect(result).toEqual({ valid: false, error: '验证失败 (HTTP 500)' }) + expect(mockLogger.warn).toHaveBeenCalledWith('API Key 验证收到意外状态码', { + statusCode: 500, + body: 'Internal Server Error' + }) + }) + + it('返回验证失败当网络请求出错', async () => { + const apiKey = 'test-api-key' + + requestSpy.mockImplementation((_options: any, callback: any) => { + const responseHandlers: HandlerMap = {} + const response = { + statusCode: 200, + on: vi.fn((event: string, handler: Handler) => { + ;(responseHandlers[event] ||= []).push(handler) + return response + }) + } + + callback?.(response as any) + + const requestState = createFakeRequest() + + process.nextTick(() => { + requestState.handlers.error?.forEach((handler) => handler(new Error('ECONNRESET'))) + }) + + return requestState.req as any + }) + + const result = await (transcriber as any).makeValidationRequest(apiKey) + + expect(result).toEqual({ valid: false, error: '网络连接失败,请检查网络设置' }) + expect(mockLogger.error).toHaveBeenCalledWith('API Key 验证请求失败', { error: 'ECONNRESET' }) + }) + + it('返回验证失败当请求超时', async () => { + const apiKey = 'test-api-key' + + requestSpy.mockImplementation((_options: any, callback: any) => { + const responseHandlers: HandlerMap = {} + const response = { + statusCode: 200, + on: vi.fn((event: string, handler: Handler) => { + ;(responseHandlers[event] ||= []).push(handler) + return response + }) + } + + callback?.(response as any) + + const requestState = createFakeRequest() + + // 模拟超时 + process.nextTick(() => { + requestState.handlers.timeout?.forEach((handler) => handler()) + }) + + return requestState.req as any + }) + + const result = await (transcriber as any).makeValidationRequest(apiKey) + + expect(result).toEqual({ valid: false, error: '验证请求超时,请稍后重试' }) + expect(mockLogger.error).toHaveBeenCalledWith('API Key 验证请求超时') + expect(requestSpy.mock.results[0].value.destroy).toHaveBeenCalled() + }) +}) diff --git a/src/main/services/audio/AudioPreprocessor.ts b/src/main/services/audio/AudioPreprocessor.ts new file mode 100644 index 00000000..191b4fe3 --- /dev/null +++ b/src/main/services/audio/AudioPreprocessor.ts @@ -0,0 +1,268 @@ +/** + * 音频预处理服务 + * 负责从视频中提取音频轨道,转换为适合 ASR 的格式 + */ + +import { spawn } from 'child_process' +import * as fs from 'fs' +import * as os from 'os' +import * as path from 'path' + +import FFmpegService from '../FFmpegService' +import { loggerService } from '../LoggerService' + +const logger = loggerService.withContext('AudioPreprocessor') + +export interface AudioExtractOptions { + /** 采样率(Hz),默认 16000 */ + sampleRate?: number + /** 声道数,默认 1 (mono),FFmpeg 会自动混音 */ + channels?: number + /** 输出格式,默认 'wav' */ + format?: 'wav' | 'mp3' +} + +export interface AudioExtractResult { + /** 是否成功 */ + success: boolean + /** 输出音频文件路径 */ + audioPath?: string + /** 音频时长(秒) */ + duration?: number + /** 错误信息 */ + error?: string +} + +/** + * 从 FFmpeg 输出中解析时长信息 + * 支持多种格式:无小数部分、1-3位小数部分 + */ +export function parseFFmpegDuration(output: string): number | null { + const durationMatch = output.match(/Duration: (\d{2}):(\d{2}):(\d{2})(?:\.(\d{1,3}))?/) + if (durationMatch) { + const hours = Number(durationMatch[1]) || 0 + const minutes = Number(durationMatch[2]) || 0 + const seconds = Number(durationMatch[3]) || 0 + const fractionStr = durationMatch[4] || '' + + // 计算毫秒部分:如果没有小数部分则为0,否则根据位数计算 + let fractionalSeconds = 0 + if (fractionStr) { + const fraction = Number(fractionStr) || 0 + const divisor = Math.pow(10, fractionStr.length) + fractionalSeconds = fraction / divisor + } + + return hours * 3600 + minutes * 60 + seconds + fractionalSeconds + } + return null +} + +class AudioPreprocessor { + private ffmpegService: FFmpegService + + constructor() { + this.ffmpegService = new FFmpegService() + } + + /** + * 从视频中提取音频轨道并进行转码 + * 转换为适合 ASR 的格式:采样率 16000Hz、单声道、PCM 16位编码 + * 包含重采样和声道混音处理,不进行流拷贝 + */ + public async extractAudioTrack( + videoPath: string, + outputDir: string, + options: AudioExtractOptions = {} + ): Promise { + const { sampleRate = 16000, channels = 1, format = 'wav' } = options + + const startTime = Date.now() + logger.info('开始提取音频轨道', { + videoPath, + sampleRate, + channels, + format + }) + + try { + // 验证输入文件 + if (!fs.existsSync(videoPath)) { + logger.error('视频文件不存在', { videoPath }) + return { success: false, error: '视频文件不存在' } + } + + // 创建输出目录 + if (!fs.existsSync(outputDir)) { + fs.mkdirSync(outputDir, { recursive: true }) + } + + // 生成输出文件路径 + const outputPath = path.join(outputDir, `audio.${format}`) + + // 构建 FFmpeg 命令 + const ffmpegPath = this.ffmpegService.getFFmpegPath() + const args = this.buildFFmpegArgs(videoPath, outputPath, sampleRate, channels, format) + + logger.debug('执行 FFmpeg 命令', { ffmpegPath, args }) + + // 执行提取 + const { success, duration, error } = await this.runFFmpegExtract(ffmpegPath, args) + + if (!success) { + return { success: false, error: error || 'FFmpeg 提取失败' } + } + + // 验证输出文件 + if (!fs.existsSync(outputPath)) { + return { success: false, error: '输出文件未生成' } + } + + const totalTime = Date.now() - startTime + logger.info('音频提取成功', { + outputPath, + duration, + totalTime: `${totalTime}ms` + }) + + return { + success: true, + audioPath: outputPath, + duration + } + } catch (error) { + const totalTime = Date.now() - startTime + const errorMsg = error instanceof Error ? error.message : String(error) + logger.error('音频提取失败', { error: errorMsg, totalTime: `${totalTime}ms` }) + return { success: false, error: errorMsg } + } + } + + /** + * 构建 FFmpeg 参数 + */ + private buildFFmpegArgs( + inputPath: string, + outputPath: string, + sampleRate: number, + channels: number, + format: 'wav' | 'mp3' = 'wav' + ): string[] { + // 根据格式选择合适的编解码器和音频参数 + const codecConfig = + format === 'mp3' + ? { codec: 'libmp3lame', bitrate: '128k' } + : { codec: 'pcm_s16le', bitrate: undefined } + + // FFmpeg 命令:提取第一个音频流并转码为 ASR 适配格式 + const args: string[] = [ + '-i', + inputPath, + '-vn', // 禁用视频 + '-map', + '0:a:0', // 选择第一个音频流 + '-ar', + String(sampleRate), // 采样率 + '-ac', + String(channels), // 声道数(FFmpeg 会自动混音) + '-c:a', + codecConfig.codec, // 根据格式选择编解码器 + '-y' // 覆盖输出文件 + ] + + // 如果是 MP3 格式,添加比特率参数 + if (codecConfig.bitrate) { + args.push('-b:a', codecConfig.bitrate) + } + + args.push(outputPath) + + return args + } + + /** + * 执行 FFmpeg 提取命令 + */ + private async runFFmpegExtract( + ffmpegPath: string, + args: string[] + ): Promise<{ success: boolean; duration?: number; error?: string }> { + return new Promise((resolve) => { + const ffmpeg = spawn(ffmpegPath, args) + + let stderrOutput = '' + let duration: number | undefined + + ffmpeg.stderr?.on('data', (data) => { + const output = data.toString() + stderrOutput += output + + // 尝试解析音频时长 + const parsedDuration = parseFFmpegDuration(output) + if (parsedDuration !== null && !duration) { + duration = parsedDuration + } + }) + + const timeoutHandle = setTimeout( + () => { + if (ffmpeg && !ffmpeg.killed) { + ffmpeg.kill('SIGKILL') + } + logger.error('FFmpeg 提取超时') + resolve({ success: false, error: 'FFmpeg 提取超时' }) + }, + 5 * 60 * 1000 + ) // 5 分钟超时 + + ffmpeg.on('close', (code) => { + clearTimeout(timeoutHandle) + + if (code === 0) { + logger.debug('FFmpeg 提取成功', { code, duration }) + resolve({ success: true, duration }) + } else { + logger.error('FFmpeg 提取失败', { + code, + error: stderrOutput.slice(-500) + }) + resolve({ success: false, error: `FFmpeg 退出码: ${code}` }) + } + }) + + ffmpeg.on('error', (error) => { + clearTimeout(timeoutHandle) + logger.error('FFmpeg 进程错误', { error: error.message }) + resolve({ success: false, error: error.message }) + }) + }) + } + + /** + * 创建临时目录 + */ + public createTempDir(prefix: string = 'asr-audio-'): string { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), prefix)) + logger.debug('创建临时目录', { tempDir }) + return tempDir + } + + /** + * 清理临时目录 + */ + public async cleanupTempDir(dirPath: string): Promise { + try { + if (fs.existsSync(dirPath)) { + await fs.promises.rm(dirPath, { recursive: true, force: true }) + logger.info('清理临时目录成功', { dirPath }) + } + } catch (error) { + logger.error('清理临时目录失败', { + dirPath, + error: error instanceof Error ? error.message : String(error) + }) + } + } +} + +export default AudioPreprocessor diff --git a/src/preload/index.ts b/src/preload/index.ts index 13deba01..d77bd60a 100644 --- a/src/preload/index.ts +++ b/src/preload/index.ts @@ -2,6 +2,7 @@ import { electronAPI } from '@electron-toolkit/preload' import { UpgradeChannel } from '@shared/config/constant' import { LogLevel, LogSourceWithContext } from '@shared/config/logger' import { IpcChannel } from '@shared/IpcChannel' +import type { ASRGenerateOptions, ASRProgress, ASRResult } from '@shared/types' import { DictionaryResponse, FFmpegVideoInfo, Shortcut, ThemeMode } from '@types' import { contextBridge, ipcRenderer, OpenDialogOptions, shell, webUtils } from 'electron' import type { @@ -268,6 +269,20 @@ const api = { timeoutMs ) }, + asr: { + generate: (options: ASRGenerateOptions): Promise => + ipcRenderer.invoke(IpcChannel.ASR_Generate, options), + cancel: (taskId: string): Promise => ipcRenderer.invoke(IpcChannel.ASR_Cancel, taskId), + validateApiKey: (apiKey: string): Promise => + ipcRenderer.invoke(IpcChannel.ASR_ValidateApiKey, apiKey), + onProgress: (listener: (progress: ASRProgress) => void) => { + const handler = (_event: unknown, payload: ASRProgress) => listener(payload) + ipcRenderer.on(IpcChannel.ASR_Progress, handler) + return () => { + ipcRenderer.removeListener(IpcChannel.ASR_Progress, handler) + } + } + }, uv: { checkInstallation: (): Promise<{ exists: boolean @@ -584,79 +599,6 @@ const api = { ipcRenderer.invoke(IpcChannel.DB_PlayerSettings_Has, videoId) } } - // Binary related APIs - // isBinaryExist: (name: string) => ipcRenderer.invoke(IpcChannel.App_IsBinaryExist, name), - // getBinaryPath: (name: string) => ipcRenderer.invoke(IpcChannel.App_GetBinaryPath, name), - // installUVBinary: () => ipcRenderer.invoke(IpcChannel.App_InstallUvBinary), - // installBunBinary: () => ipcRenderer.invoke(IpcChannel.App_InstallBunBinary), - - // searchService: { - // openSearchWindow: (uid: string) => ipcRenderer.invoke(IpcChannel.SearchWindow_Open, uid), - // closeSearchWindow: (uid: string) => ipcRenderer.invoke(IpcChannel.SearchWindow_Close, uid), - // openUrlInSearchWindow: (uid: string, url: string) => - // ipcRenderer.invoke(IpcChannel.SearchWindow_OpenUrl, uid, url) - // }, - // webview: { - // setOpenLinkExternal: (webviewId: number, isExternal: boolean) => - // ipcRenderer.invoke(IpcChannel.Webview_SetOpenLinkExternal, webviewId, isExternal), - // setSpellCheckEnabled: (webviewId: number, isEnable: boolean) => - // ipcRenderer.invoke(IpcChannel.Webview_SetSpellCheckEnabled, webviewId, isEnable) - // }, - // storeSync: { - // subscribe: () => ipcRenderer.invoke(IpcChannel.StoreSync_Subscribe), - // unsubscribe: () => ipcRenderer.invoke(IpcChannel.StoreSync_Unsubscribe), - // onUpdate: (action: any) => ipcRenderer.invoke(IpcChannel.StoreSync_OnUpdate, action) - // }, - // selection: { - // hideToolbar: () => ipcRenderer.invoke(IpcChannel.Selection_ToolbarHide), - // writeToClipboard: (text: string) => - // ipcRenderer.invoke(IpcChannel.Selection_WriteToClipboard, text), - // determineToolbarSize: (width: number, height: number) => - // ipcRenderer.invoke(IpcChannel.Selection_ToolbarDetermineSize, width, height), - // setEnabled: (enabled: boolean) => ipcRenderer.invoke(IpcChannel.Selection_SetEnabled, enabled), - // setTriggerMode: (triggerMode: string) => - // ipcRenderer.invoke(IpcChannel.Selection_SetTriggerMode, triggerMode), - // setFollowToolbar: (isFollowToolbar: boolean) => - // ipcRenderer.invoke(IpcChannel.Selection_SetFollowToolbar, isFollowToolbar), - // setRemeberWinSize: (isRemeberWinSize: boolean) => - // ipcRenderer.invoke(IpcChannel.Selection_SetRemeberWinSize, isRemeberWinSize), - // setFilterMode: (filterMode: string) => - // ipcRenderer.invoke(IpcChannel.Selection_SetFilterMode, filterMode), - // setFilterList: (filterList: string[]) => - // ipcRenderer.invoke(IpcChannel.Selection_SetFilterList, filterList), - // processAction: (actionItem: ActionItem, isFullScreen: boolean = false) => - // ipcRenderer.invoke(IpcChannel.Selection_ProcessAction, actionItem, isFullScreen), - // closeActionWindow: () => ipcRenderer.invoke(IpcChannel.Selection_ActionWindowClose), - // minimizeActionWindow: () => ipcRenderer.invoke(IpcChannel.Selection_ActionWindowMinimize), - // pinActionWindow: (isPinned: boolean) => - // ipcRenderer.invoke(IpcChannel.Selection_ActionWindowPin, isPinned) - // }, - // quoteToMainWindow: (text: string) => ipcRenderer.invoke(IpcChannel.App_QuoteToMain, text), - // setDisableHardwareAcceleration: (isDisable: boolean) => - // ipcRenderer.invoke(IpcChannel.App_SetDisableHardwareAcceleration, isDisable), - // trace: { - // saveData: (topicId: string) => ipcRenderer.invoke(IpcChannel.TRACE_SAVE_DATA, topicId), - // getData: (topicId: string, traceId: string, modelName?: string) => - // ipcRenderer.invoke(IpcChannel.TRACE_GET_DATA, topicId, traceId, modelName), - // saveEntity: (entity: SpanEntity) => ipcRenderer.invoke(IpcChannel.TRACE_SAVE_ENTITY, entity), - // getEntity: (spanId: string) => ipcRenderer.invoke(IpcChannel.TRACE_GET_ENTITY, spanId), - // bindTopic: (topicId: string, traceId: string) => - // ipcRenderer.invoke(IpcChannel.TRACE_BIND_TOPIC, topicId, traceId), - // tokenUsage: (spanId: string, usage: TokenUsage) => - // ipcRenderer.invoke(IpcChannel.TRACE_TOKEN_USAGE, spanId, usage), - // cleanHistory: (topicId: string, traceId: string, modelName?: string) => - // ipcRenderer.invoke(IpcChannel.TRACE_CLEAN_HISTORY, topicId, traceId, modelName), - // cleanTopic: (topicId: string, traceId?: string) => - // ipcRenderer.invoke(IpcChannel.TRACE_CLEAN_TOPIC, topicId, traceId), - // openWindow: (topicId: string, traceId: string, autoOpen?: boolean, modelName?: string) => - // ipcRenderer.invoke(IpcChannel.TRACE_OPEN_WINDOW, topicId, traceId, autoOpen, modelName), - // setTraceWindowTitle: (title: string) => ipcRenderer.invoke(IpcChannel.TRACE_SET_TITLE, title), - // addEndMessage: (spanId: string, modelName: string, context: string) => - // ipcRenderer.invoke(IpcChannel.TRACE_ADD_END_MESSAGE, spanId, modelName, context), - // cleanLocalData: () => ipcRenderer.invoke(IpcChannel.TRACE_CLEAN_LOCAL_DATA), - // addStreamMessage: (spanId: string, modelName: string, context: string, message: any) => - // ipcRenderer.invoke(IpcChannel.TRACE_ADD_STREAM_MESSAGE, spanId, modelName, context, message) - // } } // Use `contextBridge` APIs to expose Electron APIs to diff --git a/src/renderer/src/i18n/locales/en-us.json b/src/renderer/src/i18n/locales/en-us.json index 3e56423c..b8850941 100644 --- a/src/renderer/src/i18n/locales/en-us.json +++ b/src/renderer/src/i18n/locales/en-us.json @@ -1,5 +1,6 @@ { "common": { + "confirm": "Yes", "favorites": "Favorites", "favorites_developing": "This feature is under development", "home": "Home", @@ -12,6 +13,50 @@ "title": "Documentation" }, "player": { + "asr": { + "errors": { + "apiQuotaExceeded": "API quota exceeded, please check your Deepgram account", + "audioExtractionFailed": "Audio extraction failed, please check the video file", + "invalidApiKey": "Invalid API Key, please check settings", + "networkError": "Network error, please check your connection and retry", + "noApiKey": "Please configure Deepgram API Key in settings first", + "transcriptionFailed": "Transcription failed, please retry", + "unknown": "Generation failed: {{message}}" + }, + "progress": { + "cancel": "Cancel", + "cancelConfirm": "Are you sure you want to cancel subtitle generation?", + "cancelConfirmDescription": "All temporary files will be cleaned up and generated content will not be saved.", + "confirmCancel": "Confirm Cancel", + "estimatedTime": "Estimated remaining time: {{minutes}} minutes", + "stages": { + "complete": "Generation complete!", + "extracting": "Extracting audio...", + "failed": "Generation failed", + "formatting": "Formatting subtitles...", + "initializing": "Initializing...", + "saving": "Saving subtitles...", + "transcribing": "Transcribing..." + }, + "title": "Generating Subtitles" + }, + "prompt": { + "configureApiKey": "Please configure Deepgram API Key in settings first", + "description": "Use Deepgram speech recognition technology to automatically generate accurate subtitles with multi-language support and word-level timestamps.", + "dontShowAgain": "Don't show again", + "estimatedTime": "Estimated time: {{minutes}} minutes", + "generate": "Generate", + "language": "Select Language", + "later": "Later", + "message": "Would you like to automatically generate subtitles using AI?", + "title": "No Subtitles Detected" + }, + "success": { + "autoLoaded": "Subtitles have been automatically loaded", + "message": "Successfully generated {{count}} subtitle items", + "title": "Subtitle Generation Successful" + } + }, "controls": { "subtitle": { "background-type": { @@ -90,23 +135,24 @@ "subtitleList": { "empty": { "description": "Choose a way to start adding subtitles", - "title": "No matching subtitle file found", "options": { + "ai": { + "action": "Coming Soon", + "actionEnabled": "Generate", + "description": "Generate word-level subtitles based on speech recognition", + "title": "AI-Generated Subtitles" + }, "embedded": { - "title": "Use Embedded Subtitles", + "action": "Select", "description": "Video file contains subtitle tracks that can be imported directly", - "action": "Select" + "title": "Use Embedded Subtitles" }, "external": { - "title": "Import External Subtitles", - "description": "Import SRT, VTT, and other subtitle formats from local files" - }, - "ai": { - "title": "AI-Generated Subtitles", - "description": "Generate word-level subtitles based on speech recognition", - "action": "Coming Soon" + "description": "Import SRT, VTT, and other subtitle formats from local files", + "title": "Import External Subtitles" } - } + }, + "title": "No matching subtitle file found" }, "search": { "count": "Found {{count}} subtitle", @@ -120,11 +166,21 @@ } }, "subtitleTrackSelector": { - "title": "Import Embedded Subtitle Tracks", + "actions": { + "cancel": "Cancel", + "import": "Import" + }, "empty": "No subtitle tracks detected", + "messages": { + "extractFailed": "Failed to extract subtitle track {{index}}", + "importFailed": "Failed to extract subtitle tracks, please try again", + "importMultipleSuccess": "Imported {{tracks}} subtitle tracks ({{count}} items total)", + "importSuccess": "Imported subtitle: {{source}} ({{count}} items)", + "selectAtLeastOne": "Please select at least one subtitle track" + }, "sections": { - "text": "Text Subtitle Tracks", - "image": "PGS Subtitle Tracks (Image Subtitles)" + "image": "PGS Subtitle Tracks (Image Subtitles)", + "text": "Text Subtitle Tracks" }, "stream": { "label": "Stream {{index}}", @@ -134,19 +190,9 @@ "unsupported": "Unsupported" } }, + "title": "Import Embedded Subtitle Tracks", "warning": { "pgs": "PGS is an image-based subtitle format that requires OCR technology support. Import is not currently supported." - }, - "actions": { - "cancel": "Cancel", - "import": "Import" - }, - "messages": { - "selectAtLeastOne": "Please select at least one subtitle track", - "extractFailed": "Failed to extract subtitle track {{index}}", - "importFailed": "Failed to extract subtitle tracks, please try again", - "importSuccess": "Imported subtitle: {{source}} ({{count}} items)", - "importMultipleSuccess": "Imported {{tracks}} subtitle tracks ({{count}} items total)" } } }, @@ -189,6 +235,43 @@ "appearance": { "title": "Appearance Settings" }, + "asr": { + "apiKey": { + "description": "Get an API Key from Deepgram to use AI subtitle generation features", + "getKey": "Get API Key", + "invalid": "API Key is invalid", + "label": "Deepgram API Key", + "placeholder": "Enter your Deepgram API Key", + "saveFailed": "Failed to save", + "saved": "API Key saved", + "valid": "API Key is valid", + "validate": "Validate", + "validating": "Validating..." + }, + "defaultLanguage": { + "description": "Default language for automatic subtitle generation", + "label": "Default Language" + }, + "description": "Configure AI subtitle auto-generation using Deepgram speech recognition technology to generate accurate subtitles for videos", + "languages": { + "auto": "Auto Detect", + "de": "German", + "en": "English", + "es": "Spanish", + "fr": "French", + "ja": "Japanese", + "ko": "Korean", + "ru": "Russian", + "zh": "Chinese" + }, + "model": { + "description": "Select Deepgram transcription model", + "label": "Transcription Model", + "nova2": "Nova 2 (Recommended)", + "nova3": "Nova 3 (Latest)" + }, + "title": "Speech Recognition" + }, "developer": { "enable_developer_mode": "Enable developer mode", "title": "Developer mode" diff --git a/src/renderer/src/i18n/locales/zh-cn.json b/src/renderer/src/i18n/locales/zh-cn.json index 2432b31d..a3138f57 100644 --- a/src/renderer/src/i18n/locales/zh-cn.json +++ b/src/renderer/src/i18n/locales/zh-cn.json @@ -4,6 +4,7 @@ }, "common": { "cancel": "取消", + "confirm": "确认", "disabled": "已关闭", "enabled": "已开启", "favorites": "收藏", @@ -44,6 +45,50 @@ } }, "player": { + "asr": { + "errors": { + "apiQuotaExceeded": "API 配额已用尽,请检查您的 Deepgram 账户", + "audioExtractionFailed": "音频提取失败,请检查视频文件", + "invalidApiKey": "API Key 无效,请检查设置", + "networkError": "网络错误,请检查连接后重试", + "noApiKey": "请先在设置中配置 Deepgram API Key", + "transcriptionFailed": "转写失败,请重试", + "unknown": "生成失败:{{message}}" + }, + "progress": { + "cancel": "取消", + "cancelConfirm": "确定要取消字幕生成吗?", + "cancelConfirmDescription": "取消后将清理所有临时文件,已生成的内容将不会保存。", + "confirmCancel": "确认取消", + "estimatedTime": "预计剩余时间:{{minutes}} 分钟", + "stages": { + "complete": "生成完成!", + "extracting": "提取音频中...", + "failed": "生成失败", + "formatting": "格式化字幕中...", + "initializing": "初始化中...", + "saving": "保存字幕中...", + "transcribing": "转写中..." + }, + "title": "正在生成字幕" + }, + "prompt": { + "configureApiKey": "请先在设置中配置 Deepgram API Key", + "description": "使用 Deepgram 语音识别技术自动生成准确的字幕,支持多语言和词级时间戳。", + "dontShowAgain": "不再提示", + "estimatedTime": "预计需要 {{minutes}} 分钟", + "generate": "开始生成", + "language": "选择语言", + "later": "稍后", + "message": "是否使用 AI 自动生成字幕?", + "title": "未检测到字幕" + }, + "success": { + "autoLoaded": "字幕已自动加载", + "message": "已成功生成 {{count}} 条字幕", + "title": "字幕生成成功" + } + }, "controls": { "auto_pause": { "disabled": "字幕未加载", @@ -209,49 +254,55 @@ "subtitle": "检测到当前视频格式不受支持", "title": "视频格式不兼容" }, - "subtitles": { - "hide": "隐藏字幕列表", - "search": "搜索字幕", - "show": "展开字幕列表" - }, "subtitleList": { "empty": { - "title": "在视频文件同目录下未找到匹配的字幕文件", "description": "选择一种方式开始添加字幕", "options": { + "ai": { + "action": "即将推出", + "actionEnabled": "生成", + "description": "基于语音识别生成单词级字幕", + "title": "AI 生成字幕" + }, "embedded": { - "title": "使用内嵌字幕", + "action": "选择", "description": "视频文件包含字幕轨道,可直接导入", - "action": "选择" + "title": "使用内嵌字幕" }, "external": { - "title": "导入外挂字幕", - "description": "从本地文件导入 SRT、VTT 等格式字幕" - }, - "ai": { - "title": "AI 生成字幕", - "description": "基于语音识别生成单词级字幕", - "action": "即将推出" + "description": "从本地文件导入 SRT、VTT 等格式字幕", + "title": "导入外挂字幕" } - } + }, + "title": "在视频文件同目录下未找到匹配的字幕文件" }, "search": { - "placeholder": "搜索字幕...", - "pending": "搜索中...", "count": "找到 {{count}} 条字幕", "count_one": "找到 {{count}} 条字幕", "count_other": "找到 {{count}} 条字幕", - "none": "未找到匹配的字幕", + "emptySubtitle": "请尝试其他关键词", "emptyTitle": "未找到匹配结果", - "emptySubtitle": "请尝试其他关键词" + "none": "未找到匹配的字幕", + "pending": "搜索中...", + "placeholder": "搜索字幕..." } }, "subtitleTrackSelector": { - "title": "导入内嵌字幕轨道", + "actions": { + "cancel": "取消", + "import": "导入" + }, "empty": "未检测到字幕轨道", + "messages": { + "extractFailed": "提取字幕轨道 {{index}} 失败", + "importFailed": "字幕轨道提取失败,请重试", + "importMultipleSuccess": "已导入 {{tracks}} 个字幕轨道(共 {{count}} 条)", + "importSuccess": "已导入字幕:{{source}}(共 {{count}} 条)", + "selectAtLeastOne": "请选择至少一个字幕轨道" + }, "sections": { - "text": "文本字幕轨道", - "image": "PGS 字幕轨(图像字幕)" + "image": "PGS 字幕轨(图像字幕)", + "text": "文本字幕轨道" }, "stream": { "label": "Stream {{index}}", @@ -261,20 +312,15 @@ "unsupported": "暂不支持" } }, + "title": "导入内嵌字幕轨道", "warning": { "pgs": "PGS 是图像格式字幕,需要 OCR 技术支持,暂不支持导入。" - }, - "actions": { - "cancel": "取消", - "import": "导入" - }, - "messages": { - "selectAtLeastOne": "请选择至少一个字幕轨道", - "extractFailed": "提取字幕轨道 {{index}} 失败", - "importFailed": "字幕轨道提取失败,请重试", - "importSuccess": "已导入字幕:{{source}}(共 {{count}} 条)", - "importMultipleSuccess": "已导入 {{tracks}} 个字幕轨道(共 {{count}} 条)" } + }, + "subtitles": { + "hide": "隐藏字幕列表", + "search": "搜索字幕", + "show": "展开字幕列表" } }, "search": { @@ -336,6 +382,43 @@ "appearance": { "title": "外观设置" }, + "asr": { + "apiKey": { + "description": "从 Deepgram 获取 API Key 以使用 AI 字幕生成功能", + "getKey": "获取 API Key", + "invalid": "API Key 无效", + "label": "Deepgram API Key", + "placeholder": "输入您的 Deepgram API Key", + "saveFailed": "保存失败", + "saved": "API Key 已保存", + "valid": "API Key 有效", + "validate": "验证", + "validating": "验证中..." + }, + "defaultLanguage": { + "description": "自动生成字幕时使用的默认语言", + "label": "默认语言" + }, + "description": "配置 AI 字幕自动生成功能,使用 Deepgram 语音识别技术为视频生成准确的字幕", + "languages": { + "auto": "自动检测", + "de": "德语", + "en": "英语", + "es": "西班牙语", + "fr": "法语", + "ja": "日语", + "ko": "韩语", + "ru": "俄语", + "zh": "中文" + }, + "model": { + "description": "选择 Deepgram 转写模型", + "label": "转写模型", + "nova2": "Nova 2(推荐)", + "nova3": "Nova 3(最新)" + }, + "title": "语音识别" + }, "developer": { "enable_developer_mode": "启用开发者模式", "title": "开发者模式" diff --git a/src/renderer/src/infrastructure/types/subtitle.ts b/src/renderer/src/infrastructure/types/subtitle.ts index be82280d..e14b51a4 100644 --- a/src/renderer/src/infrastructure/types/subtitle.ts +++ b/src/renderer/src/infrastructure/types/subtitle.ts @@ -6,6 +6,15 @@ * Based on existing EchoPlayer project's subtitle processing features */ +// 单词级时间戳接口 / Word-level Timestamp Interface +export interface WordTimestamp { + readonly word: string + readonly start: number + readonly end: number + readonly confidence: number + readonly punctuated_word?: string +} + // 字幕项接口 / Subtitle Item Interface export interface SubtitleItem { readonly id: string @@ -13,6 +22,7 @@ export interface SubtitleItem { readonly endTime: number readonly originalText: string readonly translatedText?: string + readonly words?: WordTimestamp[] } // 字幕格式枚举 / Subtitle Format Enum diff --git a/src/renderer/src/pages/player/PlayerPage.tsx b/src/renderer/src/pages/player/PlayerPage.tsx index 80e4f4a9..8b042ad2 100644 --- a/src/renderer/src/pages/player/PlayerPage.tsx +++ b/src/renderer/src/pages/player/PlayerPage.tsx @@ -34,6 +34,8 @@ import styled from 'styled-components' import { NavbarIcon } from '.' import { + ASRProgressModal, + ASRSubtitlePrompt, ControllerPanel, PlayerSelector, ProgressBar, @@ -42,6 +44,7 @@ import { SubtitleTrackSelector, VideoErrorRecovery } from './components' +import { useASRSubtitle } from './hooks/useASRSubtitle' import { disposeGlobalOrchestrator } from './hooks/usePlayerEngine' import { PlayerPageProvider } from './state/player-page.provider' @@ -117,6 +120,18 @@ function PlayerPage() { // 保存原始文件路径用于字幕检测(不是 HLS 播放源) const originalFilePathRef = useRef(null) + // ASR subtitle generation + const { + asrEnabled, + showAsrPrompt, + showAsrProgress, + asrProgress, + handleOpenASRGenerator, + handleGenerateSubtitle, + handleCancelAsr, + handleAsrLater + } = useASRSubtitle(videoId, originalFilePathRef.current) + // 加载视频数据 useEffect(() => { let cancelled = false @@ -723,6 +738,8 @@ function PlayerPage() { subtitleStreams !== null && subtitleStreams.streams.length > 0 } onOpenEmbeddedSubtitleSelector={() => setShowSubtitleTrackSelector(true)} + asrEnabled={asrEnabled} + onOpenASRGenerator={handleOpenASRGenerator} /> @@ -759,6 +776,21 @@ function PlayerPage() { onImported={() => setShowSubtitleTrackSelector(false)} onDismiss={() => setUserDismissedEmbeddedSubtitles(true)} /> + + {/* ASR 字幕生成提示弹窗 */} + + + {/* ASR 字幕生成进度弹窗 */} + ) diff --git a/src/renderer/src/pages/player/components/ASRProgressModal.tsx b/src/renderer/src/pages/player/components/ASRProgressModal.tsx new file mode 100644 index 00000000..78a6c54f --- /dev/null +++ b/src/renderer/src/pages/player/components/ASRProgressModal.tsx @@ -0,0 +1,144 @@ +import { + ANIMATION_DURATION, + FONT_SIZES, + FONT_WEIGHTS, + SPACING +} from '@renderer/infrastructure/styles/theme' +import { ASRProgress, ASRProgressStage } from '@shared/types' +import { Button, Flex, Modal, Progress } from 'antd' +import { FC, useEffect, useState } from 'react' +import { useTranslation } from 'react-i18next' +import styled from 'styled-components' + +interface ASRProgressModalProps { + open: boolean + progress: ASRProgress + onCancel: () => void +} + +const Section = styled.div`` + +const StageTitle = styled.div` + font-size: ${FONT_SIZES.BASE}px; + font-weight: ${FONT_WEIGHTS.MEDIUM}; + margin-bottom: ${SPACING.MD}px; +` + +const EstimatedText = styled.div` + font-size: ${FONT_SIZES.SM}px; + color: var(--ant-color-text-secondary); +` + +const MessageText = styled.div` + font-size: ${FONT_SIZES.XS}px; + color: var(--ant-color-text-tertiary); +` + +const CancelButton = styled(Button)<{ $confirmMode: boolean }>` + transition: + color ${ANIMATION_DURATION.SLOW} ease-in-out, + border-color ${ANIMATION_DURATION.SLOW} ease-in-out; + + ${(props) => + props.$confirmMode && + ` + border-color: var(--ant-color-error) !important; + color: var(--color-error-text) !important; + + &:hover { + border-color: var(--ant-color-error) !important; + } + `} +` + +const ASRProgressModal: FC = ({ open, progress, onCancel }) => { + const { t } = useTranslation() + const [confirmMode, setConfirmMode] = useState(false) + + const getStageText = () => { + switch (progress.stage) { + case ASRProgressStage.Initializing: + return t('player.asr.progress.stages.initializing') + case ASRProgressStage.ExtractingAudio: + return t('player.asr.progress.stages.extracting') + case ASRProgressStage.Transcribing: + return t('player.asr.progress.stages.transcribing') + case ASRProgressStage.Formatting: + return t('player.asr.progress.stages.formatting') + case ASRProgressStage.Saving: + return t('player.asr.progress.stages.saving') + case ASRProgressStage.Complete: + return t('player.asr.progress.stages.complete') + case ASRProgressStage.Failed: + return t('player.asr.progress.stages.failed') + default: + return '' + } + } + + const handleCancel = () => { + if (confirmMode) { + onCancel() + setConfirmMode(false) + } else { + setConfirmMode(true) + } + } + + const handleCancelMouseLeave = () => { + if (confirmMode) { + setConfirmMode(false) + } + } + + useEffect(() => { + if (confirmMode) { + const timer = setTimeout(() => { + setConfirmMode(false) + }, 3000) + return () => clearTimeout(timer) + } + return undefined + }, [confirmMode]) + + const estimatedMinutes = progress.eta ? Math.ceil(progress.eta / 60) : undefined + + return ( + + +
+ {getStageText()} + +
+ + {estimatedMinutes !== undefined && estimatedMinutes > 0 && ( + + {t('player.asr.progress.estimatedTime', { minutes: estimatedMinutes })} + + )} + + {progress.message && {progress.message}} + + + + {confirmMode ? t('player.asr.progress.confirmCancel') : t('player.asr.progress.cancel')} + + +
+
+ ) +} + +export default ASRProgressModal diff --git a/src/renderer/src/pages/player/components/ASRSubtitlePrompt.tsx b/src/renderer/src/pages/player/components/ASRSubtitlePrompt.tsx new file mode 100644 index 00000000..aea74e28 --- /dev/null +++ b/src/renderer/src/pages/player/components/ASRSubtitlePrompt.tsx @@ -0,0 +1,85 @@ +import { useTheme } from '@renderer/contexts' +import { Button, Flex, Modal, Select } from 'antd' +import { FC, useState } from 'react' +import { useTranslation } from 'react-i18next' + +interface ASRSubtitlePromptProps { + open: boolean + onGenerate: (language: string) => void + onLater: () => void + estimatedMinutes?: number +} + +const ASRSubtitlePrompt: FC = ({ + open, + onGenerate, + onLater, + estimatedMinutes = 5 +}) => { + const { t } = useTranslation() + const { theme } = useTheme() + const [selectedLanguage, setSelectedLanguage] = useState('auto') + + const languageOptions = [ + { value: 'auto', label: t('settings.asr.languages.auto') }, + { value: 'en', label: t('settings.asr.languages.en') }, + { value: 'zh', label: t('settings.asr.languages.zh') }, + { value: 'ja', label: t('settings.asr.languages.ja') }, + { value: 'es', label: t('settings.asr.languages.es') }, + { value: 'fr', label: t('settings.asr.languages.fr') }, + { value: 'de', label: t('settings.asr.languages.de') }, + { value: 'ko', label: t('settings.asr.languages.ko') }, + { value: 'ru', label: t('settings.asr.languages.ru') } + ] + + const handleGenerate = () => { + onGenerate(selectedLanguage) + } + + return ( + + +
+

{t('player.asr.prompt.message')}

+

+ {t('player.asr.prompt.description')} +

+
+ + + +