From 244f1a82b2a813ce12f3f2c5406a827269f83222 Mon Sep 17 00:00:00 2001 From: dingyi Date: Sun, 7 Jun 2026 02:41:24 +0800 Subject: [PATCH 1/2] [Fix] normalize multimodal message content --- .../src/middlewares/chat/read_chat_message.ts | 351 +++++++----------- .../service-multimodal/src/plugins/audio.ts | 35 +- .../service-multimodal/src/plugins/image.ts | 7 +- 3 files changed, 149 insertions(+), 244 deletions(-) diff --git a/packages/core/src/middlewares/chat/read_chat_message.ts b/packages/core/src/middlewares/chat/read_chat_message.ts index 3bfa4f6bd..24d978d20 100644 --- a/packages/core/src/middlewares/chat/read_chat_message.ts +++ b/packages/core/src/middlewares/chat/read_chat_message.ts @@ -48,6 +48,10 @@ const SUPPORTED_AUDIO_MIME_TYPES = new Set([ ]) export function apply(ctx: Context, config: Config, chain: ChatChain) { + const forwardHistory = new WeakMap() + const fileSizes = new WeakMap() + const handledAudio = new WeakSet() + chain .middleware('read_chat_message', async (session, context) => { let message = @@ -153,21 +157,16 @@ export function apply(ctx: Context, config: Config, chain: ChatChain) { ) if (config.attachForwardMsgIdToContext) { - const kwargs = transformedMessage.additional_kwargs - const state = kwargs?.[forwardHistoryInternalKey] as - | ForwardHistoryState - | undefined + const state = forwardHistory.get(transformedMessage) if (state?.hasForwardHistory) { if (state.ids.length > 0) { - transformedMessage.additional_kwargs!.forwardMessageIds = + transformedMessage.additional_kwargs ??= {} + transformedMessage.additional_kwargs.forwardMessageIds = state.ids } addMessageContent(transformedMessage, '[聊天记录]') } - - // Internal-only state, should not leak outside this middleware. - delete kwargs?.[forwardHistoryInternalKey] } if ( @@ -214,7 +213,7 @@ export function apply(ctx: Context, config: Config, chain: ChatChain) { 'forward', async (session, element, message) => { if (!config.attachForwardMsgIdToContext) return - trackForwardId(element, message) + trackForwardId(forwardHistory, element, message) } ) @@ -223,7 +222,7 @@ export function apply(ctx: Context, config: Config, chain: ChatChain) { async (session, element, message) => { if (!config.attachForwardMsgIdToContext) return if (!isForwardMessageElement(element)) return - trackForwardId(element, message) + trackForwardId(forwardHistory, element, message) } ) @@ -243,20 +242,12 @@ export function apply(ctx: Context, config: Config, chain: ChatChain) { ctx.chatluna.messageTransformer.intercept( 'img', async (session, element, message, model) => { - const parsedModelInfo = - model != null - ? ctx.chatluna.platform.findModel(model) - : undefined + const supportsImage = modelSupportsElement(ctx, model, 'img') const isInstalledImageService = ctx.chatluna.getPlugin('multimodal-service') != null - if ( - parsedModelInfo?.value != null && - !parsedModelInfo.value.capabilities.includes( - ModelCapabilities.ImageInput - ) - ) { + if (!supportsImage) { if (!isInstalledImageService) { logger.warn( `Model "${model}" does not support image input. ` + @@ -264,27 +255,20 @@ export function apply(ctx: Context, config: Config, chain: ChatChain) { 'or install chatluna-multimodal-service (multimodal-service) plugin to enable image description.' ) } - return false } const url = (element.attrs.url ?? element.attrs.src) as string + const hash = await hashString(url, 8) + element.attrs['imageHash'] = hash const displayUrl = url.length > 100 ? url.substring(0, 100) + '...' : url logger.debug(`Processing image: ${displayUrl}`) - if (!ctx.chatluna_storage) { - return await oldImageRead( - ctx, - url, - message, - element, - isInstalledImageService - ) - } - - const { buffer, ext } = await readImage(ctx, url) + const image = await readImage(ctx, url) + const buffer = image.buffer + const ext = image.ext - if (ext == null) { + if (ext == null || buffer == null) { return false } @@ -292,33 +276,52 @@ export function apply(ctx: Context, config: Config, chain: ChatChain) { if (ext === 'image/gif') { if (!isInstalledImageService) { logger.warn( - `Detected GIF image, which is not supported by most models. Please install chatluna-multimodal-service (multimodal-service) plugin to parse GIF animations.` + 'Detected GIF image, which is not supported by most ' + + 'models. Please install chatluna-multimodal-service ' + + '(multimodal-service) plugin to parse GIF animations.' ) } + if (ctx.chatluna_storage) + setElementUrl( + element, + ( + await ctx.chatluna_storage.createTempFile( + buffer, + `${hash}.gif` + ) + ).url + ) return false } - const fileExt = ext.includes('/') ? ext.split('/')[1] : ext - element.attrs['ext'] = fileExt - let fileName = element.attrs['filename'] if (fileName == null || fileName.length > 50) { - fileName = `${await hashString(url, 8)}.${fileExt}` + fileName = `${hash}.${ + ext.includes('/') ? ext.split('/')[1] : ext + }` } + element.attrs['ext'] = ext.includes('/') ? ext.split('/')[1] : ext logger.debug(`Saving image as temp file: ${fileName}`) - const tempFile = await ctx.chatluna_storage.createTempFile( - buffer, - fileName - ) + const tempFile = ctx.chatluna_storage + ? await ctx.chatluna_storage.createTempFile(buffer, fileName) + : null + const imageUrl = tempFile?.url ?? image.base64Source + + if (tempFile) setElementUrl(element, tempFile.url) - ensureContentArray(message, `[image:${tempFile.url}]`) + if (!supportsImage) { + addTextPart(message, `[image:${imageUrl}]`) + return false + } + + addTextPart(message, `[image:${imageUrl}]`) ;(message.content as MessageContentComplex[]).push({ type: 'image_url', - image_url: { url: tempFile.url } + image_url: { url: imageUrl } }) - element.attrs['imageUrl'] = tempFile.url + return false }, -100 ) @@ -340,7 +343,7 @@ export function apply(ctx: Context, config: Config, chain: ChatChain) { ? ctx.chatluna.platform.findModel(model) : undefined - if (isAudioHandled(message, element)) { + if (handledAudio.has(element)) { logger.debug( 'Skip sst audio2text because audio is already handled.' ) @@ -362,7 +365,8 @@ export function apply(ctx: Context, config: Config, chain: ChatChain) { const content = await ctx.sst.audio2text(session) logger.debug(`audio2text: ${content}`) addMessageContent(message, content) - markAudioHandled(message, element) + handledAudio.add(element) + return false } ), -100 @@ -376,26 +380,9 @@ export function apply(ctx: Context, config: Config, chain: ChatChain) { ctx.chatluna.messageTransformer.intercept( 'file', async (session, element, message, model) => { - const modelInfo = - model != null - ? ctx.chatluna.platform.findModel(model) - : undefined - - if ( - modelInfo?.value != null && - !modelInfo.value.capabilities.includes( - ModelCapabilities.FileInput - ) - ) { - addMessageContent( - message, - `[file: ${element.attrs['file'] ?? element.attrs['filename'] ?? 'attachment'} (skipped: model does not support file input)]` - ) - return - } - await handleFileElement( ctx, + fileSizes, session, element, message, @@ -408,26 +395,9 @@ export function apply(ctx: Context, config: Config, chain: ChatChain) { ctx.chatluna.messageTransformer.intercept( 'video', async (session, element, message, model) => { - const modelInfo = - model != null - ? ctx.chatluna.platform.findModel(model) - : undefined - - if ( - modelInfo?.value != null && - !modelInfo.value.capabilities.includes( - ModelCapabilities.VideoInput - ) - ) { - addMessageContent( - message, - `[video: ${element.attrs['file'] ?? element.attrs['filename'] ?? 'attachment'} (skipped: model does not support video input)]` - ) - return - } - await handleFileElement( ctx, + fileSizes, session, element, message, @@ -440,30 +410,9 @@ export function apply(ctx: Context, config: Config, chain: ChatChain) { ctx.chatluna.messageTransformer.intercept( 'audio', async (session, element, message, model) => { - if (isAudioHandled(message, element)) { - logger.debug( - 'Skip audio file handler because audio is already handled.' - ) - return false - } - - const modelInfo = - model != null - ? ctx.chatluna.platform.findModel(model) - : undefined - - // If model doesn't support audio input, skip (sst handles fallback) - if ( - modelInfo?.value != null && - !modelInfo.value.capabilities.includes( - ModelCapabilities.AudioInput - ) - ) { - return false - } - const handled = await handleFileElement( ctx, + fileSizes, session, element, message, @@ -472,7 +421,7 @@ export function apply(ctx: Context, config: Config, chain: ChatChain) { ) if (handled) { - markAudioHandled(message, element) + handledAudio.add(element) } return false @@ -562,22 +511,14 @@ async function resolveSourceUrl( async function handleFileElement( ctx: Context, + fileSizes: WeakMap, session: Session, element: h, message: Message, model: string | undefined, elementType: 'file' | 'video' | 'audio' ): Promise { - const displayElementType = elementType === 'audio' ? 'voice' : elementType - - if (elementType === 'audio' && isAudioHandled(message, element)) { - logger.debug( - 'Skip handling audio file because audio is already handled.' - ) - return false - } - - const fileName: string = + const name: string = element.attrs['file'] ?? element.attrs['name'] ?? element.attrs['filename'] @@ -615,8 +556,34 @@ async function handleFileElement( return false } - const mimeType = - responseMimeType ?? getMimeTypeFromSource(sourceUrl, fileName) + const mimeType = responseMimeType ?? getMimeTypeFromSource(sourceUrl, name) + const fileName = name ?? 'attachment' + const label = + elementType === 'audio' + ? 'voice' + : elementType === 'video' + ? 'video' + : 'file' + + const file = ctx.chatluna_storage + ? await ctx.chatluna_storage.createTempFile(buffer, fileName) + : null + const fileUrl = file + ? file.url + : `data:${mimeType ?? 'application/octet-stream'};base64,${buffer.toString('base64')}` + + element.attrs['file'] = file?.name ?? fileName + element.attrs['filename'] = file?.name ?? fileName + element.attrs['chatluna_file_url'] = file?.url ?? sourceUrl + + addTextPart(message, `[${label}:${file?.name ?? fileName}]`) + + if (!modelSupportsElement(ctx, model, elementType)) { + logger.warn( + `Model "${model}" does not support ${label} input. The file was saved and fallback text was kept.` + ) + return false + } // For audio elements, check if the format is supported natively if (elementType === 'audio' && mimeType != null) { @@ -638,7 +605,7 @@ async function handleFileElement( if (mimeType != null && !fileConfig.supportedMimeTypes.has(mimeType)) { addMessageContent( message, - `[${displayElementType}: ${fileName ?? 'attachment'} (skipped: unsupported MIME type "${mimeType}")]` + `[${label}: ${file?.name ?? fileName} (skipped: unsupported MIME type "${mimeType}")]` ) return false } @@ -658,54 +625,24 @@ async function handleFileElement( if (encodedSize > maxSize) { addMessageContent( message, - `[${displayElementType}: ${fileName ?? 'attachment'} (skipped: file size ${encodedSize} bytes exceeds limit ${maxSize} bytes)]` + `[${label}: ${file?.name ?? fileName} (skipped: file size ${encodedSize} bytes exceeds limit ${maxSize} bytes)]` ) return false } // Check total size across all inline files - const currentTotal = getFileTotalSize(message) - const newTotal = currentTotal + encodedSize + const size = (fileSizes.get(message) ?? 0) + encodedSize - if (newTotal > fileConfig.maxTotalSizeBytes) { + if (size > fileConfig.maxTotalSizeBytes) { addMessageContent( message, - `[${displayElementType}: ${fileName ?? 'attachment'} (skipped: total inline size would exceed limit)]` + `[${label}: ${file?.name ?? fileName} (skipped: total inline size would exceed limit)]` ) return false } - } - - // Default path: store in storage (url) or fallback to base64 inline - const resolvedFileName = fileName ?? 'attachment' - element.attrs['file'] = resolvedFileName - element.attrs['filename'] = resolvedFileName - element.attrs['chatluna_file_url'] = sourceUrl - - const label = - elementType === 'audio' - ? 'voice' - : elementType === 'video' - ? 'video' - : 'file' - let fileUrl: string - if (ctx.chatluna_storage) { - const file = await ctx.chatluna_storage.createTempFile( - buffer, - resolvedFileName - ) - const displayFileName = fileName ?? file.name - element.attrs['file'] = displayFileName - element.attrs['filename'] = displayFileName - element.attrs['chatluna_file_url'] = file.url - fileUrl = file.url - ensureContentArray(message, `[${label}:${displayFileName}]`) - } else { - // No storage service — inline as base64 data URL, same as oldImageRead - const base64 = buffer.toString('base64') - fileUrl = `data:${mimeType ?? 'application/octet-stream'};base64,${base64}` - ensureContentArray(message, `[${label}:${resolvedFileName}]`) + fileSizes.set(message, size) + addFileSize(message, size) } // Add typed content part alongside text @@ -717,47 +654,6 @@ async function handleFileElement( // #region image reading -async function oldImageRead( - ctx: Context, - url: string, - message: Message, - element: h, - isInstalledImageService: boolean -) { - const imageHash = await hashString(url, 8) - element.attrs['imageHash'] = imageHash - - try { - const { base64Source, ext } = await readImage(ctx, url) - - if (ext == null) { - return false - } - - if (ext === 'image/gif') { - if (!isInstalledImageService) { - logger.warn( - `Detected GIF image, which is not supported by most models. Please install chatluna-multimodal-service (multimodal-service) plugin to parse GIF animations.` - ) - } - return false - } - - ensureContentArray(message, `[image:${imageHash}]`) - ;(message.content as MessageContentComplex[]).push({ - type: 'image_url', - image_url: { url: base64Source, hash: imageHash } - } as unknown as MessageContentComplex) - } catch (error) { - const displayUrl = - url.length > 100 ? url.substring(0, 100) + '...' : url - logger.warn( - `Failed to read image from ${displayUrl}. Please check your Koishi chat adapter.`, - error - ) - } -} - async function readImage(ctx: Context, url: string) { if (url.startsWith('data:image') && url.includes('base64')) { const buffer = Buffer.from(url.split(',')[1], 'base64') @@ -824,12 +720,6 @@ function pushTypedContent( } } -function getFileTotalSize(message: Message): number { - const kwargs = (message.additional_kwargs ?? {}) as Record - const value = kwargs['__file_total_size'] - return typeof value === 'number' && Number.isFinite(value) ? value : 0 -} - function toContentParts( content: MessageContent | null | undefined ): MessageContentComplex[] { @@ -846,23 +736,14 @@ function toContentParts( : [content as unknown as MessageContentComplex] } -function ensureContentArray(message: Message, fallbackText: string) { +function addTextPart(message: Message, text: string) { const parts = toContentParts(message.content) - - if (parts.length > 0) { - message.content = parts - return - } - - message.content = [{ type: 'text', text: fallbackText }] -} - -function isAudioHandled(_message: Message, element: h): boolean { - return element.attrs['_audioHandled'] === true + message.content = [...parts, { type: 'text', text }] } -function markAudioHandled(_message: Message, element: h) { - element.attrs['_audioHandled'] = true +function addFileSize(message: Message, size: number) { + message.additional_kwargs ??= {} + message.additional_kwargs['__file_total_size'] = size } function addMessageContent( @@ -889,14 +770,42 @@ function addMessageContent( // #region forward message tracking -const forwardHistoryInternalKey = '__chatluna_forwardHistory' +function modelSupportsElement( + ctx: Context, + model: string | undefined, + type: 'img' | 'file' | 'video' | 'audio' +) { + const info = model != null ? ctx.chatluna.platform.findModel(model) : null + if (info?.value == null) return true + + return info.value.capabilities.includes( + type === 'img' + ? ModelCapabilities.ImageInput + : type === 'audio' + ? ModelCapabilities.AudioInput + : type === 'video' + ? ModelCapabilities.VideoInput + : ModelCapabilities.FileInput + ) +} + +function setElementUrl(element: h, url: string) { + element.attrs['imageUrl'] = url + element.attrs['src'] = url + element.attrs['url'] = url +} -function trackForwardId(element: h, message: Message) { - const kwargs = (message.additional_kwargs ??= {}) - const state = (kwargs[forwardHistoryInternalKey] ??= { +function trackForwardId( + history: WeakMap, + element: h, + message: Message +) { + const state = history.get(message) ?? { ids: [], hasForwardHistory: false - }) as ForwardHistoryState + } + + history.set(message, state) state.hasForwardHistory = true diff --git a/packages/service-multimodal/src/plugins/audio.ts b/packages/service-multimodal/src/plugins/audio.ts index 8d0a6ea15..df4eaa07a 100644 --- a/packages/service-multimodal/src/plugins/audio.ts +++ b/packages/service-multimodal/src/plugins/audio.ts @@ -60,38 +60,35 @@ export function apply(ctx: Context, config: Config) { element.attrs['mime'] as string | null ) - let outBuffer = buffer - let outMime = detected ?? 'audio/mpeg' - - if (!detected || !NATIVE_AUDIO_MIMES.has(detected)) { - const converted = await convertAudioToMp3(ctx, buffer) - if (!converted) { - logger.warn( - `Skip audio: format ${detected ?? 'unknown'} not natively supported and ffmpeg conversion failed.` - ) - return false - } - outBuffer = converted - outMime = 'audio/mpeg' + if (detected && NATIVE_AUDIO_MIMES.has(detected)) { + return false } - const dataUrl = `data:${outMime};base64,${outBuffer.toString('base64')}` - const ext = MIME_TO_EXT[outMime] ?? 'mp3' + const converted = await convertAudioToMp3(ctx, buffer) + if (!converted) { + logger.warn( + `Skip audio: format ${detected ?? 'unknown'} not natively supported and ffmpeg conversion failed.` + ) + return false + } + + const dataUrl = `data:audio/mpeg;base64,${converted.toString('base64')}` + const ext = MIME_TO_EXT['audio/mpeg'] const fileName = `${stripExtension(audioName(element))}.${ext}` element.attrs['file'] = fileName element.attrs['filename'] = fileName element.attrs['chatluna_file_url'] = sourceUrl - ensureContentArray(message, `[voice:${fileName}]`) + ensureContentArray(message) ;(message.content as MessageContentComplex[]).push({ type: 'audio_url', - audio_url: { url: dataUrl, mimeType: outMime } + audio_url: { url: dataUrl, mimeType: 'audio/mpeg' } } as unknown as MessageContentComplex) logger.debug( - `Injected audio for ${model}: ${fileName} (${outMime}, ${outBuffer.byteLength} bytes)` + `Injected audio for ${model}: ${fileName} (audio/mpeg, ${converted.byteLength} bytes)` ) - return true + return false }, 100 ) diff --git a/packages/service-multimodal/src/plugins/image.ts b/packages/service-multimodal/src/plugins/image.ts index f1b64e51a..6012e6a8a 100644 --- a/packages/service-multimodal/src/plugins/image.ts +++ b/packages/service-multimodal/src/plugins/image.ts @@ -48,13 +48,11 @@ export async function apply( if (isGif) { await injectGifFrames(message, imageData.buffer, config) addTextToContent(message, '[image: GIF]') - } else if (imageData.base64Source) { - addImageToContent(message, imageData.base64Source) } - return true + return false } - return describeAndInject( + await describeAndInject( message, imageData, isGif, @@ -62,6 +60,7 @@ export async function apply( imageUnderstandModel.value, url ) + return false }, 100 ) From 91cf1760e95dfb326d4f9852a0f3b590cf9b2c3f Mon Sep 17 00:00:00 2001 From: dingyi Date: Sun, 7 Jun 2026 14:42:54 +0800 Subject: [PATCH 2/2] [Fix] detect multimodal file types from content --- .../src/middlewares/chat/read_chat_message.ts | 47 ++++++---- packages/service-multimodal/package.json | 1 + .../service-multimodal/src/plugins/audio.ts | 2 +- .../src/plugins/read_files.ts | 17 ++-- packages/service-multimodal/src/utils.ts | 85 +++---------------- 5 files changed, 50 insertions(+), 102 deletions(-) diff --git a/packages/core/src/middlewares/chat/read_chat_message.ts b/packages/core/src/middlewares/chat/read_chat_message.ts index 24d978d20..1ae073370 100644 --- a/packages/core/src/middlewares/chat/read_chat_message.ts +++ b/packages/core/src/middlewares/chat/read_chat_message.ts @@ -308,15 +308,16 @@ export function apply(ctx: Context, config: Config, chain: ChatChain) { ? await ctx.chatluna_storage.createTempFile(buffer, fileName) : null const imageUrl = tempFile?.url ?? image.base64Source + const imageText = tempFile?.url ?? hash if (tempFile) setElementUrl(element, tempFile.url) if (!supportsImage) { - addTextPart(message, `[image:${imageUrl}]`) + addTextPart(message, `[image:${imageText}]`) return false } - addTextPart(message, `[image:${imageUrl}]`) + addTextPart(message, `[image:${imageText}]`) ;(message.content as MessageContentComplex[]).push({ type: 'image_url', image_url: { url: imageUrl } @@ -558,12 +559,17 @@ async function handleFileElement( const mimeType = responseMimeType ?? getMimeTypeFromSource(sourceUrl, name) const fileName = name ?? 'attachment' - const label = - elementType === 'audio' - ? 'voice' - : elementType === 'video' - ? 'video' - : 'file' + let label: 'file' | 'video' | 'voice' + switch (elementType) { + case 'audio': + label = 'voice' + break + case 'video': + label = 'video' + break + default: + label = 'file' + } const file = ctx.chatluna_storage ? await ctx.chatluna_storage.createTempFile(buffer, fileName) @@ -778,15 +784,22 @@ function modelSupportsElement( const info = model != null ? ctx.chatluna.platform.findModel(model) : null if (info?.value == null) return true - return info.value.capabilities.includes( - type === 'img' - ? ModelCapabilities.ImageInput - : type === 'audio' - ? ModelCapabilities.AudioInput - : type === 'video' - ? ModelCapabilities.VideoInput - : ModelCapabilities.FileInput - ) + switch (type) { + case 'img': + return info.value.capabilities.includes( + ModelCapabilities.ImageInput + ) + case 'audio': + return info.value.capabilities.includes( + ModelCapabilities.AudioInput + ) + case 'video': + return info.value.capabilities.includes( + ModelCapabilities.VideoInput + ) + default: + return info.value.capabilities.includes(ModelCapabilities.FileInput) + } } function setElementUrl(element: h, url: string) { diff --git a/packages/service-multimodal/package.json b/packages/service-multimodal/package.json index 1a3973067..e277227ba 100644 --- a/packages/service-multimodal/package.json +++ b/packages/service-multimodal/package.json @@ -49,6 +49,7 @@ ], "dependencies": { "@langchain/core": "^0.3.80", + "file-type": "16.5.4", "jimp": "^1.6.0", "omggif": "^1.0.10", "zod": "3.25.76" diff --git a/packages/service-multimodal/src/plugins/audio.ts b/packages/service-multimodal/src/plugins/audio.ts index df4eaa07a..749b8e107 100644 --- a/packages/service-multimodal/src/plugins/audio.ts +++ b/packages/service-multimodal/src/plugins/audio.ts @@ -55,7 +55,7 @@ export function apply(ctx: Context, config: Config) { const buffer = await downloadAudio(ctx, sourceUrl) if (!buffer) return false - const detected = detectAudioMimeType( + const detected = await detectAudioMimeType( buffer, element.attrs['mime'] as string | null ) diff --git a/packages/service-multimodal/src/plugins/read_files.ts b/packages/service-multimodal/src/plugins/read_files.ts index d9d73aac6..4fd24f1b2 100644 --- a/packages/service-multimodal/src/plugins/read_files.ts +++ b/packages/service-multimodal/src/plugins/read_files.ts @@ -19,8 +19,6 @@ import { convertAudioToMp3, detectAudioMimeType, IMAGE_MIME_TYPES, - inferMimeTypeFromUrl, - normalizeMimeType, parseGifToFrames, processImageWithModel } from '../utils' @@ -63,7 +61,7 @@ export class ReadFilesTool extends StructuredTool { .max(10) ) .describe( - 'A list of files to read (max 10). File format: { url: string }. MIME type is inferred from response headers, then URL extension.' + 'A list of files to read (max 10). File format: { url: string }. MIME type is inferred from file content and response headers.' ) }) @@ -118,14 +116,11 @@ export class ReadFilesTool extends StructuredTool { continue } - const declared = - normalizeMimeType(fetched.contentType) ?? - inferMimeTypeFromUrl(sourceUrl) - const detectedAudio = detectAudioMimeType( - fetched.buffer, - declared - ) - const mime = detectedAudio ?? declared + const declared = fetched.contentType + ?.split(';')[0] + ?.trim() + ?.toLowerCase() + const mime = await detectAudioMimeType(fetched.buffer, declared) if (!mime) { pushError( diff --git a/packages/service-multimodal/src/utils.ts b/packages/service-multimodal/src/utils.ts index 55ecfdbb4..817a36715 100644 --- a/packages/service-multimodal/src/utils.ts +++ b/packages/service-multimodal/src/utils.ts @@ -15,6 +15,7 @@ import type {} from 'koishi-plugin-ffmpeg-path' import { Config, logger } from '.' import { GifReader } from 'omggif' import { Jimp } from 'jimp' +import fileType from 'file-type' // --------------------------------------------------------------------------- // MIME helpers @@ -28,69 +29,22 @@ export const IMAGE_MIME_TYPES = new Set([ 'image/gif' ]) -const FILE_EXTENSION_TO_MIME_TYPE: Record = { - '.png': 'image/png', - '.jpg': 'image/jpeg', - '.jpeg': 'image/jpeg', - '.bmp': 'image/bmp', - '.webp': 'image/webp', - '.gif': 'image/gif', - '.pdf': 'application/pdf', - '.txt': 'text/plain', - '.md': 'text/markdown', - '.html': 'text/html', - '.htm': 'text/html', - '.css': 'text/css', - '.xml': 'text/xml', - '.csv': 'text/csv', - '.rtf': 'text/rtf', - '.js': 'text/javascript', - '.mjs': 'text/javascript', - '.json': 'application/json', - '.mp4': 'video/mp4', - '.mpeg': 'video/mpeg', - '.mov': 'video/mov', - '.avi': 'video/avi', - '.flv': 'video/x-flv', - '.webm': 'video/webm', - '.wmv': 'video/wmv', - '.3gp': 'video/3gpp', - '.3gpp': 'video/3gpp', - '.mp3': 'audio/mpeg', - '.aiff': 'audio/aiff', - '.aac': 'audio/aac', - '.flac': 'audio/flac', - '.wav': 'audio/wav', - '.ogg': 'audio/ogg', - '.m4a': 'audio/mp4' -} - -export function inferMimeTypeFromUrl(url: string): string | null { - try { - const path = new URL(url).pathname.toLowerCase() - const dot = path.lastIndexOf('.') - return dot < 0 - ? null - : (FILE_EXTENSION_TO_MIME_TYPE[path.slice(dot)] ?? null) - } catch { - return null - } -} - -export function normalizeMimeType( - raw: string | null | undefined -): string | null { - return raw?.split(';')[0]?.trim()?.toLowerCase() || null +export async function detectFileType( + buffer: Buffer +): Promise<{ mime: string; ext: string } | undefined> { + const result = await fileType.fromBuffer(buffer) + if (!result) return undefined + return { mime: result.mime, ext: result.ext } } /** - * Detect audio MIME from buffer header. Recognises QQ Silk + AMR + common - * audio container magic bytes. Falls back to the declared MIME otherwise. + * Detect audio MIME from buffer header. Recognises QQ Silk + AMR, then uses + * file-type and falls back to the declared MIME otherwise. */ -export function detectAudioMimeType( +export async function detectAudioMimeType( buffer: Buffer, declared?: string | null -): string | null { +): Promise { const head = buffer.subarray(0, 16).toString('latin1') if (head.startsWith('#!AMR')) return 'audio/amr' @@ -102,23 +56,8 @@ export function detectAudioMimeType( ) { return 'audio/silk' } - // MP3 frame sync: 0xFFEx. Reject JPEG (0xFFD8) by checking the full sync word. - if ( - head.startsWith('ID3') || - (buffer[0] === 0xff && (buffer[1] & 0xe0) === 0xe0) - ) { - return 'audio/mpeg' - } - if ( - head.startsWith('RIFF') && - buffer.subarray(8, 12).toString('latin1') === 'WAVE' - ) { - return 'audio/wav' - } - if (head.startsWith('fLaC')) return 'audio/flac' - if (head.startsWith('OggS')) return 'audio/ogg' - return declared ?? null + return (await detectFileType(buffer))?.mime ?? declared ?? null } // ---------------------------------------------------------------------------