From 5b2ab3d9ba46c3853148a79ed61e4801b0314658 Mon Sep 17 00:00:00 2001 From: MrRoboto Date: Sat, 25 Apr 2026 20:08:11 +0200 Subject: [PATCH] feat(llm): auto-describe images via vision fallback when active model lacks vision support When a user pastes an image into the chat and the selected model does not support image input, automatically find a vision-capable model from any available provider, use it to describe the image(s), and inject the description as text before the main model processes the request. - Add `vision_model` config field to pin a specific vision model - Add `Provider.getVisionModel()` that scans all providers (skipping the current model) for one with `capabilities.input.image` - Add vision fallback logic in `LLM.run()` with full error handling so failures degrade gracefully without breaking the main stream --- packages/opencode/src/config/config.ts | 4 + packages/opencode/src/provider/provider.ts | 25 +++++- packages/opencode/src/session/llm.ts | 96 +++++++++++++++++++++- packages/opencode/test/fake/provider.ts | 1 + 4 files changed, 121 insertions(+), 5 deletions(-) diff --git a/packages/opencode/src/config/config.ts b/packages/opencode/src/config/config.ts index f1ceb1b4ed39..13af8d9d6677 100644 --- a/packages/opencode/src/config/config.ts +++ b/packages/opencode/src/config/config.ts @@ -142,6 +142,10 @@ export const Info = Schema.Struct({ small_model: Schema.optional(ConfigModelID).annotate({ description: "Small model to use for tasks like title generation in the format of provider/model", }), + vision_model: Schema.optional(ConfigModelID).annotate({ + description: + "Vision model to use for describing images when the active model does not support image input, e.g. openai/gpt-4o. Auto-detected if not set.", + }), default_agent: Schema.optional(Schema.String).annotate({ description: "Default agent to use when none is specified. Must be a primary agent. Falls back to 'build' if not set or if the specified agent is invalid.", diff --git a/packages/opencode/src/provider/provider.ts b/packages/opencode/src/provider/provider.ts index 0fe53e6e47f0..91e68d6a934d 100644 --- a/packages/opencode/src/provider/provider.ts +++ b/packages/opencode/src/provider/provider.ts @@ -932,6 +932,7 @@ export interface Interface { query: string[], ) => Effect.Effect<{ providerID: ProviderID; modelID: string } | undefined> readonly getSmallModel: (providerID: ProviderID) => Effect.Effect + readonly getVisionModel: (model: Model) => Effect.Effect readonly defaultModel: () => Effect.Effect<{ providerID: ProviderID; modelID: ModelID }> } @@ -1646,6 +1647,28 @@ const layer: Layer.Layer< return undefined }) + const getVisionModel = Effect.fn("Provider.getVisionModel")(function* (current: Model) { + const cfg = yield* config.get() + if (cfg.vision_model) { + const parsed = parseModel(cfg.vision_model) + return yield* getModel(parsed.providerID, parsed.modelID) + } + + const s = yield* InstanceState.get(state) + + // Scan all providers for a vision-capable model, skipping the current model itself. + // Do NOT prefer the same provider — if the user has run out of credits on that + // provider, picking another model from it would fail for the same reason. + for (const [pid, prov] of Object.entries(s.providers)) { + for (const [modelID, model] of Object.entries(prov.models)) { + if (pid === current.providerID && modelID === current.id) continue + if (model.capabilities.input.image) return yield* getModel(ProviderID.make(pid), ModelID.make(modelID)) + } + } + + return undefined + }) + const defaultModel = Effect.fn("Provider.defaultModel")(function* () { const cfg = yield* config.get() if (cfg.model) return parseModel(cfg.model) @@ -1680,7 +1703,7 @@ const layer: Layer.Layer< } }) - return Service.of({ list, getProvider, getModel, getLanguage, closest, getSmallModel, defaultModel }) + return Service.of({ list, getProvider, getModel, getLanguage, closest, getSmallModel, getVisionModel, defaultModel }) }), ) diff --git a/packages/opencode/src/session/llm.ts b/packages/opencode/src/session/llm.ts index b72f873de01d..aa7c40905662 100644 --- a/packages/opencode/src/session/llm.ts +++ b/packages/opencode/src/session/llm.ts @@ -2,7 +2,7 @@ import { Provider } from "@/provider" import { Log } from "@/util" import { Context, Effect, Layer, Record } from "effect" import * as Stream from "effect/Stream" -import { streamText, wrapLanguageModel, type ModelMessage, type Tool, tool, jsonSchema } from "ai" +import { generateText, streamText, wrapLanguageModel, type ModelMessage, type Tool, tool, jsonSchema } from "ai" import { mergeDeep, pipe } from "remeda" import { GitLabWorkflowLanguageModel } from "gitlab-ai-provider" import { ProviderTransform } from "@/provider" @@ -56,6 +56,56 @@ export interface Interface { export class Service extends Context.Service()("@opencode/LLM") {} +// eslint-disable-next-line @typescript-eslint/no-explicit-any +type AnyPart = { type: string; [k: string]: any } + +function isImageContentPart(part: AnyPart): boolean { + return ( + part.type === "image" || + (part.type === "file" && typeof part.mediaType === "string" && part.mediaType.startsWith("image/")) + ) +} + +function hasImageParts(msgs: ModelMessage[]): boolean { + return msgs.some( + (msg) => + msg.role === "user" && + Array.isArray(msg.content) && + (msg.content as unknown as AnyPart[]).some(isImageContentPart), + ) +} + +function extractImageParts(msgs: ModelMessage[]): AnyPart[] { + const parts: AnyPart[] = [] + for (const msg of msgs) { + if (msg.role !== "user" || !Array.isArray(msg.content)) continue + for (const part of msg.content as unknown as AnyPart[]) { + if (isImageContentPart(part)) parts.push(part) + } + } + return parts +} + +function replaceImagePartsWithText(msgs: ModelMessage[], description: string): ModelMessage[] { + let replaced = false + return msgs.map((msg) => { + if (msg.role !== "user" || !Array.isArray(msg.content)) return msg + const newContent: AnyPart[] = [] + for (const part of msg.content as unknown as AnyPart[]) { + if (isImageContentPart(part)) { + if (!replaced) { + replaced = true + newContent.push({ type: "text", text: description }) + } + // drop subsequent image parts — already covered by the single description + } else { + newContent.push(part) + } + } + return { ...msg, content: newContent } as unknown as ModelMessage + }) +} + const live: Layer.Layer< Service, never, @@ -144,11 +194,49 @@ const live: Layer.Layer< options.instructions = system.join("\n") } + // Vision fallback: describe images via a vision-capable model when the selected model + // doesn't support image input, so the main model still gets useful context. + let processedMessages = input.messages + if (!input.model.capabilities.input.image && hasImageParts(input.messages)) { + const visionModel = yield* provider.getVisionModel(input.model) + if (visionModel) { + const visionLanguage = yield* provider.getLanguage(visionModel).pipe( + Effect.catchDefect(() => Effect.succeed(null)), + ) + if (visionLanguage) { + const imageParts = extractImageParts(input.messages) + const visionPrompt = { + role: "user" as const, + content: [ + ...imageParts, + { + type: "text" as const, + text: "Describe the content of the image(s) above in detail so that a text-only AI assistant can understand them and help the user. Be thorough and precise.", + }, + ], + } as unknown as ModelMessage + const description = yield* Effect.tryPromise(() => + generateText({ model: visionLanguage, messages: [visionPrompt] }).then((r) => r.text), + ).pipe(Effect.catch((err) => { + l.error("vision-fallback failed", { visionModelID: visionModel.id, error: String(err) }) + return Effect.succeed(null as string | null) + })) + if (description !== null) { + processedMessages = replaceImagePartsWithText( + input.messages, + `[Image description by ${visionModel.id}]:\n${description}`, + ) + l.info("vision-fallback applied", { visionModelID: visionModel.id }) + } + } + } + } + const isWorkflow = language instanceof GitLabWorkflowLanguageModel const messages = isOpenaiOauth - ? input.messages + ? processedMessages : isWorkflow - ? input.messages + ? processedMessages : [ ...system.map( (x): ModelMessage => ({ @@ -156,7 +244,7 @@ const live: Layer.Layer< content: x, }), ), - ...input.messages, + ...processedMessages, ] const params = yield* plugin.trigger( diff --git a/packages/opencode/test/fake/provider.ts b/packages/opencode/test/fake/provider.ts index bfb185a4b1bf..0d61127ffa41 100644 --- a/packages/opencode/test/fake/provider.ts +++ b/packages/opencode/test/fake/provider.ts @@ -70,6 +70,7 @@ export namespace ProviderTest { getSmallModel: Effect.fn("TestProvider.getSmallModel")((providerID) => Effect.succeed(providerID === row.id ? mdl : undefined), ), + getVisionModel: Effect.fn("TestProvider.getVisionModel")(_model => Effect.succeed(undefined)), defaultModel: Effect.fn("TestProvider.defaultModel")(() => Effect.succeed({ providerID: row.id, modelID: mdl.id }), ),