From 5b2ab3d9ba46c3853148a79ed61e4801b0314658 Mon Sep 17 00:00:00 2001
From: MrRoboto <matteo.rancilio@gmail.com>
Date: Sat, 25 Apr 2026 20:08:11 +0200
Subject: [PATCH] feat(llm): auto-describe images via vision fallback when
 active model lacks vision support

When a user pastes an image into the chat and the selected model does not
support image input, automatically find a vision-capable model from any
available provider, use it to describe the image(s), and inject the
description as text before the main model processes the request.

- Add `vision_model` config field to pin a specific vision model
- Add `Provider.getVisionModel()` that scans all providers (skipping the
  current model) for one with `capabilities.input.image`
- Add vision fallback logic in `LLM.run()` with full error handling so
  failures degrade gracefully without breaking the main stream
---
 packages/opencode/src/config/config.ts     |  4 +
 packages/opencode/src/provider/provider.ts | 25 +++++-
 packages/opencode/src/session/llm.ts       | 96 +++++++++++++++++++++-
 packages/opencode/test/fake/provider.ts    |  1 +
 4 files changed, 121 insertions(+), 5 deletions(-)
diff --git a/packages/opencode/src/config/config.ts b/packages/opencode/src/config/config.ts
index f1ceb1b4ed39..13af8d9d6677 100644
--- a/packages/opencode/src/config/config.ts
+++ b/packages/opencode/src/config/config.ts
@@ -142,6 +142,10 @@ export const Info = Schema.Struct({
   small_model: Schema.optional(ConfigModelID).annotate({
     description: "Small model to use for tasks like title generation in the format of provider/model",
   }),
+  vision_model: Schema.optional(ConfigModelID).annotate({
+    description:
+      "Vision model to use for describing images when the active model does not support image input, e.g. openai/gpt-4o. Auto-detected if not set.",
+  }),
   default_agent: Schema.optional(Schema.String).annotate({
     description:
       "Default agent to use when none is specified. Must be a primary agent. Falls back to 'build' if not set or if the specified agent is invalid.",
diff --git a/packages/opencode/src/provider/provider.ts b/packages/opencode/src/provider/provider.ts
index 0fe53e6e47f0..91e68d6a934d 100644
--- a/packages/opencode/src/provider/provider.ts
+++ b/packages/opencode/src/provider/provider.ts
@@ -932,6 +932,7 @@ export interface Interface {
     query: string[],
   ) => Effect.Effect<{ providerID: ProviderID; modelID: string } | undefined>
   readonly getSmallModel: (providerID: ProviderID) => Effect.Effect<Model | undefined>
+  readonly getVisionModel: (model: Model) => Effect.Effect<Model | undefined>
   readonly defaultModel: () => Effect.Effect<{ providerID: ProviderID; modelID: ModelID }>
 }
 
@@ -1646,6 +1647,28 @@ const layer: Layer.Layer<
       return undefined
     })
 
+    const getVisionModel = Effect.fn("Provider.getVisionModel")(function* (current: Model) {
+      const cfg = yield* config.get()
+      if (cfg.vision_model) {
+        const parsed = parseModel(cfg.vision_model)
+        return yield* getModel(parsed.providerID, parsed.modelID)
+      }
+
+      const s = yield* InstanceState.get(state)
+
+      // Scan all providers for a vision-capable model, skipping the current model itself.
+      // Do NOT prefer the same provider — if the user has run out of credits on that
+      // provider, picking another model from it would fail for the same reason.
+      for (const [pid, prov] of Object.entries(s.providers)) {
+        for (const [modelID, model] of Object.entries(prov.models)) {
+          if (pid === current.providerID && modelID === current.id) continue
+          if (model.capabilities.input.image) return yield* getModel(ProviderID.make(pid), ModelID.make(modelID))
+        }
+      }
+
+      return undefined
+    })
+
     const defaultModel = Effect.fn("Provider.defaultModel")(function* () {
       const cfg = yield* config.get()
       if (cfg.model) return parseModel(cfg.model)
@@ -1680,7 +1703,7 @@ const layer: Layer.Layer<
       }
     })
 
-    return Service.of({ list, getProvider, getModel, getLanguage, closest, getSmallModel, defaultModel })
+    return Service.of({ list, getProvider, getModel, getLanguage, closest, getSmallModel, getVisionModel, defaultModel })
   }),
 )
 
diff --git a/packages/opencode/src/session/llm.ts b/packages/opencode/src/session/llm.ts
index b72f873de01d..aa7c40905662 100644
--- a/packages/opencode/src/session/llm.ts
+++ b/packages/opencode/src/session/llm.ts
@@ -2,7 +2,7 @@ import { Provider } from "@/provider"
 import { Log } from "@/util"
 import { Context, Effect, Layer, Record } from "effect"
 import * as Stream from "effect/Stream"
-import { streamText, wrapLanguageModel, type ModelMessage, type Tool, tool, jsonSchema } from "ai"
+import { generateText, streamText, wrapLanguageModel, type ModelMessage, type Tool, tool, jsonSchema } from "ai"
 import { mergeDeep, pipe } from "remeda"
 import { GitLabWorkflowLanguageModel } from "gitlab-ai-provider"
 import { ProviderTransform } from "@/provider"
@@ -56,6 +56,56 @@ export interface Interface {
 
 export class Service extends Context.Service<Service, Interface>()("@opencode/LLM") {}
 
+// eslint-disable-next-line @typescript-eslint/no-explicit-any
+type AnyPart = { type: string; [k: string]: any }
+
+function isImageContentPart(part: AnyPart): boolean {
+  return (
+    part.type === "image" ||
+    (part.type === "file" && typeof part.mediaType === "string" && part.mediaType.startsWith("image/"))
+  )
+}
+
+function hasImageParts(msgs: ModelMessage[]): boolean {
+  return msgs.some(
+    (msg) =>
+      msg.role === "user" &&
+      Array.isArray(msg.content) &&
+      (msg.content as unknown as AnyPart[]).some(isImageContentPart),
+  )
+}
+
+function extractImageParts(msgs: ModelMessage[]): AnyPart[] {
+  const parts: AnyPart[] = []
+  for (const msg of msgs) {
+    if (msg.role !== "user" || !Array.isArray(msg.content)) continue
+    for (const part of msg.content as unknown as AnyPart[]) {
+      if (isImageContentPart(part)) parts.push(part)
+    }
+  }
+  return parts
+}
+
+function replaceImagePartsWithText(msgs: ModelMessage[], description: string): ModelMessage[] {
+  let replaced = false
+  return msgs.map((msg) => {
+    if (msg.role !== "user" || !Array.isArray(msg.content)) return msg
+    const newContent: AnyPart[] = []
+    for (const part of msg.content as unknown as AnyPart[]) {
+      if (isImageContentPart(part)) {
+        if (!replaced) {
+          replaced = true
+          newContent.push({ type: "text", text: description })
+        }
+        // drop subsequent image parts — already covered by the single description
+      } else {
+        newContent.push(part)
+      }
+    }
+    return { ...msg, content: newContent } as unknown as ModelMessage
+  })
+}
+
 const live: Layer.Layer<
   Service,
   never,
@@ -144,11 +194,49 @@ const live: Layer.Layer<
         options.instructions = system.join("\n")
       }
 
+      // Vision fallback: describe images via a vision-capable model when the selected model
+      // doesn't support image input, so the main model still gets useful context.
+      let processedMessages = input.messages
+      if (!input.model.capabilities.input.image && hasImageParts(input.messages)) {
+        const visionModel = yield* provider.getVisionModel(input.model)
+        if (visionModel) {
+          const visionLanguage = yield* provider.getLanguage(visionModel).pipe(
+            Effect.catchDefect(() => Effect.succeed(null)),
+          )
+          if (visionLanguage) {
+            const imageParts = extractImageParts(input.messages)
+            const visionPrompt = {
+              role: "user" as const,
+              content: [
+                ...imageParts,
+                {
+                  type: "text" as const,
+                  text: "Describe the content of the image(s) above in detail so that a text-only AI assistant can understand them and help the user. Be thorough and precise.",
+                },
+              ],
+            } as unknown as ModelMessage
+            const description = yield* Effect.tryPromise(() =>
+              generateText({ model: visionLanguage, messages: [visionPrompt] }).then((r) => r.text),
+            ).pipe(Effect.catch((err) => {
+              l.error("vision-fallback failed", { visionModelID: visionModel.id, error: String(err) })
+              return Effect.succeed(null as string | null)
+            }))
+            if (description !== null) {
+              processedMessages = replaceImagePartsWithText(
+                input.messages,
+                `[Image description by ${visionModel.id}]:\n${description}`,
+              )
+              l.info("vision-fallback applied", { visionModelID: visionModel.id })
+            }
+          }
+        }
+      }
+
       const isWorkflow = language instanceof GitLabWorkflowLanguageModel
       const messages = isOpenaiOauth
-        ? input.messages
+        ? processedMessages
         : isWorkflow
-          ? input.messages
+          ? processedMessages
           : [
               ...system.map(
                 (x): ModelMessage => ({
@@ -156,7 +244,7 @@ const live: Layer.Layer<
                   content: x,
                 }),
               ),
-              ...input.messages,
+              ...processedMessages,
             ]
 
       const params = yield* plugin.trigger(
diff --git a/packages/opencode/test/fake/provider.ts b/packages/opencode/test/fake/provider.ts
index bfb185a4b1bf..0d61127ffa41 100644
--- a/packages/opencode/test/fake/provider.ts
+++ b/packages/opencode/test/fake/provider.ts
@@ -70,6 +70,7 @@ export namespace ProviderTest {
           getSmallModel: Effect.fn("TestProvider.getSmallModel")((providerID) =>
             Effect.succeed(providerID === row.id ? mdl : undefined),
           ),
+          getVisionModel: Effect.fn("TestProvider.getVisionModel")(_model => Effect.succeed(undefined)),
           defaultModel: Effect.fn("TestProvider.defaultModel")(() =>
             Effect.succeed({ providerID: row.id, modelID: mdl.id }),
           ),