Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions packages/opencode/src/config/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,10 @@ export const Info = Schema.Struct({
small_model: Schema.optional(ConfigModelID).annotate({
description: "Small model to use for tasks like title generation in the format of provider/model",
}),
vision_model: Schema.optional(ConfigModelID).annotate({
description:
"Vision model to use for describing images when the active model does not support image input, e.g. openai/gpt-4o. Auto-detected if not set.",
}),
default_agent: Schema.optional(Schema.String).annotate({
description:
"Default agent to use when none is specified. Must be a primary agent. Falls back to 'build' if not set or if the specified agent is invalid.",
Expand Down
25 changes: 24 additions & 1 deletion packages/opencode/src/provider/provider.ts
Original file line number Diff line number Diff line change
Expand Up @@ -932,6 +932,7 @@ export interface Interface {
query: string[],
) => Effect.Effect<{ providerID: ProviderID; modelID: string } | undefined>
readonly getSmallModel: (providerID: ProviderID) => Effect.Effect<Model | undefined>
readonly getVisionModel: (model: Model) => Effect.Effect<Model | undefined>
readonly defaultModel: () => Effect.Effect<{ providerID: ProviderID; modelID: ModelID }>
}

Expand Down Expand Up @@ -1646,6 +1647,28 @@ const layer: Layer.Layer<
return undefined
})

const getVisionModel = Effect.fn("Provider.getVisionModel")(function* (current: Model) {
const cfg = yield* config.get()
if (cfg.vision_model) {
const parsed = parseModel(cfg.vision_model)
return yield* getModel(parsed.providerID, parsed.modelID)
}

const s = yield* InstanceState.get(state)

// Scan all providers for a vision-capable model, skipping the current model itself.
// Do NOT prefer the same provider — if the user has run out of credits on that
// provider, picking another model from it would fail for the same reason.
for (const [pid, prov] of Object.entries(s.providers)) {
for (const [modelID, model] of Object.entries(prov.models)) {
if (pid === current.providerID && modelID === current.id) continue
if (model.capabilities.input.image) return yield* getModel(ProviderID.make(pid), ModelID.make(modelID))
}
}

return undefined
})

const defaultModel = Effect.fn("Provider.defaultModel")(function* () {
const cfg = yield* config.get()
if (cfg.model) return parseModel(cfg.model)
Expand Down Expand Up @@ -1680,7 +1703,7 @@ const layer: Layer.Layer<
}
})

return Service.of({ list, getProvider, getModel, getLanguage, closest, getSmallModel, defaultModel })
return Service.of({ list, getProvider, getModel, getLanguage, closest, getSmallModel, getVisionModel, defaultModel })
}),
)

Expand Down
96 changes: 92 additions & 4 deletions packages/opencode/src/session/llm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import { Provider } from "@/provider"
import { Log } from "@/util"
import { Context, Effect, Layer, Record } from "effect"
import * as Stream from "effect/Stream"
import { streamText, wrapLanguageModel, type ModelMessage, type Tool, tool, jsonSchema } from "ai"
import { generateText, streamText, wrapLanguageModel, type ModelMessage, type Tool, tool, jsonSchema } from "ai"
import { mergeDeep, pipe } from "remeda"
import { GitLabWorkflowLanguageModel } from "gitlab-ai-provider"
import { ProviderTransform } from "@/provider"
Expand Down Expand Up @@ -56,6 +56,56 @@ export interface Interface {

export class Service extends Context.Service<Service, Interface>()("@opencode/LLM") {}

// eslint-disable-next-line @typescript-eslint/no-explicit-any
type AnyPart = { type: string; [k: string]: any }

function isImageContentPart(part: AnyPart): boolean {
return (
part.type === "image" ||
(part.type === "file" && typeof part.mediaType === "string" && part.mediaType.startsWith("image/"))
)
}

function hasImageParts(msgs: ModelMessage[]): boolean {
return msgs.some(
(msg) =>
msg.role === "user" &&
Array.isArray(msg.content) &&
(msg.content as unknown as AnyPart[]).some(isImageContentPart),
)
}

function extractImageParts(msgs: ModelMessage[]): AnyPart[] {
const parts: AnyPart[] = []
for (const msg of msgs) {
if (msg.role !== "user" || !Array.isArray(msg.content)) continue
for (const part of msg.content as unknown as AnyPart[]) {
if (isImageContentPart(part)) parts.push(part)
}
}
return parts
}

function replaceImagePartsWithText(msgs: ModelMessage[], description: string): ModelMessage[] {
let replaced = false
return msgs.map((msg) => {
if (msg.role !== "user" || !Array.isArray(msg.content)) return msg
const newContent: AnyPart[] = []
for (const part of msg.content as unknown as AnyPart[]) {
if (isImageContentPart(part)) {
if (!replaced) {
replaced = true
newContent.push({ type: "text", text: description })
}
// drop subsequent image parts — already covered by the single description
} else {
newContent.push(part)
}
}
return { ...msg, content: newContent } as unknown as ModelMessage
})
}

const live: Layer.Layer<
Service,
never,
Expand Down Expand Up @@ -144,19 +194,57 @@ const live: Layer.Layer<
options.instructions = system.join("\n")
}

// Vision fallback: describe images via a vision-capable model when the selected model
// doesn't support image input, so the main model still gets useful context.
let processedMessages = input.messages
if (!input.model.capabilities.input.image && hasImageParts(input.messages)) {
const visionModel = yield* provider.getVisionModel(input.model)
if (visionModel) {
const visionLanguage = yield* provider.getLanguage(visionModel).pipe(
Effect.catchDefect(() => Effect.succeed(null)),
)
if (visionLanguage) {
const imageParts = extractImageParts(input.messages)
const visionPrompt = {
role: "user" as const,
content: [
...imageParts,
{
type: "text" as const,
text: "Describe the content of the image(s) above in detail so that a text-only AI assistant can understand them and help the user. Be thorough and precise.",
},
],
} as unknown as ModelMessage
const description = yield* Effect.tryPromise(() =>
generateText({ model: visionLanguage, messages: [visionPrompt] }).then((r) => r.text),
).pipe(Effect.catch((err) => {
l.error("vision-fallback failed", { visionModelID: visionModel.id, error: String(err) })
return Effect.succeed(null as string | null)
}))
if (description !== null) {
processedMessages = replaceImagePartsWithText(
input.messages,
`[Image description by ${visionModel.id}]:\n${description}`,
)
l.info("vision-fallback applied", { visionModelID: visionModel.id })
}
}
}
}

const isWorkflow = language instanceof GitLabWorkflowLanguageModel
const messages = isOpenaiOauth
? input.messages
? processedMessages
: isWorkflow
? input.messages
? processedMessages
: [
...system.map(
(x): ModelMessage => ({
role: "system",
content: x,
}),
),
...input.messages,
...processedMessages,
]

const params = yield* plugin.trigger(
Expand Down
1 change: 1 addition & 0 deletions packages/opencode/test/fake/provider.ts
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ export namespace ProviderTest {
getSmallModel: Effect.fn("TestProvider.getSmallModel")((providerID) =>
Effect.succeed(providerID === row.id ? mdl : undefined),
),
getVisionModel: Effect.fn("TestProvider.getVisionModel")(_model => Effect.succeed(undefined)),
defaultModel: Effect.fn("TestProvider.defaultModel")(() =>
Effect.succeed({ providerID: row.id, modelID: mdl.id }),
),
Expand Down
Loading