diff --git a/CHANGELOG.md b/CHANGELOG.md index 7e0ce3f..fd4f936 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,21 @@ All notable changes to `@stackbilt/llm-providers` are documented here. Format follows [Keep a Changelog](https://keepachangelog.com/). Versions use [Semantic Versioning](https://semver.org/). +## [1.3.0] — 2026-04-16 + +### Added +- **Cloudflare Workers AI vision support** — `CloudflareProvider` now accepts `request.images` and routes to vision-capable models. Previously image data was silently dropped on the CF path. +- **Three new CF vision models**: + - `@cf/google/gemma-4-26b-a4b-it` — 256K context, vision + function calling + reasoning + - `@cf/meta/llama-4-scout-17b-16e-instruct` — natively multimodal, tool calling + - `@cf/meta/llama-3.2-11b-vision-instruct` — image understanding +- **`CloudflareProvider.supportsVision = true`** — factory's `analyzeImage` now dispatches to CF when configured. +- **Factory default vision fallback** — `getDefaultVisionModel()` falls back to `@cf/google/gemma-4-26b-a4b-it` when neither Anthropic nor OpenAI is configured, enabling CF-only deployments to use `analyzeImage()`. + +### Changed +- Images are passed to CF using the OpenAI-compatible `image_url` content-part shape (base64 data URIs). HTTP image URLs throw a helpful `ConfigurationError` — fetch the image and pass bytes in `image.data`. +- Attempting `request.images` on a non-vision CF model throws a `ConfigurationError` naming the vision-capable alternatives. + ## [1.2.0] — 2026-04-01 ### Added diff --git a/package.json b/package.json index 8bcb13d..cf29004 100755 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@stackbilt/llm-providers", - "version": "1.2.0", + "version": "1.3.0", "description": "Multi-LLM failover with circuit breakers, cost tracking, and intelligent retry. Cloudflare Workers native.", "author": "Stackbilt ", "license": "Apache-2.0", diff --git a/src/__tests__/cloudflare.test.ts b/src/__tests__/cloudflare.test.ts index 9d09719..e3624ec 100644 --- a/src/__tests__/cloudflare.test.ts +++ b/src/__tests__/cloudflare.test.ts @@ -271,4 +271,97 @@ describe('CloudflareProvider', () => { ]); }); }); + + describe('vision', () => { + it('advertises vision capability on the provider', () => { + expect(provider.supportsVision).toBe(true); + }); + + it('marks Gemma 4, Llama 4 Scout, and Llama 3.2 Vision as vision-capable', () => { + const capabilities = provider.exposeModelCapabilities(); + expect(capabilities['@cf/google/gemma-4-26b-a4b-it'].supportsVision).toBe(true); + expect(capabilities['@cf/meta/llama-4-scout-17b-16e-instruct'].supportsVision).toBe(true); + expect(capabilities['@cf/meta/llama-3.2-11b-vision-instruct'].supportsVision).toBe(true); + }); + + it('attaches images to the last user message as OpenAI image_url parts', async () => { + mockAiRun.mockResolvedValueOnce({ + choices: [{ message: { role: 'assistant', content: 'A ripe tomato.' }, finish_reason: 'stop' }] + }); + + await provider.generateResponse({ + model: '@cf/google/gemma-4-26b-a4b-it', + messages: [{ role: 'user', content: 'What is in this image?' }], + images: [{ data: 'QUJD', mimeType: 'image/png' }], + maxTokens: 256 + }); + + const [modelArg, body] = mockAiRun.mock.calls[0]; + expect(modelArg).toBe('@cf/google/gemma-4-26b-a4b-it'); + expect(body.messages).toHaveLength(1); + const userMsg = body.messages[0]; + expect(userMsg.role).toBe('user'); + expect(Array.isArray(userMsg.content)).toBe(true); + expect(userMsg.content[0]).toEqual({ type: 'text', text: 'What is in this image?' }); + expect(userMsg.content[1]).toEqual({ + type: 'image_url', + image_url: { url: 'data:image/png;base64,QUJD' } + }); + }); + + it('appends multiple images as separate image_url parts', async () => { + mockAiRun.mockResolvedValueOnce({ + choices: [{ message: { content: 'Two tomatoes.' }, finish_reason: 'stop' }] + }); + + await provider.generateResponse({ + model: '@cf/meta/llama-4-scout-17b-16e-instruct', + messages: [{ role: 'user', content: 'compare' }], + images: [ + { data: 'QQ==', mimeType: 'image/jpeg' }, + { data: 'Qg==', mimeType: 'image/jpeg' } + ] + }); + + const [, body] = mockAiRun.mock.calls[0]; + const content = body.messages[body.messages.length - 1].content; + expect(content.filter((p: { type: string }) => p.type === 'image_url')).toHaveLength(2); + }); + + it('accepts pre-formed data: URLs via image.url', async () => { + mockAiRun.mockResolvedValueOnce({ + choices: [{ message: { content: 'ok' }, finish_reason: 'stop' }] + }); + + await provider.generateResponse({ + model: '@cf/meta/llama-3.2-11b-vision-instruct', + messages: [{ role: 'user', content: 'x' }], + images: [{ url: 'data:image/webp;base64,ZEFUQQ==' }] + }); + + const [, body] = mockAiRun.mock.calls[0]; + const imagePart = body.messages[0].content[1]; + expect(imagePart.image_url.url).toBe('data:image/webp;base64,ZEFUQQ=='); + }); + + it('rejects HTTP image URLs (requires base64 bytes)', async () => { + await expect( + provider.generateResponse({ + model: '@cf/google/gemma-4-26b-a4b-it', + messages: [{ role: 'user', content: 'x' }], + images: [{ url: 'https://example.com/img.jpg' }] + }) + ).rejects.toThrow(/HTTP image URLs are not supported/); + }); + + it('rejects images on non-vision models with a helpful error', async () => { + await expect( + provider.generateResponse({ + model: '@cf/meta/llama-3.1-8b-instruct', + messages: [{ role: 'user', content: 'x' }], + images: [{ data: 'QUJD', mimeType: 'image/png' }] + }) + ).rejects.toThrow(/does not support image input/); + }); + }); }); diff --git a/src/factory.ts b/src/factory.ts index 714eba6..a0d48ce 100755 --- a/src/factory.ts +++ b/src/factory.ts @@ -1105,6 +1105,7 @@ export class LLMProviderFactory { if (this.config.defaultVisionModel) return this.config.defaultVisionModel; if (this.providers.has('anthropic')) return 'claude-haiku-4-5-20251001'; if (this.providers.has('openai')) return 'gpt-4o-mini'; + if (this.providers.has('cloudflare')) return '@cf/google/gemma-4-26b-a4b-it'; return undefined; } diff --git a/src/providers/cloudflare.ts b/src/providers/cloudflare.ts index 774b531..fbb280e 100755 --- a/src/providers/cloudflare.ts +++ b/src/providers/cloudflare.ts @@ -6,6 +6,7 @@ import type { LLMRequest, LLMResponse, + LLMImageInput, CloudflareConfig, ModelCapabilities, TokenUsage, @@ -17,9 +18,15 @@ import { ModelNotFoundError } from '../errors'; +interface CloudflareContentPart { + type: 'text' | 'image_url'; + text?: string; + image_url?: { url: string }; +} + interface CloudflareMessage { role: 'system' | 'user' | 'assistant' | 'tool'; - content: string | null; + content: string | null | CloudflareContentPart[]; tool_calls?: ToolCall[]; tool_call_id?: string; } @@ -94,11 +101,15 @@ export class CloudflareProvider extends BaseProvider { '@cf/qwen/qwen1.5-0.5b-chat', '@cf/qwen/qwen1.5-1.8b-chat', '@cf/qwen/qwen1.5-14b-chat-awq', - '@cf/qwen/qwen1.5-7b-chat-awq' + '@cf/qwen/qwen1.5-7b-chat-awq', + '@cf/google/gemma-4-26b-a4b-it', + '@cf/meta/llama-4-scout-17b-16e-instruct', + '@cf/meta/llama-3.2-11b-vision-instruct' ]; supportsStreaming = true; supportsTools = true; supportsBatching = true; + supportsVision = true; private ai: Ai; private accountId?: string; @@ -307,6 +318,38 @@ export class CloudflareProvider extends BaseProvider { inputTokenCost: 0.0000001, outputTokenCost: 0.0000001, description: 'Qwen 1.5 7B - Optimized performance' + }, + '@cf/google/gemma-4-26b-a4b-it': { + maxContextLength: 256000, + supportsStreaming: true, + supportsTools: true, + toolCalling: true, + supportsVision: true, + supportsBatching: true, + inputTokenCost: 0.0000001, + outputTokenCost: 0.0000003, + description: 'Gemma 4 26B — vision + tools + reasoning, 256K context' + }, + '@cf/meta/llama-4-scout-17b-16e-instruct': { + maxContextLength: 131000, + supportsStreaming: true, + supportsTools: true, + toolCalling: true, + supportsVision: true, + supportsBatching: true, + inputTokenCost: 0.0000003, + outputTokenCost: 0.0000009, + description: 'Llama 4 Scout 17B — natively multimodal, tool calling' + }, + '@cf/meta/llama-3.2-11b-vision-instruct': { + maxContextLength: 128000, + supportsStreaming: true, + supportsTools: false, + supportsVision: true, + supportsBatching: true, + inputTokenCost: 0.0000005, + outputTokenCost: 0.0000005, + description: 'Llama 3.2 11B Vision — image understanding' } }; } @@ -326,6 +369,14 @@ export class CloudflareProvider extends BaseProvider { ); } + const hasImages = (request.images?.length ?? 0) > 0; + if (hasImages && !capabilities?.supportsVision) { + throw new ConfigurationError( + this.name, + `Model '${model}' does not support image input. Use a vision-capable model like @cf/google/gemma-4-26b-a4b-it, @cf/meta/llama-4-scout-17b-16e-instruct, or @cf/meta/llama-3.2-11b-vision-instruct.` + ); + } + const messages: CloudflareMessage[] = []; const jsonMode = request.response_format?.type === 'json_object'; const jsonInstruction = '\n\nYou must respond with valid JSON only. No markdown fences, no commentary, no text outside the JSON.'; @@ -381,6 +432,10 @@ export class CloudflareProvider extends BaseProvider { } } + if (hasImages) { + this.attachImagesToLastUserMessage(messages, request.images!, model); + } + const cloudflareRequest: CloudflareRequest = { messages, temperature: request.temperature, @@ -402,6 +457,51 @@ export class CloudflareProvider extends BaseProvider { return cloudflareRequest; } + private attachImagesToLastUserMessage( + messages: CloudflareMessage[], + images: NonNullable, + model: string + ): void { + const lastUserIndex = (() => { + for (let i = messages.length - 1; i >= 0; i--) { + if (messages[i].role === 'user') return i; + } + return -1; + })(); + + if (lastUserIndex === -1) { + throw new ConfigurationError( + this.name, + `Vision request must include at least one user message (model: ${model})` + ); + } + + const existing = messages[lastUserIndex].content; + const text = typeof existing === 'string' ? existing : ''; + + const parts: CloudflareContentPart[] = [{ type: 'text', text }]; + for (const image of images) { + const url = this.buildImageDataUrl(image, model); + parts.push({ type: 'image_url', image_url: { url } }); + } + + messages[lastUserIndex].content = parts; + } + + private buildImageDataUrl(image: LLMImageInput, model: string): string { + if (image.data) { + const mime = image.mimeType ?? 'image/jpeg'; + return `data:${mime};base64,${image.data}`; + } + if (image.url?.startsWith('data:')) { + return image.url; + } + throw new ConfigurationError( + this.name, + `Cloudflare vision models (${model}) require base64 image data or a data: URL. HTTP image URLs are not supported — fetch the image and pass bytes in image.data.` + ); + } + private formatResponse( result: WorkersAIResult, model: string,