From a19a3bd75cb1916bf3a5c06ee77d4b7227c8b24f Mon Sep 17 00:00:00 2001 From: qer Date: Wed, 17 Jun 2026 18:13:50 +0800 Subject: [PATCH] feat: FetchURL tool supports downloading images - Extend UrlFetchResult to support image responses (kind='image') - LocalFetchURLProvider detects image/* content-type and returns base64 data - FetchURLTool emits image_url ContentPart for multimodal models - Include image dimensions sniffing for coordinate guidance - Update tool description to mention image support - Add tests for PNG/JPEG image fetching and ContentPart output Closes #626 --- .../src/tools/builtin/web/fetch-url.md | 2 +- .../src/tools/builtin/web/fetch-url.ts | 43 ++++++++++-- .../src/tools/providers/local-fetch-url.ts | 22 +++++- .../agent-core/test/tools/fetch-url.test.ts | 63 +++++++++++++++++ .../tools/providers/local-fetch-url.test.ts | 67 +++++++++++++++++++ 5 files changed, 191 insertions(+), 6 deletions(-) diff --git a/packages/agent-core/src/tools/builtin/web/fetch-url.md b/packages/agent-core/src/tools/builtin/web/fetch-url.md index f2356e690..d69d47976 100644 --- a/packages/agent-core/src/tools/builtin/web/fetch-url.md +++ b/packages/agent-core/src/tools/builtin/web/fetch-url.md @@ -1,3 +1,3 @@ -Fetch content from a URL. Returns the main text content extracted from the page. Use this when you need to read a specific web page. +Fetch content from a URL. Returns the main text content extracted from the page, or the image if the URL points to an image file. Use this when you need to read a specific web page or view an image from the web. Only public `http`/`https` URLs are supported. Requests to private, loopback, or link-local addresses are refused, and responses larger than 10 MiB are rejected. diff --git a/packages/agent-core/src/tools/builtin/web/fetch-url.ts b/packages/agent-core/src/tools/builtin/web/fetch-url.ts index 9ea5b126c..01390b4b3 100644 --- a/packages/agent-core/src/tools/builtin/web/fetch-url.ts +++ b/packages/agent-core/src/tools/builtin/web/fetch-url.ts @@ -6,11 +6,13 @@ * should not be registered (not exposed to the LLM). */ +import type { ContentPart } from '@moonshot-ai/kosong'; import { z } from 'zod'; import type { BuiltinTool } from '../../../agent/tool'; import { ToolAccesses } from '../../../loop/tool-access'; import type { ExecutableToolContext, ExecutableToolResult, ToolExecution } from '../../../loop/types'; +import { sniffImageDimensions } from '../../support/file-type'; import { toInputJsonSchema } from '../../support/input-schema'; import { literalRulePattern, matchesGlobRuleSubject } from '../../support/rule-match'; import { ToolResultBuilder } from '../../support/result-builder'; @@ -26,13 +28,24 @@ import DESCRIPTION from './fetch-url.md?raw'; * - `extracted` — the body was an HTML page; only the main article text * was extracted and returned. */ -export type UrlFetchKind = 'passthrough' | 'extracted'; +export type UrlFetchKind = 'passthrough' | 'extracted' | 'image'; + +export interface UrlFetchImage { + /** Base64-encoded image data. */ + data: string; + /** Image MIME type (e.g. image/png). */ + mimeType: string; + /** Original pixel dimensions, if detectable. */ + dimensions: { width: number; height: number } | null; +} export interface UrlFetchResult { - /** The text handed to the LLM. */ + /** The text handed to the LLM (for text/HTML responses). */ content: string; - /** Whether `content` is a verbatim passthrough or extracted main text. */ + /** Whether content is verbatim, extracted, or this is an image response. */ kind: UrlFetchKind; + /** Image data when kind === 'image'. */ + image?: UrlFetchImage | undefined; } export interface UrlFetcher { @@ -89,7 +102,29 @@ export class FetchURLTool implements BuiltinTool { }: ExecutableToolContext, ): Promise { try { - const { content, kind } = await this.fetcher.fetch(args.url, { toolCallId }); + const { content, kind, image } = await this.fetcher.fetch(args.url, { toolCallId }); + + if (kind === 'image' && image !== undefined) { + const output: ContentPart[] = [ + { + type: 'text', + text: `Fetched image from ${args.url}. Mime type: ${image.mimeType}. Original dimensions: ${image.dimensions ? `${image.dimensions.width}x${image.dimensions.height}` : 'unknown'} pixels.`, + }, + { + type: 'text', + text: ``, + }, + { + type: 'image_url', + imageUrl: { url: `data:${image.mimeType};base64,${image.data}` }, + }, + { + type: 'text', + text: '', + }, + ]; + return { output, isError: false }; + } if (!content) { return { diff --git a/packages/agent-core/src/tools/providers/local-fetch-url.ts b/packages/agent-core/src/tools/providers/local-fetch-url.ts index af10a8ca3..e917c7cca 100644 --- a/packages/agent-core/src/tools/providers/local-fetch-url.ts +++ b/packages/agent-core/src/tools/providers/local-fetch-url.ts @@ -18,6 +18,7 @@ import { Readability } from '@mozilla/readability'; import { parseHTML as rawParseHTML } from 'linkedom'; import { HttpFetchError, type UrlFetcher, type UrlFetchResult } from '../builtin'; +import { sniffImageDimensions } from '../support/file-type'; // Readability's .d.ts references the global `Document` type, but this // package compiles with `lib: ES2023` (no DOM). Extracting the @@ -172,6 +173,26 @@ export class LocalFetchURLProvider implements UrlFetcher { } } + const contentType = (response.headers.get('content-type') ?? '').toLowerCase(); + + // Image responses — read binary data and return as base64. + if (contentType.startsWith('image/')) { + const arrayBuffer = await response.arrayBuffer(); + const data = Buffer.from(arrayBuffer); + if (data.length > this.maxBytes) { + throw new Error( + `Image too large: ${String(data.length)} bytes exceeds maxBytes (${String(this.maxBytes)}).`, + ); + } + const base64 = data.toString('base64'); + const dimensions = sniffImageDimensions(data); + return { + content: '', + kind: 'image', + image: { data: base64, mimeType: contentType.split(';')[0]!.trim(), dimensions }, + }; + } + const body = await response.text(); // Servers may omit content-length — measure again defensively. @@ -182,7 +203,6 @@ export class LocalFetchURLProvider implements UrlFetcher { ); } - const contentType = (response.headers.get('content-type') ?? '').toLowerCase(); if (contentType.startsWith('text/plain') || contentType.startsWith('text/markdown')) { return { content: body, kind: 'passthrough' }; } diff --git a/packages/agent-core/test/tools/fetch-url.test.ts b/packages/agent-core/test/tools/fetch-url.test.ts index 0e5c55ee6..9cfa49139 100644 --- a/packages/agent-core/test/tools/fetch-url.test.ts +++ b/packages/agent-core/test/tools/fetch-url.test.ts @@ -259,6 +259,69 @@ describe('FetchURLTool', () => { const message = (result as { message?: string }).message ?? ''; expect(message).toContain('full response body'); }); + + it('returns image_url ContentPart array when fetcher returns an image', async () => { + const fetcher: UrlFetcher = { + fetch: vi.fn().mockResolvedValue({ + content: '', + kind: 'image', + image: { + data: 'aGVsbG8=', + mimeType: 'image/png', + dimensions: { width: 100, height: 200 }, + }, + }), + }; + const tool = new FetchURLTool(fetcher); + + const result = await executeTool(tool, { + turnId: 't1', + toolCallId: 'c_img', + args: { url: 'https://example.com/chart.png' }, + signal, + }); + + expect(result.isError).toBe(false); + expect(Array.isArray(result.output)).toBe(true); + const parts = result.output as Array<{ type: string; text?: string; imageUrl?: { url: string } }>; + expect(parts.length).toBe(4); + expect(parts[0]!.type).toBe('text'); + expect(parts[0]!.text).toContain('Fetched image'); + expect(parts[0]!.text).toContain('image/png'); + expect(parts[0]!.text).toContain('100x200'); + expect(parts[1]!.type).toBe('text'); + expect(parts[1]!.text).toBe(''); + expect(parts[2]!.type).toBe('image_url'); + expect(parts[2]!.imageUrl!.url).toBe('data:image/png;base64,aGVsbG8='); + expect(parts[3]!.type).toBe('text'); + expect(parts[3]!.text).toBe(''); + }); + + it('returns image without dimensions when fetcher returns null dimensions', async () => { + const fetcher: UrlFetcher = { + fetch: vi.fn().mockResolvedValue({ + content: '', + kind: 'image', + image: { + data: 'aGVsbG8=', + mimeType: 'image/webp', + dimensions: null, + }, + }), + }; + const tool = new FetchURLTool(fetcher); + + const result = await executeTool(tool, { + turnId: 't1', + toolCallId: 'c_img2', + args: { url: 'https://example.com/photo.webp' }, + signal, + }); + + expect(result.isError).toBe(false); + const parts = result.output as Array<{ type: string; text?: string }>; + expect(parts[0]!.text).toContain('unknown'); + }); }); describe('MoonshotFetchURLProvider', () => { diff --git a/packages/agent-core/test/tools/providers/local-fetch-url.test.ts b/packages/agent-core/test/tools/providers/local-fetch-url.test.ts index 2c0ce931f..077df0d74 100644 --- a/packages/agent-core/test/tools/providers/local-fetch-url.test.ts +++ b/packages/agent-core/test/tools/providers/local-fetch-url.test.ts @@ -17,6 +17,35 @@ function htmlResponse(body: string, contentType: string): Response { }); } +function imageResponse(data: Buffer, contentType: string): Response { + return new Response(data, { + status: 200, + headers: { 'content-type': contentType }, + }); +} + +/** Build a minimal 2x3 PNG (IHDR + IDAT + IEND) for dimension sniffing. */ +function tinyPng(): Buffer { + // PNG signature + const sig = Buffer.from([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a]); + // IHDR chunk: width=2 (big-endian), height=3, bitDepth=8, colorType=2, compression=0, filter=0, interlace=0 + const ihdrData = Buffer.from([ + 0x00, 0x00, 0x00, 0x02, // width + 0x00, 0x00, 0x00, 0x03, // height + 0x08, 0x02, 0x00, 0x00, 0x00, // bitDepth, colorType, compression, filter, interlace + ]); + const ihdrLen = Buffer.from([0x00, 0x00, 0x00, 0x0d]); // 13 bytes + const ihdrType = Buffer.from('IHDR'); + const ihdrCrc = Buffer.from([0x00, 0x00, 0x00, 0x00]); // fake CRC for test + const ihdr = Buffer.concat([ihdrLen, ihdrType, ihdrData, ihdrCrc]); + // IEND chunk + const iendLen = Buffer.from([0x00, 0x00, 0x00, 0x00]); + const iendType = Buffer.from('IEND'); + const iendCrc = Buffer.from([0x00, 0x00, 0x00, 0x00]); // fake CRC + const iend = Buffer.concat([iendLen, iendType, iendCrc]); + return Buffer.concat([sig, ihdr, iend]); +} + describe('LocalFetchURLProvider content kind', () => { it('reports text/plain bodies as a verbatim passthrough', async () => { const fetchImpl = vi @@ -55,4 +84,42 @@ describe('LocalFetchURLProvider content kind', () => { expect(result.kind).toBe('extracted'); expect(result.content).toContain('quick brown fox'); }); + + it('reports image responses as base64 with kind image and dimensions', async () => { + const png = tinyPng(); + const fetchImpl = vi + .fn() + .mockResolvedValue(imageResponse(png, 'image/png')); + const provider = new LocalFetchURLProvider({ fetchImpl }); + + const result = await provider.fetch('https://example.com/img.png'); + + expect(result.kind).toBe('image'); + expect(result.image).toBeDefined(); + expect(result.image!.mimeType).toBe('image/png'); + expect(result.image!.data).toBe(png.toString('base64')); + expect(result.image!.dimensions).toEqual({ width: 2, height: 3 }); + }); + + it('reports image/jpeg responses as base64 with kind image', async () => { + // Minimal JPEG: SOI + APP0 (JFIF) + SOF0 + DHT + SOS + EOI + const jpeg = Buffer.from([ + 0xff, 0xd8, // SOI + 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01, // APP0 JFIF + 0x01, 0x00, 0x00, 0x01, 0x00, 0x01, 0x00, 0x00, + 0xff, 0xc0, 0x00, 0x0b, 0x08, 0x00, 0x01, 0x00, 0x01, 0x01, 0x01, 0x11, 0x00, // SOF0 1x1 + 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x00, 0x3f, 0x00, // SOS + 0xff, 0xd9, // EOI + ]); + const fetchImpl = vi + .fn() + .mockResolvedValue(imageResponse(jpeg, 'image/jpeg; charset=utf-8')); + const provider = new LocalFetchURLProvider({ fetchImpl }); + + const result = await provider.fetch('https://example.com/img.jpg'); + + expect(result.kind).toBe('image'); + expect(result.image!.mimeType).toBe('image/jpeg'); + expect(result.image!.dimensions).toEqual({ width: 1, height: 1 }); + }); });