From 8c783af17b61ab5529294a5f6d06c0952b631ec1 Mon Sep 17 00:00:00 2001 From: Pranav P <74554594+pranavp311@users.noreply.github.com> Date: Tue, 2 Jun 2026 18:56:22 +0800 Subject: [PATCH] feat: send PDFs as native document attachments Co-Authored-By: blackfloofie-a codegraff agent <265516171+blackfloofie@users.noreply.github.com> --- Cargo.lock | 12 +-- crates/codegraff-tui/src/main.rs | 33 ++++++-- crates/forge_app/src/dto/anthropic/request.rs | 29 +++++++ .../forge_app/src/dto/anthropic/response.rs | 1 + crates/forge_app/src/dto/openai/request.rs | 40 +++++++++- crates/forge_app/src/tool_registry.rs | 1 + crates/forge_domain/src/image.rs | 8 ++ crates/forge_domain/src/model.rs | 3 + ..._definition__usage__tests__tool_usage.snap | 2 +- .../src/tools/descriptions/fs_read.md | 4 +- ..._catalog__tests__tool_definition_json.snap | 3 - crates/forge_repo/src/provider/bedrock.rs | 7 ++ .../src/provider/openai_responses/request.rs | 8 ++ ...s__openai_responses_all_catalog_tools.snap | 21 +----- ...nthropic__tests__fetch_models_success.snap | 6 +- crates/forge_services/src/attachment.rs | 4 +- .../src/tool_services/fs_read.rs | 75 ++++++++++++++++++- 17 files changed, 210 insertions(+), 47 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 99e0e295a..3e7b72f51 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4118,7 +4118,7 @@ dependencies = [ "hmac 0.13.0", "http 1.4.0", "jsonwebtoken", - "reqwest 0.13.2", + "reqwest 0.13.4", "rustc_version", "rustls 0.23.40", "rustls-pki-types", @@ -6350,7 +6350,7 @@ dependencies = [ "chrono", "derive_builder 0.20.2", "regex", - "reqwest 0.13.2", + "reqwest 0.13.4", "semver", "serde", "serde_json", @@ -7067,9 +7067,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.13.2" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab3f43e3283ab1488b624b44b0e988d0acea0b3214e694730a055cb6b2efa801" +checksum = "219c5811de6525e5416c7d5d53bb656d3afdbc6c5af816e0802bcfa42dbdc1c3" dependencies = [ "base64 0.22.1", "bytes", @@ -8596,9 +8596,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.52.1" +version = "1.52.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b67dee974fe86fd92cc45b7a95fdd2f99a36a6d7b0d431a231178d3d670bbcc6" +checksum = "8fc7f01b389ac15039e4dc9531aa973a135d7a4135281b12d7c1bc79fd57fffe" dependencies = [ "bytes", "libc", diff --git a/crates/codegraff-tui/src/main.rs b/crates/codegraff-tui/src/main.rs index 9eaa977d9..44fe45124 100644 --- a/crates/codegraff-tui/src/main.rs +++ b/crates/codegraff-tui/src/main.rs @@ -5578,9 +5578,9 @@ fn parse_image_command(input: &str) -> ImageCommand { return ImageCommand::Invalid("Usage: /image ".to_string()); } - if !is_supported_image_path(Path::new(path)) { + if !is_supported_image_path(Path::new(path)) && !is_supported_pdf_path(Path::new(path)) { return ImageCommand::Invalid(format!( - "Unsupported image type: {path}. Supported: png, jpg, jpeg, webp" + "Unsupported attachment type: {path}. Supported: png, jpg, jpeg, webp, pdf" )); } @@ -5599,6 +5599,15 @@ fn is_supported_image_path(path: &Path) -> bool { .unwrap_or(false) } +/// PDFs are attached as document references (`@[path]`) and sent to the model as +/// native document blocks, not images. +fn is_supported_pdf_path(path: &Path) -> bool { + path.extension() + .and_then(|extension| extension.to_str()) + .map(|extension| extension.eq_ignore_ascii_case("pdf")) + .unwrap_or(false) +} + fn build_chat_event(prompt: &str, images: &[ImageAttachment]) -> Event { Event::new(build_chat_prompt(prompt, images)) } @@ -5612,7 +5621,7 @@ fn build_chat_prompt(prompt: &str, images: &[ImageAttachment]) -> String { match (prompt.trim().is_empty(), tags.is_empty()) { (true, true) => String::new(), - (true, false) => format!("Please analyze the attached image(s).\n\n{tags}"), + (true, false) => format!("Please analyze the attached file(s).\n\n{tags}"), (false, true) => prompt.trim().to_string(), (false, false) => format!("{}\n\n{tags}", prompt.trim()), } @@ -5730,7 +5739,8 @@ fn unescape_shell_path(text: &str) -> String { } fn is_readable_supported_image_path(path: &Path) -> bool { - is_supported_image_path(path) && path.is_file() && image::open(path).is_ok() + (is_supported_image_path(path) && path.is_file() && image::open(path).is_ok()) + || (is_supported_pdf_path(path) && path.is_file()) } fn normalize_paste_text(text: &str) -> String { @@ -6690,12 +6700,23 @@ mod tests { let fixture = "/image /tmp/archive.zip"; let actual = parse_image_command(fixture); let expected = ImageCommand::Invalid( - "Unsupported image type: /tmp/archive.zip. Supported: png, jpg, jpeg, webp".to_string(), + "Unsupported attachment type: /tmp/archive.zip. Supported: png, jpg, jpeg, webp, pdf" + .to_string(), ); assert_eq!(actual, expected); } + #[test] + fn image_command_accepts_pdf_as_document_attachment() { + // Intent: PDFs are attachable via /image and flow as @[path] document + // references — they are NOT rejected as "unsupported image". + let actual = parse_image_command("/image /tmp/report.pdf"); + let expected = ImageCommand::Attach(ImageAttachment::new("/tmp/report.pdf")); + + assert_eq!(actual, expected); + } + #[test] fn chat_prompt_includes_image_tags_for_backend_attachments() { let fixture = vec![ @@ -7103,7 +7124,7 @@ mod tests { fn build_chat_prompt_sends_image_only_prompt_when_text_is_blank() { let fixture = vec![ImageAttachment::new("/tmp/a.png")]; let actual = build_chat_prompt(" ", &fixture); - let expected = "Please analyze the attached image(s).\n\n@[/tmp/a.png]"; + let expected = "Please analyze the attached file(s).\n\n@[/tmp/a.png]"; assert_eq!(actual, expected); } diff --git a/crates/forge_app/src/dto/anthropic/request.rs b/crates/forge_app/src/dto/anthropic/request.rs index 72aa4496e..1976dfbdf 100644 --- a/crates/forge_app/src/dto/anthropic/request.rs +++ b/crates/forge_app/src/dto/anthropic/request.rs @@ -295,6 +295,7 @@ impl Message { .find_map(|(idx, content)| match content { Content::Text { .. } | Content::Image { .. } + | Content::Document { .. } | Content::ToolUse { .. } | Content::ToolResult { .. } => Some(idx), _ => None, @@ -320,6 +321,19 @@ impl Default for Message { impl From for Content { fn from(value: Image) -> Self { + // PDFs reuse the image byte-carrier but must be sent as a native + // `document` block, not an `image` block (Anthropic rejects + // application/pdf in an image source). + if value.is_pdf() { + return Content::Document { + source: DocumentSource { + type_: "base64".to_string(), + media_type: "application/pdf".to_string(), + data: value.data().into(), + }, + cache_control: None, + }; + } Content::Image { source: ImageSource { type_: "base64".to_string(), @@ -344,6 +358,14 @@ pub struct ImageSource { pub url: Option, } +#[derive(Serialize)] +pub struct DocumentSource { + #[serde(rename = "type")] + pub type_: String, + pub media_type: String, + pub data: String, +} + #[derive(Serialize)] #[serde(rename_all = "snake_case", tag = "type")] pub enum Content { @@ -352,6 +374,11 @@ pub enum Content { #[serde(skip_serializing_if = "Option::is_none")] cache_control: Option, }, + Document { + source: DocumentSource, + #[serde(skip_serializing_if = "Option::is_none")] + cache_control: Option, + }, Text { text: String, #[serde(skip_serializing_if = "Option::is_none")] @@ -400,6 +427,7 @@ impl Content { Content::ToolResult { tool_use_id, content, is_error, cache_control } } Content::Image { source, .. } => Content::Image { source, cache_control }, + Content::Document { source, .. } => Content::Document { source, cache_control }, // TODO: verify this Thinking variants don't support cache control Content::Thinking { signature, thinking } => Content::Thinking { signature, thinking }, } @@ -411,6 +439,7 @@ impl Content { Content::ToolUse { cache_control, .. } => cache_control.is_some(), Content::ToolResult { cache_control, .. } => cache_control.is_some(), Content::Image { cache_control, .. } => cache_control.is_some(), + Content::Document { cache_control, .. } => cache_control.is_some(), Content::Thinking { .. } => false, } } diff --git a/crates/forge_app/src/dto/anthropic/response.rs b/crates/forge_app/src/dto/anthropic/response.rs index 38b5e17bd..3b37b1d87 100644 --- a/crates/forge_app/src/dto/anthropic/response.rs +++ b/crates/forge_app/src/dto/anthropic/response.rs @@ -38,6 +38,7 @@ impl From for forge_domain::Model { vec![ forge_domain::InputModality::Text, forge_domain::InputModality::Image, + forge_domain::InputModality::Pdf, ] } else { vec![forge_domain::InputModality::Text] diff --git a/crates/forge_app/src/dto/openai/request.rs b/crates/forge_app/src/dto/openai/request.rs index fbc98e76d..979bbb976 100644 --- a/crates/forge_app/src/dto/openai/request.rs +++ b/crates/forge_app/src/dto/openai/request.rs @@ -21,6 +21,14 @@ pub struct ImageUrl { pub detail: Option, } +/// A file (e.g. PDF) attachment sent inline via the OpenAI chat completions +/// `file` content part. `file_data` is a `data:;base64,...` URI. +#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)] +pub struct FileData { + pub filename: String, + pub file_data: String, +} + #[derive(Debug, Deserialize, Serialize, Clone)] pub struct Message { pub role: Role, @@ -109,6 +117,11 @@ pub enum ContentPart { #[serde(skip_serializing_if = "Option::is_none")] cache_control: Option, }, + File { + file: FileData, + #[serde(skip_serializing_if = "Option::is_none")] + cache_control: Option, + }, } impl ContentPart { @@ -120,6 +133,9 @@ impl ContentPart { ContentPart::ImageUrl { cache_control, .. } => { *cache_control = None; } + ContentPart::File { cache_control, .. } => { + *cache_control = None; + } } } @@ -133,6 +149,9 @@ impl ContentPart { ContentPart::ImageUrl { cache_control, .. } => { *cache_control = src_cache_control; } + ContentPart::File { cache_control, .. } => { + *cache_control = src_cache_control; + } } } } @@ -495,10 +514,23 @@ impl From for Message { extra_content: None, }, ContextMessage::Image(img) => { - let content = vec![ContentPart::ImageUrl { - image_url: ImageUrl { url: img.url().clone(), detail: None }, - cache_control: None, - }]; + // PDFs reuse the image byte-carrier but must be sent as a `file` + // content part, not `image_url` (OpenAI rejects PDFs as images). + let part = if img.is_pdf() { + ContentPart::File { + file: FileData { + filename: "document.pdf".to_string(), + file_data: img.url().clone(), + }, + cache_control: None, + } + } else { + ContentPart::ImageUrl { + image_url: ImageUrl { url: img.url().clone(), detail: None }, + cache_control: None, + } + }; + let content = vec![part]; Message { role: Role::User, content: Some(MessageContent::Parts(content)), diff --git a/crates/forge_app/src/tool_registry.rs b/crates/forge_app/src/tool_registry.rs index 97eb70bc4..75b804c16 100644 --- a/crates/forge_app/src/tool_registry.rs +++ b/crates/forge_app/src/tool_registry.rs @@ -513,6 +513,7 @@ impl ToolRegistry { .map(|im| match im { InputModality::Text => "text".to_string(), InputModality::Image => "image".to_string(), + InputModality::Pdf => "pdf".to_string(), }) .collect::>() .join(", ") diff --git a/crates/forge_domain/src/image.rs b/crates/forge_domain/src/image.rs index a4770e460..ea1393740 100644 --- a/crates/forge_domain/src/image.rs +++ b/crates/forge_domain/src/image.rs @@ -27,4 +27,12 @@ impl Image { let content = format!("data:{mime_type};base64,{base64_encoded}"); Self { url: content, mime_type } } + + /// Whether this carrier holds a PDF document rather than an image. PDFs + /// reuse the image byte-carrier but are serialized as native document + /// blocks (Anthropic `document`, OpenAI `file`, Google `inline_data`) + /// rather than image blocks. + pub fn is_pdf(&self) -> bool { + self.mime_type == "application/pdf" + } } diff --git a/crates/forge_domain/src/model.rs b/crates/forge_domain/src/model.rs index fb06eed2c..33eb05755 100644 --- a/crates/forge_domain/src/model.rs +++ b/crates/forge_domain/src/model.rs @@ -15,6 +15,9 @@ pub enum InputModality { Text, /// Image input (vision-capable models) Image, + /// PDF document input (models that accept native PDF/document attachments, + /// e.g. Anthropic Claude, Google Gemini, OpenAI file-capable models) + Pdf, } /// Default input modalities when not specified (text-only) diff --git a/crates/forge_domain/src/tools/definition/snapshots/forge_domain__tools__definition__usage__tests__tool_usage.snap b/crates/forge_domain/src/tools/definition/snapshots/forge_domain__tools__definition__usage__tests__tool_usage.snap index 43a97d625..012d6e5ca 100644 --- a/crates/forge_domain/src/tools/definition/snapshots/forge_domain__tools__definition__usage__tests__tool_usage.snap +++ b/crates/forge_domain/src/tools/definition/snapshots/forge_domain__tools__definition__usage__tests__tool_usage.snap @@ -2,7 +2,7 @@ source: crates/forge_domain/src/tools/definition/usage.rs expression: prompt --- -{"name":"read","description":"Reads a file from the local filesystem. You can access any file directly by using this tool. Assume this tool is able to read all files on the machine. If the User provides a path to a file assume that path is valid. It is okay to read a file that does not exist; an error will be returned.\n\nUsage:\n- The file_path parameter must be an absolute path, not a relative path\n- By default, it reads up to {{config.maxReadSize}} lines starting from the beginning of the file\n- You can optionally specify a line start_line and end_line (especially handy for long files), but it's recommended to read the whole file by not providing these parameters\n- Any lines longer than {{config.maxLineLength}} characters will be truncated\n- Results are returned using rg \"\" -n format, with line numbers starting at 1\n{{#if (contains model.input_modalities \"image\")}}\n- This tool allows Forge Code to read images (eg PNG, JPG, etc). When reading an image file the contents are presented visually.\n- PDFs, Automatically encoded as base64 and sent as visual content for LLM to analyze pages. Any PDFs larger than {{config.maxImageSize}} bytes will return error\n{{/if}}\n- Jupyter notebooks (.ipynb files) are read as plain JSON text - you can parse the cell structure, outputs, and embedded content directly from the JSON\n- This tool can only read files, not directories. To read a directory, use an ls command via the `{{tool_names.shell}}` tool.\n- You can call multiple tools in a single response. It is always better to speculatively read multiple potentially useful files in parallel.","arguments":{"file_path":{"description":"Absolute path to the file to read.","type":"string","is_required":true},"range":{"description":"Optional line range for partial reads.","type":"object","is_required":false},"show_line_numbers":{"description":"If true, prefixes each line with its line index (starting at 1).\nDefaults to true.","type":"boolean","is_required":false}}} +{"name":"read","description":"Reads a file from the local filesystem. You can access any file directly by using this tool. Assume this tool is able to read all files on the machine. If the User provides a path to a file assume that path is valid. It is okay to read a file that does not exist; an error will be returned.\n\nUsage:\n- The file_path parameter must be an absolute path, not a relative path\n- By default, it reads up to {{config.maxReadSize}} lines starting from the beginning of the file\n- You can optionally specify a line start_line and end_line (especially handy for long files), but it's recommended to read the whole file by not providing these parameters\n- Any lines longer than {{config.maxLineLength}} characters will be truncated\n- Results are returned using rg \"\" -n format, with line numbers starting at 1\n{{#if (contains model.input_modalities \"image\")}}\n- This tool allows Forge Code to read images (eg PNG, JPG, etc). When reading an image file the contents are presented visually.\n{{/if}}\n{{#if (contains model.input_modalities \"pdf\")}}\n- PDF files (.pdf) are sent to the model as native document attachments so the document's pages can be read directly. PDFs larger than {{config.maxImageSize}} bytes will return an error.\n{{/if}}\n- Jupyter notebooks (.ipynb files) are read as plain JSON text - you can parse the cell structure, outputs, and embedded content directly from the JSON\n- This tool can only read files, not directories. To read a directory, use an ls command via the `{{tool_names.shell}}` tool.\n- You can call multiple tools in a single response. It is always better to speculatively read multiple potentially useful files in parallel.","arguments":{"file_path":{"description":"Absolute path to the file to read.","type":"string","is_required":true},"range":{"description":"Optional line range for partial reads.","type":"object","is_required":false},"show_line_numbers":{"description":"If true, prefixes each line with its line index (starting at 1).\nDefaults to true.","type":"boolean","is_required":false}}} {"name":"write","description":"Writes a file to the local filesystem.\n\nUsage:\n- This tool will overwrite the existing file if there is one at the provided path.\n- If this is an existing file, you MUST use the {{tool_names.read}} tool first to read the file's contents and use this tool with 'overwrite' as true . This tool will fail if you did not read the file first or don't set overwrite parameter to true.\n- ALWAYS prefer {{tool_names.patch}} on existing files in the codebase. NEVER write new files unless explicitly required.\n- NEVER proactively create documentation files (*.md) or README files. Only create documentation files if explicitly requested by the User.\n- Only use emojis if the user explicitly requests it. Avoid writing emojis to files unless asked.","arguments":{"content":{"description":"The content to write to the file","type":"string","is_required":true},"file_path":{"description":"The absolute path to the file to write (must be absolute, not relative)","type":"string","is_required":true},"overwrite":{"description":"If set to true, existing files will be overwritten. If not set and the\nfile exists, an error will be returned with the content of the\nexisting file.","type":"boolean","is_required":false}}} {"name":"fs_search","description":"A powerful search tool built on ripgrep\n\nUsage:\n- ALWAYS use `{{tool_names.fs_search}}` for search tasks. NEVER invoke `grep` or `rg` as a Bash command. The `{{tool_names.fs_search}}` tool has been optimized for correct permissions and access.\n- Supports full regex syntax (e.g., \"log.*Error\", \"function\\\\s+\\\\w+\")\n- Filter files with glob parameter (e.g., \"*.js\", \"**/*.tsx\") or type parameter (e.g., \"js\", \"py\", \"rust\")\n- Output modes: \"content\" shows matching lines, \"files_with_matches\" shows only file paths (default), \"count\" shows match counts\n- Use Task tool for open-ended searches requiring multiple rounds\n- Pattern syntax: Uses ripgrep (not grep) - literal braces need escaping (use `interface\\\\{\\\\}` to find `interface{}` in Go code)\n- Multiline matching: By default patterns match within single lines only. For cross-line patterns like `struct \\\\{[\\\\s\\\\S]*?field`, use `multiline: true`","arguments":{"-A":{"description":"Number of lines to show after each match (rg -A). Requires output_mode:\n\"content\", ignored otherwise.","type":"integer","is_required":false},"-B":{"description":"Number of lines to show before each match (rg -B). Requires output_mode:\n\"content\", ignored otherwise.","type":"integer","is_required":false},"-C":{"description":"Number of lines to show before and after each match (rg -C). Requires\noutput_mode: \"content\", ignored otherwise.","type":"integer","is_required":false},"-i":{"description":"Case insensitive search (rg -i)","type":"boolean","is_required":false},"-n":{"description":"Show line numbers in output (rg -n). Requires output_mode: \"content\",\nignored otherwise.","type":"boolean","is_required":false},"glob":{"description":"Glob pattern to filter files (e.g. \"*.js\", \"*.{ts,tsx}\") - maps to rg\n--glob","type":"string","is_required":false},"head_limit":{"description":"Limit output to first N lines/entries, equivalent to \"| head -N\". Works\nacross all output modes: content (limits output lines),\nfiles_with_matches (limits file paths), count (limits count entries).\nWhen unspecified, shows all results from ripgrep.","type":"integer","is_required":false},"multiline":{"description":"Enable multiline mode where . matches newlines and patterns can span\nlines (rg -U --multiline-dotall). Default: false.","type":"boolean","is_required":false},"offset":{"description":"Skip first N lines/entries before applying head_limit","type":"integer","is_required":false},"output_mode":{"description":"Output mode: \"content\" shows matching lines (supports -A/-B/-C context,\n-n line numbers, head_limit), \"files_with_matches\" shows file paths\n(supports head_limit), \"count\" shows match counts (supports head_limit).\nDefaults to \"files_with_matches\".","type":"string","is_required":false},"path":{"description":"File or directory to search in (rg PATH). Defaults to current working\ndirectory.","type":"string","is_required":false},"pattern":{"description":"The regular expression pattern to search for in file contents.","type":"string","is_required":true},"type":{"description":"File type to search (rg --type). Common types: js, py, rust, go, java,\netc. More efficient than include for standard file types.","type":"string","is_required":false}}} {"name":"sem_search","description":"AI-powered semantic code search. YOUR DEFAULT TOOL for code discovery and exploration when searching within {{env.cwd}}. Use this when you need to find code locations, understand implementations, discover patterns, or explore unfamiliar code - it works with natural language about behavior and concepts, not just keyword matching.\n\n**WHEN TO USE sem_search:**\n- Finding implementation of specific features or algorithms\n- Understanding how a system works across multiple files\n- Discovering architectural patterns and design approaches\n- Locating test examples or fixtures\n- Finding where specific technologies/libraries are used\n- Exploring unfamiliar codebases to learn structure\n- Finding documentation files (README, guides, API docs)\n\n**WHEN NOT TO USE (use {{tool_names.fs_search}} instead):**\n- Searching for exact strings, TODOs, or specific function names\n- Finding all occurrences of a variable or identifier\n- Searching in specific file paths or with regex patterns\n- When you know the exact text to search for\n\nIMPORTANT: Only searches within {{env.cwd}} and subdirectories. For paths outside this scope, use {{tool_names.fs_search}} with path parameter.\n\n**TIPS FOR SUCCESS:**\n- Use 2-3 varied queries to capture different aspects (e.g., \"OAuth token refresh\", \"JWT expiry handling\", \"authentication middleware\")\n- Balance specificity (focused results) with generality (don't miss relevant code)\n- Avoid overly broad queries like \"authentication\" or \"tools\" - be specific about what aspect you need\n- Keep queries targeted - too many broad queries can cause timeouts\n- **Match your intent**: If seeking documentation, use doc-focused keywords (\"setup guide\", \"configuration README\"); if seeking code, use implementation terms (\"token refresh logic\", \"error handling implementation\")\n\nReturns the topK most relevant file:line locations with code context. Each query is ranked independently, then reranked by relevance to your stated intent.","arguments":{"queries":{"description":"List of search queries to execute in parallel. Using multiple queries\n(2-3) with varied phrasings significantly improves results - each query\ncaptures different aspects of what you're looking for. Each query pairs\na search term with a use_case for reranking. Example: for\nauthentication, try \"user login verification\", \"token generation\",\n\"OAuth flow\".","type":"array","is_required":true}}} diff --git a/crates/forge_domain/src/tools/descriptions/fs_read.md b/crates/forge_domain/src/tools/descriptions/fs_read.md index 53a800dac..fe5f7a564 100644 --- a/crates/forge_domain/src/tools/descriptions/fs_read.md +++ b/crates/forge_domain/src/tools/descriptions/fs_read.md @@ -8,7 +8,9 @@ Usage: - Results are returned using rg "" -n format, with line numbers starting at 1 {{#if (contains model.input_modalities "image")}} - This tool allows Forge Code to read images (eg PNG, JPG, etc). When reading an image file the contents are presented visually. -- PDFs, Automatically encoded as base64 and sent as visual content for LLM to analyze pages. Any PDFs larger than {{config.maxImageSize}} bytes will return error +{{/if}} +{{#if (contains model.input_modalities "pdf")}} +- PDF files (.pdf) are sent to the model as native document attachments so the document's pages can be read directly. PDFs larger than {{config.maxImageSize}} bytes will return an error. {{/if}} - Jupyter notebooks (.ipynb files) are read as plain JSON text - you can parse the cell structure, outputs, and embedded content directly from the JSON - This tool can only read files, not directories. To read a directory, use an ls command via the `{{tool_names.shell}}` tool. diff --git a/crates/forge_domain/src/tools/snapshots/forge_domain__tools__catalog__tests__tool_definition_json.snap b/crates/forge_domain/src/tools/snapshots/forge_domain__tools__catalog__tests__tool_definition_json.snap index a224081ba..14729ac83 100644 --- a/crates/forge_domain/src/tools/snapshots/forge_domain__tools__catalog__tests__tool_definition_json.snap +++ b/crates/forge_domain/src/tools/snapshots/forge_domain__tools__catalog__tests__tool_definition_json.snap @@ -469,12 +469,10 @@ expression: tools ] } { - "title": "ToolsListInput", "description": "Meta-tool: Lists all available tool names with brief descriptions.\nUse this to discover which tools are available before calling one.", "type": "object" } { - "title": "ToolsInfoInput", "description": "Meta-tool: Returns the full ToolDefinition (including input schema) for\na specific tool. Use this after `tools_list` to get the complete schema\nfor a tool you want to call.", "type": "object", "properties": { @@ -488,7 +486,6 @@ expression: tools ] } { - "title": "CallToolInput", "description": "Meta-tool: Calls any tool by name with the provided arguments.\nUse this to execute a tool after looking up its schema via `tools_info`.\nThe `arguments` field must match the input schema of the tool being called.", "type": "object", "properties": { diff --git a/crates/forge_repo/src/provider/bedrock.rs b/crates/forge_repo/src/provider/bedrock.rs index 7133a3094..4e31a17fe 100644 --- a/crates/forge_repo/src/provider/bedrock.rs +++ b/crates/forge_repo/src/provider/bedrock.rs @@ -873,6 +873,13 @@ impl FromDomain for aws_sdk_bedrockruntime::types: .map_err(|e| anyhow::anyhow!("Failed to build tool result message: {}", e)) } forge_domain::ContextMessage::Image(img) => { + // PDFs reuse the image byte-carrier but Bedrock has no document + // content block here; fail loudly rather than mislabel as image. + if img.is_pdf() { + anyhow::bail!( + "PDF attachments are not supported on Amazon Bedrock. Use an Anthropic, OpenAI, or Google model to read PDFs." + ); + } let image_block = ImageBlock::builder() .source(ImageSource::Bytes(Blob::new( base64::Engine::decode( diff --git a/crates/forge_repo/src/provider/openai_responses/request.rs b/crates/forge_repo/src/provider/openai_responses/request.rs index 0e75abb20..d02292680 100644 --- a/crates/forge_repo/src/provider/openai_responses/request.rs +++ b/crates/forge_repo/src/provider/openai_responses/request.rs @@ -299,6 +299,14 @@ impl FromDomain for oai::CreateResponse { ))); } ContextMessage::Image(img) => { + // PDFs reuse the image byte-carrier but this path only emits + // image input; fail loudly rather than mislabel a PDF as an + // image. (The Responses API could carry PDFs via input_file.) + if img.is_pdf() { + anyhow::bail!( + "PDF attachments are not supported on this OpenAI Responses model. Use an Anthropic, OpenAI Chat Completions, or Google model to read PDFs." + ); + } // Mirror the Chat Completions request path: represent image input // as a user message with structured content. items.push(oai::InputItem::EasyMessage(oai::EasyInputMessage { diff --git a/crates/forge_repo/src/provider/openai_responses/snapshots/forge_repo__provider__openai_responses__request__tests__openai_responses_all_catalog_tools.snap b/crates/forge_repo/src/provider/openai_responses/snapshots/forge_repo__provider__openai_responses__request__tests__openai_responses_all_catalog_tools.snap index e5ed6e730..3df2bd576 100644 --- a/crates/forge_repo/src/provider/openai_responses/snapshots/forge_repo__provider__openai_responses__request__tests__openai_responses_all_catalog_tools.snap +++ b/crates/forge_repo/src/provider/openai_responses/snapshots/forge_repo__provider__openai_responses__request__tests__openai_responses_all_catalog_tools.snap @@ -7,7 +7,6 @@ expression: actual.tools "type": "function", "name": "read", "parameters": { - "title": "FSRead", "type": "object", "additionalProperties": false, "required": [ @@ -68,13 +67,12 @@ expression: actual.tools } }, "strict": true, - "description": "Reads a file from the local filesystem. You can access any file directly by using this tool. Assume this tool is able to read all files on the machine. If the User provides a path to a file assume that path is valid. It is okay to read a file that does not exist; an error will be returned.\n\nUsage:\n- The file_path parameter must be an absolute path, not a relative path\n- By default, it reads up to {{config.maxReadSize}} lines starting from the beginning of the file\n- You can optionally specify a line start_line and end_line (especially handy for long files), but it's recommended to read the whole file by not providing these parameters\n- Any lines longer than {{config.maxLineLength}} characters will be truncated\n- Results are returned using rg \"\" -n format, with line numbers starting at 1\n{{#if (contains model.input_modalities \"image\")}}\n- This tool allows Forge Code to read images (eg PNG, JPG, etc). When reading an image file the contents are presented visually.\n- PDFs, Automatically encoded as base64 and sent as visual content for LLM to analyze pages. Any PDFs larger than {{config.maxImageSize}} bytes will return error\n{{/if}}\n- Jupyter notebooks (.ipynb files) are read as plain JSON text - you can parse the cell structure, outputs, and embedded content directly from the JSON\n- This tool can only read files, not directories. To read a directory, use an ls command via the `{{tool_names.shell}}` tool.\n- You can call multiple tools in a single response. It is always better to speculatively read multiple potentially useful files in parallel." + "description": "Reads a file from the local filesystem. You can access any file directly by using this tool. Assume this tool is able to read all files on the machine. If the User provides a path to a file assume that path is valid. It is okay to read a file that does not exist; an error will be returned.\n\nUsage:\n- The file_path parameter must be an absolute path, not a relative path\n- By default, it reads up to {{config.maxReadSize}} lines starting from the beginning of the file\n- You can optionally specify a line start_line and end_line (especially handy for long files), but it's recommended to read the whole file by not providing these parameters\n- Any lines longer than {{config.maxLineLength}} characters will be truncated\n- Results are returned using rg \"\" -n format, with line numbers starting at 1\n{{#if (contains model.input_modalities \"image\")}}\n- This tool allows Forge Code to read images (eg PNG, JPG, etc). When reading an image file the contents are presented visually.\n{{/if}}\n{{#if (contains model.input_modalities \"pdf\")}}\n- PDF files (.pdf) are sent to the model as native document attachments so the document's pages can be read directly. PDFs larger than {{config.maxImageSize}} bytes will return an error.\n{{/if}}\n- Jupyter notebooks (.ipynb files) are read as plain JSON text - you can parse the cell structure, outputs, and embedded content directly from the JSON\n- This tool can only read files, not directories. To read a directory, use an ls command via the `{{tool_names.shell}}` tool.\n- You can call multiple tools in a single response. It is always better to speculatively read multiple potentially useful files in parallel." }, { "type": "function", "name": "write", "parameters": { - "title": "FSWrite", "type": "object", "additionalProperties": false, "required": [ @@ -104,7 +102,6 @@ expression: actual.tools "type": "function", "name": "fs_search", "parameters": { - "title": "FSSearch", "type": "object", "additionalProperties": false, "required": [ @@ -278,7 +275,6 @@ expression: actual.tools "type": "function", "name": "sem_search", "parameters": { - "title": "SemanticSearch", "type": "object", "additionalProperties": false, "required": [ @@ -317,7 +313,6 @@ expression: actual.tools "type": "function", "name": "remove", "parameters": { - "title": "FSRemove", "type": "object", "additionalProperties": false, "required": [ @@ -337,7 +332,6 @@ expression: actual.tools "type": "function", "name": "patch", "parameters": { - "title": "FSPatch", "type": "object", "additionalProperties": false, "required": [ @@ -373,7 +367,6 @@ expression: actual.tools "type": "function", "name": "multi_patch", "parameters": { - "title": "FSMultiPatch", "type": "object", "additionalProperties": false, "required": [ @@ -423,7 +416,6 @@ expression: actual.tools "type": "function", "name": "undo", "parameters": { - "title": "FSUndo", "type": "object", "additionalProperties": false, "required": [ @@ -443,7 +435,6 @@ expression: actual.tools "type": "function", "name": "shell", "parameters": { - "title": "Shell", "type": "object", "additionalProperties": false, "required": [ @@ -507,7 +498,6 @@ expression: actual.tools "type": "function", "name": "fetch", "parameters": { - "title": "NetFetch", "description": "Input type for the net fetch tool", "type": "object", "additionalProperties": false, @@ -540,7 +530,6 @@ expression: actual.tools "type": "function", "name": "followup", "parameters": { - "title": "Followup", "type": "object", "additionalProperties": false, "required": [ @@ -632,7 +621,6 @@ expression: actual.tools "type": "function", "name": "plan", "parameters": { - "title": "PlanCreate", "type": "object", "additionalProperties": false, "required": [ @@ -662,7 +650,6 @@ expression: actual.tools "type": "function", "name": "skill", "parameters": { - "title": "SkillFetch", "type": "object", "additionalProperties": false, "required": [ @@ -682,7 +669,6 @@ expression: actual.tools "type": "function", "name": "todo_write", "parameters": { - "title": "TodoWrite", "type": "object", "additionalProperties": false, "required": [ @@ -727,7 +713,6 @@ expression: actual.tools "type": "function", "name": "todo_read", "parameters": { - "title": "TodoRead", "type": "object", "additionalProperties": false, "required": [], @@ -740,7 +725,6 @@ expression: actual.tools "type": "function", "name": "task", "parameters": { - "title": "TaskInput", "description": "Input structure for the Task tool - delegates work to specialized agents", "type": "object", "additionalProperties": false, @@ -793,7 +777,6 @@ expression: actual.tools "type": "function", "name": "tools_list", "parameters": { - "title": "ToolsListInput", "description": "Meta-tool: Lists all available tool names with brief descriptions.\nUse this to discover which tools are available before calling one.", "type": "object", "additionalProperties": false, @@ -807,7 +790,6 @@ expression: actual.tools "type": "function", "name": "tools_info", "parameters": { - "title": "ToolsInfoInput", "description": "Meta-tool: Returns the full ToolDefinition (including input schema) for\na specific tool. Use this after `tools_list` to get the complete schema\nfor a tool you want to call.", "type": "object", "additionalProperties": false, @@ -828,7 +810,6 @@ expression: actual.tools "type": "function", "name": "call_tool", "parameters": { - "title": "CallToolInput", "description": "Meta-tool: Calls any tool by name with the provided arguments.\nUse this to execute a tool after looking up its schema via `tools_info`.\nThe `arguments` field must match the input schema of the tool being called.", "type": "object", "additionalProperties": false, diff --git a/crates/forge_repo/src/provider/snapshots/forge_repo__provider__anthropic__tests__fetch_models_success.snap b/crates/forge_repo/src/provider/snapshots/forge_repo__provider__anthropic__tests__fetch_models_success.snap index 5f259372d..bdb42e215 100644 --- a/crates/forge_repo/src/provider/snapshots/forge_repo__provider__anthropic__tests__fetch_models_success.snap +++ b/crates/forge_repo/src/provider/snapshots/forge_repo__provider__anthropic__tests__fetch_models_success.snap @@ -13,7 +13,8 @@ expression: actual "supports_reasoning": null, "input_modalities": [ "text", - "image" + "image", + "pdf" ] }, { @@ -26,7 +27,8 @@ expression: actual "supports_reasoning": null, "input_modalities": [ "text", - "image" + "image", + "pdf" ] } ] diff --git a/crates/forge_services/src/attachment.rs b/crates/forge_services/src/attachment.rs index 5ebfe6bf2..466605cac 100644 --- a/crates/forge_services/src/attachment.rs +++ b/crates/forge_services/src/attachment.rs @@ -72,11 +72,13 @@ impl< }); } - // Determine file type (text or image with format) + // Determine binary attachment type (image or PDF). PDFs reuse the image + // byte-carrier but are serialized as native document blocks downstream. let mime_type = extension.and_then(|ext| match ext.as_str() { "jpeg" | "jpg" => Some("image/jpeg".to_string()), "png" => Some("image/png".to_string()), "webp" => Some("image/webp".to_string()), + "pdf" => Some("application/pdf".to_string()), _ => None, }); diff --git a/crates/forge_services/src/tool_services/fs_read.rs b/crates/forge_services/src/tool_services/fs_read.rs index a5cf78868..7077e5afc 100644 --- a/crates/forge_services/src/tool_services/fs_read.rs +++ b/crates/forge_services/src/tool_services/fs_read.rs @@ -55,7 +55,8 @@ fn detect_mime_type(path: &Path, content: &[u8]) -> String { .to_string() } -/// Checks if a MIME type represents visual content (images or PDFs) +/// Checks if a MIME type represents content sent to the model as binary bytes +/// rather than UTF-8 text (images and PDFs). fn is_visual_content(mime_type: &str) -> bool { mime_type.starts_with("image/") || mime_type == "application/pdf" } @@ -132,7 +133,9 @@ impl + I // Detect MIME type let mime_type = detect_mime_type(path, &raw_content); - // Handle visual content (PDFs and images) + // Handle visual/binary content (images and PDFs) as base64 bytes for the + // model. PDFs are carried with their application/pdf mime type and are + // serialized as native document blocks by the provider layer (not images). if is_visual_content(&mime_type) { // Validate against image-specific size limit (may be different from // max_file_size) @@ -146,7 +149,8 @@ impl + I } })?; - // Convert to base64 image + // Carry the raw bytes + mime type. For PDFs the provider layer emits a + // document block; for images, an image block. let image = Image::new_bytes(raw_content, mime_type.clone()); let hash = compute_hash(image.url()); @@ -211,6 +215,12 @@ impl + I #[cfg(test)] mod tests { + use std::collections::BTreeMap; + use std::path::PathBuf; + + use fake::{Fake, Faker}; + use forge_app::domain::Environment; + use forge_domain::ConfigOperation; use pretty_assertions::assert_eq; use tempfile::NamedTempFile; use tokio::fs; @@ -218,6 +228,37 @@ mod tests { use super::*; use crate::attachment::tests::MockFileService; + impl EnvironmentInfra for MockFileService { + type Config = forge_config::ForgeConfig; + + fn get_env_var(&self, _key: &str) -> Option { + None + } + + fn get_env_vars(&self) -> BTreeMap { + BTreeMap::new() + } + + fn get_environment(&self) -> Environment { + let fixture: Environment = Faker.fake(); + fixture.cwd(PathBuf::from("/test")) + } + + fn get_config(&self) -> anyhow::Result { + Ok(forge_config::ForgeConfig { + max_read_lines: 2000, + max_line_chars: 2000, + max_file_size_bytes: 10 * 1024 * 1024, + max_image_size_bytes: 10 * 1024 * 1024, + ..Default::default() + }) + } + + async fn update_environment(&self, _ops: Vec) -> anyhow::Result<()> { + unimplemented!() + } + } + // Helper to create a temporary file with specific content size async fn create_test_file_with_size(size: usize) -> anyhow::Result { let file = NamedTempFile::new()?; @@ -426,4 +467,32 @@ mod tests { assert_eq!(actual.len(), 50); // 12 bytes + truncation message assert!(actual.contains("truncated")); } + + #[tokio::test] + async fn test_read_pdf_returns_image_carrier_with_pdf_mime() { + // Intent: reading a PDF through the fs_read tool must return the raw bytes + // tagged as application/pdf (Content::Image carrier), NOT UTF-8 text and + // NOT a mislabeled image. The provider layer turns this carrier into a + // native document block; tagging it image/* here is the bug we fixed. + let infra = Arc::new(MockFileService::new()); + // Minimal content beginning with the %PDF magic so detect_mime_type + // resolves application/pdf. We no longer parse the PDF, so the body is + // irrelevant. + infra.add_file( + PathBuf::from("/test/sample.pdf"), + "%PDF-1.4\n1 0 obj<<>>endobj\n".to_string(), + ); + + let fs_read = ForgeFsRead::new(infra); + let output = fs_read + .read("/test/sample.pdf".to_string(), None, None) + .await + .unwrap(); + + let image = match output.content { + Content::Image(image) => image, + Content::File(text) => panic!("expected PDF byte carrier, got text: {text:?}"), + }; + assert_eq!(image.mime_type(), "application/pdf"); + } }