From e5b7ddda86b8cd9643cfbc44d65a09dac6de7bd1 Mon Sep 17 00:00:00 2001 From: Feldwor Date: Fri, 12 Sep 2025 18:21:10 +0300 Subject: [PATCH] Add drag & drop image support with multimodal vision capabilities MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements comprehensive image upload functionality matching Claude Code CLI behavior: - Extended shared types with ImageData and MultimodalMessage interfaces for type safety - Created image validation utilities with support for JPEG, PNG, GIF, WebP formats - Enhanced ChatInput component with drag & drop interface and image previews - Updated backend chat handler to process multimodal messages via Claude SDK - Added image display support in MessageComponents with grid layout - Integrated multimodal state management in ChatPage for seamless UX Users can now drag images directly into the chat or click to browse files, with Claude analyzing them through its vision capabilities just like the original CLI tool. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- backend/handlers/chat.ts | 172 +++++++++++++---- frontend/src/components/ChatPage.tsx | 36 +++- frontend/src/components/MessageComponents.tsx | 22 +++ frontend/src/components/chat/ChatInput.tsx | 180 +++++++++++++++++- frontend/src/types.ts | 3 + frontend/src/utils/imageUtils.ts | 118 ++++++++++++ shared/types.ts | 15 +- 7 files changed, 497 insertions(+), 49 deletions(-) create mode 100644 frontend/src/utils/imageUtils.ts diff --git a/backend/handlers/chat.ts b/backend/handlers/chat.ts index b0d283e0..36e5b846 100644 --- a/backend/handlers/chat.ts +++ b/backend/handlers/chat.ts @@ -1,7 +1,79 @@ import { Context } from "hono"; -import { query, type PermissionMode } from "@anthropic-ai/claude-code"; -import type { ChatRequest, StreamResponse } from "../../shared/types.ts"; +import { query, type PermissionMode, type SDKUserMessage } from "@anthropic-ai/claude-code"; +import type { ChatRequest, StreamResponse, MultimodalMessage, ImageData } from "../../shared/types.ts"; import { logger } from "../utils/logger.ts"; +import { getPlatform } from "../utils/os.ts"; + +/** + * Gets the runtime type for Claude SDK + * @returns The runtime type that Claude SDK expects + */ +function getRuntimeType(): "bun" | "deno" | "node" { + // Check for Deno runtime + if (typeof (globalThis as any).Deno !== "undefined") { + return "deno"; + } + + // Check for Bun runtime + if (typeof (globalThis as any).Bun !== "undefined") { + return "bun"; + } + + // Default to Node.js + return "node"; +} + +/** + * Type guard to check if a message is multimodal + */ +function isMultimodalMessage(message: string | MultimodalMessage): message is MultimodalMessage { + return typeof message === 'object' && message !== null && 'text' in message && 'images' in message; +} + +/** + * Creates an SDKUserMessage from multimodal content + */ +function createMultimodalSDKMessage(message: MultimodalMessage, sessionId?: string): SDKUserMessage { + // Build content array with text and images + const content = []; + + // Add text content if present + if (message.text.trim()) { + content.push({ + type: 'text' as const, + text: message.text + }); + } + + // Add image content blocks + for (const image of message.images) { + content.push({ + type: 'image' as const, + source: { + type: 'base64' as const, + media_type: image.type, + data: image.data + } + }); + } + + return { + type: 'user' as const, + message: { + role: 'user' as const, + content: content + }, + session_id: sessionId || '', + parent_tool_use_id: null + }; +} + +/** + * Creates an async iterable from a single SDKUserMessage + */ +async function* createSDKMessageIterable(sdkMessage: SDKUserMessage): AsyncIterable { + yield sdkMessage; +} /** * Executes a Claude command and yields streaming responses @@ -16,7 +88,7 @@ import { logger } from "../utils/logger.ts"; * @returns AsyncGenerator yielding StreamResponse objects */ async function* executeClaudeCommand( - message: string, + message: string | MultimodalMessage, requestId: string, requestAbortControllers: Map, cliPath: string, @@ -28,53 +100,71 @@ async function* executeClaudeCommand( let abortController: AbortController; try { - // Process commands that start with '/' - let processedMessage = message; - if (message.startsWith("/")) { - // Remove the '/' and send just the command - processedMessage = message.substring(1); - } - // Create and store AbortController for this request abortController = new AbortController(); requestAbortControllers.set(requestId, abortController); - for await (const sdkMessage of query({ - prompt: processedMessage, - options: { - abortController, - executable: "node" as const, - executableArgs: [], - pathToClaudeCodeExecutable: cliPath, - ...(sessionId ? { resume: sessionId } : {}), - ...(allowedTools ? { allowedTools } : {}), - ...(workingDirectory ? { cwd: workingDirectory } : {}), - ...(permissionMode ? { permissionMode } : {}), - }, - })) { - // Debug logging of raw SDK messages with detailed content - logger.chat.debug("Claude SDK Message: {sdkMessage}", { sdkMessage }); + const runtimeType = getRuntimeType(); + const queryOptions = { + abortController, + executable: runtimeType, + executableArgs: [], + pathToClaudeCodeExecutable: cliPath, + env: { ...process.env }, + ...(sessionId ? { resume: sessionId } : {}), + ...(allowedTools ? { allowedTools } : {}), + ...(workingDirectory ? { cwd: workingDirectory } : {}), + ...(permissionMode ? { permissionMode } : {}), + }; - yield { - type: "claude_json", - data: sdkMessage, - }; + logger.chat.debug("Claude SDK query options: {options}", { options: queryOptions }); + + // Handle multimodal vs text-only messages + if (isMultimodalMessage(message)) { + // Multimodal message with images + logger.chat.debug("Processing multimodal message with {imageCount} images", { imageCount: message.images.length }); + + const sdkMessage = createMultimodalSDKMessage(message, sessionId); + const messageIterable = createSDKMessageIterable(sdkMessage); + + for await (const sdkMessage of query({ + prompt: messageIterable, + options: queryOptions, + })) { + logger.chat.debug("Claude SDK Message: {sdkMessage}", { sdkMessage }); + yield { + type: "claude_json", + data: sdkMessage, + }; + } + } else { + // Text-only message + let processedMessage = message; + if (message.startsWith("/")) { + processedMessage = message.substring(1); + } + + logger.chat.debug("Processing text-only message"); + + for await (const sdkMessage of query({ + prompt: processedMessage, + options: queryOptions, + })) { + logger.chat.debug("Claude SDK Message: {sdkMessage}", { sdkMessage }); + yield { + type: "claude_json", + data: sdkMessage, + }; + } } yield { type: "done" }; } catch (error) { - // Check if error is due to abort - // TODO: Re-enable when AbortError is properly exported from Claude SDK - // if (error instanceof AbortError) { - // yield { type: "aborted" }; - // } else { - { - logger.chat.error("Claude Code execution failed: {error}", { error }); - yield { - type: "error", - error: error instanceof Error ? error.message : String(error), - }; - } + logger.chat.error("Claude Code execution failed: {error}", { error }); + yield { + type: "error", + error: error instanceof Error ? error.message : String(error), + }; } finally { // Clean up AbortController from map if (requestAbortControllers.has(requestId)) { diff --git a/frontend/src/components/ChatPage.tsx b/frontend/src/components/ChatPage.tsx index a005e877..27990987 100644 --- a/frontend/src/components/ChatPage.tsx +++ b/frontend/src/components/ChatPage.tsx @@ -7,6 +7,7 @@ import type { ProjectInfo, PermissionMode, } from "../types"; +import type { ConversationSummary, ImageData, MultimodalMessage } from "../../../shared/types"; import { useClaudeStreaming } from "../hooks/useClaudeStreaming"; import { useChatState } from "../hooks/chat/useChatState"; import { usePermissions } from "../hooks/chat/usePermissions"; @@ -30,6 +31,13 @@ export function ChatPage() { const [searchParams] = useSearchParams(); const [projects, setProjects] = useState([]); const [isSettingsOpen, setIsSettingsOpen] = useState(false); + const [currentConversation, setCurrentConversation] = useState<{ + title: string; + fullTitle: string; + projectEncodedName: string; + } | null>(null); + // State for uploaded images + const [uploadedImages, setUploadedImages] = useState([]); // Extract and normalize working directory from URL const workingDirectory = (() => { @@ -148,10 +156,23 @@ export function ChatPage() { overridePermissionMode?: PermissionMode, ) => { const content = messageContent || input.trim(); - if (!content || isLoading) return; + if ((!content && uploadedImages.length === 0) || isLoading) return; const requestId = generateRequestId(); + // Prepare message payload - either string or multimodal + let messagePayload: string | MultimodalMessage; + if (uploadedImages.length > 0 && !messageContent) { + // Create multimodal message with images + messagePayload = { + text: content, + images: uploadedImages + }; + } else { + // Regular text-only message + messagePayload = content; + } + // Only add user message to chat if not hidden if (!hideUserMessage) { const userMessage: ChatMessage = { @@ -159,11 +180,16 @@ export function ChatPage() { role: "user", content: content, timestamp: Date.now(), + // Include images if this is a multimodal message + ...(uploadedImages.length > 0 && !messageContent ? { images: uploadedImages } : {}), }; addMessage(userMessage); } - if (!messageContent) clearInput(); + if (!messageContent) { + clearInput(); + setUploadedImages([]); // Clear images after sending + } startRequest(); try { @@ -171,7 +197,7 @@ export function ChatPage() { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ - message: content, + message: messagePayload, requestId, ...(currentSessionId ? { sessionId: currentSessionId } : {}), allowedTools: tools || allowedTools, @@ -259,6 +285,8 @@ export function ChatPage() { processStreamLine, handlePermissionError, createAbortHandler, + uploadedImages, + setUploadedImages, ], ); @@ -580,6 +608,8 @@ export function ChatPage() { showPermissions={isPermissionMode} permissionData={permissionData} planPermissionData={planPermissionData} + images={uploadedImages} + onImagesChange={setUploadedImages} /> )} diff --git a/frontend/src/components/MessageComponents.tsx b/frontend/src/components/MessageComponents.tsx index 71f3954a..ea6240de 100644 --- a/frontend/src/components/MessageComponents.tsx +++ b/frontend/src/components/MessageComponents.tsx @@ -66,6 +66,28 @@ export function ChatMessageComponent({ message }: ChatMessageComponentProps) { }`} /> + + {/* Display images if present (for user messages) */} + {message.images && message.images.length > 0 && ( +
+
+ {message.images.map((image) => ( +
+ {image.name} +
+ {image.name} +
+
+ ))} +
+
+ )} +
         {message.content}
       
diff --git a/frontend/src/components/chat/ChatInput.tsx b/frontend/src/components/chat/ChatInput.tsx index 6b1493cb..cf967bf1 100644 --- a/frontend/src/components/chat/ChatInput.tsx +++ b/frontend/src/components/chat/ChatInput.tsx @@ -1,10 +1,17 @@ import React, { useRef, useEffect, useState } from "react"; -import { StopIcon } from "@heroicons/react/24/solid"; +import { StopIcon, PhotoIcon, XMarkIcon } from "@heroicons/react/24/solid"; import { UI_CONSTANTS, KEYBOARD_SHORTCUTS } from "../../utils/constants"; import { useEnterBehavior } from "../../hooks/useSettings"; import { PermissionInputPanel } from "./PermissionInputPanel"; import { PlanPermissionInputPanel } from "./PlanPermissionInputPanel"; import type { PermissionMode } from "../../types"; +import type { ImageData } from "../../../../shared/types"; +import { + validateImageFiles, + fileToImageData, + formatFileSize, + type ImageValidationError +} from "../../utils/imageUtils"; interface PermissionData { patterns: string[]; @@ -50,6 +57,9 @@ interface ChatInputProps { showPermissions?: boolean; permissionData?: PermissionData; planPermissionData?: PlanPermissionData; + // Image upload props + images?: ImageData[]; + onImagesChange?: (images: ImageData[]) => void; } export function ChatInput({ @@ -64,9 +74,14 @@ export function ChatInput({ showPermissions = false, permissionData, planPermissionData, + images = [], + onImagesChange, }: ChatInputProps) { const inputRef = useRef(null); + const fileInputRef = useRef(null); const [isComposing, setIsComposing] = useState(false); + const [isDragging, setIsDragging] = useState(false); + const [uploadErrors, setUploadErrors] = useState([]); const { enterBehavior } = useEnterBehavior(); // Focus input when not loading and not in permission mode @@ -148,6 +163,72 @@ export function ChatInput({ setTimeout(() => setIsComposing(false), 0); }; + // Image handling functions + const handleFilesSelected = async (files: FileList | null) => { + if (!files || !onImagesChange) return; + + const fileArray = Array.from(files); + const { validFiles, errors } = validateImageFiles(fileArray, images.length); + + setUploadErrors(errors); + + if (validFiles.length > 0) { + try { + const imageDataPromises = validFiles.map(fileToImageData); + const newImageData = await Promise.all(imageDataPromises); + onImagesChange([...images, ...newImageData]); + } catch (error) { + console.error('Error processing images:', error); + setUploadErrors(prev => [...prev, { + type: 'read_error', + message: 'Failed to process one or more images' + }]); + } + } + }; + + const handleFileInputChange = (e: React.ChangeEvent) => { + handleFilesSelected(e.target.files); + // Reset input so same file can be selected again + if (e.target) { + e.target.value = ''; + } + }; + + const handleDragOver = (e: React.DragEvent) => { + e.preventDefault(); + e.stopPropagation(); + setIsDragging(true); + }; + + const handleDragLeave = (e: React.DragEvent) => { + e.preventDefault(); + e.stopPropagation(); + // Only set dragging to false if we're leaving the component entirely + if (!e.currentTarget.contains(e.relatedTarget as Node)) { + setIsDragging(false); + } + }; + + const handleDrop = (e: React.DragEvent) => { + e.preventDefault(); + e.stopPropagation(); + setIsDragging(false); + + const files = e.dataTransfer.files; + handleFilesSelected(files); + }; + + const removeImage = (imageId: string) => { + if (onImagesChange) { + onImagesChange(images.filter(img => img.id !== imageId)); + } + }; + + const openFileDialog = () => { + fileInputRef.current?.click(); + }; + // Get permission mode status indicator (CLI-style) const getPermissionModeIndicator = (mode: PermissionMode): string => { switch (mode) { @@ -210,7 +291,76 @@ export function ChatInput({ return (
-
+ {/* Image previews */} + {images.length > 0 && ( +
+
+ + + {images.length} image{images.length > 1 ? 's' : ''} attached + +
+
+ {images.map((image) => ( +
+ {image.name} + +
+ {image.name} ({formatFileSize(image.size)}) +
+
+ ))} +
+
+ )} + + {/* Upload errors */} + {uploadErrors.length > 0 && ( +
+
+ Upload errors: +
+ {uploadErrors.map((error, index) => ( +
+ {error.fileName && `${error.fileName}: `}{error.message} +
+ ))} +
+ )} + + {/* Drag overlay */} + {isDragging && ( +
+
+ +
+ Drop images here +
+
+ Supports JPEG, PNG, GIF, WebP (max 5MB each) +
+
+
+ )} + +