diff --git a/.changeset/image-and-video-inputs.md b/.changeset/image-and-video-inputs.md
new file mode 100644
index 000000000..e6a324899
--- /dev/null
+++ b/.changeset/image-and-video-inputs.md
@@ -0,0 +1,30 @@
+---
+'@tanstack/ai': minor
+'@tanstack/ai-openai': minor
+'@tanstack/ai-gemini': minor
+'@tanstack/ai-fal': minor
+'@tanstack/ai-grok': minor
+'@tanstack/ai-openrouter': minor
+'@tanstack/ai-client': minor
+'@tanstack/ai-event-client': patch
+---
+
+`generateImage()` and `generateVideo()` now accept a multimodal `prompt`: a plain string, or an ordered array of content parts (`TextPart` / `ImagePart` / `VideoPart` / `AudioPart`) for image-conditioned generation, image-to-image, multi-reference, image-to-video, and edit / inpaint flows. Part order is meaningful — "not like this _(image)_, more like this _(image)_" — and each media part may carry a `metadata.role` hint (`'reference' | 'mask' | 'control' | 'start_frame' | 'end_frame' | 'character'`) that adapters use to route to the provider-specific field, plus an informational `metadata.tag` label for your own bookkeeping. The accepted part types are narrowed per model at compile time via each adapter's input-modality map, so passing an image part to a text-only model is a type error (with a clear runtime throw as backstop).
+
+Prompt text is always sent **verbatim** — the SDK never injects or rewrites in-prompt referencing markers. To reference inputs from your prompt, write the provider's own convention (fal Kling / Seedance `@Image1`, OpenAI / FLUX.2 `"image 1"` prose, Gemini content descriptions); see the image-generation docs for the per-provider table.
+
+Provider behavior in this release:
+
+- **OpenAI image** — Prompts with image parts route `gpt-image-2` / `gpt-image-1` / `gpt-image-1-mini` to `images.edit()` (up to 16 source images plus optional mask); `dall-e-2` routes to `images.edit()` with one source image; `dall-e-3` rejects image parts at compile time and at runtime.
+- **OpenAI video** — Sora-2 / Sora-2-Pro accept a single image part as `input_reference`; passing more than one throws.
+- **Gemini image** — Native models (`gemini-*-flash-image`, "nano-banana") map prompt parts 1:1 onto multimodal `contents`, preserving interleaved order. Imagen is text-only (compile-time + runtime rejection).
+- **fal.ai** — Field names resolve per endpoint from a map generated from the fal SDK's endpoint types (362 endpoints with nonstandard fields, e.g. nano-banana edit → `image_urls`, Kling i2v start frame → `image_url`, Veo first-last-frame → `first_frame_url` / `last_frame_url`). Defaults for endpoints not in the map: single → `image_url`, multiple → `image_urls`; `role: 'mask'` → `mask_url`; `role: 'control'` → `control_image_url`; `role: 'reference'` / `'character'` → `reference_image_urls`; video `role: 'start_frame'` / `'end_frame'` → `start_image_url` / `end_image_url`. Per-model prompt modalities are derived at the type level from the SDK's endpoint input types. Regenerate the map after a fal SDK bump with `pnpm generate:fal-image-fields` (a unit test fails when it goes stale). In `FalImageProviderOptions` / `FalVideoProviderOptions`, media-conditioning fields the mappers can populate (`image_url`, `start_image_url`, `video_url`, `audio_url`, …) are demoted from required to optional — supply them as prompt parts, or keep passing them explicitly via `modelOptions`.
+- **Grok** — New `grok-imagine-image` / `grok-imagine-image-quality` models. Prompts with image parts route to xAI's JSON `/v1/images/edits` endpoint (up to 3 source images, addressed by xAI in request order; the prompt is sent verbatim). `role: 'mask'` / `'control'` throw. Their `size` uses an `aspectRatio_resolution` template (`'16:9_2k'`, suffix optional) mirroring Gemini's native image models. `grok-2-image-1212` remains text-to-image only.
+- **OpenRouter** — Prompt parts map 1:1 onto multimodal `text` / `image_url` chat content parts, preserving interleaved order, and are forwarded to the underlying image model. URL sources pass through verbatim (no fetching or re-encoding in your process); `data` sources become data URIs.
+- **Anthropic** — Unchanged (no image generation API).
+
+A new `resolveMediaPrompt()` utility (exported from `@tanstack/ai`) is the single downrev point from the canonical interleaved prompt shape to flattened text + per-modality part buckets, for adapter authors.
+
+On the client side, `ImageGenerateInput.prompt` and `VideoGenerateInput.prompt` (`@tanstack/ai-client`, and the `useGenerateImage` / `useGenerateVideo` hooks built on them) are widened from `string` to the same `MediaPrompt` shape, so prompt parts can be sent from the browser through your server route to `generateImage()` / `generateVideo()`.
+
+Closes #618.
diff --git a/.gitignore b/.gitignore
index 6678fb779..b261f62d1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -78,3 +78,4 @@ solo.yml
# Agent scratch output (gap-analysis reports, triage notes — generated locally)
.agent/gap-analysis/
.agent/triage/
+.agent/research/
diff --git a/.prettierignore b/.prettierignore
index c72af168a..a4770926f 100644
--- a/.prettierignore
+++ b/.prettierignore
@@ -5,6 +5,7 @@
**/coverage
**/dist
**/docs
+packages/ai-fal/src/image/generated/
pnpm-lock.yaml
.angular
diff --git a/docs/adapters/grok.md b/docs/adapters/grok.md
index 528226903..7103895b4 100644
--- a/docs/adapters/grok.md
+++ b/docs/adapters/grok.md
@@ -160,6 +160,51 @@ const result = await generateImage({
console.log(result.images);
```
+The grok-imagine models (`grok-imagine-image`, `grok-imagine-image-quality`)
+are aspect-ratio sized — `size` takes an `aspectRatio_resolution` template
+like `"16:9_2k"` (the `_2k` suffix is optional):
+
+```typescript
+const result = await generateImage({
+ adapter: grokImage("grok-imagine-image"),
+ prompt: "A futuristic cityscape at sunset",
+ size: "16:9_2k",
+});
+```
+
+### Image Editing (image-to-image)
+
+The grok-imagine models accept image prompt parts for image-conditioned
+generation via xAI's `/v1/images/edits` endpoint — up to 3 source images,
+addressed by xAI in the order they appear in the prompt. Per xAI's docs
+there is no in-prompt referencing syntax; write the prompt naturally and
+your text is sent verbatim:
+
+```typescript
+const result = await generateImage({
+ adapter: grokImage("grok-imagine-image"),
+ prompt: [
+ {
+ type: "text",
+ content: "Render the product in the style of the second image",
+ },
+ {
+ type: "image",
+ source: { type: "url", value: "https://example.com/product.png" },
+ },
+ {
+ type: "image",
+ source: { type: "url", value: "https://example.com/style.png" },
+ },
+ ],
+});
+```
+
+URL sources are fetched by xAI's servers, so they must be publicly
+reachable; use a `data` source for private images. `grok-2-image-1212` is
+text-to-image only — image prompt parts are a compile-time type error and
+throw at runtime.
+
## Text-to-Speech
Generate speech with Grok TTS:
diff --git a/docs/media/image-generation.md b/docs/media/image-generation.md
index d8af2e816..9f5d1fbba 100644
--- a/docs/media/image-generation.md
+++ b/docs/media/image-generation.md
@@ -22,7 +22,7 @@ TanStack AI provides support for image generation through dedicated image adapte
Image generation is handled by image adapters that follow the same tree-shakeable architecture as other adapters in TanStack AI. The image adapters support:
-- **OpenAI**: DALL-E 2, DALL-E 3, GPT-Image-1, and GPT-Image-1-Mini models
+- **OpenAI**: DALL-E 2, DALL-E 3, GPT-Image-1, GPT-Image-1-Mini, and GPT-Image-2 models
- **Gemini**: Gemini native image models (NanoBanana) and Imagen 3/4 models
- **fal.ai**: 600+ models including Nano Banana Pro, FLUX, and more
@@ -76,7 +76,7 @@ All image adapters support these common options:
| Option | Type | Description |
|--------|------|-------------|
| `adapter` | `ImageAdapter` | Image adapter instance with model (required) |
-| `prompt` | `string` | Text description of the image to generate (required) |
+| `prompt` | `string \| MediaPromptPart[]` | Description of the image to generate (required). A plain string, or — on models that support image-conditioned generation — an ordered array of content parts interleaving text with image inputs. See [Image-Conditioned Generation](#image-conditioned-generation) below. |
| `numberOfImages` | `number` | Number of images to generate |
| `size` | `string` | Size of the generated image in WIDTHxHEIGHT format |
| `modelOptions?` | `object` | Model-specific options (renamed from `providerOptions`) |
@@ -130,6 +130,169 @@ const result = await generateImage({
})
```
+## Image-Conditioned Generation
+
+For image-to-image, reference-guided, multi-reference, and edit / inpaint
+flows, pass the `prompt` as an ordered array of content parts — the same
+`TextPart` / `ImagePart` shapes used elsewhere for multimodal content:
+
+```typescript
+import { generateImage } from '@tanstack/ai'
+import { openaiImage } from '@tanstack/ai-openai'
+
+await generateImage({
+ adapter: openaiImage('gpt-image-2'),
+ prompt: [
+ { type: 'text', content: 'Turn this into a cinematic product photo' },
+ {
+ type: 'image',
+ source: { type: 'url', value: 'https://example.com/product.png' },
+ },
+ ],
+})
+```
+
+Part order is meaningful. Providers with natively multimodal prompts
+(Gemini image models, OpenRouter) receive the parts exactly as written, so
+text can refer to its neighbouring images:
+
+```typescript
+await generateImage({
+ adapter: geminiImage('gemini-3.1-flash-image-preview'),
+ prompt: [
+ { type: 'text', content: 'Not like this' },
+ { type: 'image', source: { type: 'url', value: badExampleUrl } },
+ { type: 'text', content: 'more like this' },
+ { type: 'image', source: { type: 'url', value: goodExampleUrl } },
+ ],
+})
+```
+
+Providers with named request fields (OpenAI, fal, xAI) extract the image
+parts and flatten the text (text parts are joined verbatim, paragraph
+separated).
+
+The accepted part types are narrowed **per model at compile time**: passing
+an image part to a text-only model (e.g. `dall-e-3`, Imagen) is a type
+error, not just a runtime throw.
+
+### Referencing images from your prompt
+
+**Your prompt text is always sent verbatim — the SDK never injects or
+rewrites referencing markers.** When you want the text to refer to specific
+input images, write the provider's own convention yourself:
+
+| Provider | Convention | Example |
+| -------- | ---------- | ------- |
+| **OpenAI** (gpt-image) | Indexed prose, per OpenAI's prompting guide | `"apply the style of image 2 to image 1"` |
+| **FLUX.2 on fal / BFL** | Indexed prose (BFL's docs parse `image N`) | `"subject from image 1, style from image 2"` |
+| **Gemini** (native image models) | Describe the reference by content/role | `"using the attached fabric sample as the texture"` |
+| **fal Kling / Seedance endpoints** | `@`-tags, 1-indexed by input order | `"Put @Image1 in the style of @Image2"` |
+| **xAI grok-imagine** | No in-prompt syntax — images addressed in request order | `"render the product in the style of the second image"` |
+
+To keep track of which part you meant by "image 2" or `@Image2`, you can
+label parts with the informational `metadata.tag` field — the SDK ignores
+it, but it keeps your code self-documenting:
+
+```typescript
+prompt: [
+ { type: 'text', content: 'Put @Image1 in the style of @Image2' },
+ { type: 'image', source: { type: 'url', value: productUrl },
+ metadata: { tag: 'product' } },
+ { type: 'image', source: { type: 'url', value: styleUrl },
+ metadata: { tag: 'style' } },
+]
+```
+
+### Source format
+
+`ImagePart.source` is a discriminated union supporting both URLs and inline
+base64 data — pass whichever you have:
+
+```typescript
+// URL source
+{ type: 'image', source: { type: 'url', value: 'https://example.com/img.png' } }
+
+// Inline base64 data (mimeType required)
+{ type: 'image', source: { type: 'data', value: base64String, mimeType: 'image/png' } }
+```
+
+OpenAI's edit endpoint requires file uploads; the adapter fetches URL sources
+and converts base64 to a `File` automatically.
+
+### Role hints via `metadata.role`
+
+When a generation has multiple inputs with different roles (mask vs reference
+vs start/end frame), set `metadata.role` on each part. Adapters route by role
+to the provider-specific field; parts without a role fall back to positional
+mapping.
+
+| Role | Maps to |
+| --------------- | -------------------------------------------------------------------------------------- |
+| `'reference'` | fal `reference_image_urls`; Gemini multimodal part; positional fallback |
+| `'character'` | Same as `'reference'`; Veo `referenceImages` slot (planned — no Veo adapter yet) |
+| `'mask'` | OpenAI `mask` (gpt-image-2, gpt-image-1, dall-e-2); fal `mask_url` |
+| `'control'` | fal `control_image_url` (ControlNet / depth / pose conditioning) |
+| `'start_frame'` | fal `start_image_url`; Veo `image` (planned) (used by `generateVideo`) |
+| `'end_frame'` | fal `end_image_url`; Veo `lastFrame` (planned) (used by `generateVideo`) |
+
+#### Inpaint / edit with a mask
+
+```typescript
+await generateImage({
+ adapter: openaiImage('gpt-image-2'),
+ prompt: [
+ { type: 'text', content: 'Replace the masked region with a tree' },
+ {
+ type: 'image',
+ source: { type: 'url', value: photoUrl },
+ },
+ {
+ type: 'image',
+ source: { type: 'url', value: maskUrl },
+ metadata: { role: 'mask' },
+ },
+ ],
+})
+```
+
+#### Multi-reference composition
+
+```typescript
+await generateImage({
+ adapter: geminiImage('gemini-3.1-flash-image-preview'),
+ prompt: [
+ {
+ type: 'text',
+ content:
+ 'Generate a new image of the product using the style of the second reference',
+ },
+ {
+ type: 'image',
+ source: { type: 'url', value: 'https://example.com/product.png' },
+ },
+ {
+ type: 'image',
+ source: { type: 'url', value: 'https://example.com/style.png' },
+ },
+ ],
+})
+```
+
+### Provider support
+
+| Provider | Behavior |
+| ------------ | --------------------------------------------------------------------------------------------------------- |
+| **OpenAI** | `gpt-image-2` / `gpt-image-1` / `gpt-image-1-mini` → routes to `images.edit()`, up to 16 source images plus optional mask. `dall-e-2` → `images.edit()` with 1 source image only. `dall-e-3` → throws (no edit support). |
+| **Gemini** | Native models (`gemini-*-flash-image`, "nano-banana", etc.) → prompt parts map 1:1 onto multimodal `contents`, preserving interleaved order. Up to ~14 input images (provider limit, not enforced by the SDK). Imagen models → throws (text-to-image only). |
+| **fal.ai** | Field names resolve per endpoint from a map generated from the fal SDK's endpoint types (e.g. nano-banana edit gets `image_urls`, Fooocus masks get `mask_image_url`). Defaults for unknown endpoints: 1 input → `image_url`; multiple → `image_urls`; `role: 'mask'` → `mask_url`; `role: 'control'` → `control_image_url`; `role: 'reference'` / `'character'` → `reference_image_urls`. Override with `modelOptions` for endpoint-specific fields. |
+| **Grok** | grok-imagine models → xAI's `/v1/images/edits` (up to 3 source images, addressed by xAI in request order; prompt sent verbatim). `role: 'mask'` / `'control'` throw (no Imagine API equivalent). `grok-2-image-1212` throws (text-to-image only). |
+| **OpenRouter** | Prompt parts map 1:1 onto multimodal `image_url` / `text` content parts, preserving interleaved order, and are forwarded to the underlying image model. |
+| **Anthropic** | n/a — no image generation API. |
+
+Adapters that don't support image-conditioned generation throw a clear
+runtime error so calls fail fast rather than silently dropping the inputs.
+
## Model Options
### OpenAI Model Options
diff --git a/docs/media/video-generation.md b/docs/media/video-generation.md
index 4af93f020..3c497b053 100644
--- a/docs/media/video-generation.md
+++ b/docs/media/video-generation.md
@@ -363,11 +363,93 @@ And returns:
| Option | Type | Description |
|--------|------|-------------|
| `adapter` | `VideoAdapter` | Video adapter instance with model (required) |
-| `prompt` | `string` | Text description of the video to generate (required) |
+| `prompt` | `string \| MediaPromptPart[]` | Description of the video to generate (required). A plain string, or — on models that support conditioned generation — an ordered array of content parts interleaving text with image / video / audio inputs. See [Image-to-Video](#image-to-video) below. |
| `size` | `string` | Video resolution in WIDTHxHEIGHT format |
| `duration` | `number` | Video duration in seconds (maps to `seconds` parameter in API) |
| `modelOptions?` | `object` | Model-specific options (renamed from `providerOptions`) |
+## Image-to-Video
+
+For starting-frame, ending-frame, and reference-image conditioned video
+generation, pass the `prompt` as an array of content parts:
+
+```typescript
+import { generateVideo } from '@tanstack/ai'
+import { openaiVideo } from '@tanstack/ai-openai'
+
+const { jobId } = await generateVideo({
+ adapter: openaiVideo('sora-2'),
+ prompt: [
+ {
+ type: 'text',
+ content:
+ 'Animate this still into a slow cinematic push-in with subtle motion',
+ },
+ {
+ type: 'image',
+ source: {
+ type: 'data',
+ value: base64Image,
+ mimeType: 'image/png',
+ },
+ },
+ ],
+})
+```
+
+The accepted part types are narrowed **per model at compile time** — fal
+endpoints, for example, only admit image / video / audio parts that their
+SDK input type actually declares fields for.
+
+Prompt text is always sent **verbatim** — the SDK never injects or rewrites
+in-prompt referencing markers. Some fal video endpoints have their own
+referencing syntax you can write directly in your text (e.g. Kling v3
+elements as `@Element1`, Seedance 2.0 reference-to-video as `@Image1` /
+`@Video1` / `@Audio1`, 1-indexed by input order); Veo and Sora take
+reference images as plain inputs with naturally written prompts. See
+[Referencing images from your prompt](./image-generation.md#referencing-images-from-your-prompt)
+for the per-provider table.
+
+### Role hints
+
+Each `ImagePart` can carry an optional `metadata.role` hint that the
+adapter uses to route the input to the provider-specific field:
+
+| Role | Maps to |
+| --------------- | ------------------------------------------------------------- |
+| `'start_frame'` | fal `start_image_url` (positional default for the first input) |
+| `'end_frame'` | fal `end_image_url` (Veo `lastFrame` planned — no Veo adapter yet) |
+| `'reference'` | fal `reference_image_urls` (Veo `referenceImages` planned) |
+| `'character'` | Same as `'reference'` — character consistency images |
+
+```typescript
+import { falVideo } from '@tanstack/ai-fal'
+
+await generateVideo({
+ adapter: falVideo('fal-ai/kling-video/v3/pro/image-to-video'),
+ prompt: [
+ { type: 'image', source: { type: 'url', value: firstFrameUrl } },
+ { type: 'text', content: 'Slow cinematic push-in then a hard cut' },
+ {
+ type: 'image',
+ source: { type: 'url', value: lastFrameUrl },
+ metadata: { role: 'end_frame' },
+ },
+ ],
+})
+```
+
+### Provider support
+
+| Provider | Image-to-Video Behavior |
+| ------------ | -------------------------------------------------------------------------------------------------------- |
+| **OpenAI** | Sora-2 / Sora-2-Pro → the image part goes to `input_reference`; flattened text is the prompt. Single image only — throws if more than one. |
+| **fal.ai** | Field names resolve per endpoint from a map generated from the fal SDK's endpoint types — e.g. `role: 'start_frame'` lands on `image_url` for Kling/Veo image-to-video, `first_frame_url` for first-last-frame endpoints, and `start_image_url` otherwise. Defaults: single input → `image_url` (start frame); `role: 'end_frame'` → `end_image_url`; `role: 'reference'` / `'character'` → `reference_image_urls`. Override per-endpoint via `modelOptions` — the media-conditioning fields are typed optional there (even when the endpoint requires them) since they usually arrive as prompt parts. |
+| **Gemini** | Veo adapter not yet implemented — image prompt parts will be supported when Veo lands. |
+
+Adapters whose underlying API can't accept image inputs throw a clear
+runtime error so calls fail fast.
+
### Supported Sizes
Based on [OpenAI API docs](https://platform.openai.com/docs/api-reference/videos/create):
diff --git a/examples/ts-react-media/src/components/ImageGenerator.tsx b/examples/ts-react-media/src/components/ImageGenerator.tsx
index 484df42c9..81dd1a5f3 100644
--- a/examples/ts-react-media/src/components/ImageGenerator.tsx
+++ b/examples/ts-react-media/src/components/ImageGenerator.tsx
@@ -1,10 +1,13 @@
-import { useState } from 'react'
-import { ImageIcon, Loader2, Shuffle } from 'lucide-react'
+import { useRef, useState } from 'react'
+import { ImageIcon, Loader2, Plus, Shuffle, X } from 'lucide-react'
import type { ImageGenerationResult } from '@tanstack/ai'
+import type { MediaPrompt } from '@tanstack/ai/client'
import { generateImageFn } from '@/lib/server-functions'
import { getRandomImagePrompt } from '@/lib/prompts'
import { IMAGE_MODELS } from '@/lib/models'
+import { readImageFile, toImagePart } from '@/lib/media'
+import type { AttachedImage } from '@/lib/media'
interface ImageGeneratorProps {
onImageGenerated?: (imageUrl: string) => void
@@ -32,11 +35,37 @@ export default function ImageGenerator({
const [selectedModel, setSelectedModel] = useState('all')
const [isLoading, setIsLoading] = useState(false)
const [results, setResults] = useState>({})
+ const [images, setImages] = useState>([])
+ const fileInputRef = useRef(null)
const currentModel = IMAGE_MODELS.find((m) => m.id === selectedModel)
+ // When images are attached, send an ordered parts array (text first, then one
+ // image part per attachment). Otherwise send the plain string. Only image-capable
+ // models accept image inputs — unsupported models surface a server error.
+ const buildPrompt = (): MediaPrompt => {
+ if (images.length === 0) return prompt
+ return [
+ { type: 'text', content: prompt },
+ ...images.map((image) => toImagePart(image)),
+ ]
+ }
+
+ const handleImageSelect = async (e: React.ChangeEvent) => {
+ const files = Array.from(e.target.files ?? [])
+ if (fileInputRef.current) fileInputRef.current.value = ''
+ if (files.length === 0) return
+ const attached = await Promise.all(files.map((file) => readImageFile(file)))
+ setImages((prev) => [...prev, ...attached])
+ }
+
+ const removeImage = (id: string) => {
+ setImages((prev) => prev.filter((image) => image.id !== id))
+ }
+
const handleGenerate = async () => {
if (!prompt.trim()) return
+ const builtPrompt = buildPrompt()
setIsLoading(true)
setResults({})
@@ -53,7 +82,7 @@ export default function ImageGenerator({
const promises = IMAGE_MODELS.map(async (model) => {
try {
const response = await generateImageFn({
- data: { prompt, model: model.id },
+ data: { prompt: builtPrompt, model: model.id },
})
setResults((prev) => ({
...prev,
@@ -83,7 +112,7 @@ export default function ImageGenerator({
try {
const response = await generateImageFn({
- data: { prompt, model: selectedModel },
+ data: { prompt: builtPrompt, model: selectedModel },
})
setResults({ [selectedModel]: { status: 'success', result: response } })
const image = response.images[0]
@@ -162,6 +191,55 @@ export default function ImageGenerator({
/>
+
+
+
+
+ Supported by Gemini native (NanoBanana) models only
+
+