Skip to content

Commit 1110db1

Browse files
committed
Update media
1 parent dc4b7fa commit 1110db1

8 files changed

Lines changed: 183 additions & 49 deletions

File tree

apps/sim/lib/copilot/generated/tool-catalog-v1.ts

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1421,7 +1421,8 @@ export const GenerateAudio: ToolCatalogEntry = {
14211421
properties: {
14221422
duration: {
14231423
type: 'number',
1424-
description: 'Approximate duration in seconds for music/sfx where supported.',
1424+
description:
1425+
'Approximate duration in seconds for sfx (and music models that support it). MiniMax music ignores this — fit music to a video with the ffmpeg tool instead.',
14251426
},
14261427
inputs: {
14271428
type: 'object',
@@ -1486,6 +1487,16 @@ export const GenerateAudio: ToolCatalogEntry = {
14861487
},
14871488
},
14881489
},
1490+
instrumental: {
1491+
type: 'boolean',
1492+
description:
1493+
'For music: true = instrumental, no vocals (default); false = a song with vocals.',
1494+
},
1495+
lyrics: {
1496+
type: 'string',
1497+
description:
1498+
'For music with vocals: the lyrics to sing (optional; supports [Verse]/[Chorus] tags). Setting this implies instrumental=false.',
1499+
},
14891500
model: {
14901501
type: 'string',
14911502
description:
@@ -1698,7 +1709,8 @@ export const GenerateVideo: ToolCatalogEntry = {
16981709
},
16991710
generateAudio: {
17001711
type: 'boolean',
1701-
description: 'Generate native audio when the model supports it (default true for Veo).',
1712+
description:
1713+
"Toggle Veo's native audio (dialogue/SFX/ambience/music generated from the prompt). Default true. Set false when you will add your own voiceover/music via the ffmpeg tool.",
17021714
},
17031715
inputs: {
17041716
type: 'object',
@@ -1766,12 +1778,11 @@ export const GenerateVideo: ToolCatalogEntry = {
17661778
model: {
17671779
type: 'string',
17681780
description:
1769-
'Optional model override. Defaults to veo-3.1-fast (native audio, cheapest Veo tier). Use veo-3.1 for 4K/cinematic; seedance-2.0 for longer narrative.',
1781+
"Optional model override, keyed to the video's goal: veo-3.1-lite (prototype/quick test, cheapest), veo-3.1-fast (reasonable draft — default, good video), veo-3.1 Standard (final cut / premium quality). Stay on Veo unless the user explicitly asks for another model; seedance-2.0 for >8s narrative, kling-v3-pro for specific looks.",
17701782
enum: [
17711783
'veo-3.1',
17721784
'veo-3.1-fast',
1773-
'sora-2',
1774-
'sora-2-pro',
1785+
'veo-3.1-lite',
17751786
'seedance-2.0',
17761787
'seedance-2.0-fast',
17771788
'kling-v3-pro',
@@ -1780,6 +1791,11 @@ export const GenerateVideo: ToolCatalogEntry = {
17801791
'ltx-2.3',
17811792
],
17821793
},
1794+
negativePrompt: {
1795+
type: 'string',
1796+
description:
1797+
'Things to exclude from the video/audio (Veo models), e.g. "no background music" to keep dialogue but drop Veo\'s invented music before overlaying your own track.',
1798+
},
17831799
outputs: {
17841800
type: 'object',
17851801
description:

apps/sim/lib/copilot/generated/tool-schemas-v1.ts

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1217,7 +1217,8 @@ export const TOOL_RUNTIME_SCHEMAS: Record<string, ToolRuntimeSchemaEntry> = {
12171217
properties: {
12181218
duration: {
12191219
type: 'number',
1220-
description: 'Approximate duration in seconds for music/sfx where supported.',
1220+
description:
1221+
'Approximate duration in seconds for sfx (and music models that support it). MiniMax music ignores this — fit music to a video with the ffmpeg tool instead.',
12211222
},
12221223
inputs: {
12231224
type: 'object',
@@ -1288,6 +1289,16 @@ export const TOOL_RUNTIME_SCHEMAS: Record<string, ToolRuntimeSchemaEntry> = {
12881289
},
12891290
},
12901291
},
1292+
instrumental: {
1293+
type: 'boolean',
1294+
description:
1295+
'For music: true = instrumental, no vocals (default); false = a song with vocals.',
1296+
},
1297+
lyrics: {
1298+
type: 'string',
1299+
description:
1300+
'For music with vocals: the lyrics to sing (optional; supports [Verse]/[Chorus] tags). Setting this implies instrumental=false.',
1301+
},
12911302
model: {
12921303
type: 'string',
12931304
description:
@@ -1497,7 +1508,8 @@ export const TOOL_RUNTIME_SCHEMAS: Record<string, ToolRuntimeSchemaEntry> = {
14971508
},
14981509
generateAudio: {
14991510
type: 'boolean',
1500-
description: 'Generate native audio when the model supports it (default true for Veo).',
1511+
description:
1512+
"Toggle Veo's native audio (dialogue/SFX/ambience/music generated from the prompt). Default true. Set false when you will add your own voiceover/music via the ffmpeg tool.",
15011513
},
15021514
inputs: {
15031515
type: 'object',
@@ -1571,12 +1583,11 @@ export const TOOL_RUNTIME_SCHEMAS: Record<string, ToolRuntimeSchemaEntry> = {
15711583
model: {
15721584
type: 'string',
15731585
description:
1574-
'Optional model override. Defaults to veo-3.1-fast (native audio, cheapest Veo tier). Use veo-3.1 for 4K/cinematic; seedance-2.0 for longer narrative.',
1586+
"Optional model override, keyed to the video's goal: veo-3.1-lite (prototype/quick test, cheapest), veo-3.1-fast (reasonable draft — default, good video), veo-3.1 Standard (final cut / premium quality). Stay on Veo unless the user explicitly asks for another model; seedance-2.0 for >8s narrative, kling-v3-pro for specific looks.",
15751587
enum: [
15761588
'veo-3.1',
15771589
'veo-3.1-fast',
1578-
'sora-2',
1579-
'sora-2-pro',
1590+
'veo-3.1-lite',
15801591
'seedance-2.0',
15811592
'seedance-2.0-fast',
15821593
'kling-v3-pro',
@@ -1585,6 +1596,11 @@ export const TOOL_RUNTIME_SCHEMAS: Record<string, ToolRuntimeSchemaEntry> = {
15851596
'ltx-2.3',
15861597
],
15871598
},
1599+
negativePrompt: {
1600+
type: 'string',
1601+
description:
1602+
'Things to exclude from the video/audio (Veo models), e.g. "no background music" to keep dialogue but drop Veo\'s invented music before overlaying your own track.',
1603+
},
15881604
outputs: {
15891605
type: 'object',
15901606
description:

apps/sim/lib/copilot/tools/server/media/generate-audio.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ interface GenerateAudioArgs {
2323
model?: string
2424
voice?: string
2525
duration?: number
26+
/** For music: explicit lyrics for a vocal track. */
27+
lyrics?: string
28+
/** For music: true = instrumental (default), false = vocal track. */
29+
instrumental?: boolean
2630
/** Optional reference voice sample (workspace audio file) for zero-shot voice cloning. */
2731
inputs?: { files?: Array<{ path: string }> }
2832
outputs?: {
@@ -103,6 +107,8 @@ export const generateAudioServerTool: BaseServerTool<GenerateAudioArgs, Generate
103107
model: params.model,
104108
voice: params.voice,
105109
duration: params.duration,
110+
lyrics: params.lyrics,
111+
instrumental: params.instrumental,
106112
voiceSampleDataUri,
107113
})
108114

apps/sim/lib/copilot/tools/server/media/generate-video.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ interface GenerateVideoArgs {
2222
resolution?: string
2323
duration?: number
2424
generateAudio?: boolean
25+
negativePrompt?: string
2526
promptOptimizer?: boolean
2627
inputs?: { files?: Array<{ path: string }> }
2728
outputs?: {
@@ -83,6 +84,7 @@ export const generateVideoServerTool: BaseServerTool<GenerateVideoArgs, Generate
8384
resolution: params.resolution,
8485
duration: params.duration,
8586
generateAudio: params.generateAudio,
87+
negativePrompt: params.negativePrompt,
8688
promptOptimizer: params.promptOptimizer,
8789
imageDataUri,
8890
})

apps/sim/lib/media/falai-audio.ts

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@ export interface GenerateFalAudioParams {
2020
model?: string
2121
voice?: string
2222
duration?: number
23+
/** For music: explicit lyrics (with optional [Verse]/[Chorus] tags). Implies a vocal track. */
24+
lyrics?: string
25+
/** For music: true = instrumental (no vocals, default); false = vocal track. */
26+
instrumental?: boolean
2327
/** When set, clones the voice from this reference sample (data URI) via a zero-shot clone model. */
2428
voiceSampleDataUri?: string
2529
}
@@ -33,7 +37,11 @@ export interface GeneratedAudio {
3337
cost: FalAICostMetadata
3438
}
3539

36-
function buildInput(type: AudioType, params: GenerateFalAudioParams): Record<string, unknown> {
40+
function buildInput(
41+
type: AudioType,
42+
params: GenerateFalAudioParams,
43+
model: string
44+
): Record<string, unknown> {
3745
const input: Record<string, unknown> = {}
3846
if (type === 'speech') {
3947
// Gemini 3.1 Flash TTS takes the text (with optional inline tags) in `prompt`.
@@ -44,9 +52,31 @@ function buildInput(type: AudioType, params: GenerateFalAudioParams): Record<str
4452
input.text = params.prompt
4553
if (params.duration !== undefined) input.duration_seconds = params.duration
4654
} else {
47-
// Music models take a `prompt` describing the track.
55+
// Music. Two modes, both supported:
56+
// - instrumental bed (default): no vocals, no lyrics required
57+
// - song with vocals: explicit `lyrics`, or auto-written from the prompt
4858
input.prompt = params.prompt
49-
if (params.duration !== undefined) input.duration = params.duration
59+
const wantsVocals = params.instrumental === false || Boolean(params.lyrics)
60+
if (model.includes('minimax')) {
61+
// MiniMax Music 2.6 requires `lyrics` unless is_instrumental=true, and rejects a
62+
// top-level `duration` (that combination is the 422 we were hitting on every call).
63+
if (wantsVocals) {
64+
input.is_instrumental = false
65+
if (params.lyrics) input.lyrics = params.lyrics
66+
else input.lyrics_optimizer = true
67+
} else {
68+
input.is_instrumental = true
69+
}
70+
} else if (model.includes('elevenlabs/music')) {
71+
if (!wantsVocals) input.force_instrumental = true
72+
if (params.lyrics) input.prompt = `${params.prompt}\n\nLyrics:\n${params.lyrics}`
73+
if (params.duration !== undefined) input.music_length_ms = Math.round(params.duration * 1000)
74+
} else {
75+
// Other music models: best-effort passthrough.
76+
if (params.instrumental !== undefined) input.instrumental = params.instrumental
77+
if (params.lyrics) input.lyrics = params.lyrics
78+
if (params.duration !== undefined) input.duration = params.duration
79+
}
5080
}
5181
return input
5282
}
@@ -80,7 +110,7 @@ export async function generateFalAudio(params: GenerateFalAudioParams): Promise<
80110
}
81111

82112
const model = params.model || DEFAULT_AUDIO_MODELS[type]
83-
const input = buildInput(type, params)
113+
const input = buildInput(type, params, model)
84114

85115
// For fal audio models the model ID is the queue endpoint.
86116
const { requestId, data } = await runFalQueue(model, input, apiKey)

apps/sim/lib/media/falai-video.ts

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ interface FalVideoModelConfig {
1818
supportsAspectRatio?: boolean
1919
supportsResolution?: boolean
2020
supportsGenerateAudio?: boolean
21+
supportsNegativePrompt?: boolean
2122
supportsPromptOptimizer?: boolean
2223
}
2324

@@ -31,6 +32,7 @@ const VIDEO_MODELS: Record<string, FalVideoModelConfig> = {
3132
supportsAspectRatio: true,
3233
supportsResolution: true,
3334
supportsGenerateAudio: true,
35+
supportsNegativePrompt: true,
3436
},
3537
'veo-3.1-fast': {
3638
endpoint: 'fal-ai/veo3.1/fast',
@@ -39,18 +41,16 @@ const VIDEO_MODELS: Record<string, FalVideoModelConfig> = {
3941
supportsAspectRatio: true,
4042
supportsResolution: true,
4143
supportsGenerateAudio: true,
44+
supportsNegativePrompt: true,
4245
},
43-
'sora-2': {
44-
endpoint: 'fal-ai/sora-2/text-to-video',
45-
durationFormat: 'number',
46-
supportsAspectRatio: true,
47-
supportsResolution: true,
48-
},
49-
'sora-2-pro': {
50-
endpoint: 'fal-ai/sora-2/text-to-video/pro',
51-
durationFormat: 'number',
46+
'veo-3.1-lite': {
47+
endpoint: 'fal-ai/veo3.1/lite',
48+
i2vEndpoint: 'fal-ai/veo3.1/lite/image-to-video',
49+
durationFormat: 'seconds',
5250
supportsAspectRatio: true,
5351
supportsResolution: true,
52+
supportsGenerateAudio: true,
53+
supportsNegativePrompt: true,
5454
},
5555
'seedance-2.0': {
5656
endpoint: 'bytedance/seedance-2.0/text-to-video',
@@ -93,9 +93,10 @@ const VIDEO_MODELS: Record<string, FalVideoModelConfig> = {
9393
},
9494
}
9595

96-
// Default to the Fast tier: same Veo 3.1 family with native audio, but the
97-
// cheapest tier on fal (~$0.10-0.15/s vs ~$0.40/s Standard, and cheaper than
98-
// Seedance ~$0.24-0.30/s). Agent can override to veo-3.1 (4K/cinematic) etc.
96+
// Default to Veo 3.1 Fast: the same Veo model family — good 1080p video with native
97+
// 48kHz audio + lip-sync — at ~1/3 the cost of Standard (~$0.15/s vs ~$0.40/s). The
98+
// gap is surface detail / 4K, not "good vs bad". The agent overrides to veo-3.1
99+
// (Standard) only when the user explicitly asks for very high / premium quality.
99100
export const DEFAULT_VIDEO_MODEL = 'veo-3.1-fast'
100101

101102
export interface GenerateFalVideoParams {
@@ -105,6 +106,8 @@ export interface GenerateFalVideoParams {
105106
resolution?: string
106107
duration?: number
107108
generateAudio?: boolean
109+
/** Things to exclude from the generation, e.g. "no background music" (Veo models). */
110+
negativePrompt?: string
108111
promptOptimizer?: boolean
109112
/** Optional start-frame image as a data URI; when set, routes to the model's image-to-video endpoint. */
110113
imageDataUri?: string
@@ -162,6 +165,9 @@ export async function generateFalVideo(params: GenerateFalVideoParams): Promise<
162165
if (config.supportsGenerateAudio && params.generateAudio !== undefined) {
163166
input.generate_audio = params.generateAudio
164167
}
168+
if (config.supportsNegativePrompt && params.negativePrompt) {
169+
input.negative_prompt = params.negativePrompt
170+
}
165171
if (config.supportsPromptOptimizer && params.promptOptimizer !== undefined) {
166172
input.prompt_optimizer = params.promptOptimizer
167173
}

apps/sim/lib/media/falai.ts

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,11 +121,13 @@ export async function runFalQueue(
121121

122122
const statusResponse = await fetch(statusUrl, { headers: { Authorization: `Key ${apiKey}` } })
123123
if (!statusResponse.ok) {
124-
await readResponseTextWithLimit(statusResponse, {
124+
const body = await readResponseTextWithLimit(statusResponse, {
125125
maxBytes: DEFAULT_MAX_ERROR_BODY_BYTES,
126126
label: 'Fal.ai status error response',
127127
}).catch(() => '')
128-
throw new Error(`Fal.ai status check failed: ${statusResponse.status}`)
128+
throw new Error(
129+
`Fal.ai status check failed: ${statusResponse.status}${body ? ` - ${body}` : ''}`
130+
)
129131
}
130132

131133
const statusData = await readResponseJsonWithLimit(statusResponse, {
@@ -143,11 +145,13 @@ export async function runFalQueue(
143145
headers: { Authorization: `Key ${apiKey}` },
144146
})
145147
if (!resultResponse.ok) {
146-
await readResponseTextWithLimit(resultResponse, {
148+
const body = await readResponseTextWithLimit(resultResponse, {
147149
maxBytes: DEFAULT_MAX_ERROR_BODY_BYTES,
148150
label: 'Fal.ai result error response',
149151
}).catch(() => '')
150-
throw new Error(`Failed to fetch Fal.ai result: ${resultResponse.status}`)
152+
throw new Error(
153+
`Failed to fetch Fal.ai result: ${resultResponse.status}${body ? ` - ${body}` : ''}`
154+
)
151155
}
152156
const resultData = await readResponseJsonWithLimit(resultResponse, {
153157
maxBytes: MAX_MEDIA_JSON_BYTES,

0 commit comments

Comments
 (0)