Update media

Sg312 · Sg312 · commit 1110db1c8f17 · 2026-06-03T18:13:29.000-07:00
diff --git a/apps/sim/lib/copilot/generated/tool-catalog-v1.ts b/apps/sim/lib/copilot/generated/tool-catalog-v1.ts
@@ -1421,7 +1421,8 @@ export const GenerateAudio: ToolCatalogEntry = {
     properties: {
       duration: {
         type: 'number',
-        description: 'Approximate duration in seconds for music/sfx where supported.',
+        description:
+          'Approximate duration in seconds for sfx (and music models that support it). MiniMax music ignores this — fit music to a video with the ffmpeg tool instead.',
       },
       inputs: {
         type: 'object',
@@ -1486,6 +1487,16 @@ export const GenerateAudio: ToolCatalogEntry = {
           },
         },
       },
+      instrumental: {
+        type: 'boolean',
+        description:
+          'For music: true = instrumental, no vocals (default); false = a song with vocals.',
+      },
+      lyrics: {
+        type: 'string',
+        description:
+          'For music with vocals: the lyrics to sing (optional; supports [Verse]/[Chorus] tags). Setting this implies instrumental=false.',
+      },
       model: {
         type: 'string',
         description:
@@ -1698,7 +1709,8 @@ export const GenerateVideo: ToolCatalogEntry = {
       },
       generateAudio: {
         type: 'boolean',
-        description: 'Generate native audio when the model supports it (default true for Veo).',
+        description:
+          "Toggle Veo's native audio (dialogue/SFX/ambience/music generated from the prompt). Default true. Set false when you will add your own voiceover/music via the ffmpeg tool.",
       },
       inputs: {
         type: 'object',
@@ -1766,12 +1778,11 @@ export const GenerateVideo: ToolCatalogEntry = {
       model: {
         type: 'string',
         description:
-          'Optional model override. Defaults to veo-3.1-fast (native audio, cheapest Veo tier). Use veo-3.1 for 4K/cinematic; seedance-2.0 for longer narrative.',
+          "Optional model override, keyed to the video's goal: veo-3.1-lite (prototype/quick test, cheapest), veo-3.1-fast (reasonable draft — default, good video), veo-3.1 Standard (final cut / premium quality). Stay on Veo unless the user explicitly asks for another model; seedance-2.0 for >8s narrative, kling-v3-pro for specific looks.",
         enum: [
           'veo-3.1',
           'veo-3.1-fast',
-          'sora-2',
-          'sora-2-pro',
+          'veo-3.1-lite',
           'seedance-2.0',
           'seedance-2.0-fast',
           'kling-v3-pro',
@@ -1780,6 +1791,11 @@ export const GenerateVideo: ToolCatalogEntry = {
           'ltx-2.3',
         ],
       },
+      negativePrompt: {
+        type: 'string',
+        description:
+          'Things to exclude from the video/audio (Veo models), e.g. "no background music" to keep dialogue but drop Veo\'s invented music before overlaying your own track.',
+      },
       outputs: {
         type: 'object',
         description:
diff --git a/apps/sim/lib/copilot/generated/tool-schemas-v1.ts b/apps/sim/lib/copilot/generated/tool-schemas-v1.ts
@@ -1217,7 +1217,8 @@ export const TOOL_RUNTIME_SCHEMAS: Record<string, ToolRuntimeSchemaEntry> = {
       properties: {
         duration: {
           type: 'number',
-          description: 'Approximate duration in seconds for music/sfx where supported.',
+          description:
+            'Approximate duration in seconds for sfx (and music models that support it). MiniMax music ignores this — fit music to a video with the ffmpeg tool instead.',
         },
         inputs: {
           type: 'object',
@@ -1288,6 +1289,16 @@ export const TOOL_RUNTIME_SCHEMAS: Record<string, ToolRuntimeSchemaEntry> = {
             },
           },
         },
+        instrumental: {
+          type: 'boolean',
+          description:
+            'For music: true = instrumental, no vocals (default); false = a song with vocals.',
+        },
+        lyrics: {
+          type: 'string',
+          description:
+            'For music with vocals: the lyrics to sing (optional; supports [Verse]/[Chorus] tags). Setting this implies instrumental=false.',
+        },
         model: {
           type: 'string',
           description:
@@ -1497,7 +1508,8 @@ export const TOOL_RUNTIME_SCHEMAS: Record<string, ToolRuntimeSchemaEntry> = {
         },
         generateAudio: {
           type: 'boolean',
-          description: 'Generate native audio when the model supports it (default true for Veo).',
+          description:
+            "Toggle Veo's native audio (dialogue/SFX/ambience/music generated from the prompt). Default true. Set false when you will add your own voiceover/music via the ffmpeg tool.",
         },
         inputs: {
           type: 'object',
@@ -1571,12 +1583,11 @@ export const TOOL_RUNTIME_SCHEMAS: Record<string, ToolRuntimeSchemaEntry> = {
         model: {
           type: 'string',
           description:
-            'Optional model override. Defaults to veo-3.1-fast (native audio, cheapest Veo tier). Use veo-3.1 for 4K/cinematic; seedance-2.0 for longer narrative.',
+            "Optional model override, keyed to the video's goal: veo-3.1-lite (prototype/quick test, cheapest), veo-3.1-fast (reasonable draft — default, good video), veo-3.1 Standard (final cut / premium quality). Stay on Veo unless the user explicitly asks for another model; seedance-2.0 for >8s narrative, kling-v3-pro for specific looks.",
           enum: [
             'veo-3.1',
             'veo-3.1-fast',
-            'sora-2',
-            'sora-2-pro',
+            'veo-3.1-lite',
             'seedance-2.0',
             'seedance-2.0-fast',
             'kling-v3-pro',
@@ -1585,6 +1596,11 @@ export const TOOL_RUNTIME_SCHEMAS: Record<string, ToolRuntimeSchemaEntry> = {
             'ltx-2.3',
           ],
         },
+        negativePrompt: {
+          type: 'string',
+          description:
+            'Things to exclude from the video/audio (Veo models), e.g. "no background music" to keep dialogue but drop Veo\'s invented music before overlaying your own track.',
+        },
         outputs: {
           type: 'object',
           description:
diff --git a/apps/sim/lib/copilot/tools/server/media/generate-audio.ts b/apps/sim/lib/copilot/tools/server/media/generate-audio.ts
@@ -23,6 +23,10 @@ interface GenerateAudioArgs {
   model?: string
   voice?: string
   duration?: number
+  /** For music: explicit lyrics for a vocal track. */
+  lyrics?: string
+  /** For music: true = instrumental (default), false = vocal track. */
+  instrumental?: boolean
   /** Optional reference voice sample (workspace audio file) for zero-shot voice cloning. */
   inputs?: { files?: Array<{ path: string }> }
   outputs?: {
@@ -103,6 +107,8 @@ export const generateAudioServerTool: BaseServerTool<GenerateAudioArgs, Generate
         model: params.model,
         voice: params.voice,
         duration: params.duration,
+        lyrics: params.lyrics,
+        instrumental: params.instrumental,
         voiceSampleDataUri,
       })
 
diff --git a/apps/sim/lib/copilot/tools/server/media/generate-video.ts b/apps/sim/lib/copilot/tools/server/media/generate-video.ts
@@ -22,6 +22,7 @@ interface GenerateVideoArgs {
   resolution?: string
   duration?: number
   generateAudio?: boolean
+  negativePrompt?: string
   promptOptimizer?: boolean
   inputs?: { files?: Array<{ path: string }> }
   outputs?: {
@@ -83,6 +84,7 @@ export const generateVideoServerTool: BaseServerTool<GenerateVideoArgs, Generate
         resolution: params.resolution,
         duration: params.duration,
         generateAudio: params.generateAudio,
+        negativePrompt: params.negativePrompt,
         promptOptimizer: params.promptOptimizer,
         imageDataUri,
       })
diff --git a/apps/sim/lib/media/falai-audio.ts b/apps/sim/lib/media/falai-audio.ts
@@ -20,6 +20,10 @@ export interface GenerateFalAudioParams {
   model?: string
   voice?: string
   duration?: number
+  /** For music: explicit lyrics (with optional [Verse]/[Chorus] tags). Implies a vocal track. */
+  lyrics?: string
+  /** For music: true = instrumental (no vocals, default); false = vocal track. */
+  instrumental?: boolean
   /** When set, clones the voice from this reference sample (data URI) via a zero-shot clone model. */
   voiceSampleDataUri?: string
 }
@@ -33,7 +37,11 @@ export interface GeneratedAudio {
   cost: FalAICostMetadata
 }
 
-function buildInput(type: AudioType, params: GenerateFalAudioParams): Record<string, unknown> {
+function buildInput(
+  type: AudioType,
+  params: GenerateFalAudioParams,
+  model: string
+): Record<string, unknown> {
   const input: Record<string, unknown> = {}
   if (type === 'speech') {
     // Gemini 3.1 Flash TTS takes the text (with optional inline tags) in `prompt`.
@@ -44,9 +52,31 @@ function buildInput(type: AudioType, params: GenerateFalAudioParams): Record<str
     input.text = params.prompt
     if (params.duration !== undefined) input.duration_seconds = params.duration
   } else {
-    // Music models take a `prompt` describing the track.
+    // Music. Two modes, both supported:
+    //  - instrumental bed (default): no vocals, no lyrics required
+    //  - song with vocals: explicit `lyrics`, or auto-written from the prompt
     input.prompt = params.prompt
-    if (params.duration !== undefined) input.duration = params.duration
+    const wantsVocals = params.instrumental === false || Boolean(params.lyrics)
+    if (model.includes('minimax')) {
+      // MiniMax Music 2.6 requires `lyrics` unless is_instrumental=true, and rejects a
+      // top-level `duration` (that combination is the 422 we were hitting on every call).
+      if (wantsVocals) {
+        input.is_instrumental = false
+        if (params.lyrics) input.lyrics = params.lyrics
+        else input.lyrics_optimizer = true
+      } else {
+        input.is_instrumental = true
+      }
+    } else if (model.includes('elevenlabs/music')) {
+      if (!wantsVocals) input.force_instrumental = true
+      if (params.lyrics) input.prompt = `${params.prompt}\n\nLyrics:\n${params.lyrics}`
+      if (params.duration !== undefined) input.music_length_ms = Math.round(params.duration * 1000)
+    } else {
+      // Other music models: best-effort passthrough.
+      if (params.instrumental !== undefined) input.instrumental = params.instrumental
+      if (params.lyrics) input.lyrics = params.lyrics
+      if (params.duration !== undefined) input.duration = params.duration
+    }
   }
   return input
 }
@@ -80,7 +110,7 @@ export async function generateFalAudio(params: GenerateFalAudioParams): Promise<
   }
 
   const model = params.model || DEFAULT_AUDIO_MODELS[type]
-  const input = buildInput(type, params)
+  const input = buildInput(type, params, model)
 
   // For fal audio models the model ID is the queue endpoint.
   const { requestId, data } = await runFalQueue(model, input, apiKey)
diff --git a/apps/sim/lib/media/falai-video.ts b/apps/sim/lib/media/falai-video.ts
@@ -18,6 +18,7 @@ interface FalVideoModelConfig {
   supportsAspectRatio?: boolean
   supportsResolution?: boolean
   supportsGenerateAudio?: boolean
+  supportsNegativePrompt?: boolean
   supportsPromptOptimizer?: boolean
 }
 
@@ -31,6 +32,7 @@ const VIDEO_MODELS: Record<string, FalVideoModelConfig> = {
     supportsAspectRatio: true,
     supportsResolution: true,
     supportsGenerateAudio: true,
+    supportsNegativePrompt: true,
   },
   'veo-3.1-fast': {
     endpoint: 'fal-ai/veo3.1/fast',
@@ -39,18 +41,16 @@ const VIDEO_MODELS: Record<string, FalVideoModelConfig> = {
     supportsAspectRatio: true,
     supportsResolution: true,
     supportsGenerateAudio: true,
+    supportsNegativePrompt: true,
   },
-  'sora-2': {
-    endpoint: 'fal-ai/sora-2/text-to-video',
-    durationFormat: 'number',
-    supportsAspectRatio: true,
-    supportsResolution: true,
-  },
-  'sora-2-pro': {
-    endpoint: 'fal-ai/sora-2/text-to-video/pro',
-    durationFormat: 'number',
+  'veo-3.1-lite': {
+    endpoint: 'fal-ai/veo3.1/lite',
+    i2vEndpoint: 'fal-ai/veo3.1/lite/image-to-video',
+    durationFormat: 'seconds',
     supportsAspectRatio: true,
     supportsResolution: true,
+    supportsGenerateAudio: true,
+    supportsNegativePrompt: true,
   },
   'seedance-2.0': {
     endpoint: 'bytedance/seedance-2.0/text-to-video',
@@ -93,9 +93,10 @@ const VIDEO_MODELS: Record<string, FalVideoModelConfig> = {
   },
 }
 
-// Default to the Fast tier: same Veo 3.1 family with native audio, but the
-// cheapest tier on fal (~$0.10-0.15/s vs ~$0.40/s Standard, and cheaper than
-// Seedance ~$0.24-0.30/s). Agent can override to veo-3.1 (4K/cinematic) etc.
+// Default to Veo 3.1 Fast: the same Veo model family — good 1080p video with native
+// 48kHz audio + lip-sync — at ~1/3 the cost of Standard (~$0.15/s vs ~$0.40/s). The
+// gap is surface detail / 4K, not "good vs bad". The agent overrides to veo-3.1
+// (Standard) only when the user explicitly asks for very high / premium quality.
 export const DEFAULT_VIDEO_MODEL = 'veo-3.1-fast'
 
 export interface GenerateFalVideoParams {
@@ -105,6 +106,8 @@ export interface GenerateFalVideoParams {
   resolution?: string
   duration?: number
   generateAudio?: boolean
+  /** Things to exclude from the generation, e.g. "no background music" (Veo models). */
+  negativePrompt?: string
   promptOptimizer?: boolean
   /** Optional start-frame image as a data URI; when set, routes to the model's image-to-video endpoint. */
   imageDataUri?: string
@@ -162,6 +165,9 @@ export async function generateFalVideo(params: GenerateFalVideoParams): Promise<
   if (config.supportsGenerateAudio && params.generateAudio !== undefined) {
     input.generate_audio = params.generateAudio
   }
+  if (config.supportsNegativePrompt && params.negativePrompt) {
+    input.negative_prompt = params.negativePrompt
+  }
   if (config.supportsPromptOptimizer && params.promptOptimizer !== undefined) {
     input.prompt_optimizer = params.promptOptimizer
   }
diff --git a/apps/sim/lib/media/falai.ts b/apps/sim/lib/media/falai.ts
@@ -121,11 +121,13 @@ export async function runFalQueue(
 
     const statusResponse = await fetch(statusUrl, { headers: { Authorization: `Key ${apiKey}` } })
     if (!statusResponse.ok) {
-      await readResponseTextWithLimit(statusResponse, {
+      const body = await readResponseTextWithLimit(statusResponse, {
         maxBytes: DEFAULT_MAX_ERROR_BODY_BYTES,
         label: 'Fal.ai status error response',
       }).catch(() => '')
-      throw new Error(`Fal.ai status check failed: ${statusResponse.status}`)
+      throw new Error(
+        `Fal.ai status check failed: ${statusResponse.status}${body ? ` - ${body}` : ''}`
+      )
     }
 
     const statusData = await readResponseJsonWithLimit(statusResponse, {
@@ -143,11 +145,13 @@ export async function runFalQueue(
         headers: { Authorization: `Key ${apiKey}` },
       })
       if (!resultResponse.ok) {
-        await readResponseTextWithLimit(resultResponse, {
+        const body = await readResponseTextWithLimit(resultResponse, {
           maxBytes: DEFAULT_MAX_ERROR_BODY_BYTES,
           label: 'Fal.ai result error response',
         }).catch(() => '')
-        throw new Error(`Failed to fetch Fal.ai result: ${resultResponse.status}`)
+        throw new Error(
+          `Failed to fetch Fal.ai result: ${resultResponse.status}${body ? ` - ${body}` : ''}`
+        )
       }
       const resultData = await readResponseJsonWithLimit(resultResponse, {
         maxBytes: MAX_MEDIA_JSON_BYTES,
diff --git a/apps/sim/lib/media/ffmpeg.ts b/apps/sim/lib/media/ffmpeg.ts