diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index a9fa241..3b09d80 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -7,9 +7,29 @@ }, "plugins": [ { - "name": "replicate", - "source": "./", - "description": "Skills for building AI-powered apps with Replicate" + "name": "find-models", + "source": "skills/find-models", + "description": "Find AI models on Replicate using search and curated collections." + }, + { + "name": "compare-models", + "source": "skills/compare-models", + "description": "Compare Replicate models by cost, speed, quality, and capabilities." + }, + { + "name": "run-models", + "source": "skills/run-models", + "description": "Run AI models on Replicate via predictions, webhooks, and streaming." + }, + { + "name": "prompt-images", + "source": "skills/prompt-images", + "description": "Prompting techniques for AI image generation and editing models on Replicate." + }, + { + "name": "prompt-videos", + "source": "skills/prompt-videos", + "description": "Prompting techniques for AI video generation models on Replicate." } ] } diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json index 0b7044f..adf7249 100644 --- a/.claude-plugin/plugin.json +++ b/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "replicate", - "description": "Skills for building AI-powered apps with Replicate", - "version": "1.0.0", + "description": "Skills for finding, comparing, running, and prompting AI models on Replicate", + "version": "2.0.0", "author": { "name": "Replicate" } diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d220407..66ec33c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -33,4 +33,8 @@ jobs: CLAWHUB_TOKEN: ${{ secrets.CLAWHUB_TOKEN }} run: | npx clawhub@latest login --token "$CLAWHUB_TOKEN" - npx clawhub@latest publish skills/replicate --slug replicate --name "Replicate" + npx clawhub@latest publish skills/find-models --slug replicate-find-models --name "Replicate: Find Models" + npx clawhub@latest publish skills/compare-models --slug replicate-compare-models --name "Replicate: Compare Models" + npx clawhub@latest publish skills/run-models --slug replicate-run-models --name "Replicate: Run Models" + npx clawhub@latest publish skills/prompt-images --slug replicate-prompt-images --name "Replicate: Prompt Images" + npx clawhub@latest publish skills/prompt-videos --slug replicate-prompt-videos --name "Replicate: Prompt Videos" diff --git a/AGENTS.md b/AGENTS.md index 67f2277..0cb636b 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -2,27 +2,32 @@ ## Purpose -This repo publishes a single Agent Skills document for Replicate. - -Keep it short and focused: a human- and agent-readable guide to discovering models, inspecting schemas, running predictions, and handling outputs. +This repo publishes Agent Skills for Replicate: focused guides for finding, comparing, running, and prompting AI models. ## Files that matter -- `skills/replicate/SKILL.md` is the canonical skill. -- `.mcp.json` points to the remote MCP server. -- `.claude-plugin/` contains marketplace metadata for Claude Code. +- `skills/find-models/SKILL.md` — search, collections, schemas, picking the right model. +- `skills/compare-models/SKILL.md` — evaluating models by cost, speed, quality, and capabilities. +- `skills/run-models/SKILL.md` — predictions, polling, webhooks, streaming, file I/O, concurrency, multi-model workflows. +- `skills/prompt-images/SKILL.md` — prompting techniques for image generation and editing models. +- `skills/prompt-videos/SKILL.md` — prompting techniques for video generation models. +- `script/lint` — validates the skills. +- `.mcp.json` — points to the remote MCP server. +- `.claude-plugin/` — marketplace metadata for Claude Code. ## Editing guidelines -- Keep `SKILL.md` concise and practical. Prefer bullet lists over long prose. -- Treat `https://api.replicate.com/openapi.json` as the source of truth. -- Keep mentions of deprecated or unofficial endpoints out of the skill. +- Keep each `SKILL.md` concise and practical. Prefer bullet lists over long prose. +- Every code snippet must be runnable. The test runner executes them all. +- Treat `https://api.replicate.com/openapi.json` as the source of truth for API details. - Do not add language-specific client guidance unless explicitly requested. ## Linting -Lint before committing changes: - ``` -script/lint +script/lint skills/find-models +script/lint skills/compare-models +script/lint skills/run-models +script/lint skills/prompt-images +script/lint skills/prompt-videos ``` diff --git a/README.md b/README.md index de3c278..9d50887 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,13 @@ # Replicate Skills +A collection of [Agent Skills](https://agentskills.io) for building AI-powered apps with [Replicate](https://replicate.com). -A collection of [Agent Skills](https://agentskills.io) for building AI-powered apps with [Replicate](https://replicate.com). +Skills included: -Discover, compare, and run AI models using Replicate's API. +- **find-models** — search for models, browse collections, read schemas, pick the right model +- **compare-models** — evaluate models by cost, speed, quality, and capabilities +- **run-models** — create predictions, poll, use webhooks, stream, handle files, run concurrently +- **prompt-images** — prompting techniques for image generation and editing models ## Installing diff --git a/script/lint b/script/lint index 2347e47..bad9837 100755 --- a/script/lint +++ b/script/lint @@ -12,23 +12,35 @@ cd "$(dirname "$0")/.." [ -z "$DEBUG" ] || set -x -SKILL_PATH="${1:-skills/replicate}" - -echo "==> Validating Agent Skills in $SKILL_PATH" - -if command -v uv >/dev/null 2>&1; then - uvx --from skills-ref agentskills validate "$SKILL_PATH" - exit 0 +if [ $# -gt 0 ]; then + SKILL_PATHS="$*" +else + SKILL_PATHS="" + for d in skills/*/; do + [ -f "$d/SKILL.md" ] && SKILL_PATHS="$SKILL_PATHS $d" + done fi -PYTHON_BIN="${PYTHON_BIN:-python3}" -if ! command -v "$PYTHON_BIN" >/dev/null 2>&1; then - if command -v python >/dev/null 2>&1; then - PYTHON_BIN=python - else - echo "python not found. Install Python 3 or uv to run skills-ref." >&2 - exit 1 +validate() { + if command -v uv >/dev/null 2>&1; then + uvx --from skills-ref agentskills validate "$1" + return fi -fi -"$PYTHON_BIN" -m skills_ref validate "$SKILL_PATH" + PYTHON_BIN="${PYTHON_BIN:-python3}" + if ! command -v "$PYTHON_BIN" >/dev/null 2>&1; then + if command -v python >/dev/null 2>&1; then + PYTHON_BIN=python + else + echo "python not found. Install Python 3 or uv to run skills-ref." >&2 + exit 1 + fi + fi + + "$PYTHON_BIN" -m skills_ref validate "$1" +} + +for SKILL_PATH in $SKILL_PATHS; do + echo "==> Validating Agent Skills in $SKILL_PATH" + validate "$SKILL_PATH" +done diff --git a/skills/compare-models/SKILL.md b/skills/compare-models/SKILL.md new file mode 100644 index 0000000..f1b60b1 --- /dev/null +++ b/skills/compare-models/SKILL.md @@ -0,0 +1,47 @@ +--- +name: compare-models +description: Compare Replicate models by cost, speed, quality, and capabilities. +--- + +## Docs + +- Reference: +- OpenAPI schema: +- MCP server: +- Per-model docs: `https://replicate.com/{owner}/{model}/llms.txt` +- Set `Accept: text/markdown` when requesting docs pages for Markdown responses. + +## Workflow + +1. Search or browse collections to build a shortlist of candidate models. +2. Fetch each model's schema to compare inputs, outputs, and capabilities. +3. Check pricing from model metadata or the Replicate website. +4. Run a small batch of test predictions to compare output quality. +5. Pick the model that best fits your constraints (cost, latency, quality). + +## What to compare + +- **Speed**: Check `metrics.predict_time` on completed predictions for actual inference time. Official models are always warm. Community models can cold-boot. +- **Cost**: Official models have predictable per-run pricing. Community models charge by compute time (GPU-seconds). Run a few predictions and check the `metrics` field for actual cost. +- **Quality**: Run the same prompts through each model and compare outputs. Quality is subjective. Match it to your use case, not a leaderboard. +- **Capabilities**: Compare input schemas for supported features (reference images, masks, aspect ratios, streaming, multi-image input). Check output formats. + +## Key tradeoffs + +- Lowest cost: smaller/distilled models. Accept slower inference and lower quality. +- Lowest latency: official models or schnell/turbo variants. Accept higher cost per run. +- Highest quality: pro/max/quality variants. Accept slower inference and higher cost. +- Most control: models with ControlNet, masks, or reference images. Accept more complex input setup. + +## Official vs community models + +- Official models: always warm, stable APIs, predictable pricing, maintained by Replicate. +- Community models: may cold-boot, require version pinning, maintained by the author. +- If a community model meets your needs and an official model doesn't, consider creating a deployment for consistent uptime. + +## Prompting guidance + +For prompting techniques and task-specific guidance: + +- Image generation and editing: see the [prompt-images](../prompt-images/SKILL.md) skill. +- Video generation: see the [prompt-videos](../prompt-videos/SKILL.md) skill. diff --git a/skills/find-models/SKILL.md b/skills/find-models/SKILL.md new file mode 100644 index 0000000..fc6b79b --- /dev/null +++ b/skills/find-models/SKILL.md @@ -0,0 +1,47 @@ +--- +name: find-models +description: Find AI models on Replicate using search and curated collections. +--- + +## Docs + +- Reference: +- OpenAPI schema: +- MCP server: +- Per-model docs: `https://replicate.com/{owner}/{model}/llms.txt` +- Set `Accept: text/markdown` when requesting docs pages for Markdown responses. + +## Search + +- Use the search API (`GET /v1/search?query=...`) to find models by task. Returns models, collections, and docs. +- Search returns metadata for each model including `tags`, `generated_description`, and `run_count`. +- The search API also returns matching collections alongside model results. +- Avoid listing all models via API. It's a firehose. Use targeted queries. + +## Collections + +- Collections are curated groups of models maintained by Replicate staff. +- The `official` collection contains always-warm models with stable APIs and predictable pricing. +- Use collections to narrow a shortlist before deep comparison. +- List collections with `GET /v1/collections`. Get one by slug with `GET /v1/collections/{slug}`. + +## Reading model schemas + +- Every model exposes its input/output schema via the models API (`GET /v1/models/{owner}/{name}`). +- Schema path: `model.latest_version.openapi_schema.components.schemas.Input.properties` +- Each property may include: `type`, `description`, `default`, `minimum`/`maximum`, `enum`, `format` (e.g. `uri` for file inputs). +- Always fetch the schema before running a model. Schemas change. + +## Picking the right model + +- Prefer official models. They're always warm (no cold boot), have stable APIs, and predictable pricing. +- Prefer the latest version. If search returns v2.5 and v3.0, use v3. +- Run count can be misleading. Old models accumulate runs over time but may be outdated. A model with 10M runs from 2023 is likely worse than a model with 100K runs from 2025. +- Prefer recently released models. The AI space moves fast. +- Check model tags to help filter by task (`image-generation`, `video`, `audio`, etc.). + +## Model identifiers + +- **Official models** use `owner/name` format (e.g. `black-forest-labs/flux-2-klein-9b`). Routes to the latest version automatically. +- **Community models** require `owner/name:version_id`. You must pin a specific version. Community models can cold-boot and take time to start. +- If you must use a community model, be aware that it can take a long time to boot. You can create always-on deployments, but you pay for model uptime. diff --git a/skills/prompt-images/SKILL.md b/skills/prompt-images/SKILL.md new file mode 100644 index 0000000..3ba3723 --- /dev/null +++ b/skills/prompt-images/SKILL.md @@ -0,0 +1,200 @@ +--- +name: prompt-images +description: > + Prompting techniques for AI image generation and editing models on Replicate. + Use when writing prompts for image models or building image generation features. +--- + +# Prompting image models on Replicate + +Distilled from Replicate's blog posts on prompting image models (2024-2026). Techniques are model-agnostic and focus on transferable principles. For model selection, pricing, and feature comparison, see the [compare-models](../compare-models/SKILL.md) skill. + + +## Writing prompts + +### Use natural language, not keyword lists + +Write full sentences describing what you want. Modern image models understand grammar and context far better than keyword-stuffed prompts. + +Good: "A woman standing in a Tokyo alleyway at dusk, neon signs reflecting off wet pavement" +Bad: "woman, Tokyo, alleyway, dusk, neon, wet pavement" + +### Be specific and unambiguous + +Name exact colors, materials, lighting setups, camera equipment, and spatial relationships. Vague terms like "make it better" or "artistic" give unpredictable results. + +Good: "A brutalist concrete building reflected in a perfectly still puddle after rain. A single figure with a red umbrella walks along the edge, the only color in an otherwise monochrome scene. Overcast sky, flat diffused light, tilt-shift lens effect on the edges." +Bad: "Cool building with a person near it, rainy day" + +### Name subjects directly + +Use descriptive phrases like "the woman with short black hair" or "the red car." Avoid pronouns, which are often too ambiguous for image models. + +### Use long, detailed prompts + +Most modern models accept thousands of tokens. Long descriptive prompts with clear structure outperform short ones. A prompt with 12+ specific requirements (text on objects, labeled diagrams, color-coded elements, specific materials) can work if each requirement is stated clearly. But be aware: the longer and more complex the prompt, the more likely something will be missed. + +### Start simple, then iterate + +Begin with basic changes. Test small edits first, then build on what works. Most editing models support iterative editing, so take advantage of that. + + +## Photographic language + +Modern image models understand camera and photography terminology deeply. Using this vocabulary gives you precise control over the look. + +### Camera and lens + +- Film stocks: Kodak Portra 800, Fuji Velvia 50, Ilford HP5 +- Lens characteristics: 50mm Summilux wide open, 85mm f/1.4, 24mm wide-angle +- Depth of field: shallow (subject sharp, background blurred), deep (everything in focus) +- Shooting techniques: golden hour, blue hour, long exposure, double exposure + +### Lighting setups + +- Rembrandt lighting: classic portrait lighting with a triangle of light on the cheek +- Soft diffused studio lighting: crisp highlights and gentle shadows +- Rim lighting / backlight: subject outlined with light from behind +- Flat diffused light: overcast, even illumination, minimal shadows +- Volumetric lighting: visible light beams, fog, haze + +### Composition + +- Rule of thirds, centered composition, symmetry +- Wide shot, medium shot, close-up, macro +- High angle, low angle, eye level, bird's-eye view +- Tilt-shift for miniature effects + + +## Text rendering + +Rendering text in images is a common task. These techniques improve accuracy across models. + +- Wrap desired text in double quotation marks within the prompt: "Design a poster with the title \"BLUE NOTE SESSIONS\" in bold condensed sans-serif" +- Stick to readable fonts. Highly stylized text may not work as well. +- When editing text in an existing image, use the pattern: "Change 'old text' to 'new text'" +- Match text length when possible: big shifts in character count can change layout +- Be explicit about preserving font style if it matters +- For complex typography (posters, editorial layouts), look for models that treat text as part of the composition rather than stamping it on top +- Some models can inpaint text: mask the text region, prompt with new text, and it matches the original font and style + + +## Style transfer + +- Name the exact style: "impressionist painting," "1960s pop art," "Sumi-e ink wash" +- Reference specific artists or movements for clearer guidance +- If a style label doesn't work, describe its key traits: "visible brushstrokes, thick paint texture, rich color depth" +- State what should stay the same: "keep the original composition" +- When a style is hard to describe in words, some models support example-based editing: provide a before/after pair, then a third image. The model infers the transformation and applies it. +- Some models accept style reference images: upload visuals capturing the color palette, texture, composition, and mood you want + + +## Character consistency + +Maintaining the same character across multiple generations is one of the hardest challenges in image generation. + +- Start with a clear reference description: "the woman with short black hair and green eyes wearing a navy blazer" +- Say what's changing (setting, activity, style) and what should stay the same (face, expression, clothing) +- Use reference images when the model supports them. Some models handle multiple reference images simultaneously for stronger consistency. +- Break complex character changes into steps: change outfit first, then change scene +- Generate synthetic training data: create many images of a character, pick the best ones, and use them for fine-tuning or as references + + +## Image editing + +### General principles + +- Specify what to keep: explicitly state what should remain unchanged. Use phrases like "keeping the pose and expression unchanged" or "maintain the original composition." +- Choose verbs carefully: "transform" suggests a full rework. Use specific actions like "change the clothes to a blue jacket" or "replace the background with a beach." +- Be precise about scope: "Change the background to a beach while keeping the person in the exact same position, maintain identical subject placement, camera angle, framing, and perspective. Only replace the environment around them." + +### Object removal + +- Describe what should fill the space left behind, not just what to remove +- Some editing models handle removal cleanly; others leave structural artifacts. If one model struggles, try another. + +### Background editing + +- Describe the new background in detail: lighting, time of day, environment +- Specify that the subject should remain in the exact same position with the same lighting + +### Perspective and angle changes + +- These are among the hardest edits. Not all models handle them well. +- Some models restrict themselves to the initial composition and struggle with new angles + +### Inpainting and outpainting + +- For inpainting: mask the region to edit, then prompt with what should fill it +- Some models have a "magic prompt" or auto-rewrite feature. When this is on, you can focus on describing just the edited region. When it's off, describe the whole scene. +- Describing only the masked region makes the model emphasize the prompt more, which can produce better results for targeted edits +- ControlNet-style conditioning (edge detection, depth maps) helps preserve structure during generation + + +## Multi-image and storyboard generation + +Some models can generate multiple related images in a single prompt. + +- Ask for "a series," "a set," or specify a grid layout (e.g., "2x2 storyboard grid") +- Describe each panel individually with consistent character descriptions +- Maintain consistent style and character continuity by repeating exact descriptions +- Some models support example-based editing: show a before/after pair for one image, then apply the same transformation to others + + +## Product photography and commercial work + +- Specify materials precisely: "brushed steel," "matte aluminum," "kraft paper," "frosted glass" +- Describe lighting setup: "soft diffused studio lighting, crisp highlights and gentle shadows" +- For brand assets and icons, look for models that produce native SVG output (real editable vector files) +- For layouts with branding and copy placement, look for models with strong typography and design composition + + +## Fine-tuning and LoRAs + +- Use trigger words from your trained model in every prompt +- When combining multiple LoRAs, balance their influence with scale parameters (typically 0.9-1.1) +- Generate synthetic training data: generate many images, pick the best, retrain +- Use consistent-character workflows to generate training data from a single reference image + + +## Common pitfalls + +1. **Keyword-stuffed prompts**: Modern models respond better to natural language sentences than comma-separated keyword lists. Write like you're describing a scene, not tagging a photo. + +2. **Using "transform" when you want a small edit**: "Transform the person into a Viking" may swap the entire identity. Use targeted language: "change her outfit to Viking armor, keeping her face and expression unchanged." + +3. **Not specifying what to keep**: When editing, always say what should stay the same. Without explicit instructions, models may change anything. + +4. **Negative prompts on models not trained for them**: Some models were not trained with negative prompts. Using them on these models introduces noise rather than removing unwanted elements. Check the model's documentation. + +5. **Too-high guidance scale (CFG)**: If images look "burnt" with excessive contrast, lower the guidance scale. Each model has a recommended range. + +6. **Expecting real-time knowledge**: No image model has internet access. Some have strong world knowledge baked in from training data, but it's not live. + +7. **Short prompts for complex scenes**: Modern models accept thousands of tokens. For complex compositions with many specific requirements, use that capacity. + +8. **Ignoring aspect ratio**: Most models have specific resolutions they work best at (commonly ~1 megapixel). Going too large produces edge artifacts. Going too small produces harsh crops. Use the model's recommended aspect ratios. + +9. **Wrong model for the task**: Not every model is good at every task. Some excel at text rendering but struggle with object removal. Some are great at style transfer but poor at background editing. If a model struggles with a specific edit type, try a different one rather than fighting the prompt. See the [compare-models](../compare-models/SKILL.md) skill for guidance. + +10. **Not iterating**: The best results come from iterative workflows. Make a small change, evaluate, refine, repeat. Don't try to get everything right in a single generation. + + +## Sources + +All techniques in this skill are sourced from Replicate's blog: + +- [How to prompt Seedream 5.0](https://replicate.com/blog/how-to-prompt-seedream-5) (Feb 2026) +- [Recraft V4](https://replicate.com/blog/recraft-v4) (Feb 2026) +- [Run FLUX.2 on Replicate](https://replicate.com/blog/run-flux-2-on-replicate) (Nov 2025) +- [How to prompt Nano Banana Pro](https://replicate.com/blog/how-to-prompt-nano-banana-pro) (Nov 2025) +- [Which image editing model should I use?](https://replicate.com/blog/compare-image-editing-models) (Sep 2025) +- [Generate consistent characters](https://replicate.com/blog/generate-consistent-characters) (Jul 2025) +- [Use FLUX.1 Kontext to edit images with words](https://replicate.com/blog/flux-kontext) (May 2025) +- [Imagen 4](https://replicate.com/blog/google-imagen-4) (May 2025) +- [Ideogram 3.0 on Replicate](https://replicate.com/blog/ideogram-v3) (May 2025) +- [FLUX.1 Tools](https://replicate.com/blog/flux-tools) (Nov 2024) +- [Ideogram v2 inpainting](https://replicate.com/blog/ideogram-v2-inpainting) (Oct 2024) +- [Using synthetic data to improve Flux finetunes](https://replicate.com/blog/using-synthetic-data-to-improve-flux-finetunes) (Sep 2024) +- [FLUX.1: First Impressions](https://replicate.com/blog/flux-first-impressions) (Aug 2024) +- [How to get the best results from Stable Diffusion 3](https://replicate.com/blog/get-the-best-from-stable-diffusion-3) (Jun 2024) diff --git a/skills/prompt-videos/SKILL.md b/skills/prompt-videos/SKILL.md new file mode 100644 index 0000000..b7f64e2 --- /dev/null +++ b/skills/prompt-videos/SKILL.md @@ -0,0 +1,334 @@ +--- +name: prompt-videos +description: > + Prompting techniques for AI video generation models on Replicate. + Use when writing prompts for video models or building video + generation features. +--- + +# Prompting video models on Replicate + +Distilled from Replicate's blog posts on prompting video models (2025-2026). Techniques are model-agnostic and focus on transferable principles. For model selection, pricing, and feature comparison, see the [compare-models](../compare-models/SKILL.md) skill. + + +## Scene description + +A good video prompt is a scene description, not a caption. Write what happens, where, and how it looks. + +### Layer these elements into every prompt + +1. **Subject**: Who or what is in the scene (a person, animal, object, landscape). +2. **Context**: Where the subject is (indoors, a city street, a forest, a spaceship corridor). +3. **Action**: What the subject does (walks, turns, picks up a phone, runs). +4. **Style**: The visual aesthetic (cinematic, animated, stop-motion, documentary). +5. **Camera**: How the camera moves (dolly shot, tracking, static, handheld). +6. **Composition**: How the shot is framed (wide shot, close-up, over-the-shoulder). +7. **Ambiance**: Mood and lighting (warm tones, blue light, golden hour, overcast). + +### Be specific, not vague + +Vague: "A car chase" + +Specific: "A high-speed car chase on a rain-drenched highway at night. Two muscle cars weave through heavy traffic at 140mph, headlights slicing through the downpour. One car clips a semi-truck sending sparks showering across six lanes. Tires hydroplane on standing water. Neon highway signs blur overhead." + +### Overdescribe + +Modern video models handle long, dense prompts well. Don't write "a man on the phone." Write "a desperate man in a weathered green trench coat picks up a rotary phone mounted on a gritty brick wall, bathed in the eerie glow of a green neon sign." Every concrete detail you add gives the model less room to improvise poorly. + +### Name subjects directly + +Use descriptive phrases like "the woman in the red jacket" or "the bearded man in flannel." Avoid pronouns, which are ambiguous to video models just as they are to image models. + + +## Camera and cinematography + +Video models understand filmmaking language. Use it to direct the shot rather than hoping for good framing. + +### Shot types + +Use standard shot terminology to control framing: + +- Wide/establishing shot: shows the full scene and environment +- Medium shot: frames the subject from roughly the waist up +- Close-up: fills the frame with the subject's face or a key object +- Extreme close-up: isolates a detail (an eye, a hand gripping a handle, a drop of water) + +### Camera motion + +Describe how the camera moves: + +- Static/tripod: locked-off, no movement +- Pan: horizontal rotation left or right +- Tilt: vertical rotation up or down +- Dolly: camera physically moves toward or away from the subject +- Tracking: camera moves alongside the subject +- Crane: camera rises or descends vertically +- Handheld: shaky, documentary-style movement +- Drone/aerial: overhead or sweeping bird's-eye shots +- Dolly zoom (Hitchcock/vertigo effect): background stretches while subject stays locked + +### Camera position + +Specify the camera's height and angle: + +- Eye level: neutral, natural perspective +- Low angle / worm's eye: looking up at the subject (makes subjects feel powerful or imposing) +- High angle / bird's eye: looking down (makes subjects feel small or vulnerable) +- Over-the-shoulder: frames one subject from behind another +- POV / first-person: camera is the subject's eyes + +### Lens and focus language + +- Shallow depth of field: subject sharp, background blurred +- Deep focus: everything sharp from foreground to background +- Macro lens: extreme close-up with shallow focus +- Wide-angle lens: exaggerated perspective, more environment visible +- Tilt-shift: miniature effect, selective focus band + +### Escalation pattern + +A natural progression for short clips is wide > medium > close-up > extreme close-up. This maps well onto 8-15 second clips and gives the model clear structure. For example: + +- 0-3s: wide establishing shot of the location +- 3-7s: medium shot, the subject enters or acts +- 7-12s: close-up on the key moment +- 12-15s: extreme close-up on a detail (a hand, an eye, a drop of rain) + + +## Audio and dialogue + +Many video models generate audio natively alongside the visuals. If you don't prompt for the audio you want, the model will guess, and it often guesses wrong. + +### Prompt all four audio layers + +1. **Dialogue**: What characters say, either exact words or described intent. +2. **Ambient sound**: The background audio of the scene (rain on metal awnings, city traffic, forest birds). +3. **Sound effects**: Specific sounds from actions (a door slamming, glass breaking, a sword being drawn). +4. **Music**: Genre, mood, and instrumentation (a tense cinematic score, soft jazz piano, no music). + +If you skip ambient audio, models may hallucinate inappropriate sounds. A common failure mode is adding a "live studio audience" laughing in the background. Prevent this by describing the soundscape explicitly: "sounds of distant bands, noisy crowd, ambient background of a busy festival field." + +### Dialogue prompting + +There are two approaches: + +- Explicit: "The man says: My name is Ben." This gives you exact control over the words. +- Implicit: "The man introduces himself." This lets the model decide the phrasing. + +Explicit dialogue should be short enough to fit the clip duration. Packing too much dialogue into an 8-second clip produces unnaturally fast speech. Too little dialogue can produce awkward silence or AI gibberish. + +### Syntax that avoids subtitles + +Many video models were trained on videos with baked-in subtitles and will add them to outputs. To prevent this: + +- Use a colon for dialogue: "She says: Hello there" rather than "She says 'Hello there'" +- Add "(no subtitles)" to the prompt +- If subtitles persist, repeat the instruction: "No subtitles. No subtitles!" + +### Pronunciation + +If a model mispronounces a name or word, spell it phonetically in the prompt. For example, write "foh-fur" instead of "fofr" or "Shreedar" instead of "Shridhar." + +### Who says what + +In multi-character scenes, the model can mix up who says what. Tie dialogue to distinctive visual descriptions: "The woman wearing pink says: ..." and "The man with glasses replies: ..." + + +## Multi-shot and time-coded prompting + +Some models support generating multiple shots within a single clip (up to ~15 seconds). You can direct each shot individually using time codes. + +### Time-coded format + +Write timestamps directly into the prompt: + +``` +[0-4s]: Wide establishing shot, static camera, misty bamboo forest at dawn +[4-9s]: Medium shot, slow push-in, the fighter steps forward +[9-15s]: Close-up, orbit shot, the fighter strikes, slow motion +``` + +Each shot should specify: +- Camera position and motion +- Subject action +- Lighting or mood shifts + +### Transition language + +Use explicit transition instructions between shots: + +- "Hard cut to..." for an abrupt switch +- "Seamless morph into..." for a fluid transition +- "Whip pan to..." for a fast, energetic cut +- "Snap cut to..." for a jarring, dramatic shift + +Without explicit transitions, the model improvises, which may or may not match your intent. + +### Example: multi-shot commercial + +``` +(0-3s) Macro shot of a luxury perfume bottle among scattered pink peonies, + shallow depth of field, petals floating in warm afternoon light, + soft ambient music. +(3-7s) Camera glides closer, a feminine hand enters frame from the right, + fingers gently touch the glass bottle, the sound of silk rustling. +(7-12s) Hard cut to slow-motion spray, golden mist diffuses through the air, + particles catching rim light against a dark background, + the hiss of the atomizer. +(12-15s) Seamless pull-out to hero frame, product centered, volumetric + lighting, minimal cream background, elegant silence. +``` + + +## Reference inputs + +Many video models accept images, video clips, or audio files as reference inputs alongside a text prompt. This shifts the workflow from "prompting" to something closer to "directing." + +### Image-to-video + +Feed a starting image and describe the motion. The model animates from that frame. + +- The input image becomes the first frame of the video +- Describe what changes (action, camera movement), not the static scene the model can already see +- Style preservation is a strength: animated styles, paintings, photographs, and color grading all carry through +- For maximum style control, generate the starting image with a specialized image model first, then pass it to the video model + +### First and last frame interpolation + +Some models accept both a starting and ending image. The model generates the transition between them. This is useful for: + +- Morphing between subjects (e.g. one animal transforming into another) +- Before/after transformations (room makeover, seasonal change) +- Controlled narrative arcs where you know the start and end state + +### Subject references + +Some models accept reference images of characters, products, or objects and maintain their appearance in the generated video. This is useful for: + +- UGC-style product review videos (reference image of character + reference image of product) +- Brand consistency across multiple video clips +- Placing existing characters in new scenarios + +When referencing input assets, many models use a bracket syntax like `[Image1]` or `[Audio1]` in the prompt to specify which reference maps to which role: "[Image2] is in the interior of [Image1]." + +### Audio-driven generation + +Some models accept audio files and sync the generated video to the audio. The model can match: + +- Lip movements to speech +- Cuts and motion to musical beats +- Ambient rhythm to environmental sounds + +When using audio references, it helps to also transcribe the audio content in the text prompt itself, and match the video duration to the audio length. + +### Multi-reference workflows + +The most powerful results come from combining multiple reference types: + +- An image for character appearance +- A video clip for motion style +- An audio track for rhythm and pacing +- A text prompt describing how everything fits together + + +## Style control + +### Name the style explicitly + +Video models understand style labels. Include them directly in your prompt: + +- "In the style of claymation" +- "Pixar animation style" +- "Anime" +- "Stop-motion" +- "8-bit retro" +- "Graphic novel" +- "Documentary footage" +- "Origami" +- "LEGO" +- "Blueprint technical drawing" + +Style labels affect not just the visual look but also how characters move and interact. A claymation style produces jerky, stop-motion movement. An anime style produces fluid, exaggerated motion. + +### Quality anchors + +Phrases like "hyper-realistic, 8k" or "cinematic" push models toward their highest fidelity output. Use them when you want photorealistic results. + +### Film and genre language + +Reference specific genres or filmmaking styles for mood and tone: + +- "Michael Mann cinematography" (neon, night, urban) +- "Wes Anderson" (symmetrical, pastel, quirky) +- "Roger Deakins lighting" (naturalistic, precise) +- "Blade Runner 2049 cinematography" (atmospheric, orange/teal) +- "National Geographic documentary" (nature, steady, observational) + +### Use input images for style + +Rather than describing a style verbally, generate an image with the exact aesthetic you want using an image model, then pass it to the video model. This gives you pixel-level control over the look. The video model preserves the style, color grading, and composition while adding motion. + +### Grain and texture + +Adding "slightly grainy, film-like" or "VHS aesthetic" pushes output away from the too-clean AI look and makes videos feel more organic. + + +## Character consistency + +### Repeat descriptions verbatim + +When generating multiple clips with the same character, use identical character descriptions across prompts. Create a "character sheet" with exact wording: + +"John, a man in his 40s with short brown hair, wearing a blue jacket and glasses, looking thoughtful" + +Paste this description into every prompt where John appears. The more specific and unique the description, the more consistent the results. + +### What to specify + +- Physical appearance: age, hair, skin, build +- Clothing: exact garments, colors, materials +- Accessories: glasses, jewelry, hat +- Expression or demeanor: thoughtful, cheerful, intense + +### Vary the scene, not the character + +When placing a consistent character in different scenarios, change only the action, location, and camera work. Keep the character description word-for-word identical. + +### Reference images for identity + +If the model supports subject reference images, use a clear photo of the character as input. This is more reliable than text descriptions alone, especially for maintaining facial features across clips. + + +## Common pitfalls + +1. **Not describing audio**: If you skip audio prompting, models hallucinate ambient sounds. A common failure is adding inappropriate laughter or a "live studio audience." Always describe the soundscape. + +2. **Too much dialogue for the clip length**: An 8-second clip can hold roughly 2-3 short sentences. Packing in a paragraph produces unnaturally fast speech or truncated output. + +3. **Too little dialogue for the clip length**: If you only provide a few words for a long clip, the model fills silence with gibberish or awkward pauses. Match dialogue length to clip duration. + +4. **Not specifying what to keep unchanged**: When using reference images or editing, always state what should stay the same. Without explicit instructions, models may change anything. + +5. **Expecting variation from identical prompts**: Unlike image models, some video models produce very similar outputs for the same prompt (even with different seeds). If you want variety, change the prompt, don't just rerun it. + +6. **Not prompting camera motion**: Without camera direction, you get either static shots or unpredictable movement. Describe the camera explicitly. + +7. **Subtitle contamination**: Many models were trained on videos with baked-in subtitles. Use colons for dialogue (not quotes), add "(no subtitles)", and repeat if necessary. + +8. **Vague prompts for complex scenes**: Modern video models handle long, detailed prompts. A prompt with 12+ specific requirements (camera moves, lighting, sound design, subject actions, environmental details) can work if each requirement is stated clearly. Don't undersell what you want. + +9. **Ignoring aspect ratio and resolution**: Most video models have specific resolutions they support (480p, 720p, 1080p). Check what the model supports and choose the right resolution for your use case. If you need vertical video and the model only outputs landscape, you may need to reframe with a separate tool. + +10. **Forgetting that video models don't have internet access**: No video model has live information. They work from training data. Don't expect them to know about current events or real-time information. + + +## Sources + +All techniques in this skill are sourced from Replicate's blog: + +- [How to make remarkable videos with Seedance 2.0](https://replicate.com/blog/seedance-2) (Apr 2026) +- [How to prompt Veo 3.1](https://replicate.com/blog/veo-3-1) (Oct 2025) +- [How to prompt Veo 3 with images](https://replicate.com/blog/veo-3-image) (Aug 2025) +- [Open source video is back (Wan 2.2)](https://replicate.com/blog/wan-22) (Jul 2025) +- [Compare AI video models](https://replicate.com/blog/compare-ai-video-models) (Jul 2025) +- [How to prompt Veo 3 for the best results](https://replicate.com/blog/using-and-prompting-veo-3) (Jun 2025) diff --git a/skills/replicate/SKILL.md b/skills/replicate/SKILL.md deleted file mode 100644 index 0960d34..0000000 --- a/skills/replicate/SKILL.md +++ /dev/null @@ -1,56 +0,0 @@ ---- -name: replicate -description: Discover, compare, and run AI models using Replicate's API ---- - -## Docs - -- Reference docs: https://replicate.com/docs/llms.txt -- HTTP API schema: https://api.replicate.com/openapi.json -- MCP server: https://mcp.replicate.com -- Set an `Accept: text/markdown` header when requesting docs pages to get a Markdown response. - -## Workflow - -Here's a common workflow for using Replicate's API to run a model: - -1. **Choose the right model** - Search with the API or ask the user -2. **Get model metadata** - Fetch model input and output schema via API -3. **Create prediction** - POST to /v1/predictions -4. **Poll for results** - GET prediction until status is "succeeded" -5. **Return output** - Usually URLs to generated content - -## Choosing models - -- Use the search and collections APIs to find and compare the best models. Do not list all the models via API, as it's basically a firehose. -- Collections are curated by Replicate staff, so they're vetted. -- Official models are in the "official" collection. -- Use official models because they: - - are always running - - have stable API interfaces - - have predictable output pricing - - are maintained by Replicate staff -- If you must use a community model, be aware that it can take a long time to boot. -- You can create always-on deployments of community models, but you pay for model uptime. - -## Running models - -Models take time to run. There are three ways to run a model via API and get its output: - -1. Create a prediction, store its id from the response, and poll until completion. -2. Set a `Prefer: wait` header when creating a prediction for a blocking synchronous response. Only recommended for very fast models. -3. Set an HTTPS webhook URL when creating a prediction, and Replicate will POST to that URL when the prediction completes. - -Follow these guideliness when running models: - -- Use the "POST /v1/predictions" endpoint, as it supports both official and community models. -- Every model has its own OpenAPI schema. Always fetch and check model schemas to make sure you're setting valid inputs. Even popular models change their schemas. -- Validate input parameters against schema constraints (minimum, maximum, enum values). Don't generate values that violate them. -- When unsure about a parameter value, use the model's default example or omit the optional parameter. -- Don't set optional inputs unless you have a reason to. Stick to the required inputs and let the model's defaults do the work. -- Use HTTPS URLs for file inputs whenever possible. You can also send base64-encoded files, but they should be avoided. -- Fire off multiple predictions concurrently. Don't wait for one to finish before starting the next. -- Output file URLs expire after 1 hour, so back them up if you need to keep them, using a service like Cloudflare R2. -- Webhooks are a good mechanism for receiving and storing prediction output. - - diff --git a/skills/run-models/SKILL.md b/skills/run-models/SKILL.md new file mode 100644 index 0000000..4459980 --- /dev/null +++ b/skills/run-models/SKILL.md @@ -0,0 +1,69 @@ +--- +name: run-models +description: Run AI models on Replicate via predictions, webhooks, and streaming. +--- + +## Docs + +- Reference: +- OpenAPI schema: +- MCP server: +- Per-model docs: `https://replicate.com/{owner}/{model}/llms.txt` +- Set `Accept: text/markdown` when requesting docs pages for Markdown responses. + +## Workflow + +1. **Choose the right model** - Search with the API or ask the user. +2. **Get model metadata** - Fetch input and output schema via API. +3. **Create prediction** - POST to /v1/predictions. +4. **Poll for results** - GET prediction until status is "succeeded". +5. **Return output** - Usually URLs to generated content. + +## Three ways to get output + +1. Create a prediction, store its id from the response, and poll until completion. +2. Set a `Prefer: wait` header when creating a prediction for a blocking synchronous response. Only recommended for very fast models. Max 60 seconds. +3. Set an HTTPS webhook URL when creating a prediction, and Replicate will POST to that URL when the prediction completes. + +## Guidelines + +- Use the `POST /v1/predictions` endpoint, as it supports both official and community models. +- Every model has its own OpenAPI schema. Always fetch and check model schemas to make sure you're setting valid inputs. Even popular models change their schemas. +- Validate input parameters against schema constraints (`minimum`, `maximum`, `enum` values). Don't generate values that violate them. +- When unsure about a parameter value, use the model's default example or omit the optional parameter. +- Don't set optional inputs unless you have a reason to. Stick to the required inputs and let the model's defaults do the work. +- Use HTTPS URLs for file inputs whenever possible. You can also send base64-encoded files, but they should be avoided. +- Fire off multiple predictions concurrently. Don't wait for one to finish before starting the next. +- Output file URLs expire after 1 hour, so back them up if you need to keep them, using a service like Cloudflare R2. +- Webhooks are a good mechanism for receiving and storing prediction output. + +## Predictions + +- A prediction goes through these states: `starting` -> `processing` -> `succeeded` / `failed` / `canceled`. +- Official models use `owner/name` format. Community models require `owner/name:version_id`. +- The `POST /v1/predictions` endpoint handles both. + +## Webhooks + +- Set `webhook` to an HTTPS URL when creating a prediction. Replicate POSTs the full prediction object when it completes. +- Filter events with `webhook_events_filter`: `start`, `output`, `logs`, `completed`. +- Validate webhook signatures using the `Webhook-ID`, `Webhook-Timestamp`, and `Webhook-Signature` headers. Get the signing secret from `GET /v1/webhooks/default/secret`. + +## Prediction lifetime + +- Set `lifetime` to auto-cancel predictions that run too long (e.g. `30s`, `5m`, `1h`). Measured from creation time. + +## Streaming + +- Language models that support streaming include a `stream` URL in the response. Use SSE to receive incremental output. + +## File handling + +- Prefer HTTPS URLs for file inputs. Output URLs from one prediction can be passed directly as file inputs to the next model. +- Output file URLs expire after 1 hour. Download and store them immediately if you need to keep them. + +## Multi-model workflows + +- Chain models by passing output URLs as file inputs to the next model. +- Start all independent predictions in parallel, then collect results. +- Output URLs are valid for 1 hour, which is enough for pipeline steps.