diff --git a/src/multihead/adapters/transformers_adapter.py b/src/multihead/adapters/transformers_adapter.py index 237e95b..8e1d4e4 100644 --- a/src/multihead/adapters/transformers_adapter.py +++ b/src/multihead/adapters/transformers_adapter.py @@ -143,7 +143,17 @@ async def generate(self, prompt: str, **kwargs: Any) -> dict[str, Any]: # Use chat template when available (required for Qwen3, etc.) tokenizer = self._tokenizer or self._processor if not skip_template and tokenizer is not None and hasattr(tokenizer, "apply_chat_template"): - messages = [{"role": "user", "content": prompt}] + # VLM with images: content must be a list including image placeholders + # so the chat template emits <|vision_start|><|image_pad|><|vision_end|> + # tokens that bind to the image features in the processor() call below. + images_for_template = kwargs.get("images") + if images_for_template and self._processor is not None: + content: Any = [{"type": "image"} for _ in images_for_template] + [ + {"type": "text", "text": prompt}, + ] + else: + content = prompt + messages = [{"role": "user", "content": content}] template_kwargs: dict[str, Any] = { "tokenize": False, "add_generation_prompt": True,