From 652c141a4a2e487cd7fe320a3470bbffd3e54c63 Mon Sep 17 00:00:00 2001 From: Alan Kharebov Date: Wed, 3 Jun 2026 17:15:24 -0700 Subject: [PATCH] fix(transformers): include image entries in chat-template content for VLM The VLM (processor) path was applying the chat template with content as a plain prompt string. For Qwen3-VL the template then emits no <|vision_start|><|image_pad|><|vision_end|> placeholder tokens, so when the processor splices in the image features the forward pass raises: ValueError: Image features and image tokens do not match: tokens: 0, features 1333 Fix: when there are images and a processor (VLM mode), pass structured content (image entries + text entry) to apply_chat_template so the placeholder tokens land in the prompt. Closes #5 --- src/multihead/adapters/transformers_adapter.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/multihead/adapters/transformers_adapter.py b/src/multihead/adapters/transformers_adapter.py index 237e95b..8e1d4e4 100644 --- a/src/multihead/adapters/transformers_adapter.py +++ b/src/multihead/adapters/transformers_adapter.py @@ -143,7 +143,17 @@ async def generate(self, prompt: str, **kwargs: Any) -> dict[str, Any]: # Use chat template when available (required for Qwen3, etc.) tokenizer = self._tokenizer or self._processor if not skip_template and tokenizer is not None and hasattr(tokenizer, "apply_chat_template"): - messages = [{"role": "user", "content": prompt}] + # VLM with images: content must be a list including image placeholders + # so the chat template emits <|vision_start|><|image_pad|><|vision_end|> + # tokens that bind to the image features in the processor() call below. + images_for_template = kwargs.get("images") + if images_for_template and self._processor is not None: + content: Any = [{"type": "image"} for _ in images_for_template] + [ + {"type": "text", "text": prompt}, + ] + else: + content = prompt + messages = [{"role": "user", "content": content}] template_kwargs: dict[str, Any] = { "tokenize": False, "add_generation_prompt": True,