From 652c141a4a2e487cd7fe320a3470bbffd3e54c63 Mon Sep 17 00:00:00 2001
From: Alan Kharebov <ak61us@gmail.com>
Date: Wed, 3 Jun 2026 17:15:24 -0700
Subject: [PATCH] fix(transformers): include image entries in chat-template
 content for VLM

The VLM (processor) path was applying the chat template with content as
a plain prompt string. For Qwen3-VL the template then emits no
<|vision_start|><|image_pad|><|vision_end|> placeholder tokens, so when
the processor splices in the image features the forward pass raises:

  ValueError: Image features and image tokens do not match:
    tokens: 0, features 1333

Fix: when there are images and a processor (VLM mode), pass structured
content (image entries + text entry) to apply_chat_template so the
placeholder tokens land in the prompt.

Closes #5
---
 src/multihead/adapters/transformers_adapter.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/multihead/adapters/transformers_adapter.py b/src/multihead/adapters/transformers_adapter.py
index 237e95b..8e1d4e4 100644
--- a/src/multihead/adapters/transformers_adapter.py
+++ b/src/multihead/adapters/transformers_adapter.py
@@ -143,7 +143,17 @@ async def generate(self, prompt: str, **kwargs: Any) -> dict[str, Any]:
         # Use chat template when available (required for Qwen3, etc.)
         tokenizer = self._tokenizer or self._processor
         if not skip_template and tokenizer is not None and hasattr(tokenizer, "apply_chat_template"):
-            messages = [{"role": "user", "content": prompt}]
+            # VLM with images: content must be a list including image placeholders
+            # so the chat template emits <|vision_start|><|image_pad|><|vision_end|>
+            # tokens that bind to the image features in the processor() call below.
+            images_for_template = kwargs.get("images")
+            if images_for_template and self._processor is not None:
+                content: Any = [{"type": "image"} for _ in images_for_template] + [
+                    {"type": "text", "text": prompt},
+                ]
+            else:
+                content = prompt
+            messages = [{"role": "user", "content": content}]
             template_kwargs: dict[str, Any] = {
                 "tokenize": False,
                 "add_generation_prompt": True,