From a33690b5643a6450f216d9865dc5d4dad775b89b Mon Sep 17 00:00:00 2001
From: BolinSNLHM <bolins@nvidia.com>
Date: Mon, 4 May 2026 13:13:16 -0700
Subject: [PATCH] fix: support Q3VL multimodal benchmark on vLLM endpoints
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three independent issues blocked the Qwen3-VL-235B-A22B benchmark
against a real vLLM/Dynamo server. They are bundled here because
they all surface on the same flow (Shopify multimodal dataset →
OpenAI chat completions on vLLM):

1. **OpenAI response decoding too strict (`openai/types.py`).**
   The msgspec structs that decode `/v1/chat/completions` responses
   declared `content`, `refusal`, `finish_reason`, `created`,
   `model`, `usage`, and `system_fingerprint` as required fields
   with no defaults. vLLM (and the OpenAI spec itself) treats these
   as nullable/optional — vLLM omits `usage` on non-final SSE
   chunks, rarely sets `system_fingerprint`, and may omit `content`
   on tool-only responses. Decoding any such response raised a
   msgspec validation error before the adapter could return a
   QueryResult. Added safe defaults so the decoder accepts whatever
   the server legitimately sends.

2. **Multimodal prompt crashes metrics tracking (`session.py`).**
   `PhaseIssuer.issue()` constructs a `PromptData(text=..., ...)`
   for the ISSUED event. `PromptData.text` is typed `str | None`,
   but the Shopify multimodal formatter sets `prompt` to OpenAI
   vision content parts (`[{"type": "text", ...},
   {"type": "image_url", ...}]`). msgspec rejected the list and the
   issuer raised, killing the run before any sample was sent. The
   HTTP adapter handles list-shaped prompts correctly — only the
   metrics tracking path needed fixing. Type-guard the assignment
   so non-strings fall through to None and ISL is computed from
   `token_ids` (which the multimodal datasets already pre-tokenize).

3. **Tokenizer cannot be loaded from container-local model paths
   (`schema.py`, `execute.py`).** When the model is served from a
   local cache path (e.g. `/root/.cache/huggingface/hub/models--…`)
   instead of a HF repo ID, `_check_tokenizer_exists(model_name)`
   fails its `model_info()` probe and the aggregator runs without
   ISL/OSL/TPOT metrics. Added a `tokenizer_name` override field
   to `ModelParams` so the tokenizer source can be set explicitly
   while the served model name remains a path. The example YAML
   configs document the new field with a commented hint.

Validated all three changes interactively with msgspec/pydantic:
- `ChatCompletionResponse` decodes minimal vLLM responses (most
  fields missing) and full responses with usage.
- `PromptData(text=…)` accepts both string prompts and multimodal
  `list[dict]` prompts.
- `ModelParams` round-trips through YAML with and without
  `tokenizer_name`, and both Q3VL example configs load cleanly.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../offline_qwen3_vl_235b_a22b_shopify.yaml   |  1 +
 .../online_qwen3_vl_235b_a22b_shopify.yaml    |  1 +
 .../commands/benchmark/execute.py             | 11 ++++--
 src/inference_endpoint/config/schema.py       |  9 +++++
 .../load_generator/session.py                 |  8 ++++-
 src/inference_endpoint/openai/types.py        | 36 +++++++++++++------
 6 files changed, 53 insertions(+), 13 deletions(-)

diff --git a/examples/08_Qwen3-VL-235B-A22B_Example/offline_qwen3_vl_235b_a22b_shopify.yaml b/examples/08_Qwen3-VL-235B-A22B_Example/offline_qwen3_vl_235b_a22b_shopify.yaml
index 95445781..11b3a516 100644
--- a/examples/08_Qwen3-VL-235B-A22B_Example/offline_qwen3_vl_235b_a22b_shopify.yaml
+++ b/examples/08_Qwen3-VL-235B-A22B_Example/offline_qwen3_vl_235b_a22b_shopify.yaml
@@ -7,6 +7,7 @@ timeout: 14400 # Perf + acc run takes over 3 hours, consider limit n_samples_to_
 
 model_params:
   name: "Qwen/Qwen3-VL-235B-A22B-Instruct"
+  # tokenizer_name: "Qwen/Qwen3-VL-235B-A22B-Instruct"  # Set this if model name is a local/container path
   temperature: 0
   top_p: 1
   max_new_tokens: 150
diff --git a/examples/08_Qwen3-VL-235B-A22B_Example/online_qwen3_vl_235b_a22b_shopify.yaml b/examples/08_Qwen3-VL-235B-A22B_Example/online_qwen3_vl_235b_a22b_shopify.yaml
index db23f163..a47f4f98 100644
--- a/examples/08_Qwen3-VL-235B-A22B_Example/online_qwen3_vl_235b_a22b_shopify.yaml
+++ b/examples/08_Qwen3-VL-235B-A22B_Example/online_qwen3_vl_235b_a22b_shopify.yaml
@@ -6,6 +6,7 @@ timeout: 14400
 
 model_params:
   name: "Qwen/Qwen3-VL-235B-A22B-Instruct"
+  # tokenizer_name: "Qwen/Qwen3-VL-235B-A22B-Instruct"  # Set this if model name is a local/container path
   temperature: 0
   top_p: 1
   max_new_tokens: 150
diff --git a/src/inference_endpoint/commands/benchmark/execute.py b/src/inference_endpoint/commands/benchmark/execute.py
index 73c3427f..b2cb9f8e 100644
--- a/src/inference_endpoint/commands/benchmark/execute.py
+++ b/src/inference_endpoint/commands/benchmark/execute.py
@@ -300,9 +300,16 @@ def setup_benchmark(config: BenchmarkConfig, test_mode: TestMode) -> BenchmarkCo
     report_dir.mkdir(parents=True, exist_ok=True)
     config.to_yaml_file(report_dir / "config.yaml")
 
-    # Tokenizer check (light API call, no download)
+    # Tokenizer check (light API call, no download).
+    # When the serving model name is a local/container path (e.g. an NVFP4
+    # checkpoint cached under /root/.cache/huggingface/hub/...) it is not a
+    # valid HF repo ID and the probe will fail. Allow model_params.tokenizer_name
+    # to override the source so the upstream HF tokenizer can still be used.
     model_name = config.model_params.name
-    tokenizer_name = model_name if _check_tokenizer_exists(model_name) else None
+    tokenizer_source = config.model_params.tokenizer_name or model_name
+    tokenizer_name = (
+        tokenizer_source if _check_tokenizer_exists(tokenizer_source) else None
+    )
 
     # Streaming
     logger.info(
diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py
index 6a1884b4..9898c62c 100644
--- a/src/inference_endpoint/config/schema.py
+++ b/src/inference_endpoint/config/schema.py
@@ -182,6 +182,15 @@ class ModelParams(BaseModel):
         str,
         cyclopts.Parameter(alias="--model", help="Model name", required=True),
     ] = ""
+    tokenizer_name: Annotated[
+        str | None,
+        cyclopts.Parameter(
+            alias="--tokenizer",
+            help="Tokenizer name or path (overrides model name for tokenizer loading). "
+            "Useful when the serving model path differs from the tokenizer, e.g. "
+            "quantized checkpoints or container-local paths.",
+        ),
+    ] = Field(None, description="Tokenizer name/path override (HF repo ID or local path)")
     temperature: float | None = Field(None, description="Sampling temperature")
     top_k: int | None = Field(None, description="Top-K sampling")
     top_p: float | None = Field(None, description="Top-P (nucleus) sampling")
diff --git a/src/inference_endpoint/load_generator/session.py b/src/inference_endpoint/load_generator/session.py
index 1c8ad992..28a88889 100644
--- a/src/inference_endpoint/load_generator/session.py
+++ b/src/inference_endpoint/load_generator/session.py
@@ -191,8 +191,14 @@ def issue(self, sample_index: int) -> str | None:
         prompt_data: PromptData
         if isinstance(data, dict):
             token_ids = data.get("input_tokens") or data.get("token_ids")
+            # Multimodal datasets store ``prompt`` as a list of OpenAI content
+            # parts (e.g. [{"type": "text", ...}, {"type": "image_url", ...}])
+            # which the HTTP adapter handles directly. PromptData.text is only
+            # meaningful for ISL on text-only prompts, so coerce non-strings
+            # to None and rely on token_ids when the dataset pre-tokenizes.
+            prompt = data.get("prompt")
             prompt_data = PromptData(
-                text=data.get("prompt"),
+                text=prompt if isinstance(prompt, str) else None,
                 token_ids=tuple(token_ids) if token_ids is not None else None,
             )
         else:
diff --git a/src/inference_endpoint/openai/types.py b/src/inference_endpoint/openai/types.py
index 036dd172..70476fb0 100644
--- a/src/inference_endpoint/openai/types.py
+++ b/src/inference_endpoint/openai/types.py
@@ -108,21 +108,30 @@ class ChatCompletionRequest(
 class ChatCompletionResponseMessage(
     msgspec.Struct, frozen=True, kw_only=True, omit_defaults=True, gc=False
 ):  # type: ignore[call-arg]
-    """Response message from OpenAI."""
+    """Response message from OpenAI.
+
+    ``content`` and ``refusal`` are nullable per the OpenAI spec and vLLM
+    routinely omits them (e.g. when the model returns no text or no refusal
+    block), so they default to ``None`` to allow successful decoding.
+    """
 
     role: str
-    content: str | None
-    refusal: str | None
+    content: str | None = None
+    refusal: str | None = None
 
 
 class ChatCompletionChoice(
     msgspec.Struct, frozen=True, kw_only=True, omit_defaults=True, gc=False
 ):  # type: ignore[call-arg]
-    """A single choice in the completion response."""
+    """A single choice in the completion response.
+
+    ``finish_reason`` may be omitted in non-final SSE chunks; default to
+    ``None`` so decoding intermediate frames does not fail.
+    """
 
     index: int
     message: ChatCompletionResponseMessage
-    finish_reason: str | None
+    finish_reason: str | None = None
 
 
 class CompletionUsage(
@@ -142,12 +151,19 @@ class ChatCompletionResponse(
     omit_defaults=False,
     gc=False,
 ):  # type: ignore[call-arg]
-    """OpenAI chat completion response."""
+    """OpenAI chat completion response.
+
+    Most servers (vLLM, Dynamo, etc.) legitimately omit a number of these
+    fields — e.g. ``usage`` is only emitted on the final SSE chunk,
+    ``system_fingerprint`` is rarely populated, and ``created``/``model``
+    can be missing in some response variants. All of these get safe
+    defaults so the decoder accepts whatever the server sends.
+    """
 
     id: str
     object: str = "chat.completion"
-    created: int
-    model: str
+    created: int = 0
+    model: str = ""
     choices: list[ChatCompletionChoice]
-    usage: CompletionUsage | None
-    system_fingerprint: str | None
+    usage: CompletionUsage | None = None
+    system_fingerprint: str | None = None