mlcommons · BolinSNLHM · May 4, 2026 · gemini-code-assist · May 5, 2026
@@ -7,6 +7,7 @@ timeout: 14400 # Perf + acc run takes over 3 hours, consider limit n_samples_to_
 
 model_params:
   name: "Qwen/Qwen3-VL-235B-A22B-Instruct"
+  # tokenizer_name: "Qwen/Qwen3-VL-235B-A22B-Instruct"  # Set this if model name is a local/container path
   temperature: 0
   top_p: 1
   max_new_tokens: 150

@@ -6,6 +6,7 @@ timeout: 14400
 
 model_params:
   name: "Qwen/Qwen3-VL-235B-A22B-Instruct"
+  # tokenizer_name: "Qwen/Qwen3-VL-235B-A22B-Instruct"  # Set this if model name is a local/container path
   temperature: 0
   top_p: 1
   max_new_tokens: 150

@@ -300,9 +300,16 @@ def setup_benchmark(config: BenchmarkConfig, test_mode: TestMode) -> BenchmarkCo
     report_dir.mkdir(parents=True, exist_ok=True)
     config.to_yaml_file(report_dir / "config.yaml")
 
-    # Tokenizer check (light API call, no download)
+    # Tokenizer check (light API call, no download).
+    # When the serving model name is a local/container path (e.g. an NVFP4
+    # checkpoint cached under /root/.cache/huggingface/hub/...) it is not a
+    # valid HF repo ID and the probe will fail. Allow model_params.tokenizer_name
+    # to override the source so the upstream HF tokenizer can still be used.
     model_name = config.model_params.name
-    tokenizer_name = model_name if _check_tokenizer_exists(model_name) else None
+    tokenizer_source = config.model_params.tokenizer_name or model_name
+    tokenizer_name = (
+        tokenizer_source if _check_tokenizer_exists(tokenizer_source) else None
+    )
-    tokenizer_source = config.model_params.tokenizer_name or model_name
-    tokenizer_name = (
-        tokenizer_source if _check_tokenizer_exists(tokenizer_source) else None
-    )
+    tokenizer_source = config.model_params.tokenizer_name or model_name
+    tokenizer_name = (
+        tokenizer_source
+        if Path(tokenizer_source).exists() or _check_tokenizer_exists(tokenizer_source)
+        else None
+    )
-    tokenizer_source = config.model_params.tokenizer_name or model_name
-    tokenizer_name = (
-        tokenizer_source if _check_tokenizer_exists(tokenizer_source) else None
-    )
+    tokenizer_source = config.model_params.tokenizer_name or model_name
+    tokenizer_name = (
+        tokenizer_source
+        if Path(tokenizer_source).exists() or _check_tokenizer_exists(tokenizer_source)
+        else None
+    )
 
     # Streaming
     logger.info(

@@ -182,6 +182,15 @@ class ModelParams(BaseModel):
         str,
         cyclopts.Parameter(alias="--model", help="Model name", required=True),
     ] = ""
+    tokenizer_name: Annotated[
+        str | None,
+        cyclopts.Parameter(
+            alias="--tokenizer",
+            help="Tokenizer name or path (overrides model name for tokenizer loading). "
+            "Useful when the serving model path differs from the tokenizer, e.g. "
+            "quantized checkpoints or container-local paths.",
+        ),
+    ] = Field(None, description="Tokenizer name/path override (HF repo ID or local path)")
     temperature: float | None = Field(None, description="Sampling temperature")
     top_k: int | None = Field(None, description="Top-K sampling")
     top_p: float | None = Field(None, description="Top-P (nucleus) sampling")

@@ -191,8 +191,14 @@ def issue(self, sample_index: int) -> str | None:
         prompt_data: PromptData
         if isinstance(data, dict):
             token_ids = data.get("input_tokens") or data.get("token_ids")
+            # Multimodal datasets store ``prompt`` as a list of OpenAI content
+            # parts (e.g. [{"type": "text", ...}, {"type": "image_url", ...}])
+            # which the HTTP adapter handles directly. PromptData.text is only
+            # meaningful for ISL on text-only prompts, so coerce non-strings
+            # to None and rely on token_ids when the dataset pre-tokenizes.
+            prompt = data.get("prompt")
             prompt_data = PromptData(
-                text=data.get("prompt"),
+                text=prompt if isinstance(prompt, str) else None,
                 token_ids=tuple(token_ids) if token_ids is not None else None,
             )
         else:

@@ -108,21 +108,30 @@ class ChatCompletionRequest(
 class ChatCompletionResponseMessage(
     msgspec.Struct, frozen=True, kw_only=True, omit_defaults=True, gc=False
 ):  # type: ignore[call-arg]
-    """Response message from OpenAI."""
+    """Response message from OpenAI.
+
+    ``content`` and ``refusal`` are nullable per the OpenAI spec and vLLM
+    routinely omits them (e.g. when the model returns no text or no refusal
+    block), so they default to ``None`` to allow successful decoding.
+    """
 
     role: str
-    content: str | None
-    refusal: str | None
+    content: str | None = None
+    refusal: str | None = None
 
 
 class ChatCompletionChoice(
     msgspec.Struct, frozen=True, kw_only=True, omit_defaults=True, gc=False
 ):  # type: ignore[call-arg]
-    """A single choice in the completion response."""
+    """A single choice in the completion response.
+
+    ``finish_reason`` may be omitted in non-final SSE chunks; default to
+    ``None`` so decoding intermediate frames does not fail.
+    """
 
     index: int
     message: ChatCompletionResponseMessage
-    finish_reason: str | None
+    finish_reason: str | None = None
 
 
 class CompletionUsage(
@@ -142,12 +151,19 @@ class ChatCompletionResponse(
     omit_defaults=False,
     gc=False,
 ):  # type: ignore[call-arg]
-    """OpenAI chat completion response."""
+    """OpenAI chat completion response.
+
+    Most servers (vLLM, Dynamo, etc.) legitimately omit a number of these
+    fields — e.g. ``usage`` is only emitted on the final SSE chunk,
+    ``system_fingerprint`` is rarely populated, and ``created``/``model``
+    can be missing in some response variants. All of these get safe
+    defaults so the decoder accepts whatever the server sends.
+    """
 
     id: str
     object: str = "chat.completion"
-    created: int
-    model: str
+    created: int = 0
+    model: str = ""
     choices: list[ChatCompletionChoice]
-    usage: CompletionUsage | None
-    system_fingerprint: str | None
+    usage: CompletionUsage | None = None
+    system_fingerprint: str | None = None