From a33690b5643a6450f216d9865dc5d4dad775b89b Mon Sep 17 00:00:00 2001 From: BolinSNLHM Date: Mon, 4 May 2026 13:13:16 -0700 Subject: [PATCH] fix: support Q3VL multimodal benchmark on vLLM endpoints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three independent issues blocked the Qwen3-VL-235B-A22B benchmark against a real vLLM/Dynamo server. They are bundled here because they all surface on the same flow (Shopify multimodal dataset → OpenAI chat completions on vLLM): 1. **OpenAI response decoding too strict (`openai/types.py`).** The msgspec structs that decode `/v1/chat/completions` responses declared `content`, `refusal`, `finish_reason`, `created`, `model`, `usage`, and `system_fingerprint` as required fields with no defaults. vLLM (and the OpenAI spec itself) treats these as nullable/optional — vLLM omits `usage` on non-final SSE chunks, rarely sets `system_fingerprint`, and may omit `content` on tool-only responses. Decoding any such response raised a msgspec validation error before the adapter could return a QueryResult. Added safe defaults so the decoder accepts whatever the server legitimately sends. 2. **Multimodal prompt crashes metrics tracking (`session.py`).** `PhaseIssuer.issue()` constructs a `PromptData(text=..., ...)` for the ISSUED event. `PromptData.text` is typed `str | None`, but the Shopify multimodal formatter sets `prompt` to OpenAI vision content parts (`[{"type": "text", ...}, {"type": "image_url", ...}]`). msgspec rejected the list and the issuer raised, killing the run before any sample was sent. The HTTP adapter handles list-shaped prompts correctly — only the metrics tracking path needed fixing. Type-guard the assignment so non-strings fall through to None and ISL is computed from `token_ids` (which the multimodal datasets already pre-tokenize). 3. **Tokenizer cannot be loaded from container-local model paths (`schema.py`, `execute.py`).** When the model is served from a local cache path (e.g. `/root/.cache/huggingface/hub/models--…`) instead of a HF repo ID, `_check_tokenizer_exists(model_name)` fails its `model_info()` probe and the aggregator runs without ISL/OSL/TPOT metrics. Added a `tokenizer_name` override field to `ModelParams` so the tokenizer source can be set explicitly while the served model name remains a path. The example YAML configs document the new field with a commented hint. Validated all three changes interactively with msgspec/pydantic: - `ChatCompletionResponse` decodes minimal vLLM responses (most fields missing) and full responses with usage. - `PromptData(text=…)` accepts both string prompts and multimodal `list[dict]` prompts. - `ModelParams` round-trips through YAML with and without `tokenizer_name`, and both Q3VL example configs load cleanly. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../offline_qwen3_vl_235b_a22b_shopify.yaml | 1 + .../online_qwen3_vl_235b_a22b_shopify.yaml | 1 + .../commands/benchmark/execute.py | 11 ++++-- src/inference_endpoint/config/schema.py | 9 +++++ .../load_generator/session.py | 8 ++++- src/inference_endpoint/openai/types.py | 36 +++++++++++++------ 6 files changed, 53 insertions(+), 13 deletions(-) diff --git a/examples/08_Qwen3-VL-235B-A22B_Example/offline_qwen3_vl_235b_a22b_shopify.yaml b/examples/08_Qwen3-VL-235B-A22B_Example/offline_qwen3_vl_235b_a22b_shopify.yaml index 95445781..11b3a516 100644 --- a/examples/08_Qwen3-VL-235B-A22B_Example/offline_qwen3_vl_235b_a22b_shopify.yaml +++ b/examples/08_Qwen3-VL-235B-A22B_Example/offline_qwen3_vl_235b_a22b_shopify.yaml @@ -7,6 +7,7 @@ timeout: 14400 # Perf + acc run takes over 3 hours, consider limit n_samples_to_ model_params: name: "Qwen/Qwen3-VL-235B-A22B-Instruct" + # tokenizer_name: "Qwen/Qwen3-VL-235B-A22B-Instruct" # Set this if model name is a local/container path temperature: 0 top_p: 1 max_new_tokens: 150 diff --git a/examples/08_Qwen3-VL-235B-A22B_Example/online_qwen3_vl_235b_a22b_shopify.yaml b/examples/08_Qwen3-VL-235B-A22B_Example/online_qwen3_vl_235b_a22b_shopify.yaml index db23f163..a47f4f98 100644 --- a/examples/08_Qwen3-VL-235B-A22B_Example/online_qwen3_vl_235b_a22b_shopify.yaml +++ b/examples/08_Qwen3-VL-235B-A22B_Example/online_qwen3_vl_235b_a22b_shopify.yaml @@ -6,6 +6,7 @@ timeout: 14400 model_params: name: "Qwen/Qwen3-VL-235B-A22B-Instruct" + # tokenizer_name: "Qwen/Qwen3-VL-235B-A22B-Instruct" # Set this if model name is a local/container path temperature: 0 top_p: 1 max_new_tokens: 150 diff --git a/src/inference_endpoint/commands/benchmark/execute.py b/src/inference_endpoint/commands/benchmark/execute.py index 73c3427f..b2cb9f8e 100644 --- a/src/inference_endpoint/commands/benchmark/execute.py +++ b/src/inference_endpoint/commands/benchmark/execute.py @@ -300,9 +300,16 @@ def setup_benchmark(config: BenchmarkConfig, test_mode: TestMode) -> BenchmarkCo report_dir.mkdir(parents=True, exist_ok=True) config.to_yaml_file(report_dir / "config.yaml") - # Tokenizer check (light API call, no download) + # Tokenizer check (light API call, no download). + # When the serving model name is a local/container path (e.g. an NVFP4 + # checkpoint cached under /root/.cache/huggingface/hub/...) it is not a + # valid HF repo ID and the probe will fail. Allow model_params.tokenizer_name + # to override the source so the upstream HF tokenizer can still be used. model_name = config.model_params.name - tokenizer_name = model_name if _check_tokenizer_exists(model_name) else None + tokenizer_source = config.model_params.tokenizer_name or model_name + tokenizer_name = ( + tokenizer_source if _check_tokenizer_exists(tokenizer_source) else None + ) # Streaming logger.info( diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py index 6a1884b4..9898c62c 100644 --- a/src/inference_endpoint/config/schema.py +++ b/src/inference_endpoint/config/schema.py @@ -182,6 +182,15 @@ class ModelParams(BaseModel): str, cyclopts.Parameter(alias="--model", help="Model name", required=True), ] = "" + tokenizer_name: Annotated[ + str | None, + cyclopts.Parameter( + alias="--tokenizer", + help="Tokenizer name or path (overrides model name for tokenizer loading). " + "Useful when the serving model path differs from the tokenizer, e.g. " + "quantized checkpoints or container-local paths.", + ), + ] = Field(None, description="Tokenizer name/path override (HF repo ID or local path)") temperature: float | None = Field(None, description="Sampling temperature") top_k: int | None = Field(None, description="Top-K sampling") top_p: float | None = Field(None, description="Top-P (nucleus) sampling") diff --git a/src/inference_endpoint/load_generator/session.py b/src/inference_endpoint/load_generator/session.py index 1c8ad992..28a88889 100644 --- a/src/inference_endpoint/load_generator/session.py +++ b/src/inference_endpoint/load_generator/session.py @@ -191,8 +191,14 @@ def issue(self, sample_index: int) -> str | None: prompt_data: PromptData if isinstance(data, dict): token_ids = data.get("input_tokens") or data.get("token_ids") + # Multimodal datasets store ``prompt`` as a list of OpenAI content + # parts (e.g. [{"type": "text", ...}, {"type": "image_url", ...}]) + # which the HTTP adapter handles directly. PromptData.text is only + # meaningful for ISL on text-only prompts, so coerce non-strings + # to None and rely on token_ids when the dataset pre-tokenizes. + prompt = data.get("prompt") prompt_data = PromptData( - text=data.get("prompt"), + text=prompt if isinstance(prompt, str) else None, token_ids=tuple(token_ids) if token_ids is not None else None, ) else: diff --git a/src/inference_endpoint/openai/types.py b/src/inference_endpoint/openai/types.py index 036dd172..70476fb0 100644 --- a/src/inference_endpoint/openai/types.py +++ b/src/inference_endpoint/openai/types.py @@ -108,21 +108,30 @@ class ChatCompletionRequest( class ChatCompletionResponseMessage( msgspec.Struct, frozen=True, kw_only=True, omit_defaults=True, gc=False ): # type: ignore[call-arg] - """Response message from OpenAI.""" + """Response message from OpenAI. + + ``content`` and ``refusal`` are nullable per the OpenAI spec and vLLM + routinely omits them (e.g. when the model returns no text or no refusal + block), so they default to ``None`` to allow successful decoding. + """ role: str - content: str | None - refusal: str | None + content: str | None = None + refusal: str | None = None class ChatCompletionChoice( msgspec.Struct, frozen=True, kw_only=True, omit_defaults=True, gc=False ): # type: ignore[call-arg] - """A single choice in the completion response.""" + """A single choice in the completion response. + + ``finish_reason`` may be omitted in non-final SSE chunks; default to + ``None`` so decoding intermediate frames does not fail. + """ index: int message: ChatCompletionResponseMessage - finish_reason: str | None + finish_reason: str | None = None class CompletionUsage( @@ -142,12 +151,19 @@ class ChatCompletionResponse( omit_defaults=False, gc=False, ): # type: ignore[call-arg] - """OpenAI chat completion response.""" + """OpenAI chat completion response. + + Most servers (vLLM, Dynamo, etc.) legitimately omit a number of these + fields — e.g. ``usage`` is only emitted on the final SSE chunk, + ``system_fingerprint`` is rarely populated, and ``created``/``model`` + can be missing in some response variants. All of these get safe + defaults so the decoder accepts whatever the server sends. + """ id: str object: str = "chat.completion" - created: int - model: str + created: int = 0 + model: str = "" choices: list[ChatCompletionChoice] - usage: CompletionUsage | None - system_fingerprint: str | None + usage: CompletionUsage | None = None + system_fingerprint: str | None = None