diff --git a/examples/08_Qwen3-VL-235B-A22B_Example/offline_qwen3_vl_235b_a22b_shopify.yaml b/examples/08_Qwen3-VL-235B-A22B_Example/offline_qwen3_vl_235b_a22b_shopify.yaml index 95445781..11b3a516 100644 --- a/examples/08_Qwen3-VL-235B-A22B_Example/offline_qwen3_vl_235b_a22b_shopify.yaml +++ b/examples/08_Qwen3-VL-235B-A22B_Example/offline_qwen3_vl_235b_a22b_shopify.yaml @@ -7,6 +7,7 @@ timeout: 14400 # Perf + acc run takes over 3 hours, consider limit n_samples_to_ model_params: name: "Qwen/Qwen3-VL-235B-A22B-Instruct" + # tokenizer_name: "Qwen/Qwen3-VL-235B-A22B-Instruct" # Set this if model name is a local/container path temperature: 0 top_p: 1 max_new_tokens: 150 diff --git a/examples/08_Qwen3-VL-235B-A22B_Example/online_qwen3_vl_235b_a22b_shopify.yaml b/examples/08_Qwen3-VL-235B-A22B_Example/online_qwen3_vl_235b_a22b_shopify.yaml index db23f163..a47f4f98 100644 --- a/examples/08_Qwen3-VL-235B-A22B_Example/online_qwen3_vl_235b_a22b_shopify.yaml +++ b/examples/08_Qwen3-VL-235B-A22B_Example/online_qwen3_vl_235b_a22b_shopify.yaml @@ -6,6 +6,7 @@ timeout: 14400 model_params: name: "Qwen/Qwen3-VL-235B-A22B-Instruct" + # tokenizer_name: "Qwen/Qwen3-VL-235B-A22B-Instruct" # Set this if model name is a local/container path temperature: 0 top_p: 1 max_new_tokens: 150 diff --git a/src/inference_endpoint/commands/benchmark/execute.py b/src/inference_endpoint/commands/benchmark/execute.py index 73c3427f..b2cb9f8e 100644 --- a/src/inference_endpoint/commands/benchmark/execute.py +++ b/src/inference_endpoint/commands/benchmark/execute.py @@ -300,9 +300,16 @@ def setup_benchmark(config: BenchmarkConfig, test_mode: TestMode) -> BenchmarkCo report_dir.mkdir(parents=True, exist_ok=True) config.to_yaml_file(report_dir / "config.yaml") - # Tokenizer check (light API call, no download) + # Tokenizer check (light API call, no download). + # When the serving model name is a local/container path (e.g. an NVFP4 + # checkpoint cached under /root/.cache/huggingface/hub/...) it is not a + # valid HF repo ID and the probe will fail. Allow model_params.tokenizer_name + # to override the source so the upstream HF tokenizer can still be used. model_name = config.model_params.name - tokenizer_name = model_name if _check_tokenizer_exists(model_name) else None + tokenizer_source = config.model_params.tokenizer_name or model_name + tokenizer_name = ( + tokenizer_source if _check_tokenizer_exists(tokenizer_source) else None + ) # Streaming logger.info( diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py index 6a1884b4..9898c62c 100644 --- a/src/inference_endpoint/config/schema.py +++ b/src/inference_endpoint/config/schema.py @@ -182,6 +182,15 @@ class ModelParams(BaseModel): str, cyclopts.Parameter(alias="--model", help="Model name", required=True), ] = "" + tokenizer_name: Annotated[ + str | None, + cyclopts.Parameter( + alias="--tokenizer", + help="Tokenizer name or path (overrides model name for tokenizer loading). " + "Useful when the serving model path differs from the tokenizer, e.g. " + "quantized checkpoints or container-local paths.", + ), + ] = Field(None, description="Tokenizer name/path override (HF repo ID or local path)") temperature: float | None = Field(None, description="Sampling temperature") top_k: int | None = Field(None, description="Top-K sampling") top_p: float | None = Field(None, description="Top-P (nucleus) sampling") diff --git a/src/inference_endpoint/load_generator/session.py b/src/inference_endpoint/load_generator/session.py index 1c8ad992..28a88889 100644 --- a/src/inference_endpoint/load_generator/session.py +++ b/src/inference_endpoint/load_generator/session.py @@ -191,8 +191,14 @@ def issue(self, sample_index: int) -> str | None: prompt_data: PromptData if isinstance(data, dict): token_ids = data.get("input_tokens") or data.get("token_ids") + # Multimodal datasets store ``prompt`` as a list of OpenAI content + # parts (e.g. [{"type": "text", ...}, {"type": "image_url", ...}]) + # which the HTTP adapter handles directly. PromptData.text is only + # meaningful for ISL on text-only prompts, so coerce non-strings + # to None and rely on token_ids when the dataset pre-tokenizes. + prompt = data.get("prompt") prompt_data = PromptData( - text=data.get("prompt"), + text=prompt if isinstance(prompt, str) else None, token_ids=tuple(token_ids) if token_ids is not None else None, ) else: diff --git a/src/inference_endpoint/openai/types.py b/src/inference_endpoint/openai/types.py index 036dd172..70476fb0 100644 --- a/src/inference_endpoint/openai/types.py +++ b/src/inference_endpoint/openai/types.py @@ -108,21 +108,30 @@ class ChatCompletionRequest( class ChatCompletionResponseMessage( msgspec.Struct, frozen=True, kw_only=True, omit_defaults=True, gc=False ): # type: ignore[call-arg] - """Response message from OpenAI.""" + """Response message from OpenAI. + + ``content`` and ``refusal`` are nullable per the OpenAI spec and vLLM + routinely omits them (e.g. when the model returns no text or no refusal + block), so they default to ``None`` to allow successful decoding. + """ role: str - content: str | None - refusal: str | None + content: str | None = None + refusal: str | None = None class ChatCompletionChoice( msgspec.Struct, frozen=True, kw_only=True, omit_defaults=True, gc=False ): # type: ignore[call-arg] - """A single choice in the completion response.""" + """A single choice in the completion response. + + ``finish_reason`` may be omitted in non-final SSE chunks; default to + ``None`` so decoding intermediate frames does not fail. + """ index: int message: ChatCompletionResponseMessage - finish_reason: str | None + finish_reason: str | None = None class CompletionUsage( @@ -142,12 +151,19 @@ class ChatCompletionResponse( omit_defaults=False, gc=False, ): # type: ignore[call-arg] - """OpenAI chat completion response.""" + """OpenAI chat completion response. + + Most servers (vLLM, Dynamo, etc.) legitimately omit a number of these + fields — e.g. ``usage`` is only emitted on the final SSE chunk, + ``system_fingerprint`` is rarely populated, and ``created``/``model`` + can be missing in some response variants. All of these get safe + defaults so the decoder accepts whatever the server sends. + """ id: str object: str = "chat.completion" - created: int - model: str + created: int = 0 + model: str = "" choices: list[ChatCompletionChoice] - usage: CompletionUsage | None - system_fingerprint: str | None + usage: CompletionUsage | None = None + system_fingerprint: str | None = None