Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ timeout: 14400 # Perf + acc run takes over 3 hours, consider limit n_samples_to_

model_params:
name: "Qwen/Qwen3-VL-235B-A22B-Instruct"
# tokenizer_name: "Qwen/Qwen3-VL-235B-A22B-Instruct" # Set this if model name is a local/container path
temperature: 0
top_p: 1
max_new_tokens: 150
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ timeout: 14400

model_params:
name: "Qwen/Qwen3-VL-235B-A22B-Instruct"
# tokenizer_name: "Qwen/Qwen3-VL-235B-A22B-Instruct" # Set this if model name is a local/container path
temperature: 0
top_p: 1
max_new_tokens: 150
Expand Down
11 changes: 9 additions & 2 deletions src/inference_endpoint/commands/benchmark/execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,9 +300,16 @@ def setup_benchmark(config: BenchmarkConfig, test_mode: TestMode) -> BenchmarkCo
report_dir.mkdir(parents=True, exist_ok=True)
config.to_yaml_file(report_dir / "config.yaml")

# Tokenizer check (light API call, no download)
# Tokenizer check (light API call, no download).
# When the serving model name is a local/container path (e.g. an NVFP4
# checkpoint cached under /root/.cache/huggingface/hub/...) it is not a
# valid HF repo ID and the probe will fail. Allow model_params.tokenizer_name
# to override the source so the upstream HF tokenizer can still be used.
model_name = config.model_params.name
tokenizer_name = model_name if _check_tokenizer_exists(model_name) else None
tokenizer_source = config.model_params.tokenizer_name or model_name
tokenizer_name = (
tokenizer_source if _check_tokenizer_exists(tokenizer_source) else None
)
Comment on lines +309 to +312
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The current implementation of _check_tokenizer_exists relies on huggingface_hub.model_info, which is designed for repository IDs and will raise an exception (and log a warning) when provided with a local file system path. Since tokenizer_name is explicitly documented in the schema to support local paths, the logic should check for local existence first. This avoids incorrect warnings and ensures local tokenizers are correctly identified and passed to the metrics aggregator.

Suggested change
tokenizer_source = config.model_params.tokenizer_name or model_name
tokenizer_name = (
tokenizer_source if _check_tokenizer_exists(tokenizer_source) else None
)
tokenizer_source = config.model_params.tokenizer_name or model_name
tokenizer_name = (
tokenizer_source
if Path(tokenizer_source).exists() or _check_tokenizer_exists(tokenizer_source)
else None
)


# Streaming
logger.info(
Expand Down
9 changes: 9 additions & 0 deletions src/inference_endpoint/config/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,15 @@ class ModelParams(BaseModel):
str,
cyclopts.Parameter(alias="--model", help="Model name", required=True),
] = ""
tokenizer_name: Annotated[
str | None,
cyclopts.Parameter(
alias="--tokenizer",
help="Tokenizer name or path (overrides model name for tokenizer loading). "
"Useful when the serving model path differs from the tokenizer, e.g. "
"quantized checkpoints or container-local paths.",
),
] = Field(None, description="Tokenizer name/path override (HF repo ID or local path)")
temperature: float | None = Field(None, description="Sampling temperature")
top_k: int | None = Field(None, description="Top-K sampling")
top_p: float | None = Field(None, description="Top-P (nucleus) sampling")
Expand Down
8 changes: 7 additions & 1 deletion src/inference_endpoint/load_generator/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,14 @@ def issue(self, sample_index: int) -> str | None:
prompt_data: PromptData
if isinstance(data, dict):
token_ids = data.get("input_tokens") or data.get("token_ids")
# Multimodal datasets store ``prompt`` as a list of OpenAI content
# parts (e.g. [{"type": "text", ...}, {"type": "image_url", ...}])
# which the HTTP adapter handles directly. PromptData.text is only
# meaningful for ISL on text-only prompts, so coerce non-strings
# to None and rely on token_ids when the dataset pre-tokenizes.
prompt = data.get("prompt")
prompt_data = PromptData(
text=data.get("prompt"),
text=prompt if isinstance(prompt, str) else None,
token_ids=tuple(token_ids) if token_ids is not None else None,
)
else:
Expand Down
36 changes: 26 additions & 10 deletions src/inference_endpoint/openai/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,21 +108,30 @@ class ChatCompletionRequest(
class ChatCompletionResponseMessage(
msgspec.Struct, frozen=True, kw_only=True, omit_defaults=True, gc=False
): # type: ignore[call-arg]
"""Response message from OpenAI."""
"""Response message from OpenAI.

``content`` and ``refusal`` are nullable per the OpenAI spec and vLLM
routinely omits them (e.g. when the model returns no text or no refusal
block), so they default to ``None`` to allow successful decoding.
"""

role: str
content: str | None
refusal: str | None
content: str | None = None
refusal: str | None = None


class ChatCompletionChoice(
msgspec.Struct, frozen=True, kw_only=True, omit_defaults=True, gc=False
): # type: ignore[call-arg]
"""A single choice in the completion response."""
"""A single choice in the completion response.

``finish_reason`` may be omitted in non-final SSE chunks; default to
``None`` so decoding intermediate frames does not fail.
"""

index: int
message: ChatCompletionResponseMessage
finish_reason: str | None
finish_reason: str | None = None


class CompletionUsage(
Expand All @@ -142,12 +151,19 @@ class ChatCompletionResponse(
omit_defaults=False,
gc=False,
): # type: ignore[call-arg]
"""OpenAI chat completion response."""
"""OpenAI chat completion response.

Most servers (vLLM, Dynamo, etc.) legitimately omit a number of these
fields — e.g. ``usage`` is only emitted on the final SSE chunk,
``system_fingerprint`` is rarely populated, and ``created``/``model``
can be missing in some response variants. All of these get safe
defaults so the decoder accepts whatever the server sends.
"""

id: str
object: str = "chat.completion"
created: int
model: str
created: int = 0
model: str = ""
choices: list[ChatCompletionChoice]
usage: CompletionUsage | None
system_fingerprint: str | None
usage: CompletionUsage | None = None
system_fingerprint: str | None = None
Loading