From 898621da245628e2e573bc75402aac041d484edf Mon Sep 17 00:00:00 2001
From: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
Date: Fri, 1 May 2026 08:24:48 -0700
Subject: [PATCH 1/5] Fixes for DSV4 example
Signed-off-by: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
---
.../metrics_aggregator/token_metrics.py | 18 ++-
.../commands/benchmark/execute.py | 71 +++++++----
src/inference_endpoint/config/schema.py | 6 +
src/inference_endpoint/core/types.py | 8 +-
.../predefined/aime25/presets.py | 35 ++++++
.../dataset_manager/transforms.py | 7 +-
.../evaluation/extractor.py | 117 +++++++++++++++++-
src/inference_endpoint/openai/accumulator.py | 13 +-
.../openai/openai_msgspec_adapter.py | 2 +
src/inference_endpoint/openai/types.py | 1 +
tests/unit/evaluation/test_extractor.py | 66 +++++++++-
11 files changed, 306 insertions(+), 38 deletions(-)
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
index 56dee33f..546e30c1 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
@@ -22,7 +22,7 @@
from concurrent.futures import ThreadPoolExecutor
from typing import TYPE_CHECKING
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, PreTrainedTokenizerFast
if TYPE_CHECKING:
from transformers import PreTrainedTokenizerBase
@@ -76,9 +76,19 @@ def __init__(self, tokenizer_name: str, n_workers: int) -> None:
def _get_thread_tokenizer(self) -> PreTrainedTokenizerBase:
"""Return the tokenizer for the current thread, loading it if needed."""
if getattr(self._thread_local, "tokenizer", None) is None:
- self._thread_local.tokenizer = AutoTokenizer.from_pretrained(
- self._tokenizer_name
- )
+ try:
+ self._thread_local.tokenizer = AutoTokenizer.from_pretrained(
+ self._tokenizer_name
+ )
+ except Exception:
+ # AutoTokenizer loads config.json to detect the model type; for
+ # models with unknown model_type (e.g. deepseek_v4 in older
+ # transformers) or missing rope config fields, this fails.
+ # Fall back to PreTrainedTokenizerFast which reads only
+ # tokenizer.json / tokenizer_config.json and skips model config.
+ self._thread_local.tokenizer = PreTrainedTokenizerFast.from_pretrained(
+ self._tokenizer_name
+ )
return self._thread_local.tokenizer
def _token_count_worker(self, text: str) -> int:
diff --git a/src/inference_endpoint/commands/benchmark/execute.py b/src/inference_endpoint/commands/benchmark/execute.py
index 73c3427f..008e6da7 100644
--- a/src/inference_endpoint/commands/benchmark/execute.py
+++ b/src/inference_endpoint/commands/benchmark/execute.py
@@ -63,8 +63,6 @@
APIType,
BenchmarkConfig,
DatasetType,
- LoadPattern,
- LoadPatternType,
StreamingMode,
TestMode,
TestType,
@@ -302,7 +300,15 @@ def setup_benchmark(config: BenchmarkConfig, test_mode: TestMode) -> BenchmarkCo
# Tokenizer check (light API call, no download)
model_name = config.model_params.name
- tokenizer_name = model_name if _check_tokenizer_exists(model_name) else None
+ tokenizer_override = config.model_params.tokenizer_name
+ tokenizer_name: str | None
+ if tokenizer_override:
+ tokenizer_name = tokenizer_override
+ logger.info(
+ f"Tokenizer available for model: {model_name} (override: {tokenizer_override})"
+ )
+ else:
+ tokenizer_name = model_name if _check_tokenizer_exists(model_name) else None
# Streaming
logger.info(
@@ -368,7 +374,7 @@ def _build_phases(ctx: BenchmarkContext) -> list[PhaseConfig]:
min_sample_count=acc_ds.num_samples() * acc_ds.repeats,
rng_sched=ctx.rt_settings.rng_sched,
rng_sample_index=ctx.rt_settings.rng_sample_index,
- load_pattern=LoadPattern(type=LoadPatternType.MAX_THROUGHPUT),
+ load_pattern=ctx.rt_settings.load_pattern,
)
phases.append(
PhaseConfig(eval_cfg.dataset_name, acc_settings, acc_ds, PhaseType.ACCURACY)
@@ -649,27 +655,48 @@ def finalize_benchmark(ctx: BenchmarkContext, bench: BenchmarkResult) -> None:
# Write scoring artifacts + copy event log from tmpfs to disk
_write_scoring_artifacts(ctx, result, bench.tmpfs_dir)
- # Accuracy scoring
+ # Accuracy scoring — continue past per-scorer failures so partial results are saved
accuracy_scores: dict[str, Any] = {}
+ scoring_failed = False
for eval_cfg in ctx.eval_configs:
- scorer_instance = eval_cfg.scorer(
- eval_cfg.dataset_name,
- eval_cfg.dataset,
- eval_cfg.report_dir,
- extractor=eval_cfg.extractor,
- ground_truth_column=eval_cfg.ground_truth_column,
+ try:
+ scorer_instance = eval_cfg.scorer(
+ eval_cfg.dataset_name,
+ eval_cfg.dataset,
+ eval_cfg.report_dir,
+ extractor=eval_cfg.extractor,
+ ground_truth_column=eval_cfg.ground_truth_column,
+ )
+ score, n_repeats = scorer_instance.score()
+ assert eval_cfg.dataset.data is not None
+ accuracy_scores[eval_cfg.dataset_name] = {
+ "dataset_name": eval_cfg.dataset_name,
+ "num_samples": len(eval_cfg.dataset.data),
+ "extractor": eval_cfg.extractor.__name__,
+ "ground_truth_column": eval_cfg.ground_truth_column,
+ "score": score,
+ "n_repeats": n_repeats,
+ }
+ logger.info(
+ f"Score for {eval_cfg.dataset_name}: {score} ({n_repeats} repeats)"
+ )
+ except Exception as e:
+ scoring_failed = True
+ logger.error(f"Scoring failed for {eval_cfg.dataset_name}: {e}")
+ assert eval_cfg.dataset.data is not None
+ accuracy_scores[eval_cfg.dataset_name] = {
+ "dataset_name": eval_cfg.dataset_name,
+ "num_samples": len(eval_cfg.dataset.data),
+ "extractor": eval_cfg.extractor.__name__,
+ "ground_truth_column": eval_cfg.ground_truth_column,
+ "score": None,
+ "error": str(e),
+ }
+
+ if scoring_failed:
+ logger.warning(
+ "One or more accuracy scorers failed — partial accuracy results saved"
)
- score, n_repeats = scorer_instance.score()
- assert eval_cfg.dataset.data is not None
- accuracy_scores[eval_cfg.dataset_name] = {
- "dataset_name": eval_cfg.dataset_name,
- "num_samples": len(eval_cfg.dataset.data),
- "extractor": eval_cfg.extractor.__name__,
- "ground_truth_column": eval_cfg.ground_truth_column,
- "score": score,
- "n_repeats": n_repeats,
- }
- logger.info(f"Score for {eval_cfg.dataset_name}: {score} ({n_repeats} repeats)")
# Report metrics: prefer Report from KVStore, fall back to SessionResult
if report is not None and report.duration_ns is not None:
diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py
index 6a1884b4..eba20708 100644
--- a/src/inference_endpoint/config/schema.py
+++ b/src/inference_endpoint/config/schema.py
@@ -196,6 +196,12 @@ class ModelParams(BaseModel):
StreamingMode,
cyclopts.Parameter(alias="--streaming", help="Streaming mode: auto/on/off"),
] = StreamingMode.AUTO
+ tokenizer_name: str | None = Field(
+ None,
+ description="Local tokenizer path override. Use when AutoTokenizer.from_pretrained "
+ "fails for the HF model name (e.g. transformers ≥5.4 rope_theta regression "
+ "for DeepSeek-V4). Defaults to the model name if unset.",
+ )
class SubmissionReference(BaseModel):
diff --git a/src/inference_endpoint/core/types.py b/src/inference_endpoint/core/types.py
index accd2ca8..848ba61a 100644
--- a/src/inference_endpoint/core/types.py
+++ b/src/inference_endpoint/core/types.py
@@ -133,9 +133,13 @@ def text_after_first_chunk(self) -> str:
"""
parts: list[str] = []
if self.reasoning:
- if isinstance(self.reasoning, tuple) and len(self.reasoning) > 1:
+ if isinstance(self.reasoning, str):
+ # str reasoning is the fully joined streaming trace — include it
+ # in the TPOT denominator. Over-counts by one token (the first
+ # token is not excluded), but the error is negligible in practice.
+ parts.append(self.reasoning)
+ elif isinstance(self.reasoning, tuple) and len(self.reasoning) > 1:
parts.extend(self.reasoning[1:])
- # str reasoning: single chunk, skip entirely (it IS the first chunk)
if self.output:
if isinstance(self.output, str):
# Non-streaming: if reasoning was present and was the first chunk,
diff --git a/src/inference_endpoint/dataset_manager/predefined/aime25/presets.py b/src/inference_endpoint/dataset_manager/predefined/aime25/presets.py
index 6d9adf43..75c1ef61 100644
--- a/src/inference_endpoint/dataset_manager/predefined/aime25/presets.py
+++ b/src/inference_endpoint/dataset_manager/predefined/aime25/presets.py
@@ -17,6 +17,7 @@
"""Preset transforms for the AIME25 dataset."""
from inference_endpoint.dataset_manager.transforms import (
+ AddStaticColumns,
Transform,
UserPromptFormatter,
)
@@ -27,4 +28,38 @@ def gptoss() -> list[Transform]:
UserPromptFormatter(
user_prompt_format="{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.",
),
+ # Enable DeepSeek thinking mode so the model uses chain-of-thought reasoning.
+ # vLLM's reasoning_parser strips ... tokens into reasoning_content;
+ # the final boxed answer ends up in content where boxed_math_extractor finds it.
+ AddStaticColumns({"chat_template_kwargs": {"thinking": True}}),
+ ]
+
+
+def gptoss_budget() -> list[Transform]:
+ return [
+ UserPromptFormatter(
+ user_prompt_format="{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.",
+ ),
+ # Same as gptoss but caps thinking at 8192 tokens via budget_tokens so the model
+ # is forced to emit a final answer rather than consuming all max_new_tokens in
+ # the reasoning phase (observed issue: 85% of responses had empty answer text).
+ AddStaticColumns({"chat_template_kwargs": {"thinking": True, "budget_tokens": 8192}}),
+ ]
+
+
+def gptoss_budget_20k() -> list[Transform]:
+ return [
+ UserPromptFormatter(
+ user_prompt_format="{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.",
+ ),
+ AddStaticColumns({"chat_template_kwargs": {"thinking": True, "budget_tokens": 20000}}),
+ ]
+
+
+def gptoss_budget_20k_pre() -> list[Transform]:
+ return [
+ UserPromptFormatter(
+ user_prompt_format="Please reason step by step, and put your final answer within \\boxed{{}}.\n\n{question}",
+ ),
+ AddStaticColumns({"chat_template_kwargs": {"thinking": True, "budget_tokens": 20000}}),
]
diff --git a/src/inference_endpoint/dataset_manager/transforms.py b/src/inference_endpoint/dataset_manager/transforms.py
index 79133796..2e1e7683 100644
--- a/src/inference_endpoint/dataset_manager/transforms.py
+++ b/src/inference_endpoint/dataset_manager/transforms.py
@@ -123,7 +123,12 @@ def __init__(self, data: dict[str, Any]):
def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
"""Add the static columns to the row."""
for key, value in self.data.items():
- df[key] = value
+ # Wrap dict/list values in a list so pandas doesn't try to align
+ # on index keys (e.g. {"thinking": True} would produce NaN otherwise).
+ if isinstance(value, (dict, list)):
+ df[key] = [value] * len(df)
+ else:
+ df[key] = value
return df
diff --git a/src/inference_endpoint/evaluation/extractor.py b/src/inference_endpoint/evaluation/extractor.py
index 99d07db4..cbace9cf 100644
--- a/src/inference_endpoint/evaluation/extractor.py
+++ b/src/inference_endpoint/evaluation/extractor.py
@@ -103,7 +103,7 @@ class ABCDExtractor(Extractor, extractor_id="abcd_extractor"):
Returns:
"choice" key (see GQPA dataset columns) or empty string if no answer is found.
Examples:
- >>> ABCDExtractor.extract("The answer is B")
+ >>> ABCDExtractor.extract("Answer: B")
'choice2'
>>> ABCDExtractor.extract("**Answer:** C")
'choice3'
@@ -220,6 +220,121 @@ def extract(cls, text: str, default: str | None = None) -> str | None:
return default if default is not None else ""
+class LetterExtractor(Extractor, extractor_id="letter_extractor"):
+ """Extract MCQ answer letter (A–J) from response text, returning the letter directly.
+
+ Like ABCDExtractor but returns the raw letter ("A", "B", … "J") instead of
+ mapping to "choice1"–"choice4". Supports datasets with up to ten answer
+ options (e.g. MMLU-Pro A–J) where ground-truth labels are stored as the
+ letter itself.
+
+ Examples:
+ >>> LetterExtractor.extract("Answer: B")
+ 'B'
+ >>> LetterExtractor.extract("**Answer:** G")
+ 'G'
+ >>> LetterExtractor.extract("\\\\boxed{E}")
+ 'E'
+ """
+
+ LETTERS = frozenset("ABCDEFGHIJ")
+
+ PATTERNS = [
+ # 0) **Answer:** A or *Answers* – B
+ re.compile(
+ r"""(?ix)
+ (?:\*{1,2}|_{1,2})
+ Answer[s]?
+ \s*[:\-–]?
+ (?:\*{1,2}|_{1,2})
+ \s*
+ ([A-J])\b
+ """,
+ re.X,
+ ),
+ # 0.1) Answer: A (with optional markdown)
+ re.compile(
+ r"""(?ix)
+ ^\s*
+ (?:\*{1,2}|_{1,2})?
+ Answer:?
+ (?:\*{1,2}|_{1,2})?
+ \s*:?\s*
+ (?:\*{1,2}|_{1,2})?
+ ([A-J])
+ (?:\*{1,2}|_{1,2})?
+ \s*
+ """,
+ re.MULTILINE,
+ ),
+ # 1) Answer: (C)
+ re.compile(r"(?ix)\bAnswer[s]?\b\s*[:\-–]?\s*\(\s*([A-J])\s*\)"),
+ # 2) Answer: C
+ re.compile(r"(?ix)\bAnswer[s]?\b\s*[:\-–]?\s*([A-J])\b"),
+ # 3) Option B or Choice: C
+ re.compile(r"(?ix)\b(?:Option|Choice)\b\s*[:\-–]?\s*([A-J])\b"),
+ # 7) \boxed{A}
+ re.compile(r"(?x)\\boxed\{[^}]*?([A-J])[^}]*\}", re.MULTILINE),
+ # 7.5) \boxed{\textbf{C}}
+ re.compile(
+ r"(?x)\\boxed\{[^}]*?\\textbf\{[^}]*?([A-J])[^}]*\}[^}]*\}", re.MULTILINE
+ ),
+ # 7.51) \boxed{\text{C}}
+ re.compile(
+ r"(?x)\\boxed\{[^}]*?\\text\{[^}]*?([A-J])[^}]*\}[^}]*\}", re.MULTILINE
+ ),
+ # 4) bare singletons: (A) [B]
+ re.compile(r"(?x)(? str | None:
+ matches = []
+ for prio, pat in enumerate(cls.PATTERNS):
+ m = pat.search(text)
+ if m:
+ letter = m.group(1).upper()
+ if letter in cls.LETTERS:
+ matches.append((prio, m, letter))
+
+ matches.sort(key=lambda triple: (triple[0], len(triple[1].group(0))))
+
+ for _, _, letter in matches:
+ return letter
+
+ stripped = text.removeprefix("**")
+ if stripped and stripped[0].upper() in cls.LETTERS:
+ return stripped[0].upper()
+
+ return default if default is not None else ""
+
+
class BoxedMathExtractor(Extractor, extractor_id="boxed_math_extractor"):
"""Extract boxed math answer from response text.
Based on OpenAI's extract_boxed_math function from GPT-OSS.
diff --git a/src/inference_endpoint/openai/accumulator.py b/src/inference_endpoint/openai/accumulator.py
index 6cb23ed8..c78fd922 100644
--- a/src/inference_endpoint/openai/accumulator.py
+++ b/src/inference_endpoint/openai/accumulator.py
@@ -68,15 +68,14 @@ def add_chunk(self, delta: OpenAISSEDelta) -> StreamChunk | None:
def get_final_output(self) -> QueryResult:
if self.reasoning_chunks:
- # If there are reasoning chunks, then the first chunk received
- # is the first reasoning chunk. The rest of the reasoning chunks,
- # as well as the output chunks can be joined together.
- resp_reasoning: list[str] = [self.reasoning_chunks[0]]
- if len(self.reasoning_chunks) > 1:
- resp_reasoning.append("".join(self.reasoning_chunks[1:]))
+ # All reasoning chunks are joined into a single string so the full
+ # thinking trace is captured as-is in events.jsonl. TPOT still uses
+ # text_after_first_chunk(), which includes string reasoning in the
+ # denominator (off by one token vs. the true "after first chunk"
+ # count, which is negligible).
text_output = TextModelOutput(
output="".join(self.output_chunks),
- reasoning=resp_reasoning,
+ reasoning="".join(self.reasoning_chunks),
)
elif self.output_chunks:
# If there are only output chunks, the first chunk is used for
diff --git a/src/inference_endpoint/openai/openai_msgspec_adapter.py b/src/inference_endpoint/openai/openai_msgspec_adapter.py
index 6106e1bd..16f4bc3b 100644
--- a/src/inference_endpoint/openai/openai_msgspec_adapter.py
+++ b/src/inference_endpoint/openai/openai_msgspec_adapter.py
@@ -82,6 +82,7 @@ def dataset_transforms(cls, model_params: ModelParams) -> list[Transform]:
"logit_bias",
"user",
"chat_template",
+ "chat_template_kwargs",
]
return [
ColumnFilter(
@@ -164,6 +165,7 @@ def to_endpoint_request(cls, query: Query) -> ChatCompletionRequest:
logit_bias=query.data.get("logit_bias"),
user=query.data.get("user"),
chat_template=query.data.get("chat_template"),
+ chat_template_kwargs=query.data.get("chat_template_kwargs"),
)
@classmethod
diff --git a/src/inference_endpoint/openai/types.py b/src/inference_endpoint/openai/types.py
index 036dd172..09b53e88 100644
--- a/src/inference_endpoint/openai/types.py
+++ b/src/inference_endpoint/openai/types.py
@@ -103,6 +103,7 @@ class ChatCompletionRequest(
logit_bias: dict[str, float] | None = None
user: str | None = None
chat_template: str | None = None
+ chat_template_kwargs: dict[str, Any] | None = None
class ChatCompletionResponseMessage(
diff --git a/tests/unit/evaluation/test_extractor.py b/tests/unit/evaluation/test_extractor.py
index 17ad1654..e2db968a 100644
--- a/tests/unit/evaluation/test_extractor.py
+++ b/tests/unit/evaluation/test_extractor.py
@@ -14,9 +14,15 @@
# limitations under the License.
-from inference_endpoint.evaluation.extractor import PythonCodeExtractor
+import pytest
+from inference_endpoint.evaluation.extractor import (
+ ABCDExtractor,
+ LetterExtractor,
+ PythonCodeExtractor,
+)
+@pytest.mark.unit
class TestPythonCodeExtractor:
"""Test cases for PythonCodeExtractor."""
@@ -155,3 +161,61 @@ def test_extractor_get_method(self):
text = "```python\nprint('test')\n```"
result = extractor_cls.extract(text)
assert result == "print('test')"
+
+
+@pytest.mark.unit
+class TestLetterExtractor:
+ """Tests for LetterExtractor — returns raw letter (A–J) for MCQ datasets
+ where ground truth is stored as a letter (e.g. MLPerf GPQA/MMLU-Pro)."""
+
+ def test_answer_colon(self):
+ assert LetterExtractor.extract("Answer: B") == "B"
+
+ def test_markdown_answer(self):
+ assert LetterExtractor.extract("**Answer:** C") == "C"
+
+ def test_answer_colon_d(self):
+ assert LetterExtractor.extract("Answer: D") == "D"
+
+ # Extended range (E–J) for MMLU-Pro
+ def test_letter_e(self):
+ assert LetterExtractor.extract("Answer: E") == "E"
+
+ def test_letter_g(self):
+ assert LetterExtractor.extract("**Answer:** G") == "G"
+
+ def test_letter_j(self):
+ assert LetterExtractor.extract("Answer: J") == "J"
+
+ def test_boxed_letter(self):
+ assert LetterExtractor.extract(r"\boxed{F}") == "F"
+
+ def test_parenthesised_singleton(self):
+ assert LetterExtractor.extract("The correct choice is (H)") == "H"
+
+ def test_mlperf_style_output(self):
+ # Mirrors the few-shot prompt format: model outputs "Answer: X"
+ text = "Let me think... the ring characteristic is 0.\nAnswer: A"
+ assert LetterExtractor.extract(text) == "A"
+
+ def test_no_match_returns_empty_string(self):
+ assert LetterExtractor.extract("No answer here.") == ""
+
+ def test_default_on_no_match(self):
+ assert LetterExtractor.extract("Nothing", default="X") == "X"
+
+ def test_returns_letter_not_choice_key(self):
+ # Confirm it does NOT return "choice1" etc. (ABCDExtractor behaviour)
+ result = LetterExtractor.extract("Answer: B")
+ assert result == "B"
+ assert result != "choice2"
+
+ def test_abcd_extractor_unchanged(self):
+ # Confirm ABCDExtractor still maps to choiceN (regression guard)
+ assert ABCDExtractor.extract("Answer: B") == "choice2"
+
+ def test_registered(self):
+ from inference_endpoint.evaluation.extractor import Extractor
+
+ assert "letter_extractor" in Extractor.PREDEFINED
+ assert Extractor.get("letter_extractor") is LetterExtractor
From fa3eb9199cc52b279e46a52b39e6baf8bc7abfdc Mon Sep 17 00:00:00 2001
From: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
Date: Fri, 1 May 2026 08:32:20 -0700
Subject: [PATCH 2/5] Add example files.
Signed-off-by: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
---
examples/09_DeepSeek-V4-Pro_Example/README.md | 174 ++++++++++++++++++
.../extract_mlperf_subsets.py | 57 ++++++
.../launch_server.sh | 94 ++++++++++
.../vllm_dsv4pro_mlperf_accuracy.yaml | 116 ++++++++++++
4 files changed, 441 insertions(+)
create mode 100644 examples/09_DeepSeek-V4-Pro_Example/README.md
create mode 100644 examples/09_DeepSeek-V4-Pro_Example/extract_mlperf_subsets.py
create mode 100755 examples/09_DeepSeek-V4-Pro_Example/launch_server.sh
create mode 100644 examples/09_DeepSeek-V4-Pro_Example/vllm_dsv4pro_mlperf_accuracy.yaml
diff --git a/examples/09_DeepSeek-V4-Pro_Example/README.md b/examples/09_DeepSeek-V4-Pro_Example/README.md
new file mode 100644
index 00000000..9ec3ca3f
--- /dev/null
+++ b/examples/09_DeepSeek-V4-Pro_Example/README.md
@@ -0,0 +1,174 @@
+# DeepSeek-V4-Pro Benchmark
+
+End-to-end example for benchmarking `deepseek-ai/DeepSeek-V4-Pro` with vLLM on 8×B200 (or 8×B300), covering performance throughput and accuracy evaluation (AIME 2025, GPQA, and the full MLPerf Inference accuracy suite).
+
+## Hardware
+
+| Requirement | Details |
+|-------------|---------|
+| GPUs | 8× NVIDIA B200 or B300 |
+| System RAM | ≥ 256 GB |
+| Docker image | `vllm/vllm-openai:deepseekv4-cu130` |
+| Startup time | ~22 minutes (weight loading + TileLang kernel compilation) |
+
+The recipe is taken from the [vLLM DeepSeek V4 blog post](https://github.com/vllm-project/vllm-project.github.io/blob/main/_posts/2026-04-24-deepseek-v4.md).
+
+## Environment Setup
+
+```bash
+export MODEL_PATH=/path/to/DeepSeek-V4-Pro # local weight directory
+export HF_HOME=~/.cache/huggingface
+export HF_TOKEN=
+```
+
+## Launching the Server
+
+```bash
+bash examples/09_DeepSeek-V4-Pro_Example/launch_server.sh
+```
+
+The script mounts `$MODEL_PATH` into the container at `/model`, sets
+`VLLM_ENGINE_READY_TIMEOUT_S=3600`, and polls `/health` until the server is ready.
+
+### Why `VLLM_ENGINE_READY_TIMEOUT_S=3600` is required
+
+The default value is 600 s (10 min). Loading DeepSeek-V4-Pro's 64 safetensor shards plus
+compiling TileLang kernels (`mhc_pre_big_fuse_tilelang` etc.) across 8 DP workers takes
+~22 min on 8×B200. With the default timeout the `ApiServer_0` process raises a `TimeoutError`
+and exits — even though all 8 engine workers completed successfully — causing the container to
+crash. Setting the timeout to 3600 s avoids this entirely.
+
+### Key launch flags
+
+| Flag | Purpose |
+|------|---------|
+| `--data-parallel-size 8` | Expert parallelism across 8 GPUs (no TP needed for MoE) |
+| `--enable-expert-parallel` | Required alongside `--data-parallel-size` |
+| `--kv-cache-dtype fp8` | Matches DeepSeek V4's hybrid c4a / c128a KV cache design |
+| `--block-size 256` | Unified 256-token logical block across all compression layers |
+| `--attention_config.use_fp4_indexer_cache=True` | FP4 indexer for ~2x additional KV savings |
+| `--tokenizer-mode deepseek_v4` | Required for the V4 chat template |
+| `--reasoning-parser deepseek_v4` | Strips `…` into `reasoning_content` |
+| `--compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}'` | Enables TileLang kernel fusions |
+| `VLLM_ENGINE_READY_TIMEOUT_S=3600` | Prevents premature `ApiServer_0` timeout during startup |
+
+## Performance Benchmark
+
+```bash
+uv run inference-endpoint benchmark from-config \
+ -c examples/09_DeepSeek-V4-Pro_Example/vllm_dsv4pro_perf.yaml
+```
+
+Config: [`vllm_dsv4pro_perf.yaml`](vllm_dsv4pro_perf.yaml)
+- 2-minute minimum run at concurrency 32
+- Metrics: throughput, latency, TTFT, TPOT
+
+## Accuracy Benchmark (AIME 2025 + GPQA)
+
+```bash
+uv run inference-endpoint benchmark from-config \
+ -c examples/09_DeepSeek-V4-Pro_Example/vllm_dsv4pro_accuracy.yaml
+```
+
+Config: [`vllm_dsv4pro_accuracy.yaml`](vllm_dsv4pro_accuracy.yaml)
+
+| Dataset | Samples | Repeats | Extractor | Scorer |
+|---------|---------|---------|-----------|--------|
+| AIME 2025 | 30 | 8 | `boxed_math_extractor` | `pass_at_1` |
+| GPQA Diamond | 198 | 5 | `abcd_extractor` | `pass_at_1` |
+
+### Concurrency note
+
+`target_concurrency: 4` is intentional. With `max_model_len=65536` and `max_new_tokens=32768`,
+each in-flight request can occupy up to 32k tokens of KV cache. Four concurrent requests
+fit within the fp8 KV cache budget without preemption on 8×B200.
+
+### Thinking mode and `budget_tokens`
+
+The `aime25::gptoss_budget_20k` preset enables DeepSeek's thinking mode
+(`chat_template_kwargs: {thinking: True, budget_tokens: 20000}`). Without `budget_tokens`,
+the model can spend all 32k tokens in the `` block and return an empty boxed answer —
+observed on ~85% of responses in early testing. Setting `budget_tokens=20000` caps the
+reasoning phase and forces a final answer.
+
+### Measured results (8×B200, `deepseekv4-cu130`)
+
+| Dataset | Score |
+|---------|-------|
+| AIME 2025 pass@1 | **55.4%** (8 repeats, budget_tokens=20000) |
+
+## MLPerf Inference Accuracy Suite
+
+The MLPerf DeepSeek-R1 accuracy check uses 5 sub-datasets (4388 total samples):
+
+| Sub-dataset | Samples | Metric | File |
+|-------------|---------|--------|------|
+| AIME 1983 | 932 | exact_match | `mlperf_deepseek_r1_math_accuracy.parquet` |
+| MATH-500 | 499 | exact_match | `mlperf_deepseek_r1_math_accuracy.parquet` |
+| GPQA | 198 | exact_match | `mlperf_deepseek_r1_mcq_accuracy.parquet` |
+| MMLU-Pro | 2410 | exact_match | extracted by `extract_mlperf_subsets.py` |
+| LiveCodeBench | 349 | code_execute_verify | extracted by `extract_mlperf_subsets.py` |
+
+**Golden accuracy (fp32):** `exact_match = 81.3582%`, `TOKENS_PER_SAMPLE = 3886.2`
+**MLPerf pass threshold:** ≥ 80.52% exact_match (99% of golden), tokens within ±10%
+
+### Step 1 — Extract missing subsets
+
+```bash
+uv run python examples/09_DeepSeek-V4-Pro_Example/extract_mlperf_subsets.py
+```
+
+This writes:
+- `datasets/deepseek/mlperf_deepseek_r1_mmlu_pro_accuracy.parquet`
+- `datasets/deepseek/mlperf_deepseek_r1_livecodebench_accuracy.parquet`
+
+### Step 2 — Run math + MCQ accuracy
+
+Uncomment MMLU-Pro in [`vllm_dsv4pro_mlperf_accuracy.yaml`](vllm_dsv4pro_mlperf_accuracy.yaml), then:
+
+```bash
+uv run inference-endpoint benchmark from-config \
+ -c examples/09_DeepSeek-V4-Pro_Example/vllm_dsv4pro_mlperf_accuracy.yaml
+```
+
+### Step 3 — Run LiveCodeBench accuracy
+
+LiveCodeBench requires the `lcb-service` container (executes generated Python code in an
+isolated environment). See the
+[LiveCodeBench README](../../src/inference_endpoint/evaluation/livecodebench/README.md) for
+container setup. Once running on port 13835, uncomment the `mlperf-livecodebench` dataset in
+`vllm_dsv4pro_mlperf_accuracy.yaml` and re-run.
+
+## Troubleshooting
+
+**Container exits immediately or health check never passes**
+
+```bash
+docker logs | tail -40
+```
+
+Common causes:
+- `TimeoutError: Timed out waiting for engine core processes to start` — set `VLLM_ENGINE_READY_TIMEOUT_S=3600` (already set in `launch_server.sh`)
+- OOM during weight loading — verify `--max-model-len` is not too large for available GPU memory
+- `MODEL_PATH` not mounted correctly — check that `/model/config.json` exists inside the container
+
+**`At least one performance dataset required`**
+
+Every benchmark config must include at least one `type: performance` dataset entry, even for
+accuracy-only runs. Use the perf-warmup entry with `n_samples_to_issue: 1`.
+
+**Empty boxed answers / low AIME accuracy**
+
+The model exhausted `max_new_tokens` in the thinking phase. Add `budget_tokens` to the preset:
+
+```yaml
+- name: aime25::gptoss_budget_20k # uses budget_tokens=20000
+```
+
+**`uv: cannot execute binary file: Exec format error`**
+
+The `uv` binary in `~/.local/bin/uv` has the wrong architecture. Use the venv directly:
+
+```bash
+.venv/bin/inference-endpoint benchmark from-config -c
+```
diff --git a/examples/09_DeepSeek-V4-Pro_Example/extract_mlperf_subsets.py b/examples/09_DeepSeek-V4-Pro_Example/extract_mlperf_subsets.py
new file mode 100644
index 00000000..6e945dfa
--- /dev/null
+++ b/examples/09_DeepSeek-V4-Pro_Example/extract_mlperf_subsets.py
@@ -0,0 +1,57 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Extract MMLU-Pro and LiveCodeBench subsets from the combined MLPerf accuracy parquet.
+
+The combined parquet (mlperf_deepseek_r1_dataset_4388_fp8_eval_accuracy.parquet) contains
+5 sub-datasets identified by the 'dataset' column. Pre-split files already exist for Math
+and GPQA; this script extracts the remaining two:
+
+ datasets/deepseek/mlperf_deepseek_r1_mmlu_pro_accuracy.parquet (2410 rows)
+ datasets/deepseek/mlperf_deepseek_r1_livecodebench_accuracy.parquet (349 rows)
+
+Usage:
+ uv run python examples/09_DeepSeek-V4-Pro_Example/extract_mlperf_subsets.py
+"""
+
+from pathlib import Path
+
+import pandas as pd
+
+SRC = Path("datasets/deepseek/mlperf_deepseek_r1_dataset_4388_fp8_eval_accuracy.parquet")
+OUT_DIR = Path("datasets/deepseek")
+
+SUBSETS = {
+ "mmlu_pro": "mlperf_deepseek_r1_mmlu_pro_accuracy.parquet",
+ "livecodebench": "mlperf_deepseek_r1_livecodebench_accuracy.parquet",
+}
+
+
+def main() -> None:
+ df = pd.read_parquet(SRC)
+ print(f"Loaded {len(df)} rows from {SRC}")
+ print("Sub-dataset breakdown:")
+ print(df.groupby(["dataset", "metric"]).size().to_string())
+ print()
+
+ for dataset_name, out_filename in SUBSETS.items():
+ subset = df[df["dataset"] == dataset_name].reset_index(drop=True)
+ out_path = OUT_DIR / out_filename
+ subset.to_parquet(out_path, index=False)
+ print(f"Wrote {len(subset)} rows → {out_path}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/09_DeepSeek-V4-Pro_Example/launch_server.sh b/examples/09_DeepSeek-V4-Pro_Example/launch_server.sh
new file mode 100755
index 00000000..ac141c2a
--- /dev/null
+++ b/examples/09_DeepSeek-V4-Pro_Example/launch_server.sh
@@ -0,0 +1,94 @@
+#!/usr/bin/env bash
+# Launch DeepSeek-V4-Pro with vLLM on 8×B200 / 8×B300.
+#
+# Key flags vs. a standard vLLM launch:
+# --data-parallel-size 8 Expert parallelism across 8 GPUs (no TP)
+# --enable-expert-parallel Required for MoE data-parallel dispatch
+# --kv-cache-dtype fp8 DeepSeek V4's hybrid KV cache (c4a/c128a)
+# --block-size 256 Matches the 256-native-token logical block size
+# --attention_config.use_fp4_indexer_cache=True FP4 indexer for 2x KV savings
+# --tokenizer-mode deepseek_v4 Custom tokenizer for V4 chat template
+# --reasoning-parser deepseek_v4 Strips … into reasoning_content
+# --compilation-config … FULL_AND_PIECEWISE cudagraph + all custom fusions
+#
+# Startup time note:
+# Model weight loading (64 shards) + TileLang kernel compilation takes ~22 min
+# on 8×B200. The default VLLM_ENGINE_READY_TIMEOUT_S=600 (10 min) is too short
+# and will crash the API server with a TimeoutError even though the workers are
+# fine. Always set VLLM_ENGINE_READY_TIMEOUT_S=3600 for this model.
+
+set -euo pipefail
+
+: "${MODEL_PATH:?Set MODEL_PATH to the directory containing the DeepSeek-V4-Pro weights}"
+PORT="${PORT:-8000}"
+MAX_MODEL_LEN="${MAX_MODEL_LEN:-65536}"
+
+if [[ ! -d "${MODEL_PATH}" ]]; then
+ echo "ERROR: MODEL_PATH=${MODEL_PATH} does not exist."
+ echo "Set MODEL_PATH to the directory containing the DeepSeek-V4-Pro weights."
+ exit 1
+fi
+
+echo "Launching DeepSeek-V4-Pro on port ${PORT} (model: ${MODEL_PATH})"
+echo "Startup takes ~22 minutes for weight loading + TileLang kernel compilation."
+echo ""
+
+CONTAINER_ID=$(docker run -d \
+ --gpus all \
+ --shm-size 32g \
+ --net host \
+ --ipc host \
+ -v "${MODEL_PATH}:/model" \
+ -v "${HF_HOME:-${HOME}/.cache/huggingface}:/root/.cache/huggingface" \
+ --env HF_TOKEN="${HF_TOKEN:-}" \
+ --env VLLM_WORKER_MULTIPROC_METHOD=spawn \
+ --env VLLM_ENGINE_READY_TIMEOUT_S=3600 \
+ vllm/vllm-openai:deepseekv4-cu130 \
+ --model /model \
+ --served-model-name deepseek-ai/DeepSeek-V4-Pro \
+ --trust-remote-code \
+ --kv-cache-dtype fp8 \
+ --block-size 256 \
+ --enable-expert-parallel \
+ --data-parallel-size 8 \
+ --max-model-len "${MAX_MODEL_LEN}" \
+ --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' \
+ --attention_config.use_fp4_indexer_cache=True \
+ --tokenizer-mode deepseek_v4 \
+ --tool-call-parser deepseek_v4 \
+ --enable-auto-tool-choice \
+ --reasoning-parser deepseek_v4 \
+ --disable-log-stats \
+ --disable-uvicorn-access-log \
+ --port "${PORT}")
+
+echo "Container started: ${CONTAINER_ID:0:12}"
+echo ""
+echo "Polling http://localhost:${PORT}/health ..."
+
+TIMEOUT=2400
+START=$(date +%s)
+while true; do
+ if curl -sf "http://localhost:${PORT}/health" > /dev/null 2>&1; then
+ ELAPSED=$(( $(date +%s) - START ))
+ echo "Server healthy after ${ELAPSED}s. Ready to benchmark."
+ break
+ fi
+ ELAPSED=$(( $(date +%s) - START ))
+ if [[ ${ELAPSED} -ge ${TIMEOUT} ]]; then
+ echo "ERROR: server not healthy after ${TIMEOUT}s"
+ docker logs "${CONTAINER_ID}" | tail -40
+ exit 1
+ fi
+ if [[ "$(docker inspect -f '{{.State.Running}}' "${CONTAINER_ID}" 2>/dev/null)" != "true" ]]; then
+ echo "ERROR: container exited unexpectedly"
+ docker logs "${CONTAINER_ID}" | tail -40
+ exit 1
+ fi
+ echo " Waiting... (${ELAPSED}s)"
+ sleep 15
+done
+
+echo ""
+echo "Container ID : ${CONTAINER_ID:0:12}"
+echo "Stop with : docker stop ${CONTAINER_ID:0:12}"
diff --git a/examples/09_DeepSeek-V4-Pro_Example/vllm_dsv4pro_mlperf_accuracy.yaml b/examples/09_DeepSeek-V4-Pro_Example/vllm_dsv4pro_mlperf_accuracy.yaml
new file mode 100644
index 00000000..f4d29d4e
--- /dev/null
+++ b/examples/09_DeepSeek-V4-Pro_Example/vllm_dsv4pro_mlperf_accuracy.yaml
@@ -0,0 +1,116 @@
+name: dsv4pro-vllm-b200x8-mlperf-accuracy
+version: "1.0"
+type: online
+timeout: 600.0
+
+# MLPerf Inference accuracy evaluation for DeepSeek-R1 / DeepSeek-V4-Pro.
+#
+# Runs the 4 sub-datasets used in the MLPerf submission accuracy check:
+# - Math (AIME1983 + MATH500) 1431 samples boxed_math_extractor
+# - GPQA 198 samples abcd_extractor
+# - MMLU-Pro 2410 samples abcd_extractor
+# - LiveCodeBench (commented) 349 samples python_code_extractor
+#
+# Golden accuracy (fp32 reference): exact_match = 81.3582%
+# MLPerf pass threshold: exact_match ≥ 80.52% (99% of golden)
+#
+# MMLU-Pro and LiveCodeBench must be extracted from the combined parquet first:
+# python examples/09_DeepSeek-V4-Pro_Example/extract_mlperf_subsets.py
+
+model_params:
+ name: deepseek-ai/DeepSeek-V4-Pro
+ temperature: 1.0
+ max_new_tokens: 32768
+ top_p: 1.0
+ streaming: "on"
+ # tokenizer_name: /path/to/local/weights # optional: set if weights are not in HF cache
+
+datasets:
+ - name: perf-warmup
+ type: performance
+ path: datasets/deepseek/mlperf_deepseek_r1_dataset_4388_fp8_eval.parquet
+ parser:
+ prompt: templated_text_input
+
+ # AIME1983 + MATH500 (1431 samples). Prompts are already formatted as
+ # templated_text_input — no parser transform needed.
+ - name: mlperf-math
+ type: accuracy
+ path: datasets/deepseek/mlperf_deepseek_r1_math_accuracy.parquet
+ parser:
+ prompt: templated_text_input
+ accuracy_config:
+ eval_method: pass_at_1
+ ground_truth: ground_truth
+ extractor: boxed_math_extractor
+ num_repeats: 1
+
+ # GPQA (198 samples). Ground truth is stored as raw letters ("A"–"D") in
+ # this parquet, so letter_extractor is required (abcd_extractor would return
+ # "choice1"–"choice4" and score 0% against letter ground truth).
+ - name: mlperf-gpqa
+ type: accuracy
+ path: datasets/deepseek/mlperf_deepseek_r1_mcq_accuracy.parquet
+ parser:
+ prompt: templated_text_input
+ accuracy_config:
+ eval_method: pass_at_1
+ ground_truth: ground_truth
+ extractor: letter_extractor
+ num_repeats: 1
+
+ # MMLU-Pro (2410 samples) — extracted from the combined accuracy parquet by
+ # extract_mlperf_subsets.py. Ground truth is "A"–"J" (10 choices); use
+ # letter_extractor which handles the full A–J range.
+ - name: mlperf-mmlu-pro
+ type: accuracy
+ path: datasets/deepseek/mlperf_deepseek_r1_mmlu_pro_accuracy.parquet
+ parser:
+ prompt: templated_text_input
+ accuracy_config:
+ eval_method: pass_at_1
+ ground_truth: ground_truth
+ extractor: letter_extractor
+ num_repeats: 1
+
+ # LiveCodeBench (349 samples) — requires lcb-service container on port 13835.
+ # Uncomment after running extract_mlperf_subsets.py and launching the container.
+ # - name: mlperf-livecodebench
+ # type: accuracy
+ # path: datasets/deepseek/mlperf_deepseek_r1_livecodebench_accuracy.parquet
+ # parser:
+ # prompt: templated_text_input
+ # accuracy_config:
+ # eval_method: code_bench_scorer
+ # ground_truth: ground_truth
+ # extractor: python_code_extractor
+ # num_repeats: 1
+
+settings:
+ runtime:
+ min_duration_ms: 0
+ max_duration_ms: 0
+ n_samples_to_issue: 1
+ scheduler_random_seed: 42
+ dataloader_random_seed: 42
+
+ load_pattern:
+ type: concurrency
+ target_concurrency: 1024
+
+ client:
+ num_workers: -1
+ log_level: INFO
+ warmup_connections: false
+ max_connections: 64
+ worker_initialization_timeout: 120.0
+
+endpoint_config:
+ endpoints:
+ - http://localhost:8000
+ api_key: null
+ api_type: openai
+
+report_dir: results/dsv4pro_vllm_b200x8_mlperf_accuracy
+verbose: false
+enable_cpu_affinity: true
From ab25f965813d5af082322b15b769eca7f9151edd Mon Sep 17 00:00:00 2001
From: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
Date: Fri, 1 May 2026 09:15:46 -0700
Subject: [PATCH 3/5] Fix pre-commit
Signed-off-by: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
---
examples/09_DeepSeek-V4-Pro_Example/README.md | 63 ++++++++++---------
.../extract_mlperf_subsets.py | 4 +-
.../templates/concurrency_template_full.yaml | 1 +
.../templates/offline_template_full.yaml | 1 +
.../templates/online_template_full.yaml | 1 +
.../predefined/aime25/presets.py | 12 +++-
uv.lock | 8 +--
7 files changed, 52 insertions(+), 38 deletions(-)
diff --git a/examples/09_DeepSeek-V4-Pro_Example/README.md b/examples/09_DeepSeek-V4-Pro_Example/README.md
index 9ec3ca3f..2f55b23d 100644
--- a/examples/09_DeepSeek-V4-Pro_Example/README.md
+++ b/examples/09_DeepSeek-V4-Pro_Example/README.md
@@ -4,11 +4,11 @@ End-to-end example for benchmarking `deepseek-ai/DeepSeek-V4-Pro` with vLLM on 8
## Hardware
-| Requirement | Details |
-|-------------|---------|
-| GPUs | 8× NVIDIA B200 or B300 |
-| System RAM | ≥ 256 GB |
-| Docker image | `vllm/vllm-openai:deepseekv4-cu130` |
+| Requirement | Details |
+| ------------ | ---------------------------------------------------------- |
+| GPUs | 8× NVIDIA B200 or B300 |
+| System RAM | ≥ 256 GB |
+| Docker image | `vllm/vllm-openai:deepseekv4-cu130` |
| Startup time | ~22 minutes (weight loading + TileLang kernel compilation) |
The recipe is taken from the [vLLM DeepSeek V4 blog post](https://github.com/vllm-project/vllm-project.github.io/blob/main/_posts/2026-04-24-deepseek-v4.md).
@@ -40,17 +40,17 @@ crash. Setting the timeout to 3600 s avoids this entirely.
### Key launch flags
-| Flag | Purpose |
-|------|---------|
-| `--data-parallel-size 8` | Expert parallelism across 8 GPUs (no TP needed for MoE) |
-| `--enable-expert-parallel` | Required alongside `--data-parallel-size` |
-| `--kv-cache-dtype fp8` | Matches DeepSeek V4's hybrid c4a / c128a KV cache design |
-| `--block-size 256` | Unified 256-token logical block across all compression layers |
-| `--attention_config.use_fp4_indexer_cache=True` | FP4 indexer for ~2x additional KV savings |
-| `--tokenizer-mode deepseek_v4` | Required for the V4 chat template |
-| `--reasoning-parser deepseek_v4` | Strips `…` into `reasoning_content` |
-| `--compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}'` | Enables TileLang kernel fusions |
-| `VLLM_ENGINE_READY_TIMEOUT_S=3600` | Prevents premature `ApiServer_0` timeout during startup |
+| Flag | Purpose |
+| ------------------------------------------------------------------------------------- | ------------------------------------------------------------- |
+| `--data-parallel-size 8` | Expert parallelism across 8 GPUs (no TP needed for MoE) |
+| `--enable-expert-parallel` | Required alongside `--data-parallel-size` |
+| `--kv-cache-dtype fp8` | Matches DeepSeek V4's hybrid c4a / c128a KV cache design |
+| `--block-size 256` | Unified 256-token logical block across all compression layers |
+| `--attention_config.use_fp4_indexer_cache=True` | FP4 indexer for ~2x additional KV savings |
+| `--tokenizer-mode deepseek_v4` | Required for the V4 chat template |
+| `--reasoning-parser deepseek_v4` | Strips `…` into `reasoning_content` |
+| `--compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}'` | Enables TileLang kernel fusions |
+| `VLLM_ENGINE_READY_TIMEOUT_S=3600` | Prevents premature `ApiServer_0` timeout during startup |
## Performance Benchmark
@@ -60,6 +60,7 @@ uv run inference-endpoint benchmark from-config \
```
Config: [`vllm_dsv4pro_perf.yaml`](vllm_dsv4pro_perf.yaml)
+
- 2-minute minimum run at concurrency 32
- Metrics: throughput, latency, TTFT, TPOT
@@ -72,10 +73,10 @@ uv run inference-endpoint benchmark from-config \
Config: [`vllm_dsv4pro_accuracy.yaml`](vllm_dsv4pro_accuracy.yaml)
-| Dataset | Samples | Repeats | Extractor | Scorer |
-|---------|---------|---------|-----------|--------|
-| AIME 2025 | 30 | 8 | `boxed_math_extractor` | `pass_at_1` |
-| GPQA Diamond | 198 | 5 | `abcd_extractor` | `pass_at_1` |
+| Dataset | Samples | Repeats | Extractor | Scorer |
+| ------------ | ------- | ------- | ---------------------- | ----------- |
+| AIME 2025 | 30 | 8 | `boxed_math_extractor` | `pass_at_1` |
+| GPQA Diamond | 198 | 5 | `abcd_extractor` | `pass_at_1` |
### Concurrency note
@@ -93,21 +94,21 @@ reasoning phase and forces a final answer.
### Measured results (8×B200, `deepseekv4-cu130`)
-| Dataset | Score |
-|---------|-------|
+| Dataset | Score |
+| ---------------- | ------------------------------------------ |
| AIME 2025 pass@1 | **55.4%** (8 repeats, budget_tokens=20000) |
## MLPerf Inference Accuracy Suite
The MLPerf DeepSeek-R1 accuracy check uses 5 sub-datasets (4388 total samples):
-| Sub-dataset | Samples | Metric | File |
-|-------------|---------|--------|------|
-| AIME 1983 | 932 | exact_match | `mlperf_deepseek_r1_math_accuracy.parquet` |
-| MATH-500 | 499 | exact_match | `mlperf_deepseek_r1_math_accuracy.parquet` |
-| GPQA | 198 | exact_match | `mlperf_deepseek_r1_mcq_accuracy.parquet` |
-| MMLU-Pro | 2410 | exact_match | extracted by `extract_mlperf_subsets.py` |
-| LiveCodeBench | 349 | code_execute_verify | extracted by `extract_mlperf_subsets.py` |
+| Sub-dataset | Samples | Metric | File |
+| ------------- | ------- | ------------------- | ------------------------------------------ |
+| AIME 1983 | 932 | exact_match | `mlperf_deepseek_r1_math_accuracy.parquet` |
+| MATH-500 | 499 | exact_match | `mlperf_deepseek_r1_math_accuracy.parquet` |
+| GPQA | 198 | exact_match | `mlperf_deepseek_r1_mcq_accuracy.parquet` |
+| MMLU-Pro | 2410 | exact_match | extracted by `extract_mlperf_subsets.py` |
+| LiveCodeBench | 349 | code_execute_verify | extracted by `extract_mlperf_subsets.py` |
**Golden accuracy (fp32):** `exact_match = 81.3582%`, `TOKENS_PER_SAMPLE = 3886.2`
**MLPerf pass threshold:** ≥ 80.52% exact_match (99% of golden), tokens within ±10%
@@ -119,6 +120,7 @@ uv run python examples/09_DeepSeek-V4-Pro_Example/extract_mlperf_subsets.py
```
This writes:
+
- `datasets/deepseek/mlperf_deepseek_r1_mmlu_pro_accuracy.parquet`
- `datasets/deepseek/mlperf_deepseek_r1_livecodebench_accuracy.parquet`
@@ -148,6 +150,7 @@ docker logs | tail -40
```
Common causes:
+
- `TimeoutError: Timed out waiting for engine core processes to start` — set `VLLM_ENGINE_READY_TIMEOUT_S=3600` (already set in `launch_server.sh`)
- OOM during weight loading — verify `--max-model-len` is not too large for available GPU memory
- `MODEL_PATH` not mounted correctly — check that `/model/config.json` exists inside the container
@@ -162,7 +165,7 @@ accuracy-only runs. Use the perf-warmup entry with `n_samples_to_issue: 1`.
The model exhausted `max_new_tokens` in the thinking phase. Add `budget_tokens` to the preset:
```yaml
-- name: aime25::gptoss_budget_20k # uses budget_tokens=20000
+- name: aime25::gptoss_budget_20k # uses budget_tokens=20000
```
**`uv: cannot execute binary file: Exec format error`**
diff --git a/examples/09_DeepSeek-V4-Pro_Example/extract_mlperf_subsets.py b/examples/09_DeepSeek-V4-Pro_Example/extract_mlperf_subsets.py
index 6e945dfa..77015879 100644
--- a/examples/09_DeepSeek-V4-Pro_Example/extract_mlperf_subsets.py
+++ b/examples/09_DeepSeek-V4-Pro_Example/extract_mlperf_subsets.py
@@ -30,7 +30,9 @@
import pandas as pd
-SRC = Path("datasets/deepseek/mlperf_deepseek_r1_dataset_4388_fp8_eval_accuracy.parquet")
+SRC = Path(
+ "datasets/deepseek/mlperf_deepseek_r1_dataset_4388_fp8_eval_accuracy.parquet"
+)
OUT_DIR = Path("datasets/deepseek")
SUBSETS = {
diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
index 3a8e004f..5cccbacf 100644
--- a/src/inference_endpoint/config/templates/concurrency_template_full.yaml
+++ b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
@@ -12,6 +12,7 @@ model_params:
max_new_tokens: 1024 # Max output tokens
osl_distribution: null # Output sequence length distribution
streaming: 'on' # Streaming mode: auto/on/off | options: auto, on, off
+ tokenizer_name: null # Local tokenizer path override. Use when AutoTokenizer.from_pretrained fails for the HF model name (e.g. transformers ≥5.4 rope_theta regression for DeepSeek-V4). Defaults to the model name if unset.
datasets: # Dataset configs
- name: perf
type: performance # Dataset purpose: performance or accuracy | options: performance, accuracy
diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml
index faabffde..ce1186c8 100644
--- a/src/inference_endpoint/config/templates/offline_template_full.yaml
+++ b/src/inference_endpoint/config/templates/offline_template_full.yaml
@@ -12,6 +12,7 @@ model_params:
max_new_tokens: 1024 # Max output tokens
osl_distribution: null # Output sequence length distribution
streaming: 'off' # Streaming mode: auto/on/off | options: auto, on, off
+ tokenizer_name: null # Local tokenizer path override. Use when AutoTokenizer.from_pretrained fails for the HF model name (e.g. transformers ≥5.4 rope_theta regression for DeepSeek-V4). Defaults to the model name if unset.
datasets: # Dataset configs
- name: perf
type: performance # Dataset purpose: performance or accuracy | options: performance, accuracy
diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml
index e9b7a673..39ac0dd6 100644
--- a/src/inference_endpoint/config/templates/online_template_full.yaml
+++ b/src/inference_endpoint/config/templates/online_template_full.yaml
@@ -12,6 +12,7 @@ model_params:
max_new_tokens: 1024 # Max output tokens
osl_distribution: null # Output sequence length distribution
streaming: 'on' # Streaming mode: auto/on/off | options: auto, on, off
+ tokenizer_name: null # Local tokenizer path override. Use when AutoTokenizer.from_pretrained fails for the HF model name (e.g. transformers ≥5.4 rope_theta regression for DeepSeek-V4). Defaults to the model name if unset.
datasets: # Dataset configs
- name: perf
type: performance # Dataset purpose: performance or accuracy | options: performance, accuracy
diff --git a/src/inference_endpoint/dataset_manager/predefined/aime25/presets.py b/src/inference_endpoint/dataset_manager/predefined/aime25/presets.py
index 75c1ef61..f60a1fe4 100644
--- a/src/inference_endpoint/dataset_manager/predefined/aime25/presets.py
+++ b/src/inference_endpoint/dataset_manager/predefined/aime25/presets.py
@@ -43,7 +43,9 @@ def gptoss_budget() -> list[Transform]:
# Same as gptoss but caps thinking at 8192 tokens via budget_tokens so the model
# is forced to emit a final answer rather than consuming all max_new_tokens in
# the reasoning phase (observed issue: 85% of responses had empty answer text).
- AddStaticColumns({"chat_template_kwargs": {"thinking": True, "budget_tokens": 8192}}),
+ AddStaticColumns(
+ {"chat_template_kwargs": {"thinking": True, "budget_tokens": 8192}}
+ ),
]
@@ -52,7 +54,9 @@ def gptoss_budget_20k() -> list[Transform]:
UserPromptFormatter(
user_prompt_format="{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.",
),
- AddStaticColumns({"chat_template_kwargs": {"thinking": True, "budget_tokens": 20000}}),
+ AddStaticColumns(
+ {"chat_template_kwargs": {"thinking": True, "budget_tokens": 20000}}
+ ),
]
@@ -61,5 +65,7 @@ def gptoss_budget_20k_pre() -> list[Transform]:
UserPromptFormatter(
user_prompt_format="Please reason step by step, and put your final answer within \\boxed{{}}.\n\n{question}",
),
- AddStaticColumns({"chat_template_kwargs": {"thinking": True, "budget_tokens": 20000}}),
+ AddStaticColumns(
+ {"chat_template_kwargs": {"thinking": True, "budget_tokens": 20000}}
+ ),
]
diff --git a/uv.lock b/uv.lock
index ed84bd7e..9017350e 100644
--- a/uv.lock
+++ b/uv.lock
@@ -877,7 +877,7 @@ requires-dist = [
{ name = "sphinx-autodoc-typehints", marker = "extra == 'dev'", specifier = "==3.9.11" },
{ name = "sphinx-rtd-theme", marker = "extra == 'dev'", specifier = "==3.1.0" },
{ name = "sqlalchemy", marker = "extra == 'sql'", specifier = "==2.0.48" },
- { name = "transformers", specifier = "==5.4.0" },
+ { name = "transformers", specifier = "==5.5.0" },
{ name = "typing-extensions", specifier = "==4.15.0" },
{ name = "uvloop", specifier = "==0.22.1" },
{ name = "websocket-client", specifier = "==1.9.0" },
@@ -2403,7 +2403,7 @@ wheels = [
[[package]]
name = "transformers"
-version = "5.4.0"
+version = "5.5.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "huggingface-hub", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
@@ -2416,9 +2416,9 @@ dependencies = [
{ name = "tqdm", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
{ name = "typer", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
]
-sdist = { url = "https://files.pythonhosted.org/packages/0b/4c/42a8e1c7bbe668d8e073941ec3205263afb1cd02683fa5a8a75e615fdfbe/transformers-5.4.0.tar.gz", hash = "sha256:cb34ca89dce345ae3224b290346b9c0fa9694b951d54f3ed16334a4b1bfe3d04", size = 8152836, upload-time = "2026-03-27T00:24:24.692Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/ff/9d/fb46e729b461985f41a5740167688b924a4019141e5c164bea77548d3d9e/transformers-5.5.0.tar.gz", hash = "sha256:c8db656cf51c600cd8c75f06b20ef85c72e8b8ff9abc880c5d3e8bc70e0ddcbd", size = 8237745, upload-time = "2026-04-02T16:13:08.113Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/0b/a0/0a87883e564e364baab32adcacb4bec2e200b28a568423c8cf7fde316461/transformers-5.4.0-py3-none-any.whl", hash = "sha256:9fbe50602d2a4e6d0aa8a35a605433dfac72d595ee2192eae192590a6cc2df86", size = 10105556, upload-time = "2026-03-27T00:24:21.735Z" },
+ { url = "https://files.pythonhosted.org/packages/e7/28/35f7411ff80a3640c1f4fc907dcbb6a65061ebb82f66950e38bfc9f7f740/transformers-5.5.0-py3-none-any.whl", hash = "sha256:821a9ff0961abbb29eb1eb686d78df1c85929fdf213a3fe49dc6bd94f9efa944", size = 10245591, upload-time = "2026-04-02T16:13:03.462Z" },
]
[[package]]
From 4ba9fe0054943894e5d6237598f576f7bc0c666b Mon Sep 17 00:00:00 2001
From: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
Date: Fri, 1 May 2026 13:20:13 -0700
Subject: [PATCH 4/5] Reduce changes.
Signed-off-by: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
---
src/inference_endpoint/core/types.py | 8 ++---
.../dataset_manager/transforms.py | 2 +-
src/inference_endpoint/openai/accumulator.py | 13 ++++----
tests/unit/evaluation/test_extractor.py | 30 ++-----------------
4 files changed, 13 insertions(+), 40 deletions(-)
diff --git a/src/inference_endpoint/core/types.py b/src/inference_endpoint/core/types.py
index 848ba61a..accd2ca8 100644
--- a/src/inference_endpoint/core/types.py
+++ b/src/inference_endpoint/core/types.py
@@ -133,13 +133,9 @@ def text_after_first_chunk(self) -> str:
"""
parts: list[str] = []
if self.reasoning:
- if isinstance(self.reasoning, str):
- # str reasoning is the fully joined streaming trace — include it
- # in the TPOT denominator. Over-counts by one token (the first
- # token is not excluded), but the error is negligible in practice.
- parts.append(self.reasoning)
- elif isinstance(self.reasoning, tuple) and len(self.reasoning) > 1:
+ if isinstance(self.reasoning, tuple) and len(self.reasoning) > 1:
parts.extend(self.reasoning[1:])
+ # str reasoning: single chunk, skip entirely (it IS the first chunk)
if self.output:
if isinstance(self.output, str):
# Non-streaming: if reasoning was present and was the first chunk,
diff --git a/src/inference_endpoint/dataset_manager/transforms.py b/src/inference_endpoint/dataset_manager/transforms.py
index 2e1e7683..319cb887 100644
--- a/src/inference_endpoint/dataset_manager/transforms.py
+++ b/src/inference_endpoint/dataset_manager/transforms.py
@@ -125,7 +125,7 @@ def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
for key, value in self.data.items():
# Wrap dict/list values in a list so pandas doesn't try to align
# on index keys (e.g. {"thinking": True} would produce NaN otherwise).
- if isinstance(value, (dict, list)):
+ if isinstance(value, dict | list):
df[key] = [value] * len(df)
else:
df[key] = value
diff --git a/src/inference_endpoint/openai/accumulator.py b/src/inference_endpoint/openai/accumulator.py
index c78fd922..6cb23ed8 100644
--- a/src/inference_endpoint/openai/accumulator.py
+++ b/src/inference_endpoint/openai/accumulator.py
@@ -68,14 +68,15 @@ def add_chunk(self, delta: OpenAISSEDelta) -> StreamChunk | None:
def get_final_output(self) -> QueryResult:
if self.reasoning_chunks:
- # All reasoning chunks are joined into a single string so the full
- # thinking trace is captured as-is in events.jsonl. TPOT still uses
- # text_after_first_chunk(), which includes string reasoning in the
- # denominator (off by one token vs. the true "after first chunk"
- # count, which is negligible).
+ # If there are reasoning chunks, then the first chunk received
+ # is the first reasoning chunk. The rest of the reasoning chunks,
+ # as well as the output chunks can be joined together.
+ resp_reasoning: list[str] = [self.reasoning_chunks[0]]
+ if len(self.reasoning_chunks) > 1:
+ resp_reasoning.append("".join(self.reasoning_chunks[1:]))
text_output = TextModelOutput(
output="".join(self.output_chunks),
- reasoning="".join(self.reasoning_chunks),
+ reasoning=resp_reasoning,
)
elif self.output_chunks:
# If there are only output chunks, the first chunk is used for
diff --git a/tests/unit/evaluation/test_extractor.py b/tests/unit/evaluation/test_extractor.py
index e2db968a..0ca5afb8 100644
--- a/tests/unit/evaluation/test_extractor.py
+++ b/tests/unit/evaluation/test_extractor.py
@@ -16,7 +16,7 @@
import pytest
from inference_endpoint.evaluation.extractor import (
- ABCDExtractor,
+ Extractor,
LetterExtractor,
PythonCodeExtractor,
)
@@ -148,15 +148,11 @@ def test_extract_whitespace_handling(self):
def test_registered_in_extractor_registry(self):
"""Test that PythonCodeExtractor is registered."""
- from inference_endpoint.evaluation.extractor import Extractor
-
assert "python_code_extractor" in Extractor.PREDEFINED
assert Extractor.get("python_code_extractor") == PythonCodeExtractor
def test_extractor_get_method(self):
"""Test that we can retrieve PythonCodeExtractor by name."""
- from inference_endpoint.evaluation.extractor import Extractor
-
extractor_cls = Extractor.get("python_code_extractor")
text = "```python\nprint('test')\n```"
result = extractor_cls.extract(text)
@@ -174,17 +170,9 @@ def test_answer_colon(self):
def test_markdown_answer(self):
assert LetterExtractor.extract("**Answer:** C") == "C"
- def test_answer_colon_d(self):
- assert LetterExtractor.extract("Answer: D") == "D"
-
- # Extended range (E–J) for MMLU-Pro
- def test_letter_e(self):
+ def test_extended_range(self):
+ # E–J range needed for MMLU-Pro (10 choices); check boundary letters
assert LetterExtractor.extract("Answer: E") == "E"
-
- def test_letter_g(self):
- assert LetterExtractor.extract("**Answer:** G") == "G"
-
- def test_letter_j(self):
assert LetterExtractor.extract("Answer: J") == "J"
def test_boxed_letter(self):
@@ -204,18 +192,6 @@ def test_no_match_returns_empty_string(self):
def test_default_on_no_match(self):
assert LetterExtractor.extract("Nothing", default="X") == "X"
- def test_returns_letter_not_choice_key(self):
- # Confirm it does NOT return "choice1" etc. (ABCDExtractor behaviour)
- result = LetterExtractor.extract("Answer: B")
- assert result == "B"
- assert result != "choice2"
-
- def test_abcd_extractor_unchanged(self):
- # Confirm ABCDExtractor still maps to choiceN (regression guard)
- assert ABCDExtractor.extract("Answer: B") == "choice2"
-
def test_registered(self):
- from inference_endpoint.evaluation.extractor import Extractor
-
assert "letter_extractor" in Extractor.PREDEFINED
assert Extractor.get("letter_extractor") is LetterExtractor
From 743258278d1c057d4857ce7019e5703317fcdadd Mon Sep 17 00:00:00 2001
From: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
Date: Mon, 4 May 2026 11:00:35 -0700
Subject: [PATCH 5/5] Sync.
Signed-off-by: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
---
.../vllm_dsv4pro_mlperf_accuracy.yaml | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/examples/09_DeepSeek-V4-Pro_Example/vllm_dsv4pro_mlperf_accuracy.yaml b/examples/09_DeepSeek-V4-Pro_Example/vllm_dsv4pro_mlperf_accuracy.yaml
index f4d29d4e..89c76f83 100644
--- a/examples/09_DeepSeek-V4-Pro_Example/vllm_dsv4pro_mlperf_accuracy.yaml
+++ b/examples/09_DeepSeek-V4-Pro_Example/vllm_dsv4pro_mlperf_accuracy.yaml
@@ -1,6 +1,6 @@
name: dsv4pro-vllm-b200x8-mlperf-accuracy
version: "1.0"
-type: online
+type: offline
timeout: 600.0
# MLPerf Inference accuracy evaluation for DeepSeek-R1 / DeepSeek-V4-Pro.
@@ -95,8 +95,7 @@ settings:
dataloader_random_seed: 42
load_pattern:
- type: concurrency
- target_concurrency: 1024
+ type: max_throughput
client:
num_workers: -1