From 898621da245628e2e573bc75402aac041d484edf Mon Sep 17 00:00:00 2001
From: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
Date: Fri, 1 May 2026 08:24:48 -0700
Subject: [PATCH 1/5] Fixes for DSV4 example

Signed-off-by: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
---
 .../metrics_aggregator/token_metrics.py       |  18 ++-
 .../commands/benchmark/execute.py             |  71 +++++++----
 src/inference_endpoint/config/schema.py       |   6 +
 src/inference_endpoint/core/types.py          |   8 +-
 .../predefined/aime25/presets.py              |  35 ++++++
 .../dataset_manager/transforms.py             |   7 +-
 .../evaluation/extractor.py                   | 117 +++++++++++++++++-
 src/inference_endpoint/openai/accumulator.py  |  13 +-
 .../openai/openai_msgspec_adapter.py          |   2 +
 src/inference_endpoint/openai/types.py        |   1 +
 tests/unit/evaluation/test_extractor.py       |  66 +++++++++-
 11 files changed, 306 insertions(+), 38 deletions(-)

diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
index 56dee33f..546e30c1 100644
--- a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
+++ b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
@@ -22,7 +22,7 @@
 from concurrent.futures import ThreadPoolExecutor
 from typing import TYPE_CHECKING
 
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, PreTrainedTokenizerFast
 
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizerBase
@@ -76,9 +76,19 @@ def __init__(self, tokenizer_name: str, n_workers: int) -> None:
     def _get_thread_tokenizer(self) -> PreTrainedTokenizerBase:
         """Return the tokenizer for the current thread, loading it if needed."""
         if getattr(self._thread_local, "tokenizer", None) is None:
-            self._thread_local.tokenizer = AutoTokenizer.from_pretrained(
-                self._tokenizer_name
-            )
+            try:
+                self._thread_local.tokenizer = AutoTokenizer.from_pretrained(
+                    self._tokenizer_name
+                )
+            except Exception:
+                # AutoTokenizer loads config.json to detect the model type; for
+                # models with unknown model_type (e.g. deepseek_v4 in older
+                # transformers) or missing rope config fields, this fails.
+                # Fall back to PreTrainedTokenizerFast which reads only
+                # tokenizer.json / tokenizer_config.json and skips model config.
+                self._thread_local.tokenizer = PreTrainedTokenizerFast.from_pretrained(
+                    self._tokenizer_name
+                )
         return self._thread_local.tokenizer
 
     def _token_count_worker(self, text: str) -> int:
diff --git a/src/inference_endpoint/commands/benchmark/execute.py b/src/inference_endpoint/commands/benchmark/execute.py
index 73c3427f..008e6da7 100644
--- a/src/inference_endpoint/commands/benchmark/execute.py
+++ b/src/inference_endpoint/commands/benchmark/execute.py
@@ -63,8 +63,6 @@
     APIType,
     BenchmarkConfig,
     DatasetType,
-    LoadPattern,
-    LoadPatternType,
     StreamingMode,
     TestMode,
     TestType,
@@ -302,7 +300,15 @@ def setup_benchmark(config: BenchmarkConfig, test_mode: TestMode) -> BenchmarkCo
 
     # Tokenizer check (light API call, no download)
     model_name = config.model_params.name
-    tokenizer_name = model_name if _check_tokenizer_exists(model_name) else None
+    tokenizer_override = config.model_params.tokenizer_name
+    tokenizer_name: str | None
+    if tokenizer_override:
+        tokenizer_name = tokenizer_override
+        logger.info(
+            f"Tokenizer available for model: {model_name} (override: {tokenizer_override})"
+        )
+    else:
+        tokenizer_name = model_name if _check_tokenizer_exists(model_name) else None
 
     # Streaming
     logger.info(
@@ -368,7 +374,7 @@ def _build_phases(ctx: BenchmarkContext) -> list[PhaseConfig]:
             min_sample_count=acc_ds.num_samples() * acc_ds.repeats,
             rng_sched=ctx.rt_settings.rng_sched,
             rng_sample_index=ctx.rt_settings.rng_sample_index,
-            load_pattern=LoadPattern(type=LoadPatternType.MAX_THROUGHPUT),
+            load_pattern=ctx.rt_settings.load_pattern,
         )
         phases.append(
             PhaseConfig(eval_cfg.dataset_name, acc_settings, acc_ds, PhaseType.ACCURACY)
@@ -649,27 +655,48 @@ def finalize_benchmark(ctx: BenchmarkContext, bench: BenchmarkResult) -> None:
     # Write scoring artifacts + copy event log from tmpfs to disk
     _write_scoring_artifacts(ctx, result, bench.tmpfs_dir)
 
-    # Accuracy scoring
+    # Accuracy scoring — continue past per-scorer failures so partial results are saved
     accuracy_scores: dict[str, Any] = {}
+    scoring_failed = False
     for eval_cfg in ctx.eval_configs:
-        scorer_instance = eval_cfg.scorer(
-            eval_cfg.dataset_name,
-            eval_cfg.dataset,
-            eval_cfg.report_dir,
-            extractor=eval_cfg.extractor,
-            ground_truth_column=eval_cfg.ground_truth_column,
+        try:
+            scorer_instance = eval_cfg.scorer(
+                eval_cfg.dataset_name,
+                eval_cfg.dataset,
+                eval_cfg.report_dir,
+                extractor=eval_cfg.extractor,
+                ground_truth_column=eval_cfg.ground_truth_column,
+            )
+            score, n_repeats = scorer_instance.score()
+            assert eval_cfg.dataset.data is not None
+            accuracy_scores[eval_cfg.dataset_name] = {
+                "dataset_name": eval_cfg.dataset_name,
+                "num_samples": len(eval_cfg.dataset.data),
+                "extractor": eval_cfg.extractor.__name__,
+                "ground_truth_column": eval_cfg.ground_truth_column,
+                "score": score,
+                "n_repeats": n_repeats,
+            }
+            logger.info(
+                f"Score for {eval_cfg.dataset_name}: {score} ({n_repeats} repeats)"
+            )
+        except Exception as e:
+            scoring_failed = True
+            logger.error(f"Scoring failed for {eval_cfg.dataset_name}: {e}")
+            assert eval_cfg.dataset.data is not None
+            accuracy_scores[eval_cfg.dataset_name] = {
+                "dataset_name": eval_cfg.dataset_name,
+                "num_samples": len(eval_cfg.dataset.data),
+                "extractor": eval_cfg.extractor.__name__,
+                "ground_truth_column": eval_cfg.ground_truth_column,
+                "score": None,
+                "error": str(e),
+            }
+
+    if scoring_failed:
+        logger.warning(
+            "One or more accuracy scorers failed — partial accuracy results saved"
         )
-        score, n_repeats = scorer_instance.score()
-        assert eval_cfg.dataset.data is not None
-        accuracy_scores[eval_cfg.dataset_name] = {
-            "dataset_name": eval_cfg.dataset_name,
-            "num_samples": len(eval_cfg.dataset.data),
-            "extractor": eval_cfg.extractor.__name__,
-            "ground_truth_column": eval_cfg.ground_truth_column,
-            "score": score,
-            "n_repeats": n_repeats,
-        }
-        logger.info(f"Score for {eval_cfg.dataset_name}: {score} ({n_repeats} repeats)")
 
     # Report metrics: prefer Report from KVStore, fall back to SessionResult
     if report is not None and report.duration_ns is not None:
diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py
index 6a1884b4..eba20708 100644
--- a/src/inference_endpoint/config/schema.py
+++ b/src/inference_endpoint/config/schema.py
@@ -196,6 +196,12 @@ class ModelParams(BaseModel):
         StreamingMode,
         cyclopts.Parameter(alias="--streaming", help="Streaming mode: auto/on/off"),
     ] = StreamingMode.AUTO
+    tokenizer_name: str | None = Field(
+        None,
+        description="Local tokenizer path override. Use when AutoTokenizer.from_pretrained "
+        "fails for the HF model name (e.g. transformers ≥5.4 rope_theta regression "
+        "for DeepSeek-V4). Defaults to the model name if unset.",
+    )
 
 
 class SubmissionReference(BaseModel):
diff --git a/src/inference_endpoint/core/types.py b/src/inference_endpoint/core/types.py
index accd2ca8..848ba61a 100644
--- a/src/inference_endpoint/core/types.py
+++ b/src/inference_endpoint/core/types.py
@@ -133,9 +133,13 @@ def text_after_first_chunk(self) -> str:
         """
         parts: list[str] = []
         if self.reasoning:
-            if isinstance(self.reasoning, tuple) and len(self.reasoning) > 1:
+            if isinstance(self.reasoning, str):
+                # str reasoning is the fully joined streaming trace — include it
+                # in the TPOT denominator. Over-counts by one token (the first
+                # token is not excluded), but the error is negligible in practice.
+                parts.append(self.reasoning)
+            elif isinstance(self.reasoning, tuple) and len(self.reasoning) > 1:
                 parts.extend(self.reasoning[1:])
-            # str reasoning: single chunk, skip entirely (it IS the first chunk)
         if self.output:
             if isinstance(self.output, str):
                 # Non-streaming: if reasoning was present and was the first chunk,
diff --git a/src/inference_endpoint/dataset_manager/predefined/aime25/presets.py b/src/inference_endpoint/dataset_manager/predefined/aime25/presets.py
index 6d9adf43..75c1ef61 100644
--- a/src/inference_endpoint/dataset_manager/predefined/aime25/presets.py
+++ b/src/inference_endpoint/dataset_manager/predefined/aime25/presets.py
@@ -17,6 +17,7 @@
 """Preset transforms for the AIME25 dataset."""
 
 from inference_endpoint.dataset_manager.transforms import (
+    AddStaticColumns,
     Transform,
     UserPromptFormatter,
 )
@@ -27,4 +28,38 @@ def gptoss() -> list[Transform]:
         UserPromptFormatter(
             user_prompt_format="{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.",
         ),
+        # Enable DeepSeek thinking mode so the model uses chain-of-thought reasoning.
+        # vLLM's reasoning_parser strips <think>...</think> tokens into reasoning_content;
+        # the final boxed answer ends up in content where boxed_math_extractor finds it.
+        AddStaticColumns({"chat_template_kwargs": {"thinking": True}}),
+    ]
+
+
+def gptoss_budget() -> list[Transform]:
+    return [
+        UserPromptFormatter(
+            user_prompt_format="{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.",
+        ),
+        # Same as gptoss but caps thinking at 8192 tokens via budget_tokens so the model
+        # is forced to emit a final answer rather than consuming all max_new_tokens in
+        # the reasoning phase (observed issue: 85% of responses had empty answer text).
+        AddStaticColumns({"chat_template_kwargs": {"thinking": True, "budget_tokens": 8192}}),
+    ]
+
+
+def gptoss_budget_20k() -> list[Transform]:
+    return [
+        UserPromptFormatter(
+            user_prompt_format="{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.",
+        ),
+        AddStaticColumns({"chat_template_kwargs": {"thinking": True, "budget_tokens": 20000}}),
+    ]
+
+
+def gptoss_budget_20k_pre() -> list[Transform]:
+    return [
+        UserPromptFormatter(
+            user_prompt_format="Please reason step by step, and put your final answer within \\boxed{{}}.\n\n{question}",
+        ),
+        AddStaticColumns({"chat_template_kwargs": {"thinking": True, "budget_tokens": 20000}}),
     ]
diff --git a/src/inference_endpoint/dataset_manager/transforms.py b/src/inference_endpoint/dataset_manager/transforms.py
index 79133796..2e1e7683 100644
--- a/src/inference_endpoint/dataset_manager/transforms.py
+++ b/src/inference_endpoint/dataset_manager/transforms.py
@@ -123,7 +123,12 @@ def __init__(self, data: dict[str, Any]):
     def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
         """Add the static columns to the row."""
         for key, value in self.data.items():
-            df[key] = value
+            # Wrap dict/list values in a list so pandas doesn't try to align
+            # on index keys (e.g. {"thinking": True} would produce NaN otherwise).
+            if isinstance(value, (dict, list)):
+                df[key] = [value] * len(df)
+            else:
+                df[key] = value
         return df
 
 
diff --git a/src/inference_endpoint/evaluation/extractor.py b/src/inference_endpoint/evaluation/extractor.py
index 99d07db4..cbace9cf 100644
--- a/src/inference_endpoint/evaluation/extractor.py
+++ b/src/inference_endpoint/evaluation/extractor.py
@@ -103,7 +103,7 @@ class ABCDExtractor(Extractor, extractor_id="abcd_extractor"):
     Returns:
         "choice" key (see GQPA dataset columns) or empty string if no answer is found.
     Examples:
-        >>> ABCDExtractor.extract("The answer is B")
+        >>> ABCDExtractor.extract("Answer: B")
         'choice2'
         >>> ABCDExtractor.extract("**Answer:** C")
         'choice3'
@@ -220,6 +220,121 @@ def extract(cls, text: str, default: str | None = None) -> str | None:
         return default if default is not None else ""
 
 
+class LetterExtractor(Extractor, extractor_id="letter_extractor"):
+    """Extract MCQ answer letter (A–J) from response text, returning the letter directly.
+
+    Like ABCDExtractor but returns the raw letter ("A", "B", … "J") instead of
+    mapping to "choice1"–"choice4".  Supports datasets with up to ten answer
+    options (e.g. MMLU-Pro A–J) where ground-truth labels are stored as the
+    letter itself.
+
+    Examples:
+        >>> LetterExtractor.extract("Answer: B")
+        'B'
+        >>> LetterExtractor.extract("**Answer:** G")
+        'G'
+        >>> LetterExtractor.extract("\\\\boxed{E}")
+        'E'
+    """
+
+    LETTERS = frozenset("ABCDEFGHIJ")
+
+    PATTERNS = [
+        # 0) **Answer:** A  or  *Answers* – B
+        re.compile(
+            r"""(?ix)
+            (?:\*{1,2}|_{1,2})
+            Answer[s]?
+            \s*[:\-–]?
+            (?:\*{1,2}|_{1,2})
+            \s*
+            ([A-J])\b
+            """,
+            re.X,
+        ),
+        # 0.1) Answer: A  (with optional markdown)
+        re.compile(
+            r"""(?ix)
+            ^\s*
+            (?:\*{1,2}|_{1,2})?
+            Answer:?
+            (?:\*{1,2}|_{1,2})?
+            \s*:?\s*
+            (?:\*{1,2}|_{1,2})?
+            ([A-J])
+            (?:\*{1,2}|_{1,2})?
+            \s*
+            """,
+            re.MULTILINE,
+        ),
+        # 1) Answer: (C)
+        re.compile(r"(?ix)\bAnswer[s]?\b\s*[:\-–]?\s*\(\s*([A-J])\s*\)"),
+        # 2) Answer: C
+        re.compile(r"(?ix)\bAnswer[s]?\b\s*[:\-–]?\s*([A-J])\b"),
+        # 3) Option B  or  Choice: C
+        re.compile(r"(?ix)\b(?:Option|Choice)\b\s*[:\-–]?\s*([A-J])\b"),
+        # 7) \boxed{A}
+        re.compile(r"(?x)\\boxed\{[^}]*?([A-J])[^}]*\}", re.MULTILINE),
+        # 7.5) \boxed{\textbf{C}}
+        re.compile(
+            r"(?x)\\boxed\{[^}]*?\\textbf\{[^}]*?([A-J])[^}]*\}[^}]*\}", re.MULTILINE
+        ),
+        # 7.51) \boxed{\text{C}}
+        re.compile(
+            r"(?x)\\boxed\{[^}]*?\\text\{[^}]*?([A-J])[^}]*\}[^}]*\}", re.MULTILINE
+        ),
+        # 4) bare singletons: (A)  [B]
+        re.compile(r"(?x)(?<![A-Za-z0-9])[\(\[]\s*([A-J])\s*[\)\]](?![A-Za-z0-9])"),
+        # 5) Markdown-wrapped: *A*  **B**
+        re.compile(
+            r"(?x)(?<![A-Za-z0-9])(?:\*{1,2}|_{1,2})([A-J])(?:\*{1,2}|_{1,2})(?![A-Za-z0-9])"
+        ),
+        # 6) \textbf{C}
+        re.compile(r"(?x)\\textbf\{[^}]*?([A-J])[^}]*\}"),
+        # 8) **D) description**
+        re.compile(r"""(?x)
+            (?<![A-Za-z0-9])
+            (?:\*{1,2}|_{1,2})
+            \s*([A-J])\)
+            [^*_\n]+?
+            (?:\*{1,2}|_{1,2})
+            (?![A-Za-z0-9])
+        """),
+        # 9) final fallback: line starting with a single letter
+        re.compile(
+            r"""(?x)^\s*
+            (?:\*{1,2}|_{1,2})?
+            ([A-J])
+            (?:\*{1,2}|_{1,2})?
+            \s*[\.\)\-–:]?
+            \s*.*$
+            """,
+            re.MULTILINE,
+        ),
+    ]
+
+    @classmethod
+    def extract(cls, text: str, default: str | None = None) -> str | None:
+        matches = []
+        for prio, pat in enumerate(cls.PATTERNS):
+            m = pat.search(text)
+            if m:
+                letter = m.group(1).upper()
+                if letter in cls.LETTERS:
+                    matches.append((prio, m, letter))
+
+        matches.sort(key=lambda triple: (triple[0], len(triple[1].group(0))))
+
+        for _, _, letter in matches:
+            return letter
+
+        stripped = text.removeprefix("**")
+        if stripped and stripped[0].upper() in cls.LETTERS:
+            return stripped[0].upper()
+
+        return default if default is not None else ""
+
+
 class BoxedMathExtractor(Extractor, extractor_id="boxed_math_extractor"):
     """Extract boxed math answer from response text.
     Based on OpenAI's extract_boxed_math function from GPT-OSS.
diff --git a/src/inference_endpoint/openai/accumulator.py b/src/inference_endpoint/openai/accumulator.py
index 6cb23ed8..c78fd922 100644
--- a/src/inference_endpoint/openai/accumulator.py
+++ b/src/inference_endpoint/openai/accumulator.py
@@ -68,15 +68,14 @@ def add_chunk(self, delta: OpenAISSEDelta) -> StreamChunk | None:
 
     def get_final_output(self) -> QueryResult:
         if self.reasoning_chunks:
-            # If there are reasoning chunks, then the first chunk received
-            # is the first reasoning chunk. The rest of the reasoning chunks,
-            # as well as the output chunks can be joined together.
-            resp_reasoning: list[str] = [self.reasoning_chunks[0]]
-            if len(self.reasoning_chunks) > 1:
-                resp_reasoning.append("".join(self.reasoning_chunks[1:]))
+            # All reasoning chunks are joined into a single string so the full
+            # thinking trace is captured as-is in events.jsonl. TPOT still uses
+            # text_after_first_chunk(), which includes string reasoning in the
+            # denominator (off by one token vs. the true "after first chunk"
+            # count, which is negligible).
             text_output = TextModelOutput(
                 output="".join(self.output_chunks),
-                reasoning=resp_reasoning,
+                reasoning="".join(self.reasoning_chunks),
             )
         elif self.output_chunks:
             # If there are only output chunks, the first chunk is used for
diff --git a/src/inference_endpoint/openai/openai_msgspec_adapter.py b/src/inference_endpoint/openai/openai_msgspec_adapter.py
index 6106e1bd..16f4bc3b 100644
--- a/src/inference_endpoint/openai/openai_msgspec_adapter.py
+++ b/src/inference_endpoint/openai/openai_msgspec_adapter.py
@@ -82,6 +82,7 @@ def dataset_transforms(cls, model_params: ModelParams) -> list[Transform]:
             "logit_bias",
             "user",
             "chat_template",
+            "chat_template_kwargs",
         ]
         return [
             ColumnFilter(
@@ -164,6 +165,7 @@ def to_endpoint_request(cls, query: Query) -> ChatCompletionRequest:
             logit_bias=query.data.get("logit_bias"),
             user=query.data.get("user"),
             chat_template=query.data.get("chat_template"),
+            chat_template_kwargs=query.data.get("chat_template_kwargs"),
         )
 
     @classmethod
diff --git a/src/inference_endpoint/openai/types.py b/src/inference_endpoint/openai/types.py
index 036dd172..09b53e88 100644
--- a/src/inference_endpoint/openai/types.py
+++ b/src/inference_endpoint/openai/types.py
@@ -103,6 +103,7 @@ class ChatCompletionRequest(
     logit_bias: dict[str, float] | None = None
     user: str | None = None
     chat_template: str | None = None
+    chat_template_kwargs: dict[str, Any] | None = None
 
 
 class ChatCompletionResponseMessage(
diff --git a/tests/unit/evaluation/test_extractor.py b/tests/unit/evaluation/test_extractor.py
index 17ad1654..e2db968a 100644
--- a/tests/unit/evaluation/test_extractor.py
+++ b/tests/unit/evaluation/test_extractor.py
@@ -14,9 +14,15 @@
 # limitations under the License.
 
 
-from inference_endpoint.evaluation.extractor import PythonCodeExtractor
+import pytest
+from inference_endpoint.evaluation.extractor import (
+    ABCDExtractor,
+    LetterExtractor,
+    PythonCodeExtractor,
+)
 
 
+@pytest.mark.unit
 class TestPythonCodeExtractor:
     """Test cases for PythonCodeExtractor."""
 
@@ -155,3 +161,61 @@ def test_extractor_get_method(self):
         text = "```python\nprint('test')\n```"
         result = extractor_cls.extract(text)
         assert result == "print('test')"
+
+
+@pytest.mark.unit
+class TestLetterExtractor:
+    """Tests for LetterExtractor — returns raw letter (A–J) for MCQ datasets
+    where ground truth is stored as a letter (e.g. MLPerf GPQA/MMLU-Pro)."""
+
+    def test_answer_colon(self):
+        assert LetterExtractor.extract("Answer: B") == "B"
+
+    def test_markdown_answer(self):
+        assert LetterExtractor.extract("**Answer:** C") == "C"
+
+    def test_answer_colon_d(self):
+        assert LetterExtractor.extract("Answer: D") == "D"
+
+    # Extended range (E–J) for MMLU-Pro
+    def test_letter_e(self):
+        assert LetterExtractor.extract("Answer: E") == "E"
+
+    def test_letter_g(self):
+        assert LetterExtractor.extract("**Answer:** G") == "G"
+
+    def test_letter_j(self):
+        assert LetterExtractor.extract("Answer: J") == "J"
+
+    def test_boxed_letter(self):
+        assert LetterExtractor.extract(r"\boxed{F}") == "F"
+
+    def test_parenthesised_singleton(self):
+        assert LetterExtractor.extract("The correct choice is (H)") == "H"
+
+    def test_mlperf_style_output(self):
+        # Mirrors the few-shot prompt format: model outputs "Answer: X"
+        text = "Let me think... the ring characteristic is 0.\nAnswer: A"
+        assert LetterExtractor.extract(text) == "A"
+
+    def test_no_match_returns_empty_string(self):
+        assert LetterExtractor.extract("No answer here.") == ""
+
+    def test_default_on_no_match(self):
+        assert LetterExtractor.extract("Nothing", default="X") == "X"
+
+    def test_returns_letter_not_choice_key(self):
+        # Confirm it does NOT return "choice1" etc. (ABCDExtractor behaviour)
+        result = LetterExtractor.extract("Answer: B")
+        assert result == "B"
+        assert result != "choice2"
+
+    def test_abcd_extractor_unchanged(self):
+        # Confirm ABCDExtractor still maps to choiceN (regression guard)
+        assert ABCDExtractor.extract("Answer: B") == "choice2"
+
+    def test_registered(self):
+        from inference_endpoint.evaluation.extractor import Extractor
+
+        assert "letter_extractor" in Extractor.PREDEFINED
+        assert Extractor.get("letter_extractor") is LetterExtractor

From fa3eb9199cc52b279e46a52b39e6baf8bc7abfdc Mon Sep 17 00:00:00 2001
From: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
Date: Fri, 1 May 2026 08:32:20 -0700
Subject: [PATCH 2/5] Add example files.

Signed-off-by: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
---
 examples/09_DeepSeek-V4-Pro_Example/README.md | 174 ++++++++++++++++++
 .../extract_mlperf_subsets.py                 |  57 ++++++
 .../launch_server.sh                          |  94 ++++++++++
 .../vllm_dsv4pro_mlperf_accuracy.yaml         | 116 ++++++++++++
 4 files changed, 441 insertions(+)
 create mode 100644 examples/09_DeepSeek-V4-Pro_Example/README.md
 create mode 100644 examples/09_DeepSeek-V4-Pro_Example/extract_mlperf_subsets.py
 create mode 100755 examples/09_DeepSeek-V4-Pro_Example/launch_server.sh
 create mode 100644 examples/09_DeepSeek-V4-Pro_Example/vllm_dsv4pro_mlperf_accuracy.yaml

diff --git a/examples/09_DeepSeek-V4-Pro_Example/README.md b/examples/09_DeepSeek-V4-Pro_Example/README.md
new file mode 100644
index 00000000..9ec3ca3f
--- /dev/null
+++ b/examples/09_DeepSeek-V4-Pro_Example/README.md
@@ -0,0 +1,174 @@
+# DeepSeek-V4-Pro Benchmark
+
+End-to-end example for benchmarking `deepseek-ai/DeepSeek-V4-Pro` with vLLM on 8×B200 (or 8×B300), covering performance throughput and accuracy evaluation (AIME 2025, GPQA, and the full MLPerf Inference accuracy suite).
+
+## Hardware
+
+| Requirement | Details |
+|-------------|---------|
+| GPUs | 8× NVIDIA B200 or B300 |
+| System RAM | ≥ 256 GB |
+| Docker image | `vllm/vllm-openai:deepseekv4-cu130` |
+| Startup time | ~22 minutes (weight loading + TileLang kernel compilation) |
+
+The recipe is taken from the [vLLM DeepSeek V4 blog post](https://github.com/vllm-project/vllm-project.github.io/blob/main/_posts/2026-04-24-deepseek-v4.md).
+
+## Environment Setup
+
+```bash
+export MODEL_PATH=/path/to/DeepSeek-V4-Pro   # local weight directory
+export HF_HOME=~/.cache/huggingface
+export HF_TOKEN=<your HuggingFace token>
+```
+
+## Launching the Server
+
+```bash
+bash examples/09_DeepSeek-V4-Pro_Example/launch_server.sh
+```
+
+The script mounts `$MODEL_PATH` into the container at `/model`, sets
+`VLLM_ENGINE_READY_TIMEOUT_S=3600`, and polls `/health` until the server is ready.
+
+### Why `VLLM_ENGINE_READY_TIMEOUT_S=3600` is required
+
+The default value is 600 s (10 min). Loading DeepSeek-V4-Pro's 64 safetensor shards plus
+compiling TileLang kernels (`mhc_pre_big_fuse_tilelang` etc.) across 8 DP workers takes
+~22 min on 8×B200. With the default timeout the `ApiServer_0` process raises a `TimeoutError`
+and exits — even though all 8 engine workers completed successfully — causing the container to
+crash. Setting the timeout to 3600 s avoids this entirely.
+
+### Key launch flags
+
+| Flag | Purpose |
+|------|---------|
+| `--data-parallel-size 8` | Expert parallelism across 8 GPUs (no TP needed for MoE) |
+| `--enable-expert-parallel` | Required alongside `--data-parallel-size` |
+| `--kv-cache-dtype fp8` | Matches DeepSeek V4's hybrid c4a / c128a KV cache design |
+| `--block-size 256` | Unified 256-token logical block across all compression layers |
+| `--attention_config.use_fp4_indexer_cache=True` | FP4 indexer for ~2x additional KV savings |
+| `--tokenizer-mode deepseek_v4` | Required for the V4 chat template |
+| `--reasoning-parser deepseek_v4` | Strips `<think>…</think>` into `reasoning_content` |
+| `--compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}'` | Enables TileLang kernel fusions |
+| `VLLM_ENGINE_READY_TIMEOUT_S=3600` | Prevents premature `ApiServer_0` timeout during startup |
+
+## Performance Benchmark
+
+```bash
+uv run inference-endpoint benchmark from-config \
+  -c examples/09_DeepSeek-V4-Pro_Example/vllm_dsv4pro_perf.yaml
+```
+
+Config: [`vllm_dsv4pro_perf.yaml`](vllm_dsv4pro_perf.yaml)
+- 2-minute minimum run at concurrency 32
+- Metrics: throughput, latency, TTFT, TPOT
+
+## Accuracy Benchmark (AIME 2025 + GPQA)
+
+```bash
+uv run inference-endpoint benchmark from-config \
+  -c examples/09_DeepSeek-V4-Pro_Example/vllm_dsv4pro_accuracy.yaml
+```
+
+Config: [`vllm_dsv4pro_accuracy.yaml`](vllm_dsv4pro_accuracy.yaml)
+
+| Dataset | Samples | Repeats | Extractor | Scorer |
+|---------|---------|---------|-----------|--------|
+| AIME 2025 | 30 | 8 | `boxed_math_extractor` | `pass_at_1` |
+| GPQA Diamond | 198 | 5 | `abcd_extractor` | `pass_at_1` |
+
+### Concurrency note
+
+`target_concurrency: 4` is intentional. With `max_model_len=65536` and `max_new_tokens=32768`,
+each in-flight request can occupy up to 32k tokens of KV cache. Four concurrent requests
+fit within the fp8 KV cache budget without preemption on 8×B200.
+
+### Thinking mode and `budget_tokens`
+
+The `aime25::gptoss_budget_20k` preset enables DeepSeek's thinking mode
+(`chat_template_kwargs: {thinking: True, budget_tokens: 20000}`). Without `budget_tokens`,
+the model can spend all 32k tokens in the `<think>` block and return an empty boxed answer —
+observed on ~85% of responses in early testing. Setting `budget_tokens=20000` caps the
+reasoning phase and forces a final answer.
+
+### Measured results (8×B200, `deepseekv4-cu130`)
+
+| Dataset | Score |
+|---------|-------|
+| AIME 2025 pass@1 | **55.4%** (8 repeats, budget_tokens=20000) |
+
+## MLPerf Inference Accuracy Suite
+
+The MLPerf DeepSeek-R1 accuracy check uses 5 sub-datasets (4388 total samples):
+
+| Sub-dataset | Samples | Metric | File |
+|-------------|---------|--------|------|
+| AIME 1983 | 932 | exact_match | `mlperf_deepseek_r1_math_accuracy.parquet` |
+| MATH-500 | 499 | exact_match | `mlperf_deepseek_r1_math_accuracy.parquet` |
+| GPQA | 198 | exact_match | `mlperf_deepseek_r1_mcq_accuracy.parquet` |
+| MMLU-Pro | 2410 | exact_match | extracted by `extract_mlperf_subsets.py` |
+| LiveCodeBench | 349 | code_execute_verify | extracted by `extract_mlperf_subsets.py` |
+
+**Golden accuracy (fp32):** `exact_match = 81.3582%`, `TOKENS_PER_SAMPLE = 3886.2`
+**MLPerf pass threshold:** ≥ 80.52% exact_match (99% of golden), tokens within ±10%
+
+### Step 1 — Extract missing subsets
+
+```bash
+uv run python examples/09_DeepSeek-V4-Pro_Example/extract_mlperf_subsets.py
+```
+
+This writes:
+- `datasets/deepseek/mlperf_deepseek_r1_mmlu_pro_accuracy.parquet`
+- `datasets/deepseek/mlperf_deepseek_r1_livecodebench_accuracy.parquet`
+
+### Step 2 — Run math + MCQ accuracy
+
+Uncomment MMLU-Pro in [`vllm_dsv4pro_mlperf_accuracy.yaml`](vllm_dsv4pro_mlperf_accuracy.yaml), then:
+
+```bash
+uv run inference-endpoint benchmark from-config \
+  -c examples/09_DeepSeek-V4-Pro_Example/vllm_dsv4pro_mlperf_accuracy.yaml
+```
+
+### Step 3 — Run LiveCodeBench accuracy
+
+LiveCodeBench requires the `lcb-service` container (executes generated Python code in an
+isolated environment). See the
+[LiveCodeBench README](../../src/inference_endpoint/evaluation/livecodebench/README.md) for
+container setup. Once running on port 13835, uncomment the `mlperf-livecodebench` dataset in
+`vllm_dsv4pro_mlperf_accuracy.yaml` and re-run.
+
+## Troubleshooting
+
+**Container exits immediately or health check never passes**
+
+```bash
+docker logs <container_id> | tail -40
+```
+
+Common causes:
+- `TimeoutError: Timed out waiting for engine core processes to start` — set `VLLM_ENGINE_READY_TIMEOUT_S=3600` (already set in `launch_server.sh`)
+- OOM during weight loading — verify `--max-model-len` is not too large for available GPU memory
+- `MODEL_PATH` not mounted correctly — check that `/model/config.json` exists inside the container
+
+**`At least one performance dataset required`**
+
+Every benchmark config must include at least one `type: performance` dataset entry, even for
+accuracy-only runs. Use the perf-warmup entry with `n_samples_to_issue: 1`.
+
+**Empty boxed answers / low AIME accuracy**
+
+The model exhausted `max_new_tokens` in the thinking phase. Add `budget_tokens` to the preset:
+
+```yaml
+- name: aime25::gptoss_budget_20k   # uses budget_tokens=20000
+```
+
+**`uv: cannot execute binary file: Exec format error`**
+
+The `uv` binary in `~/.local/bin/uv` has the wrong architecture. Use the venv directly:
+
+```bash
+.venv/bin/inference-endpoint benchmark from-config -c <config.yaml>
+```
diff --git a/examples/09_DeepSeek-V4-Pro_Example/extract_mlperf_subsets.py b/examples/09_DeepSeek-V4-Pro_Example/extract_mlperf_subsets.py
new file mode 100644
index 00000000..6e945dfa
--- /dev/null
+++ b/examples/09_DeepSeek-V4-Pro_Example/extract_mlperf_subsets.py
@@ -0,0 +1,57 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Extract MMLU-Pro and LiveCodeBench subsets from the combined MLPerf accuracy parquet.
+
+The combined parquet (mlperf_deepseek_r1_dataset_4388_fp8_eval_accuracy.parquet) contains
+5 sub-datasets identified by the 'dataset' column. Pre-split files already exist for Math
+and GPQA; this script extracts the remaining two:
+
+  datasets/deepseek/mlperf_deepseek_r1_mmlu_pro_accuracy.parquet   (2410 rows)
+  datasets/deepseek/mlperf_deepseek_r1_livecodebench_accuracy.parquet (349 rows)
+
+Usage:
+    uv run python examples/09_DeepSeek-V4-Pro_Example/extract_mlperf_subsets.py
+"""
+
+from pathlib import Path
+
+import pandas as pd
+
+SRC = Path("datasets/deepseek/mlperf_deepseek_r1_dataset_4388_fp8_eval_accuracy.parquet")
+OUT_DIR = Path("datasets/deepseek")
+
+SUBSETS = {
+    "mmlu_pro": "mlperf_deepseek_r1_mmlu_pro_accuracy.parquet",
+    "livecodebench": "mlperf_deepseek_r1_livecodebench_accuracy.parquet",
+}
+
+
+def main() -> None:
+    df = pd.read_parquet(SRC)
+    print(f"Loaded {len(df)} rows from {SRC}")
+    print("Sub-dataset breakdown:")
+    print(df.groupby(["dataset", "metric"]).size().to_string())
+    print()
+
+    for dataset_name, out_filename in SUBSETS.items():
+        subset = df[df["dataset"] == dataset_name].reset_index(drop=True)
+        out_path = OUT_DIR / out_filename
+        subset.to_parquet(out_path, index=False)
+        print(f"Wrote {len(subset)} rows → {out_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/09_DeepSeek-V4-Pro_Example/launch_server.sh b/examples/09_DeepSeek-V4-Pro_Example/launch_server.sh
new file mode 100755
index 00000000..ac141c2a
--- /dev/null
+++ b/examples/09_DeepSeek-V4-Pro_Example/launch_server.sh
@@ -0,0 +1,94 @@
+#!/usr/bin/env bash
+# Launch DeepSeek-V4-Pro with vLLM on 8×B200 / 8×B300.
+#
+# Key flags vs. a standard vLLM launch:
+#   --data-parallel-size 8        Expert parallelism across 8 GPUs (no TP)
+#   --enable-expert-parallel      Required for MoE data-parallel dispatch
+#   --kv-cache-dtype fp8          DeepSeek V4's hybrid KV cache (c4a/c128a)
+#   --block-size 256              Matches the 256-native-token logical block size
+#   --attention_config.use_fp4_indexer_cache=True  FP4 indexer for 2x KV savings
+#   --tokenizer-mode deepseek_v4  Custom tokenizer for V4 chat template
+#   --reasoning-parser deepseek_v4  Strips <think>…</think> into reasoning_content
+#   --compilation-config …        FULL_AND_PIECEWISE cudagraph + all custom fusions
+#
+# Startup time note:
+#   Model weight loading (64 shards) + TileLang kernel compilation takes ~22 min
+#   on 8×B200. The default VLLM_ENGINE_READY_TIMEOUT_S=600 (10 min) is too short
+#   and will crash the API server with a TimeoutError even though the workers are
+#   fine. Always set VLLM_ENGINE_READY_TIMEOUT_S=3600 for this model.
+
+set -euo pipefail
+
+: "${MODEL_PATH:?Set MODEL_PATH to the directory containing the DeepSeek-V4-Pro weights}"
+PORT="${PORT:-8000}"
+MAX_MODEL_LEN="${MAX_MODEL_LEN:-65536}"
+
+if [[ ! -d "${MODEL_PATH}" ]]; then
+  echo "ERROR: MODEL_PATH=${MODEL_PATH} does not exist."
+  echo "Set MODEL_PATH to the directory containing the DeepSeek-V4-Pro weights."
+  exit 1
+fi
+
+echo "Launching DeepSeek-V4-Pro on port ${PORT} (model: ${MODEL_PATH})"
+echo "Startup takes ~22 minutes for weight loading + TileLang kernel compilation."
+echo ""
+
+CONTAINER_ID=$(docker run -d \
+  --gpus all \
+  --shm-size 32g \
+  --net host \
+  --ipc host \
+  -v "${MODEL_PATH}:/model" \
+  -v "${HF_HOME:-${HOME}/.cache/huggingface}:/root/.cache/huggingface" \
+  --env HF_TOKEN="${HF_TOKEN:-}" \
+  --env VLLM_WORKER_MULTIPROC_METHOD=spawn \
+  --env VLLM_ENGINE_READY_TIMEOUT_S=3600 \
+  vllm/vllm-openai:deepseekv4-cu130 \
+  --model /model \
+  --served-model-name deepseek-ai/DeepSeek-V4-Pro \
+  --trust-remote-code \
+  --kv-cache-dtype fp8 \
+  --block-size 256 \
+  --enable-expert-parallel \
+  --data-parallel-size 8 \
+  --max-model-len "${MAX_MODEL_LEN}" \
+  --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' \
+  --attention_config.use_fp4_indexer_cache=True \
+  --tokenizer-mode deepseek_v4 \
+  --tool-call-parser deepseek_v4 \
+  --enable-auto-tool-choice \
+  --reasoning-parser deepseek_v4 \
+  --disable-log-stats \
+  --disable-uvicorn-access-log \
+  --port "${PORT}")
+
+echo "Container started: ${CONTAINER_ID:0:12}"
+echo ""
+echo "Polling http://localhost:${PORT}/health ..."
+
+TIMEOUT=2400
+START=$(date +%s)
+while true; do
+  if curl -sf "http://localhost:${PORT}/health" > /dev/null 2>&1; then
+    ELAPSED=$(( $(date +%s) - START ))
+    echo "Server healthy after ${ELAPSED}s. Ready to benchmark."
+    break
+  fi
+  ELAPSED=$(( $(date +%s) - START ))
+  if [[ ${ELAPSED} -ge ${TIMEOUT} ]]; then
+    echo "ERROR: server not healthy after ${TIMEOUT}s"
+    docker logs "${CONTAINER_ID}" | tail -40
+    exit 1
+  fi
+  if [[ "$(docker inspect -f '{{.State.Running}}' "${CONTAINER_ID}" 2>/dev/null)" != "true" ]]; then
+    echo "ERROR: container exited unexpectedly"
+    docker logs "${CONTAINER_ID}" | tail -40
+    exit 1
+  fi
+  echo "  Waiting... (${ELAPSED}s)"
+  sleep 15
+done
+
+echo ""
+echo "Container ID : ${CONTAINER_ID:0:12}"
+echo "Stop with   : docker stop ${CONTAINER_ID:0:12}"
diff --git a/examples/09_DeepSeek-V4-Pro_Example/vllm_dsv4pro_mlperf_accuracy.yaml b/examples/09_DeepSeek-V4-Pro_Example/vllm_dsv4pro_mlperf_accuracy.yaml
new file mode 100644
index 00000000..f4d29d4e
--- /dev/null
+++ b/examples/09_DeepSeek-V4-Pro_Example/vllm_dsv4pro_mlperf_accuracy.yaml
@@ -0,0 +1,116 @@
+name: dsv4pro-vllm-b200x8-mlperf-accuracy
+version: "1.0"
+type: online
+timeout: 600.0
+
+# MLPerf Inference accuracy evaluation for DeepSeek-R1 / DeepSeek-V4-Pro.
+#
+# Runs the 4 sub-datasets used in the MLPerf submission accuracy check:
+#   - Math (AIME1983 + MATH500)  1431 samples  boxed_math_extractor
+#   - GPQA                         198 samples  abcd_extractor
+#   - MMLU-Pro                    2410 samples  abcd_extractor
+#   - LiveCodeBench (commented)    349 samples  python_code_extractor
+#
+# Golden accuracy (fp32 reference): exact_match = 81.3582%
+# MLPerf pass threshold:            exact_match ≥ 80.52%  (99% of golden)
+#
+# MMLU-Pro and LiveCodeBench must be extracted from the combined parquet first:
+#   python examples/09_DeepSeek-V4-Pro_Example/extract_mlperf_subsets.py
+
+model_params:
+  name: deepseek-ai/DeepSeek-V4-Pro
+  temperature: 1.0
+  max_new_tokens: 32768
+  top_p: 1.0
+  streaming: "on"
+  # tokenizer_name: /path/to/local/weights  # optional: set if weights are not in HF cache
+
+datasets:
+  - name: perf-warmup
+    type: performance
+    path: datasets/deepseek/mlperf_deepseek_r1_dataset_4388_fp8_eval.parquet
+    parser:
+      prompt: templated_text_input
+
+  # AIME1983 + MATH500 (1431 samples). Prompts are already formatted as
+  # templated_text_input — no parser transform needed.
+  - name: mlperf-math
+    type: accuracy
+    path: datasets/deepseek/mlperf_deepseek_r1_math_accuracy.parquet
+    parser:
+      prompt: templated_text_input
+    accuracy_config:
+      eval_method: pass_at_1
+      ground_truth: ground_truth
+      extractor: boxed_math_extractor
+      num_repeats: 1
+
+  # GPQA (198 samples). Ground truth is stored as raw letters ("A"–"D") in
+  # this parquet, so letter_extractor is required (abcd_extractor would return
+  # "choice1"–"choice4" and score 0% against letter ground truth).
+  - name: mlperf-gpqa
+    type: accuracy
+    path: datasets/deepseek/mlperf_deepseek_r1_mcq_accuracy.parquet
+    parser:
+      prompt: templated_text_input
+    accuracy_config:
+      eval_method: pass_at_1
+      ground_truth: ground_truth
+      extractor: letter_extractor
+      num_repeats: 1
+
+  # MMLU-Pro (2410 samples) — extracted from the combined accuracy parquet by
+  # extract_mlperf_subsets.py. Ground truth is "A"–"J" (10 choices); use
+  # letter_extractor which handles the full A–J range.
+  - name: mlperf-mmlu-pro
+    type: accuracy
+    path: datasets/deepseek/mlperf_deepseek_r1_mmlu_pro_accuracy.parquet
+    parser:
+      prompt: templated_text_input
+    accuracy_config:
+      eval_method: pass_at_1
+      ground_truth: ground_truth
+      extractor: letter_extractor
+      num_repeats: 1
+
+  # LiveCodeBench (349 samples) — requires lcb-service container on port 13835.
+  # Uncomment after running extract_mlperf_subsets.py and launching the container.
+  # - name: mlperf-livecodebench
+  #   type: accuracy
+  #   path: datasets/deepseek/mlperf_deepseek_r1_livecodebench_accuracy.parquet
+  #   parser:
+  #     prompt: templated_text_input
+  #   accuracy_config:
+  #     eval_method: code_bench_scorer
+  #     ground_truth: ground_truth
+  #     extractor: python_code_extractor
+  #     num_repeats: 1
+
+settings:
+  runtime:
+    min_duration_ms: 0
+    max_duration_ms: 0
+    n_samples_to_issue: 1
+    scheduler_random_seed: 42
+    dataloader_random_seed: 42
+
+  load_pattern:
+    type: concurrency
+    target_concurrency: 1024
+
+  client:
+    num_workers: -1
+    log_level: INFO
+    warmup_connections: false
+    max_connections: 64
+    worker_initialization_timeout: 120.0
+
+endpoint_config:
+  endpoints:
+    - http://localhost:8000
+  api_key: null
+  api_type: openai
+
+report_dir: results/dsv4pro_vllm_b200x8_mlperf_accuracy
+verbose: false
+enable_cpu_affinity: true

From ab25f965813d5af082322b15b769eca7f9151edd Mon Sep 17 00:00:00 2001
From: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
Date: Fri, 1 May 2026 09:15:46 -0700
Subject: [PATCH 3/5] Fix pre-commit

Signed-off-by: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
---
 examples/09_DeepSeek-V4-Pro_Example/README.md | 63 ++++++++++---------
 .../extract_mlperf_subsets.py                 |  4 +-
 .../templates/concurrency_template_full.yaml  |  1 +
 .../templates/offline_template_full.yaml      |  1 +
 .../templates/online_template_full.yaml       |  1 +
 .../predefined/aime25/presets.py              | 12 +++-
 uv.lock                                       |  8 +--
 7 files changed, 52 insertions(+), 38 deletions(-)

diff --git a/examples/09_DeepSeek-V4-Pro_Example/README.md b/examples/09_DeepSeek-V4-Pro_Example/README.md
index 9ec3ca3f..2f55b23d 100644
--- a/examples/09_DeepSeek-V4-Pro_Example/README.md
+++ b/examples/09_DeepSeek-V4-Pro_Example/README.md
@@ -4,11 +4,11 @@ End-to-end example for benchmarking `deepseek-ai/DeepSeek-V4-Pro` with vLLM on 8
 
 ## Hardware
 
-| Requirement | Details |
-|-------------|---------|
-| GPUs | 8× NVIDIA B200 or B300 |
-| System RAM | ≥ 256 GB |
-| Docker image | `vllm/vllm-openai:deepseekv4-cu130` |
+| Requirement  | Details                                                    |
+| ------------ | ---------------------------------------------------------- |
+| GPUs         | 8× NVIDIA B200 or B300                                     |
+| System RAM   | ≥ 256 GB                                                   |
+| Docker image | `vllm/vllm-openai:deepseekv4-cu130`                        |
 | Startup time | ~22 minutes (weight loading + TileLang kernel compilation) |
 
 The recipe is taken from the [vLLM DeepSeek V4 blog post](https://github.com/vllm-project/vllm-project.github.io/blob/main/_posts/2026-04-24-deepseek-v4.md).
@@ -40,17 +40,17 @@ crash. Setting the timeout to 3600 s avoids this entirely.
 
 ### Key launch flags
 
-| Flag | Purpose |
-|------|---------|
-| `--data-parallel-size 8` | Expert parallelism across 8 GPUs (no TP needed for MoE) |
-| `--enable-expert-parallel` | Required alongside `--data-parallel-size` |
-| `--kv-cache-dtype fp8` | Matches DeepSeek V4's hybrid c4a / c128a KV cache design |
-| `--block-size 256` | Unified 256-token logical block across all compression layers |
-| `--attention_config.use_fp4_indexer_cache=True` | FP4 indexer for ~2x additional KV savings |
-| `--tokenizer-mode deepseek_v4` | Required for the V4 chat template |
-| `--reasoning-parser deepseek_v4` | Strips `<think>…</think>` into `reasoning_content` |
-| `--compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}'` | Enables TileLang kernel fusions |
-| `VLLM_ENGINE_READY_TIMEOUT_S=3600` | Prevents premature `ApiServer_0` timeout during startup |
+| Flag                                                                                  | Purpose                                                       |
+| ------------------------------------------------------------------------------------- | ------------------------------------------------------------- |
+| `--data-parallel-size 8`                                                              | Expert parallelism across 8 GPUs (no TP needed for MoE)       |
+| `--enable-expert-parallel`                                                            | Required alongside `--data-parallel-size`                     |
+| `--kv-cache-dtype fp8`                                                                | Matches DeepSeek V4's hybrid c4a / c128a KV cache design      |
+| `--block-size 256`                                                                    | Unified 256-token logical block across all compression layers |
+| `--attention_config.use_fp4_indexer_cache=True`                                       | FP4 indexer for ~2x additional KV savings                     |
+| `--tokenizer-mode deepseek_v4`                                                        | Required for the V4 chat template                             |
+| `--reasoning-parser deepseek_v4`                                                      | Strips `<think>…</think>` into `reasoning_content`            |
+| `--compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}'` | Enables TileLang kernel fusions                               |
+| `VLLM_ENGINE_READY_TIMEOUT_S=3600`                                                    | Prevents premature `ApiServer_0` timeout during startup       |
 
 ## Performance Benchmark
 
@@ -60,6 +60,7 @@ uv run inference-endpoint benchmark from-config \
 ```
 
 Config: [`vllm_dsv4pro_perf.yaml`](vllm_dsv4pro_perf.yaml)
+
 - 2-minute minimum run at concurrency 32
 - Metrics: throughput, latency, TTFT, TPOT
 
@@ -72,10 +73,10 @@ uv run inference-endpoint benchmark from-config \
 
 Config: [`vllm_dsv4pro_accuracy.yaml`](vllm_dsv4pro_accuracy.yaml)
 
-| Dataset | Samples | Repeats | Extractor | Scorer |
-|---------|---------|---------|-----------|--------|
-| AIME 2025 | 30 | 8 | `boxed_math_extractor` | `pass_at_1` |
-| GPQA Diamond | 198 | 5 | `abcd_extractor` | `pass_at_1` |
+| Dataset      | Samples | Repeats | Extractor              | Scorer      |
+| ------------ | ------- | ------- | ---------------------- | ----------- |
+| AIME 2025    | 30      | 8       | `boxed_math_extractor` | `pass_at_1` |
+| GPQA Diamond | 198     | 5       | `abcd_extractor`       | `pass_at_1` |
 
 ### Concurrency note
 
@@ -93,21 +94,21 @@ reasoning phase and forces a final answer.
 
 ### Measured results (8×B200, `deepseekv4-cu130`)
 
-| Dataset | Score |
-|---------|-------|
+| Dataset          | Score                                      |
+| ---------------- | ------------------------------------------ |
 | AIME 2025 pass@1 | **55.4%** (8 repeats, budget_tokens=20000) |
 
 ## MLPerf Inference Accuracy Suite
 
 The MLPerf DeepSeek-R1 accuracy check uses 5 sub-datasets (4388 total samples):
 
-| Sub-dataset | Samples | Metric | File |
-|-------------|---------|--------|------|
-| AIME 1983 | 932 | exact_match | `mlperf_deepseek_r1_math_accuracy.parquet` |
-| MATH-500 | 499 | exact_match | `mlperf_deepseek_r1_math_accuracy.parquet` |
-| GPQA | 198 | exact_match | `mlperf_deepseek_r1_mcq_accuracy.parquet` |
-| MMLU-Pro | 2410 | exact_match | extracted by `extract_mlperf_subsets.py` |
-| LiveCodeBench | 349 | code_execute_verify | extracted by `extract_mlperf_subsets.py` |
+| Sub-dataset   | Samples | Metric              | File                                       |
+| ------------- | ------- | ------------------- | ------------------------------------------ |
+| AIME 1983     | 932     | exact_match         | `mlperf_deepseek_r1_math_accuracy.parquet` |
+| MATH-500      | 499     | exact_match         | `mlperf_deepseek_r1_math_accuracy.parquet` |
+| GPQA          | 198     | exact_match         | `mlperf_deepseek_r1_mcq_accuracy.parquet`  |
+| MMLU-Pro      | 2410    | exact_match         | extracted by `extract_mlperf_subsets.py`   |
+| LiveCodeBench | 349     | code_execute_verify | extracted by `extract_mlperf_subsets.py`   |
 
 **Golden accuracy (fp32):** `exact_match = 81.3582%`, `TOKENS_PER_SAMPLE = 3886.2`
 **MLPerf pass threshold:** ≥ 80.52% exact_match (99% of golden), tokens within ±10%
@@ -119,6 +120,7 @@ uv run python examples/09_DeepSeek-V4-Pro_Example/extract_mlperf_subsets.py
 ```
 
 This writes:
+
 - `datasets/deepseek/mlperf_deepseek_r1_mmlu_pro_accuracy.parquet`
 - `datasets/deepseek/mlperf_deepseek_r1_livecodebench_accuracy.parquet`
 
@@ -148,6 +150,7 @@ docker logs <container_id> | tail -40
 ```
 
 Common causes:
+
 - `TimeoutError: Timed out waiting for engine core processes to start` — set `VLLM_ENGINE_READY_TIMEOUT_S=3600` (already set in `launch_server.sh`)
 - OOM during weight loading — verify `--max-model-len` is not too large for available GPU memory
 - `MODEL_PATH` not mounted correctly — check that `/model/config.json` exists inside the container
@@ -162,7 +165,7 @@ accuracy-only runs. Use the perf-warmup entry with `n_samples_to_issue: 1`.
 The model exhausted `max_new_tokens` in the thinking phase. Add `budget_tokens` to the preset:
 
 ```yaml
-- name: aime25::gptoss_budget_20k   # uses budget_tokens=20000
+- name: aime25::gptoss_budget_20k # uses budget_tokens=20000
 ```
 
 **`uv: cannot execute binary file: Exec format error`**
diff --git a/examples/09_DeepSeek-V4-Pro_Example/extract_mlperf_subsets.py b/examples/09_DeepSeek-V4-Pro_Example/extract_mlperf_subsets.py
index 6e945dfa..77015879 100644
--- a/examples/09_DeepSeek-V4-Pro_Example/extract_mlperf_subsets.py
+++ b/examples/09_DeepSeek-V4-Pro_Example/extract_mlperf_subsets.py
@@ -30,7 +30,9 @@
 
 import pandas as pd
 
-SRC = Path("datasets/deepseek/mlperf_deepseek_r1_dataset_4388_fp8_eval_accuracy.parquet")
+SRC = Path(
+    "datasets/deepseek/mlperf_deepseek_r1_dataset_4388_fp8_eval_accuracy.parquet"
+)
 OUT_DIR = Path("datasets/deepseek")
 
 SUBSETS = {
diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
index 3a8e004f..5cccbacf 100644
--- a/src/inference_endpoint/config/templates/concurrency_template_full.yaml
+++ b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
@@ -12,6 +12,7 @@ model_params:
   max_new_tokens: 1024  # Max output tokens
   osl_distribution: null  # Output sequence length distribution
   streaming: 'on'  # Streaming mode: auto/on/off | options: auto, on, off
+  tokenizer_name: null  # Local tokenizer path override. Use when AutoTokenizer.from_pretrained fails for the HF model name (e.g. transformers ≥5.4 rope_theta regression for DeepSeek-V4). Defaults to the model name if unset.
 datasets:  # Dataset configs
 - name: perf
   type: performance  # Dataset purpose: performance or accuracy | options: performance, accuracy
diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml
index faabffde..ce1186c8 100644
--- a/src/inference_endpoint/config/templates/offline_template_full.yaml
+++ b/src/inference_endpoint/config/templates/offline_template_full.yaml
@@ -12,6 +12,7 @@ model_params:
   max_new_tokens: 1024  # Max output tokens
   osl_distribution: null  # Output sequence length distribution
   streaming: 'off'  # Streaming mode: auto/on/off | options: auto, on, off
+  tokenizer_name: null  # Local tokenizer path override. Use when AutoTokenizer.from_pretrained fails for the HF model name (e.g. transformers ≥5.4 rope_theta regression for DeepSeek-V4). Defaults to the model name if unset.
 datasets:  # Dataset configs
 - name: perf
   type: performance  # Dataset purpose: performance or accuracy | options: performance, accuracy
diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml
index e9b7a673..39ac0dd6 100644
--- a/src/inference_endpoint/config/templates/online_template_full.yaml
+++ b/src/inference_endpoint/config/templates/online_template_full.yaml
@@ -12,6 +12,7 @@ model_params:
   max_new_tokens: 1024  # Max output tokens
   osl_distribution: null  # Output sequence length distribution
   streaming: 'on'  # Streaming mode: auto/on/off | options: auto, on, off
+  tokenizer_name: null  # Local tokenizer path override. Use when AutoTokenizer.from_pretrained fails for the HF model name (e.g. transformers ≥5.4 rope_theta regression for DeepSeek-V4). Defaults to the model name if unset.
 datasets:  # Dataset configs
 - name: perf
   type: performance  # Dataset purpose: performance or accuracy | options: performance, accuracy
diff --git a/src/inference_endpoint/dataset_manager/predefined/aime25/presets.py b/src/inference_endpoint/dataset_manager/predefined/aime25/presets.py
index 75c1ef61..f60a1fe4 100644
--- a/src/inference_endpoint/dataset_manager/predefined/aime25/presets.py
+++ b/src/inference_endpoint/dataset_manager/predefined/aime25/presets.py
@@ -43,7 +43,9 @@ def gptoss_budget() -> list[Transform]:
         # Same as gptoss but caps thinking at 8192 tokens via budget_tokens so the model
         # is forced to emit a final answer rather than consuming all max_new_tokens in
         # the reasoning phase (observed issue: 85% of responses had empty answer text).
-        AddStaticColumns({"chat_template_kwargs": {"thinking": True, "budget_tokens": 8192}}),
+        AddStaticColumns(
+            {"chat_template_kwargs": {"thinking": True, "budget_tokens": 8192}}
+        ),
     ]
 
 
@@ -52,7 +54,9 @@ def gptoss_budget_20k() -> list[Transform]:
         UserPromptFormatter(
             user_prompt_format="{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.",
         ),
-        AddStaticColumns({"chat_template_kwargs": {"thinking": True, "budget_tokens": 20000}}),
+        AddStaticColumns(
+            {"chat_template_kwargs": {"thinking": True, "budget_tokens": 20000}}
+        ),
     ]
 
 
@@ -61,5 +65,7 @@ def gptoss_budget_20k_pre() -> list[Transform]:
         UserPromptFormatter(
             user_prompt_format="Please reason step by step, and put your final answer within \\boxed{{}}.\n\n{question}",
         ),
-        AddStaticColumns({"chat_template_kwargs": {"thinking": True, "budget_tokens": 20000}}),
+        AddStaticColumns(
+            {"chat_template_kwargs": {"thinking": True, "budget_tokens": 20000}}
+        ),
     ]
diff --git a/uv.lock b/uv.lock
index ed84bd7e..9017350e 100644
--- a/uv.lock
+++ b/uv.lock
@@ -877,7 +877,7 @@ requires-dist = [
     { name = "sphinx-autodoc-typehints", marker = "extra == 'dev'", specifier = "==3.9.11" },
     { name = "sphinx-rtd-theme", marker = "extra == 'dev'", specifier = "==3.1.0" },
     { name = "sqlalchemy", marker = "extra == 'sql'", specifier = "==2.0.48" },
-    { name = "transformers", specifier = "==5.4.0" },
+    { name = "transformers", specifier = "==5.5.0" },
     { name = "typing-extensions", specifier = "==4.15.0" },
     { name = "uvloop", specifier = "==0.22.1" },
     { name = "websocket-client", specifier = "==1.9.0" },
@@ -2403,7 +2403,7 @@ wheels = [
 
 [[package]]
 name = "transformers"
-version = "5.4.0"
+version = "5.5.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "huggingface-hub", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
@@ -2416,9 +2416,9 @@ dependencies = [
     { name = "tqdm", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "typer", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'x86_64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/0b/4c/42a8e1c7bbe668d8e073941ec3205263afb1cd02683fa5a8a75e615fdfbe/transformers-5.4.0.tar.gz", hash = "sha256:cb34ca89dce345ae3224b290346b9c0fa9694b951d54f3ed16334a4b1bfe3d04", size = 8152836, upload-time = "2026-03-27T00:24:24.692Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/ff/9d/fb46e729b461985f41a5740167688b924a4019141e5c164bea77548d3d9e/transformers-5.5.0.tar.gz", hash = "sha256:c8db656cf51c600cd8c75f06b20ef85c72e8b8ff9abc880c5d3e8bc70e0ddcbd", size = 8237745, upload-time = "2026-04-02T16:13:08.113Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/0b/a0/0a87883e564e364baab32adcacb4bec2e200b28a568423c8cf7fde316461/transformers-5.4.0-py3-none-any.whl", hash = "sha256:9fbe50602d2a4e6d0aa8a35a605433dfac72d595ee2192eae192590a6cc2df86", size = 10105556, upload-time = "2026-03-27T00:24:21.735Z" },
+    { url = "https://files.pythonhosted.org/packages/e7/28/35f7411ff80a3640c1f4fc907dcbb6a65061ebb82f66950e38bfc9f7f740/transformers-5.5.0-py3-none-any.whl", hash = "sha256:821a9ff0961abbb29eb1eb686d78df1c85929fdf213a3fe49dc6bd94f9efa944", size = 10245591, upload-time = "2026-04-02T16:13:03.462Z" },
 ]
 
 [[package]]

From 4ba9fe0054943894e5d6237598f576f7bc0c666b Mon Sep 17 00:00:00 2001
From: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
Date: Fri, 1 May 2026 13:20:13 -0700
Subject: [PATCH 4/5] Reduce changes.

Signed-off-by: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
---
 src/inference_endpoint/core/types.py          |  8 ++---
 .../dataset_manager/transforms.py             |  2 +-
 src/inference_endpoint/openai/accumulator.py  | 13 ++++----
 tests/unit/evaluation/test_extractor.py       | 30 ++-----------------
 4 files changed, 13 insertions(+), 40 deletions(-)

diff --git a/src/inference_endpoint/core/types.py b/src/inference_endpoint/core/types.py
index 848ba61a..accd2ca8 100644
--- a/src/inference_endpoint/core/types.py
+++ b/src/inference_endpoint/core/types.py
@@ -133,13 +133,9 @@ def text_after_first_chunk(self) -> str:
         """
         parts: list[str] = []
         if self.reasoning:
-            if isinstance(self.reasoning, str):
-                # str reasoning is the fully joined streaming trace — include it
-                # in the TPOT denominator. Over-counts by one token (the first
-                # token is not excluded), but the error is negligible in practice.
-                parts.append(self.reasoning)
-            elif isinstance(self.reasoning, tuple) and len(self.reasoning) > 1:
+            if isinstance(self.reasoning, tuple) and len(self.reasoning) > 1:
                 parts.extend(self.reasoning[1:])
+            # str reasoning: single chunk, skip entirely (it IS the first chunk)
         if self.output:
             if isinstance(self.output, str):
                 # Non-streaming: if reasoning was present and was the first chunk,
diff --git a/src/inference_endpoint/dataset_manager/transforms.py b/src/inference_endpoint/dataset_manager/transforms.py
index 2e1e7683..319cb887 100644
--- a/src/inference_endpoint/dataset_manager/transforms.py
+++ b/src/inference_endpoint/dataset_manager/transforms.py
@@ -125,7 +125,7 @@ def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
         for key, value in self.data.items():
             # Wrap dict/list values in a list so pandas doesn't try to align
             # on index keys (e.g. {"thinking": True} would produce NaN otherwise).
-            if isinstance(value, (dict, list)):
+            if isinstance(value, dict | list):
                 df[key] = [value] * len(df)
             else:
                 df[key] = value
diff --git a/src/inference_endpoint/openai/accumulator.py b/src/inference_endpoint/openai/accumulator.py
index c78fd922..6cb23ed8 100644
--- a/src/inference_endpoint/openai/accumulator.py
+++ b/src/inference_endpoint/openai/accumulator.py
@@ -68,14 +68,15 @@ def add_chunk(self, delta: OpenAISSEDelta) -> StreamChunk | None:
 
     def get_final_output(self) -> QueryResult:
         if self.reasoning_chunks:
-            # All reasoning chunks are joined into a single string so the full
-            # thinking trace is captured as-is in events.jsonl. TPOT still uses
-            # text_after_first_chunk(), which includes string reasoning in the
-            # denominator (off by one token vs. the true "after first chunk"
-            # count, which is negligible).
+            # If there are reasoning chunks, then the first chunk received
+            # is the first reasoning chunk. The rest of the reasoning chunks,
+            # as well as the output chunks can be joined together.
+            resp_reasoning: list[str] = [self.reasoning_chunks[0]]
+            if len(self.reasoning_chunks) > 1:
+                resp_reasoning.append("".join(self.reasoning_chunks[1:]))
             text_output = TextModelOutput(
                 output="".join(self.output_chunks),
-                reasoning="".join(self.reasoning_chunks),
+                reasoning=resp_reasoning,
             )
         elif self.output_chunks:
             # If there are only output chunks, the first chunk is used for
diff --git a/tests/unit/evaluation/test_extractor.py b/tests/unit/evaluation/test_extractor.py
index e2db968a..0ca5afb8 100644
--- a/tests/unit/evaluation/test_extractor.py
+++ b/tests/unit/evaluation/test_extractor.py
@@ -16,7 +16,7 @@
 
 import pytest
 from inference_endpoint.evaluation.extractor import (
-    ABCDExtractor,
+    Extractor,
     LetterExtractor,
     PythonCodeExtractor,
 )
@@ -148,15 +148,11 @@ def test_extract_whitespace_handling(self):
 
     def test_registered_in_extractor_registry(self):
         """Test that PythonCodeExtractor is registered."""
-        from inference_endpoint.evaluation.extractor import Extractor
-
         assert "python_code_extractor" in Extractor.PREDEFINED
         assert Extractor.get("python_code_extractor") == PythonCodeExtractor
 
     def test_extractor_get_method(self):
         """Test that we can retrieve PythonCodeExtractor by name."""
-        from inference_endpoint.evaluation.extractor import Extractor
-
         extractor_cls = Extractor.get("python_code_extractor")
         text = "```python\nprint('test')\n```"
         result = extractor_cls.extract(text)
@@ -174,17 +170,9 @@ def test_answer_colon(self):
     def test_markdown_answer(self):
         assert LetterExtractor.extract("**Answer:** C") == "C"
 
-    def test_answer_colon_d(self):
-        assert LetterExtractor.extract("Answer: D") == "D"
-
-    # Extended range (E–J) for MMLU-Pro
-    def test_letter_e(self):
+    def test_extended_range(self):
+        # E–J range needed for MMLU-Pro (10 choices); check boundary letters
         assert LetterExtractor.extract("Answer: E") == "E"
-
-    def test_letter_g(self):
-        assert LetterExtractor.extract("**Answer:** G") == "G"
-
-    def test_letter_j(self):
         assert LetterExtractor.extract("Answer: J") == "J"
 
     def test_boxed_letter(self):
@@ -204,18 +192,6 @@ def test_no_match_returns_empty_string(self):
     def test_default_on_no_match(self):
         assert LetterExtractor.extract("Nothing", default="X") == "X"
 
-    def test_returns_letter_not_choice_key(self):
-        # Confirm it does NOT return "choice1" etc. (ABCDExtractor behaviour)
-        result = LetterExtractor.extract("Answer: B")
-        assert result == "B"
-        assert result != "choice2"
-
-    def test_abcd_extractor_unchanged(self):
-        # Confirm ABCDExtractor still maps to choiceN (regression guard)
-        assert ABCDExtractor.extract("Answer: B") == "choice2"
-
     def test_registered(self):
-        from inference_endpoint.evaluation.extractor import Extractor
-
         assert "letter_extractor" in Extractor.PREDEFINED
         assert Extractor.get("letter_extractor") is LetterExtractor

From 743258278d1c057d4857ce7019e5703317fcdadd Mon Sep 17 00:00:00 2001
From: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
Date: Mon, 4 May 2026 11:00:35 -0700
Subject: [PATCH 5/5] Sync.

Signed-off-by: Rashid Kaleem <230885705+arekay-nv@users.noreply.github.com>
---
 .../vllm_dsv4pro_mlperf_accuracy.yaml                        | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/09_DeepSeek-V4-Pro_Example/vllm_dsv4pro_mlperf_accuracy.yaml b/examples/09_DeepSeek-V4-Pro_Example/vllm_dsv4pro_mlperf_accuracy.yaml
index f4d29d4e..89c76f83 100644
--- a/examples/09_DeepSeek-V4-Pro_Example/vllm_dsv4pro_mlperf_accuracy.yaml
+++ b/examples/09_DeepSeek-V4-Pro_Example/vllm_dsv4pro_mlperf_accuracy.yaml
@@ -1,6 +1,6 @@
 name: dsv4pro-vllm-b200x8-mlperf-accuracy
 version: "1.0"
-type: online
+type: offline
 timeout: 600.0
 
 # MLPerf Inference accuracy evaluation for DeepSeek-R1 / DeepSeek-V4-Pro.
@@ -95,8 +95,7 @@ settings:
     dataloader_random_seed: 42
 
   load_pattern:
-    type: concurrency
-    target_concurrency: 1024
+    type: max_throughput
 
   client:
     num_workers: -1