mlcommons · arekay-nv · May 5, 2026 · May 5, 2026 · May 5, 2026 · May 5, 2026
@@ -197,3 +197,6 @@ docs/superpowers/
 
 # User-specific local dev configs; do not commit
 CLAUDE.local.md
+
+# Generated dataset cache (created by Dataset.get_dataloader())
+dataset_cache/
@@ -17,6 +17,7 @@
 
 import argparse
 import asyncio
+import logging
 from contextlib import AbstractContextManager, nullcontext
 from pathlib import Path
 
@@ -91,10 +92,16 @@ async def main() -> None:
 
     # Using ternary operator causes errors in MyPy object type coalescing
     # (coalesces to 'object' not 'AbstractContextManager[TokenizePool | None]')
+    pool_cm: AbstractContextManager[TokenizePool | None]
     if args.tokenizer:
-        pool_cm: AbstractContextManager[TokenizePool | None] = TokenizePool(
-            args.tokenizer, n_workers=args.tokenizer_workers
-        )
+        try:
+            pool_cm = TokenizePool(args.tokenizer, n_workers=args.tokenizer_workers)
+        except Exception as e:
+            logging.warning(
+                f"Failed to load tokenizer '{args.tokenizer}': {e}. "
+                "ISL/OSL/TPOT token metrics will be unavailable."
+            )
+            pool_cm = nullcontext()
     else:
         pool_cm = nullcontext()
 

@@ -27,11 +27,13 @@
 import json
 import logging
 import platform
+import random
 import shutil
 import signal
 import tempfile
 import uuid
 from dataclasses import dataclass, field
+from dataclasses import replace as dataclass_replace
 from datetime import datetime
 from pathlib import Path
 from typing import Any
@@ -70,7 +72,7 @@
     TestType,
 )
 from inference_endpoint.core.types import QueryResult
-from inference_endpoint.dataset_manager.dataset import Dataset
+from inference_endpoint.dataset_manager.dataset import Dataset, SaltedDataset
 from inference_endpoint.dataset_manager.factory import DataLoaderFactory
 from inference_endpoint.endpoint_client.cpu_affinity import AffinityPlan, pin_loadgen
 from inference_endpoint.endpoint_client.http_client import HTTPEndpointClient
@@ -347,6 +349,33 @@ def _build_phases(ctx: BenchmarkContext) -> list[PhaseConfig]:
     """Build the phase list from BenchmarkContext."""
     phases: list[PhaseConfig] = []
 
+    # Warmup phase (optional, before performance)
+    warmup_cfg = ctx.config.settings.warmup
+    if warmup_cfg.enabled:
+        warmup_dataset: Dataset = (
+            SaltedDataset(ctx.dataloader) if warmup_cfg.salt else ctx.dataloader
+        )
+        warmup_rt = dataclass_replace(
+            ctx.rt_settings,
+            min_duration_ms=0,
+            max_duration_ms=None,
+            n_samples_from_dataset=ctx.dataloader.num_samples(),
+            n_samples_to_issue=warmup_cfg.n_requests,
+            min_sample_count=1,
+            rng_sched=random.Random(warmup_cfg.warmup_random_seed),
+            rng_sample_index=random.Random(warmup_cfg.warmup_random_seed + 1),
+            load_pattern=LoadPattern(type=LoadPatternType.MAX_THROUGHPUT),
+        )
+        phases.append(
+            PhaseConfig(
+                "warmup",
+                warmup_rt,
+                warmup_dataset,
+                PhaseType.WARMUP,
+                drain_after=warmup_cfg.drain,
+            )
+        )
+
     # Performance phase
     phases.append(
         PhaseConfig(
@@ -525,12 +554,31 @@ async def _run_benchmark_async(
         phases = _build_phases(ctx)
         report: Report | None = None
 
+        # Global wall-clock timeout covers warmup + performance + accuracy phases
+        # combined, and bounds the warmup drain so a dropped request can't hang forever.
+        global_timeout_handle = None
+        max_duration_ms = ctx.rt_settings.max_duration_ms
+        if max_duration_ms is not None:
+
+            def _on_global_timeout() -> None:
+                logger.warning(
+                    "Global experiment timeout reached (%d ms); stopping session.",
+                    max_duration_ms,
+                )
+                session.stop()
+
+            global_timeout_handle = loop.call_later(
+                max_duration_ms / 1000.0, _on_global_timeout
+            )
+
         loop.add_signal_handler(signal.SIGINT, session.stop)
         try:
             result = await session.run(phases)
         except Exception as e:
             raise ExecutionError(f"Benchmark execution failed: {e}") from e
         finally:
+            if global_timeout_handle is not None:
+                global_timeout_handle.cancel()
             loop.remove_signal_handler(signal.SIGINT)
             logger.info("Cleaning up...")
             try:

@@ -392,6 +392,30 @@ def _validate_completeness(self) -> Self:
         return self
 
 
+class WarmupConfig(BaseModel):
+    """Warmup phase configuration. Runs before the performance phase; results are not recorded."""
+
+    model_config = ConfigDict(extra="forbid", frozen=True)
+
+    enabled: bool = Field(
+        False, description="Enable warmup phase before performance run"
+    )
+    n_requests: int | None = Field(
+        None, gt=0, description="Warmup request count (None = full dataset once)"
+    )
+    salt: bool = Field(
+        False, description="Prepend a unique random hex salt to each warmup prompt"
+    )
+    drain: bool = Field(
+        False,
+        description="Drain in-flight warmup requests before starting the performance phase",
+    )
+    warmup_random_seed: int = Field(
+        42,
+        description="RNG seed for warmup scheduling and sample ordering",
+    )
+
+
 @cyclopts.Parameter(name="*")
 class Settings(BaseModel):
     """Test settings."""
@@ -401,6 +425,7 @@ class Settings(BaseModel):
     runtime: RuntimeConfig = Field(default_factory=RuntimeConfig)
     load_pattern: LoadPattern = Field(default_factory=LoadPattern)
     client: HTTPClientConfig = Field(default_factory=HTTPClientConfig)
+    warmup: WarmupConfig = Field(default_factory=WarmupConfig)
 
 
 class OfflineSettings(Settings):

@@ -68,6 +68,12 @@ settings:
     max_idle_time: 4.0  # Discard connections idle longer than this (seconds)
     min_required_connections: -1  # Min connections to initialize (-1=auto, 0=disabled)
     worker_gc_mode: relaxed  # Worker GC strategy | options: disabled, relaxed, system
+  warmup:
+    enabled: false  # Enable warmup phase before performance run
+    n_requests: null  # Warmup request count (None = full dataset once)
+    salt: false  # Prepend a unique random hex salt to each warmup prompt
+    drain: false  # Drain in-flight warmup requests before starting the performance phase
+    warmup_random_seed: 42  # RNG seed for warmup scheduling and sample ordering
 endpoint_config:
   endpoints:  # Endpoint URL(s). Must include scheme, e.g. 'http://host:port'.
   - http://localhost:8000

@@ -68,6 +68,12 @@ settings:
     max_idle_time: 4.0  # Discard connections idle longer than this (seconds)
     min_required_connections: -1  # Min connections to initialize (-1=auto, 0=disabled)
     worker_gc_mode: relaxed  # Worker GC strategy | options: disabled, relaxed, system
+  warmup:
+    enabled: false  # Enable warmup phase before performance run
+    n_requests: null  # Warmup request count (None = full dataset once)
+    salt: false  # Prepend a unique random hex salt to each warmup prompt
+    drain: false  # Drain in-flight warmup requests before starting the performance phase
+    warmup_random_seed: 42  # RNG seed for warmup scheduling and sample ordering
 endpoint_config:
   endpoints:  # Endpoint URL(s). Must include scheme, e.g. 'http://host:port'.
   - http://localhost:8000

@@ -68,6 +68,12 @@ settings:
     max_idle_time: 4.0  # Discard connections idle longer than this (seconds)
     min_required_connections: -1  # Min connections to initialize (-1=auto, 0=disabled)
     worker_gc_mode: relaxed  # Worker GC strategy | options: disabled, relaxed, system
+  warmup:
+    enabled: false  # Enable warmup phase before performance run
+    n_requests: null  # Warmup request count (None = full dataset once)
+    salt: false  # Prepend a unique random hex salt to each warmup prompt
+    drain: false  # Drain in-flight warmup requests before starting the performance phase
+    warmup_random_seed: 42  # RNG seed for warmup scheduling and sample ordering
 endpoint_config:
   endpoints:  # Endpoint URL(s). Must include scheme, e.g. 'http://host:port'.
   - http://localhost:8000

@@ -276,11 +276,12 @@ class Dataset:
     def __init_subclass__(
         cls,
         dataset_id: str | None = None,
+        register: bool = True,
         **kwargs,
     ):
         super().__init_subclass__(**kwargs)
 
-        if not inspect.isabstract(cls):
+        if register and not inspect.isabstract(cls):
             if dataset_id is None:
                 dataset_id = cls.__name__
             cls.DATASET_ID = dataset_id
@@ -411,7 +412,7 @@ def num_samples(self) -> int:
     @classmethod
     def get_dataloader(
         cls,
-        datasets_dir: Path = Path("datasets"),
+        datasets_dir: Path = Path("dataset_cache"),
         num_repeats: int = 1,
         transforms: list[Transform] | None = None,
         force_regenerate: bool = False,
@@ -431,6 +432,74 @@ def get_dataloader(
         return cls(df, transforms=transforms, repeats=num_repeats)
 
 
+class SaltedDataset(Dataset, register=False):
+    """Wraps a loaded Dataset, prepending a unique random salt to each prompt on load_sample().
+
+    Each call to load_sample() generates a fresh salt, so reused samples (when
+    n_requests > dataset size) each receive a distinct salt.
+    """
+
+    def __init__(self, inner: Dataset) -> None:
+        # Skip Dataset.__init__ — all state is delegated to inner
+        self._inner = inner
+        self.dataframe = None
+        self.transforms = None
+        self.repeats = inner.repeats
+        self.logger = getLogger(__name__)
+
+    @property  # type: ignore[override]
+    def data(self) -> list[Any] | None:
+        return self._inner.data
+
+    @data.setter
+    def data(self, value: list[Any] | None) -> None:
+        self._inner.data = value
+
+    def load(
+        self,
+        adapter: "HttpRequestAdapter | None" = None,
+        api_type: APIType | None = None,
+        model_params: ModelParams | None = None,
+        force: bool = False,
+    ) -> None:
+        pass  # Inner dataset already loaded
+
+    def load_sample(self, index: int) -> Any:
+        data = self._inner.load_sample(index)
+        if not isinstance(data, dict):
+            return data
+        if "input_tokens" in data and "prompt" not in data:
+            self.logger.warning(
+                "SaltedDataset: sample has 'input_tokens' but no 'prompt' — "
+                "salt cannot be applied to pre-tokenized input; KV-cache reuse may not be prevented"
+            )
+            return data
+        if "prompt" not in data:
+            return data
+        prompt = data["prompt"]
+        salt = os.urandom(8).hex()
+        if isinstance(prompt, str):
+            return {**data, "prompt": f"[{salt}] {prompt}"}
+        if isinstance(prompt, list) and prompt:
+            # Find the first text part at any index (image-first prompts place text at index 1+)
+            for i, part in enumerate(prompt):
+                if isinstance(part, dict) and part.get("type") == "text":
+                    salted_parts = [
+                        *prompt[:i],
+                        {**part, "text": f"[{salt}] {part['text']}"},
+                        *prompt[i + 1 :],
+                    ]
+                    return {**data, "prompt": salted_parts}
+            self.logger.warning(
+                "SaltedDataset: multimodal prompt has no text part — "
+                "salt cannot be applied; KV-cache reuse may not be prevented"
+            )
+        return data  # unsupported prompt type — skip salting
+
+    def num_samples(self) -> int:
+        return self._inner.num_samples()
+
+
 class EmptyDataset(Dataset):
     """Empty dataset to be used as performance dataset when running only accuracy tests."""
 

@@ -22,7 +22,6 @@
 
 import asyncio
 import logging
-import os
 import time
 import uuid
 from collections.abc import Callable
@@ -44,8 +43,6 @@
 
 logger = logging.getLogger(__name__)
 
-_WARMUP_ENABLED = os.environ.get("ENABLE_WARMUP") == "1"
-
 
 # ---------------------------------------------------------------------------
 # Phase configuration
@@ -68,6 +65,7 @@ class PhaseConfig:
     runtime_settings: RuntimeSettings
     dataset: Dataset
     phase_type: PhaseType = PhaseType.PERFORMANCE
+    drain_after: bool = True
 
 
 # ---------------------------------------------------------------------------
@@ -242,6 +240,7 @@ def __init__(
         self._stop_requested = False
         self._done = False
         self._current_phase_issuer: PhaseIssuer | None = None
+        self._current_phase_type: PhaseType | None = None
         self._current_strategy: LoadStrategy | None = None
         self._recv_task: asyncio.Task | None = None
         self._strategy_task: asyncio.Task | None = None
@@ -274,12 +273,6 @@ async def run(self, phases: list[PhaseConfig]) -> SessionResult:
             for phase in phases:
                 if self._stop_requested:
                     break
-                if phase.phase_type == PhaseType.WARMUP and not _WARMUP_ENABLED:
-                    logger.info(
-                        "Skipping warmup phase %s (set ENABLE_WARMUP=1 to enable)",
-                        phase.name,
-                    )
-                    continue
                 result = await self._run_phase(phase)
                 if result is not None:
                     phase_results.append(result)
@@ -318,6 +311,7 @@ async def _run_phase(self, phase: PhaseConfig) -> PhaseResult | None:
         )
 
         self._current_phase_issuer = phase_issuer
+        self._current_phase_type = phase.phase_type
         self._current_strategy = strategy
 
         # Performance phases get tracking events
@@ -333,8 +327,7 @@ async def _run_phase(self, phase: PhaseConfig) -> PhaseResult | None:
         finally:
             self._strategy_task = None
 
-        # Drain in-flight (skip for warmup — keep concurrency hot)
-        if phase.phase_type != PhaseType.WARMUP:
+        if phase.drain_after:
             await self._drain_inflight(phase_issuer)
 
         if phase.phase_type == PhaseType.PERFORMANCE:
@@ -363,9 +356,9 @@ async def _run_phase(self, phase: PhaseConfig) -> PhaseResult | None:
     async def _drain_inflight(self, phase_issuer: PhaseIssuer) -> None:
         """Wait for all in-flight responses from this phase to complete.
 
-        Currently, there is no timeout for the drain step. In the future,
-        we can possibly add a dynamic timeout based on the rate of completion
-        throughout the current phase."""
+        Bounded by the global experiment timeout: if the caller schedules a
+        loop.call_later that calls stop(), stop() sets _drain_event, unblocking
+        this wait without leaving it hung indefinitely."""
         if phase_issuer.inflight <= 0 or self._stop_requested:
             return
         logger.info("Draining %d in-flight responses...", phase_issuer.inflight)
@@ -425,7 +418,10 @@ def _handle_response(self, resp: QueryResult | StreamChunk) -> None:
                     self._drain_event.set()
                 if self._current_strategy:
                     self._current_strategy.on_query_complete(query_id)
-                if self._on_sample_complete:
+                if (
+                    self._on_sample_complete
+                    and self._current_phase_type != PhaseType.WARMUP
+                ):
                     self._on_sample_complete(resp)
 
         elif isinstance(resp, StreamChunk):