mlcommons · wu6u3tw · Apr 30, 2026 · Apr 30, 2026 · Apr 30, 2026 · Apr 30, 2026
@@ -194,6 +194,7 @@ examples/03_BenchmarkComparison/vllm_venv/
 .cursor/
 docs/superpowers/
 .claude/agent-memory/
+.worktrees/
 
 # User-specific local dev configs; do not commit
 CLAUDE.local.md
@@ -74,16 +74,17 @@ Dataset Manager --> Load Generator --> Endpoint Client --> External Endpoint
 
 ### Key Components
 
-| Component           | Location                                                      | Purpose                                                                                                                                     |
-| ------------------- | ------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| **Load Generator**  | `src/inference_endpoint/load_generator/`                      | Central orchestrator: `BenchmarkSession` owns the lifecycle, `Scheduler` controls timing, `LoadGenerator` issues queries                    |
-| **Endpoint Client** | `src/inference_endpoint/endpoint_client/`                     | Multi-process HTTP workers communicating via ZMQ IPC. `HTTPEndpointClient` is the main entry point                                          |
-| **Dataset Manager** | `src/inference_endpoint/dataset_manager/`                     | Loads JSONL, HuggingFace, CSV, JSON, Parquet datasets. `Dataset` base class with `load_sample()`/`num_samples()` interface                  |
-| **Metrics**         | `src/inference_endpoint/metrics/`                             | `EventRecorder` writes to SQLite, `MetricsReporter` reads and aggregates (QPS, latency, TTFT, TPOT)                                         |
-| **Config**          | `src/inference_endpoint/config/`, `endpoint_client/config.py` | Pydantic-based YAML schema (`schema.py`), `HTTPClientConfig` (single Pydantic model for CLI/YAML/runtime), `RuntimeSettings`                |
-| **CLI**             | `src/inference_endpoint/main.py`, `commands/benchmark/cli.py` | cyclopts-based, auto-generated from `schema.py` and `HTTPClientConfig` Pydantic models. Flat shorthands via `cyclopts.Parameter(alias=...)` |
-| **Async Utils**     | `src/inference_endpoint/async_utils/`                         | `LoopManager` (uvloop + eager_task_factory), ZMQ transport layer, event publisher                                                           |
-| **OpenAI/SGLang**   | `src/inference_endpoint/openai/`, `sglang/`                   | Protocol adapters and response accumulators for different API formats                                                                       |
+| Component           | Location                                                      | Purpose                                                                                                                                                                                                                                                                                                                                               |
+| ------------------- | ------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| **Load Generator**  | `src/inference_endpoint/load_generator/`                      | Central orchestrator: `BenchmarkSession` owns the lifecycle, `Scheduler` controls timing, `LoadGenerator` issues queries                                                                                                                                                                                                                              |
+| **Endpoint Client** | `src/inference_endpoint/endpoint_client/`                     | Multi-process HTTP workers communicating via ZMQ IPC. `HTTPEndpointClient` is the main entry point                                                                                                                                                                                                                                                    |
+| **Dataset Manager** | `src/inference_endpoint/dataset_manager/`                     | Loads JSONL, HuggingFace, CSV, JSON, Parquet datasets. `Dataset` base class with `load_sample()`/`num_samples()` interface                                                                                                                                                                                                                            |
+| **Metrics**         | `src/inference_endpoint/metrics/`                             | `EventRecorder` writes to SQLite, `MetricsReporter` reads and aggregates (QPS, latency, TTFT, TPOT)                                                                                                                                                                                                                                                   |
+| **Config**          | `src/inference_endpoint/config/`, `endpoint_client/config.py` | Pydantic-based YAML schema (`schema.py`), `HTTPClientConfig` (single Pydantic model for CLI/YAML/runtime), `RuntimeSettings`                                                                                                                                                                                                                          |
+| **CLI**             | `src/inference_endpoint/main.py`, `commands/benchmark/cli.py` | cyclopts-based, auto-generated from `schema.py` and `HTTPClientConfig` Pydantic models. Flat shorthands via `cyclopts.Parameter(alias=...)`                                                                                                                                                                                                           |
+| **Async Utils**     | `src/inference_endpoint/async_utils/`                         | `LoopManager` (uvloop + eager_task_factory), ZMQ transport layer, event publisher                                                                                                                                                                                                                                                                     |
+| **OpenAI/SGLang**   | `src/inference_endpoint/openai/`, `sglang/`                   | Protocol adapters and response accumulators for different API formats                                                                                                                                                                                                                                                                                 |
+| **VideoGen**        | `src/inference_endpoint/videogen/`                            | Adapter for video-generation endpoints (e.g. trtllm-serve `POST /v1/videos/generations`, used by MLPerf WAN2.2-T2V-A14B). Defaults to `response_format=video_path` (server saves video to shared storage and returns path) to avoid large byte payloads; switch to `video_bytes` for accuracy mode. Dataset is ingested via the generic JSONL loader. |
 
 ### Hot-Path Architecture
 
@@ -199,6 +200,10 @@ src/inference_endpoint/
 │   ├── accumulator.py         # Streaming response accumulator
 │   └── harmony.py             # openai_harmony integration
 ├── sglang/                    # SGLang API adapter
+├── videogen/                  # Video generation adapter (e.g. WAN2.2 T2V workload)
+│   ├── __init__.py
+│   ├── types.py               # Pydantic: VideoPathRequest, VideoPathResponse, VideoPayloadResponse
+│   └── adapter.py             # VideoGenAdapter (HttpRequestAdapter) + VideoGenAccumulator (no-op)
 ├── evaluation/                # Accuracy evaluation (extractor, scoring, livecodebench)
 ├── plugins/                   # Plugin system
 ├── profiling/                 # line_profiler integration, pytest plugin

@@ -0,0 +1,54 @@
+# Offline Video Generation Benchmark for WAN 2.2 (GB200/GB300)
+#
+# Targets trtllm-serve POST /v1/videos/generations directly (no proxy).
+# Uses response_format=video_path: server saves video to Lustre and returns
+# the file path, avoiding large video byte payloads over HTTP/ZMQ.
+#
+# MLPerf inference parameters (text_to_video task):
+#   Resolution:  720x1280 (portrait)
+#   Duration:    81 frames = 5 s
+#   Steps:       20 denoising steps
+#   Guidance:    4.0 (primary CFG) / 3.0 (null-text secondary)
+#   Seed:        42 (fixed for reproducibility; combine with fixed_latent.pt)
+#   Dataset:     248 prompts from shopify_product_catalogue::q3vl
+#
+# Resolution / duration / steps / guidance / seed are defaulted on
+# `VideoPathRequest`. Each JSONL row carries `prompt` plus the canonical
+# MLPerf `negative_prompt`; both flow into `query.data` and serialise into
+# the request body, while unset fields fall back to the request defaults.
+
+name: "offline-wan22-video-generation-benchmark"
+version: "1.0"
+type: "offline"
+
+model_params:
+  name: "wan22"
+  max_new_tokens: 1 # Ignored by VideoGenAdapter; kept >0 so swapping api_type to openai/sglang for debugging doesn't yield a 400.
+  streaming: "off" # WAN 2.2 uses non-streaming HTTP POST/response
+
+datasets:
+  - name: wan22_prompts
+    path: examples/09_Wan22_VideoGen_Example/wan22_prompts.jsonl
+    type: "performance"
+    samples: 248
+
+settings:
+  runtime:
+    max_duration_ms: 600000 # 10 minute cap
+    scheduler_random_seed: 42
+    dataloader_random_seed: 42
+    n_samples_to_issue: 248
+
+  load_pattern:
+    type: "max_throughput"
+
+  client:
+    num_workers: 4
+
+endpoint_config:
+  endpoints:
+    - "http://localhost:8000"
+  api_type: "videogen"
+  api_key: null
+
+report_dir: logs/wan22_video_generation_benchmark
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+# setup_and_test.sh — End-to-end runbook for the WAN 2.2 video-generation example.
+#
+# Steps:
+#   1. Download the WAN 2.2 weights from HuggingFace.
+#   2. Launch trtllm-serve in a separate shell.
+#   3. Run the offline benchmark from this script.
+#
+# Prerequisites: Python 3.12, a GPU host with trtllm-serve installed,
+# and HuggingFace credentials (`huggingface-cli login`) — the WAN 2.2
+# weights are gated.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+
+MODEL_REPO="Wan-AI/Wan2.2-T2V-A14B"   # https://huggingface.co/Wan-AI/Wan2.2-T2V-A14B
+MODEL_DIR="${MODEL_DIR:-${HOME}/models/wan2.2-t2v-a14b}"
+
+cd "${REPO_ROOT}"
+
+# 1. Download model weights (~28 GB).
+huggingface-cli download "${MODEL_REPO}" --local-dir "${MODEL_DIR}"
+
+# 2. Launch the server in a separate shell, then re-run this script:
+#
+#      trtllm-serve "${MODEL_DIR}" --host 0.0.0.0 --port 8000 \
+#          --backend pytorch --task text_to_video
+#
+# 3. Run the offline benchmark.
+inference-endpoint benchmark from-config \
+    --config "${SCRIPT_DIR}/offline_wan22.yaml"