diff --git a/examples/08_Qwen2.5-0.5B_Example/.gitignore b/examples/08_Qwen2.5-0.5B_Example/.gitignore new file mode 100644 index 00000000..a125fc2d --- /dev/null +++ b/examples/08_Qwen2.5-0.5B_Example/.gitignore @@ -0,0 +1,14 @@ +# Benchmark results +results/ + +# Generated data +data/*.pkl + +# Logs +*.log +benchmark_output.log + +# Python cache +__pycache__/ +*.pyc +*.pyo diff --git a/examples/08_Qwen2.5-0.5B_Example/QUICKSTART.md b/examples/08_Qwen2.5-0.5B_Example/QUICKSTART.md new file mode 100644 index 00000000..1671a1ac --- /dev/null +++ b/examples/08_Qwen2.5-0.5B_Example/QUICKSTART.md @@ -0,0 +1,76 @@ +# Quick Start — Qwen2.5-0.5B + +All commands run from the **repository root**. + +## Setup + +```bash +python3.12 -m venv .venv && source .venv/bin/activate +pip install -e ".[test]" +python examples/08_Qwen2.5-0.5B_Example/prepare_dataset.py +``` + +## Option A — Automated (vLLM or SGLang) + +```bash +bash examples/08_Qwen2.5-0.5B_Example/run_benchmark.sh vllm offline +bash examples/08_Qwen2.5-0.5B_Example/run_benchmark.sh vllm online +bash examples/08_Qwen2.5-0.5B_Example/run_benchmark.sh sglang online +``` + +## Option B — Manual step-by-step + +**1. Start server** (pick one): + +```bash +# vLLM +docker run --runtime nvidia --gpus all -v ~/.cache/huggingface:/root/.cache/huggingface \ + -e PYTORCH_ALLOC_CONF=expandable_segments:True -p 8000:8000 --ipc=host \ + --name vllm-qwen -d vllm/vllm-openai:latest \ + --model Qwen/Qwen2.5-0.5B-Instruct --gpu-memory-utilization 0.85 + +# SGLang +docker run --runtime nvidia --gpus all --net host \ + -v ~/.cache/huggingface:/root/.cache/huggingface --ipc=host \ + --name sglang-qwen -d lmsysorg/sglang:latest \ + python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-0.5B-Instruct \ + --host 0.0.0.0 --port 30000 --mem-fraction-static 0.9 --attention-backend flashinfer +``` + +**2. Wait for ready:** + +```bash +until curl -sf http://localhost:8000/v1/models > /dev/null; do sleep 5; done # vLLM +until curl -sf http://localhost:30000/health > /dev/null; do sleep 5; done # SGLang +``` + +**3. Run concurrency sweep:** + +```bash +python scripts/concurrency_sweep/run.py \ + --config examples/08_Qwen2.5-0.5B_Example/online_qwen_benchmark.yaml # vLLM + # or: --config examples/08_Qwen2.5-0.5B_Example/sglang_online_qwen_benchmark.yaml + +# Add --verbose to stream output live; add --concurrency / --duration-ms to customize +``` + +**4. Summarize and plot:** + +```bash +python scripts/concurrency_sweep/summarize.py \ + results/qwen_online_benchmark/concurrency_sweep/ # vLLM + # or: results/qwen_sglang_online_benchmark/concurrency_sweep/ +``` + +Writes `metrics_summary.csv`, `metrics_summary.md`, and `metrics_summary.png`. + +**5. Stop server:** + +```bash +docker stop vllm-qwen && docker rm vllm-qwen +# or: docker stop sglang-qwen && docker rm sglang-qwen +``` + +--- + +For TRT-LLM setup, config customization, and output file locations, see [README.md](README.md). diff --git a/examples/08_Qwen2.5-0.5B_Example/README.md b/examples/08_Qwen2.5-0.5B_Example/README.md new file mode 100644 index 00000000..58c2e688 --- /dev/null +++ b/examples/08_Qwen2.5-0.5B_Example/README.md @@ -0,0 +1,242 @@ +# Qwen2.5-0.5B-Instruct Benchmark Example + +Benchmarks `Qwen/Qwen2.5-0.5B-Instruct` with offline (max-throughput) and online +(concurrency sweep) load patterns. Designed for small GPUs (8–16 GB VRAM). + +Supported inference servers: **vLLM**, **SGLang**, **TRT-LLM**. + +--- + +## Requirements + +- Python 3.12+ +- Docker with NVIDIA GPU support (`--runtime nvidia`) +- NVIDIA GPU with at least 8 GB VRAM + +--- + +## Step 1 — Install and prepare dataset + +Run all commands from the **repository root**. + +```bash +python3.12 -m venv .venv +source .venv/bin/activate +pip install -e ".[test]" + +python examples/08_Qwen2.5-0.5B_Example/prepare_dataset.py +``` + +This converts `tests/datasets/dummy_1k.pkl` into +`examples/08_Qwen2.5-0.5B_Example/data/test_dataset.pkl`. + +--- + +## Step 2 — Start the inference server + +Pick one backend. The server must be fully ready before running benchmarks. + +### vLLM (port 8000) + +```bash +docker run --runtime nvidia --gpus all \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + -e PYTORCH_ALLOC_CONF=expandable_segments:True \ + -p 8000:8000 \ + --ipc=host \ + --name vllm-qwen \ + -d \ + vllm/vllm-openai:latest \ + --model Qwen/Qwen2.5-0.5B-Instruct \ + --gpu-memory-utilization 0.85 +``` + +### SGLang (port 30000) + +```bash +docker run --runtime nvidia --gpus all --net host \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + --ipc=host \ + --name sglang-qwen \ + -d \ + lmsysorg/sglang:latest \ + python3 -m sglang.launch_server \ + --model-path Qwen/Qwen2.5-0.5B-Instruct \ + --host 0.0.0.0 \ + --port 30000 \ + --mem-fraction-static 0.9 \ + --attention-backend flashinfer +``` + +### TRT-LLM (port 8000) + +```bash +docker run --runtime nvidia --gpus all \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + -p 8000:8000 \ + --ipc=host \ + --name trtllm-qwen \ + -d \ + nvcr.io/nvidia/tritonserver:latest \ + # Add your TRT-LLM engine launch arguments here +``` + +> **Note:** No pre-built TRT-LLM config is provided. Use +> `examples/08_Qwen2.5-0.5B_Example/online_qwen_benchmark.yaml` as a template and +> point `endpoint_config.endpoints` at `http://localhost:8000`. + +--- + +## Step 3 — Wait for the server to be ready + +Poll until the health endpoint responds: + +```bash +# vLLM / TRT-LLM (port 8000) +until curl -sf http://localhost:8000/v1/models > /dev/null; do + echo "Waiting for server..."; sleep 5 +done +echo "Server ready." + +# SGLang (port 30000) +until curl -sf http://localhost:30000/health > /dev/null; do + echo "Waiting for server..."; sleep 5 +done +echo "Server ready." +``` + +--- + +## Step 4 — Run the concurrency sweep + +Choose the config that matches your server. The sweep script overrides +`load_pattern` and `report_dir` for each concurrency level, leaving all other +settings (model, dataset, endpoint) from the config file. + +```bash +# vLLM +python scripts/concurrency_sweep/run.py \ + --config examples/08_Qwen2.5-0.5B_Example/online_qwen_benchmark.yaml + +# SGLang +python scripts/concurrency_sweep/run.py \ + --config examples/08_Qwen2.5-0.5B_Example/sglang_online_qwen_benchmark.yaml + +# TRT-LLM (use the vLLM config or a custom one pointing at port 8000) +python scripts/concurrency_sweep/run.py \ + --config examples/08_Qwen2.5-0.5B_Example/online_qwen_benchmark.yaml +``` + +**Common options:** + +| Flag | Default | Description | +|---|---|---| +| `--concurrency N [N ...]` | `1 2 4 8 16 32 64 128 256 512 1024` | Concurrency levels to test | +| `--duration-ms MS` | `600000` (10 min) | Duration per run | +| `--output-dir DIR` | from `report_dir` in config | Root directory for sweep output | +| `--timeout-seconds S` | `720` (12 min) | Per-run subprocess timeout | +| `--verbose` | off | Stream output live to the terminal (useful for debugging) | + +Example — quick 3-minute sweep at a few concurrency levels: + +```bash +python scripts/concurrency_sweep/run.py \ + --config examples/08_Qwen2.5-0.5B_Example/online_qwen_benchmark.yaml \ + --concurrency 1 4 16 64 \ + --duration-ms 180000 \ + --verbose +``` + +Results land in subdirectories under the config's `report_dir`: + +``` +results/qwen_online_benchmark/concurrency_sweep/ + concurrency_1/ benchmark.log result_summary.json + concurrency_4/ benchmark.log result_summary.json + ... + summary.json summary.csv +``` + +If a run fails, check the per-run log: + +```bash +cat results/qwen_online_benchmark/concurrency_sweep/concurrency_64/benchmark.log +``` + +--- + +## Step 5 — Summarize results and generate plots + +```bash +# vLLM +python scripts/concurrency_sweep/summarize.py \ + results/qwen_online_benchmark/concurrency_sweep/ + +# SGLang +python scripts/concurrency_sweep/summarize.py \ + results/qwen_sglang_online_benchmark/concurrency_sweep/ +``` + +This prints formatted tables to stdout and writes three files into the sweep +directory: + +| File | Contents | +|---|---| +| `metrics_summary.csv` | All metrics in CSV form | +| `metrics_summary.md` | Markdown tables with throughput, latency, TTFT, TPOT | +| `metrics_summary.png` | Line plots of TPS, TTFT P99, and TPOT P50 vs concurrency | + +Pass `--no-save` to print tables only without writing files. + +--- + +## Step 6 — Stop the server + +```bash +docker stop vllm-qwen # or sglang-qwen / trtllm-qwen +docker rm vllm-qwen +``` + +--- + +## Offline (max-throughput) benchmark + +For a single offline run (no sweep): + +```bash +# vLLM +inference-endpoint benchmark from-config \ + -c examples/08_Qwen2.5-0.5B_Example/offline_qwen_benchmark.yaml + +# SGLang +inference-endpoint benchmark from-config \ + -c examples/08_Qwen2.5-0.5B_Example/sglang_offline_qwen_benchmark.yaml +``` + +Results: `results/qwen_offline_benchmark/` or `results/qwen_sglang_offline_benchmark/`. + +--- + +## Automated wrapper + +`run_benchmark.sh` automates Steps 2–4 (dataset prep, container start, benchmark): + +```bash +bash examples/08_Qwen2.5-0.5B_Example/run_benchmark.sh vllm offline +bash examples/08_Qwen2.5-0.5B_Example/run_benchmark.sh vllm online +bash examples/08_Qwen2.5-0.5B_Example/run_benchmark.sh sglang offline +bash examples/08_Qwen2.5-0.5B_Example/run_benchmark.sh sglang online +``` + +--- + +## Config files + +| File | Server | Mode | +|---|---|---| +| `offline_qwen_benchmark.yaml` | vLLM (`:8000`) | Offline | +| `online_qwen_benchmark.yaml` | vLLM (`:8000`) | Online sweep | +| `sglang_offline_qwen_benchmark.yaml` | SGLang (`:30000`) | Offline | +| `sglang_online_qwen_benchmark.yaml` | SGLang (`:30000`) | Online sweep | +| `prepare_dataset.py` | — | Converts `dummy_1k.pkl` to example dataset | +| `run_benchmark.sh` | vLLM / SGLang | Automated wrapper | diff --git a/examples/08_Qwen2.5-0.5B_Example/offline_qwen_benchmark.yaml b/examples/08_Qwen2.5-0.5B_Example/offline_qwen_benchmark.yaml new file mode 100644 index 00000000..35b6a509 --- /dev/null +++ b/examples/08_Qwen2.5-0.5B_Example/offline_qwen_benchmark.yaml @@ -0,0 +1,43 @@ +name: "qwen-0.5b-offline-benchmark" +version: "1.0" +type: "offline" + +model_params: + name: "Qwen/Qwen2.5-0.5B-Instruct" + temperature: 1.0 + max_new_tokens: 100 + top_p: 1.0 + streaming: "on" + +datasets: + - name: "qwen-perf-test" + type: "performance" + path: "examples/08_Qwen2.5-0.5B_Example/data/test_dataset.pkl" + samples: 1000 + +settings: + runtime: + min_duration_ms: 100 + max_duration_ms: 60000 + scheduler_random_seed: 42 + dataloader_random_seed: 42 + + client: + workers: 1 + max_connections: 100 + warmup_connections: 0 + record_worker_events: false + +metrics: + collect: + - "throughput" + - "latency" + - "ttft" + - "tpot" + +endpoint_config: + endpoints: + - "http://localhost:8000" + api_key: null + +report_dir: "results/qwen_offline_benchmark/" diff --git a/examples/08_Qwen2.5-0.5B_Example/online_qwen_benchmark.yaml b/examples/08_Qwen2.5-0.5B_Example/online_qwen_benchmark.yaml new file mode 100644 index 00000000..8773f1cb --- /dev/null +++ b/examples/08_Qwen2.5-0.5B_Example/online_qwen_benchmark.yaml @@ -0,0 +1,47 @@ +name: "qwen-0.5b-online-benchmark" +version: "1.0" +type: "online" + +model_params: + name: "Qwen/Qwen2.5-0.5B-Instruct" + temperature: 0.7 + max_new_tokens: 128 + top_p: 0.95 + streaming: "on" + +datasets: + - name: "qwen-perf-test" + type: "performance" + path: "examples/08_Qwen2.5-0.5B_Example/data/test_dataset.pkl" + samples: 500 + +settings: + runtime: + min_duration_ms: 600000 + max_duration_ms: 600000 + scheduler_random_seed: 42 + dataloader_random_seed: 42 + + load_pattern: + type: "concurrency" + target_concurrency: 1 + + client: + workers: 1 + max_connections: 2048 + warmup_connections: 0 + record_worker_events: false + +metrics: + collect: + - "throughput" + - "latency" + - "ttft" + - "tpot" + +endpoint_config: + endpoints: + - "http://localhost:8000" + api_key: null + +report_dir: "results/qwen_online_benchmark/" diff --git a/examples/08_Qwen2.5-0.5B_Example/prepare_dataset.py b/examples/08_Qwen2.5-0.5B_Example/prepare_dataset.py new file mode 100755 index 00000000..34faf0e5 --- /dev/null +++ b/examples/08_Qwen2.5-0.5B_Example/prepare_dataset.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Prepare test dataset for Qwen benchmark. + +This script creates a test dataset with the 'prompt' column required by +the inference-endpoint benchmarking tool. +""" + +import pickle +import sys +from pathlib import Path + + +def prepare_dataset( + input_path: str = "tests/datasets/dummy_1k.pkl", + output_dir: str = "examples/08_Qwen2.5-0.5B_Example/data", + output_filename: str = "test_dataset.pkl", +) -> None: + """ + Prepare the test dataset by renaming columns to match expected format. + + Args: + input_path: Path to the input dataset + output_dir: Directory to save the output dataset + output_filename: Name of the output file + """ + print(f"Loading dataset from: {input_path}") + + # Load the original dataset + try: + with open(input_path, "rb") as f: + data = pickle.load(f) + except FileNotFoundError: + print(f"ERROR: Input dataset not found at {input_path}") + print("Make sure you're running from the repository root directory") + sys.exit(1) + + print(f"Loaded dataset with {len(data)} samples") + print(f"Original columns: {data.columns.tolist()}") + + # Rename text_input to prompt + if "text_input" in data.columns: + data = data.rename(columns={"text_input": "prompt"}) + print("Renamed 'text_input' to 'prompt'") + elif "prompt" not in data.columns: + print("ERROR: Dataset must have 'text_input' or 'prompt' column") + sys.exit(1) + + print(f"Final columns: {data.columns.tolist()}") + + # Create output directory + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + # Save the dataset + full_output_path = output_path / output_filename + with open(full_output_path, "wb") as f: + pickle.dump(data, f) + + print(f"✅ Dataset saved to: {full_output_path}") + print(f" Samples: {len(data)}") + print(f" Columns: {data.columns.tolist()}") + + +if __name__ == "__main__": + # Allow custom input path as command-line argument + input_path = sys.argv[1] if len(sys.argv) > 1 else "tests/datasets/dummy_1k.pkl" + prepare_dataset(input_path=input_path) diff --git a/examples/08_Qwen2.5-0.5B_Example/run_benchmark.sh b/examples/08_Qwen2.5-0.5B_Example/run_benchmark.sh new file mode 100755 index 00000000..1f62bd18 --- /dev/null +++ b/examples/08_Qwen2.5-0.5B_Example/run_benchmark.sh @@ -0,0 +1,233 @@ +#!/bin/bash +# Complete benchmark workflow for Qwen2.5-0.5B-Instruct +# Supports both vLLM and SGLang inference servers + +set -eo pipefail # Exit on error, including failures in piped benchmark commands + +echo "========================================" +echo "Qwen2.5-0.5B Benchmark Runner" +echo "========================================" +echo "" + +# Parse arguments +SERVER_TYPE="${1:-vllm}" # vllm or sglang +BENCHMARK_TYPE="${2:-offline}" # offline or online + +# Validate server type +if [[ "$SERVER_TYPE" != "vllm" && "$SERVER_TYPE" != "sglang" ]]; then + echo "ERROR: Invalid server type: $SERVER_TYPE" + echo "Usage: bash examples/08_Qwen2.5-0.5B_Example/run_benchmark.sh [vllm|sglang] [offline|online]" + exit 1 +fi + +# Validate benchmark type +if [[ "$BENCHMARK_TYPE" != "offline" && "$BENCHMARK_TYPE" != "online" ]]; then + echo "ERROR: Invalid benchmark type: $BENCHMARK_TYPE" + echo "Usage: bash examples/08_Qwen2.5-0.5B_Example/run_benchmark.sh [vllm|sglang] [offline|online]" + exit 1 +fi + +# Configuration +MODEL_NAME="Qwen/Qwen2.5-0.5B-Instruct" +HF_HOME="${HF_HOME:-$HOME/.cache/huggingface}" + +# Set server-specific configuration +if [[ "$SERVER_TYPE" == "vllm" ]]; then + CONTAINER_NAME="vllm-qwen" + SERVER_PORT=8000 + CONFIG_PREFIX="" + DOCKER_IMAGE="vllm/vllm-openai:latest" +else + CONTAINER_NAME="sglang-qwen" + SERVER_PORT=30000 + CONFIG_PREFIX="sglang_" + DOCKER_IMAGE="lmsysorg/sglang:latest" +fi + +echo "Configuration:" +echo " Server: $SERVER_TYPE" +echo " Benchmark: $BENCHMARK_TYPE" +echo " Container: $CONTAINER_NAME" +echo " Port: $SERVER_PORT" +echo "" + +# Check if running from repo root +if [ ! -f "pyproject.toml" ]; then + echo "ERROR: Please run this script from the repository root" + echo "Usage: bash examples/08_Qwen2.5-0.5B_Example/run_benchmark.sh [vllm|sglang] [offline|online]" + exit 1 +fi + +# Step 1: Prepare dataset +echo "Step 1: Preparing dataset..." +if [ ! -f ".venv/bin/activate" ]; then + echo "ERROR: Virtual environment not found at .venv/" + echo "Please create it first: python3.12 -m venv .venv && source .venv/bin/activate && pip install -e ." + exit 1 +fi + +source .venv/bin/activate +python examples/08_Qwen2.5-0.5B_Example/prepare_dataset.py +echo "✅ Dataset prepared" +echo "" + +# Step 2: Check if container is already running +echo "Step 2: Checking for existing $SERVER_TYPE container..." +if docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then + echo "Found existing container: ${CONTAINER_NAME}" + if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then + echo "Container is running. Skipping server launch." + else + echo "Container exists but not running. Starting..." + docker start ${CONTAINER_NAME} + sleep 15 + fi +else + echo "No existing container found. Launching $SERVER_TYPE server..." + + # Step 3: Launch server (vLLM or SGLang) + echo "Step 3: Launching $SERVER_TYPE server..." + + if [[ "$SERVER_TYPE" == "vllm" ]]; then + # Launch vLLM + docker run --runtime nvidia --gpus all \ + -v ${HF_HOME}:/root/.cache/huggingface \ + -e PYTORCH_ALLOC_CONF=expandable_segments:True \ + -p ${SERVER_PORT}:8000 \ + --ipc=host \ + --name ${CONTAINER_NAME} \ + -d \ + ${DOCKER_IMAGE} \ + --model ${MODEL_NAME} \ + --gpu-memory-utilization 0.85 + else + # Launch SGLang + docker run --runtime nvidia --gpus all \ + --net host \ + -v ${HF_HOME}:/root/.cache/huggingface \ + --ipc=host \ + --name ${CONTAINER_NAME} \ + -d \ + ${DOCKER_IMAGE} \ + python3 -m sglang.launch_server \ + --model-path ${MODEL_NAME} \ + --host 0.0.0.0 \ + --port ${SERVER_PORT} \ + --mem-fraction-static 0.9 \ + --attention-backend flashinfer + fi + + echo "Waiting for server to start..." + sleep 20 +fi +echo "" + +# Step 4: Wait for server to be ready +echo "Step 4: Waiting for server to be ready..." +MAX_RETRIES=40 +RETRY_COUNT=0 + +# Different ready indicators for vLLM vs SGLang +if [[ "$SERVER_TYPE" == "vllm" ]]; then + READY_PATTERN="Uvicorn running|Application startup complete" +else + READY_PATTERN="Uvicorn running|Server is ready" +fi + +while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do + if docker logs ${CONTAINER_NAME} 2>&1 | grep -qE "$READY_PATTERN"; then + echo "✅ Server is ready!" + break + fi + if docker logs ${CONTAINER_NAME} 2>&1 | grep -qE "ERROR.*failed|CUDA out of memory|RuntimeError"; then + echo "❌ Server failed to start. Check logs:" + docker logs ${CONTAINER_NAME} 2>&1 | tail -20 + exit 1 + fi + RETRY_COUNT=$((RETRY_COUNT + 1)) + echo "Waiting... ($RETRY_COUNT/$MAX_RETRIES)" + sleep 5 +done + +if [ $RETRY_COUNT -eq $MAX_RETRIES ]; then + echo "❌ Server did not start within expected time" + docker logs ${CONTAINER_NAME} 2>&1 | tail -30 + exit 1 +fi +echo "" + +# Step 5: Verify server +echo "Step 5: Verifying server..." +sleep 5 # Give it a moment to fully initialize + +if curl -s http://localhost:${SERVER_PORT}/v1/models 2>/dev/null | grep -q "${MODEL_NAME}"; then + echo "✅ Server is responding correctly" +elif curl -s http://localhost:${SERVER_PORT}/health 2>/dev/null | grep -q "ok\|healthy"; then + echo "✅ Server health check passed" +else + echo "⚠️ Warning: Server may not be fully ready, but proceeding..." +fi +echo "" + +# Step 6: Run benchmark +echo "Step 6: Running ${SERVER_TYPE} ${BENCHMARK_TYPE} benchmark..." +CONFIG_FILE="examples/08_Qwen2.5-0.5B_Example/${CONFIG_PREFIX}${BENCHMARK_TYPE}_qwen_benchmark.yaml" + +if [ ! -f "$CONFIG_FILE" ]; then + echo "ERROR: Config file not found: $CONFIG_FILE" + echo "Available configs:" + ls examples/08_Qwen2.5-0.5B_Example/*.yaml + exit 1 +fi + +source .venv/bin/activate +if [[ "$BENCHMARK_TYPE" == "online" ]]; then + python scripts/concurrency_sweep/run.py \ + --config "$CONFIG_FILE" 2>&1 | tee benchmark_${SERVER_TYPE}_${BENCHMARK_TYPE}.log +else + inference-endpoint benchmark from-config -c "$CONFIG_FILE" 2>&1 | tee benchmark_${SERVER_TYPE}_${BENCHMARK_TYPE}.log +fi + +echo "" +echo "========================================" +echo "Benchmark Complete!" +echo "========================================" +echo "" +echo "Server: $SERVER_TYPE" +echo "Benchmark Type: $BENCHMARK_TYPE" +echo "" +echo "Results saved to:" +if [[ "$SERVER_TYPE" == "vllm" ]]; then + if [ "$BENCHMARK_TYPE" = "offline" ]; then + RESULT_DIR="results/qwen_offline_benchmark/" + SWEEP_DIR="" + else + RESULT_DIR="results/qwen_online_benchmark/" + SWEEP_DIR="${RESULT_DIR}concurrency_sweep/" + fi +else + if [ "$BENCHMARK_TYPE" = "offline" ]; then + RESULT_DIR="results/qwen_sglang_offline_benchmark/" + SWEEP_DIR="" + else + RESULT_DIR="results/qwen_sglang_online_benchmark/" + SWEEP_DIR="${RESULT_DIR}concurrency_sweep/" + fi +fi + +echo " ${RESULT_DIR}" +echo "" +if [[ "$BENCHMARK_TYPE" == "online" ]]; then + echo "Summarize sweep results (tables + CSV + Markdown + plot):" + echo " python scripts/concurrency_sweep/summarize.py ${SWEEP_DIR}" +else + echo "View summary:" + echo " cat ${RESULT_DIR}report.txt" +fi +echo "" +echo "Benchmark log:" +echo " cat benchmark_${SERVER_TYPE}_${BENCHMARK_TYPE}.log" +echo "" +echo "To stop the server:" +echo " docker stop ${CONTAINER_NAME}" +echo "" diff --git a/examples/08_Qwen2.5-0.5B_Example/sglang_offline_qwen_benchmark.yaml b/examples/08_Qwen2.5-0.5B_Example/sglang_offline_qwen_benchmark.yaml new file mode 100644 index 00000000..a1c8c277 --- /dev/null +++ b/examples/08_Qwen2.5-0.5B_Example/sglang_offline_qwen_benchmark.yaml @@ -0,0 +1,44 @@ +name: "qwen-0.5b-sglang-offline-benchmark" +version: "1.0" +type: "offline" + +model_params: + name: "Qwen/Qwen2.5-0.5B-Instruct" + temperature: 1.0 + max_new_tokens: 100 + top_p: 1.0 + streaming: "on" + +datasets: + - name: "qwen-perf-test" + type: "performance" + path: "examples/08_Qwen2.5-0.5B_Example/data/test_dataset.pkl" + samples: 1000 + +settings: + runtime: + min_duration_ms: 100 + max_duration_ms: 60000 + scheduler_random_seed: 42 + dataloader_random_seed: 42 + + client: + workers: 1 + max_connections: 100 + warmup_connections: 0 + record_worker_events: false + +metrics: + collect: + - "throughput" + - "latency" + - "ttft" + - "tpot" + +endpoint_config: + endpoints: + - "http://localhost:30000" + api_key: null + api_type: "sglang" + +report_dir: "results/qwen_sglang_offline_benchmark/" diff --git a/examples/08_Qwen2.5-0.5B_Example/sglang_online_qwen_benchmark.yaml b/examples/08_Qwen2.5-0.5B_Example/sglang_online_qwen_benchmark.yaml new file mode 100644 index 00000000..576d6985 --- /dev/null +++ b/examples/08_Qwen2.5-0.5B_Example/sglang_online_qwen_benchmark.yaml @@ -0,0 +1,48 @@ +name: "qwen-0.5b-sglang-online-benchmark" +version: "1.0" +type: "online" + +model_params: + name: "Qwen/Qwen2.5-0.5B-Instruct" + temperature: 0.7 + max_new_tokens: 128 + top_p: 0.95 + streaming: "on" + +datasets: + - name: "qwen-perf-test" + type: "performance" + path: "examples/08_Qwen2.5-0.5B_Example/data/test_dataset.pkl" + samples: 500 + +settings: + runtime: + min_duration_ms: 600000 + max_duration_ms: 600000 + scheduler_random_seed: 42 + dataloader_random_seed: 42 + + load_pattern: + type: "concurrency" + target_concurrency: 1 + + client: + workers: 1 + max_connections: 2048 + warmup_connections: 0 + record_worker_events: false + +metrics: + collect: + - "throughput" + - "latency" + - "ttft" + - "tpot" + +endpoint_config: + endpoints: + - "http://localhost:30000" + api_key: null + api_type: "sglang" + +report_dir: "results/qwen_sglang_online_benchmark/" diff --git a/scripts/concurrency_sweep/run.py b/scripts/concurrency_sweep/run.py new file mode 100644 index 00000000..20c375c9 --- /dev/null +++ b/scripts/concurrency_sweep/run.py @@ -0,0 +1,270 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Run a concurrency sweep benchmark using a template YAML config. + +The provided config is used as a template: for each concurrency value the +load_pattern is overridden to type=concurrency with the given target, and +the report_dir is set to a per-run subdirectory under the sweep root. + +Usage: + python scripts/concurrency_sweep/run.py --config path/to/config.yaml + python scripts/concurrency_sweep/run.py \\ + --config examples/08_Qwen2.5-0.5B_Example/online_qwen_benchmark.yaml \\ + --concurrency 1 2 4 8 16 32 64 \\ + --duration-ms 120000 \\ + --output-dir results/my_sweep + +After the sweep completes, run the summarize script: + python scripts/concurrency_sweep/summarize.py /concurrency_sweep/ +""" + +from __future__ import annotations + +import argparse +import copy +import csv +import json +import subprocess +import sys +import tempfile +from pathlib import Path + +import yaml + +DEFAULT_CONCURRENCY_VALUES = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024] +DEFAULT_DURATION_MS = 600_000 +DEFAULT_TIMEOUT_S = 12 * 60 + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Run a concurrency sweep using a template YAML config.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--config", + type=Path, + required=True, + help="Path to the base benchmark YAML config to use as a template.", + ) + parser.add_argument( + "--concurrency", + type=int, + nargs="+", + default=DEFAULT_CONCURRENCY_VALUES, + metavar="N", + help="Concurrency values to sweep over.", + ) + parser.add_argument( + "--duration-ms", + type=int, + default=DEFAULT_DURATION_MS, + help="Per-run benchmark duration in milliseconds.", + ) + parser.add_argument( + "--output-dir", + type=Path, + default=None, + help=( + "Root directory for sweep output. Defaults to the report_dir " + "defined in the config file." + ), + ) + parser.add_argument( + "--timeout-seconds", + type=int, + default=DEFAULT_TIMEOUT_S, + help="Per-run subprocess timeout in seconds (includes setup and teardown).", + ) + parser.add_argument( + "--verbose", + action="store_true", + help=( + "Stream benchmark output to the terminal in real time in addition " + "to saving it to the per-run log file. Useful for debugging failures." + ), + ) + return parser.parse_args() + + +def load_config(config_path: Path) -> dict: + with config_path.open() as f: + return yaml.safe_load(f) + + +def render_config( + base_config: dict, concurrency: int, report_dir: Path, duration_ms: int +) -> dict: + config = copy.deepcopy(base_config) + config["name"] = f"{config.get('name', 'benchmark')}-c{concurrency}" + config["report_dir"] = str(report_dir) + + runtime = config.setdefault("settings", {}).setdefault("runtime", {}) + runtime["min_duration_ms"] = duration_ms + runtime["max_duration_ms"] = duration_ms + + load_pattern = config["settings"].setdefault("load_pattern", {}) + load_pattern["type"] = "concurrency" + load_pattern["target_concurrency"] = concurrency + load_pattern.pop("target_qps", None) + + return config + + +def run_single_benchmark( + config: dict, timeout_seconds: int, log_path: Path, verbose: bool = False +) -> tuple[str, str]: + with tempfile.NamedTemporaryFile( + mode="w", suffix=".yaml", delete=False, dir="." + ) as tmp: + yaml.safe_dump(config, tmp, sort_keys=False) + temp_config_path = Path(tmp.name) + + cmd = [ + "inference-endpoint", + "benchmark", + "from-config", + "-c", + str(temp_config_path), + ] + + try: + if verbose: + with log_path.open("w") as log_file: + proc = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + ) + assert proc.stdout is not None + for line in proc.stdout: + print(line, end="", flush=True) + log_file.write(line) + try: + proc.wait(timeout=timeout_seconds) + except subprocess.TimeoutExpired: + proc.kill() + raise + returncode = proc.returncode + else: + with log_path.open("w") as log_file: + result = subprocess.run( + cmd, + stdout=log_file, + stderr=subprocess.STDOUT, + text=True, + timeout=timeout_seconds, + check=False, + ) + returncode = result.returncode + + if returncode == 0: + return "success", "" + return "failed", f"exit code {returncode}, see {log_path}" + except subprocess.TimeoutExpired: + return "timeout", f"exceeded {timeout_seconds} seconds" + finally: + temp_config_path.unlink(missing_ok=True) + + +def write_summary(sweep_root: Path, rows: list[dict]) -> None: + summary_path = sweep_root / "summary.json" + with summary_path.open("w") as f: + json.dump(rows, f, indent=2) + + csv_path = summary_path.with_suffix(".csv") + with csv_path.open("w", newline="") as f: + writer = csv.DictWriter( + f, + fieldnames=["concurrency", "status", "report_dir", "log_file", "detail"], + ) + writer.writeheader() + writer.writerows(rows) + + print(f"Wrote run summary to {summary_path}") + + +def main() -> int: + args = parse_args() + + if not args.config.is_file(): + print(f"Error: config file not found: {args.config}", file=sys.stderr) + return 1 + + base_config = load_config(args.config) + + if args.output_dir is not None: + base_report_dir = args.output_dir + elif "report_dir" in base_config: + base_report_dir = Path(base_config["report_dir"]) + else: + print( + "Error: no --output-dir given and config has no report_dir field.", + file=sys.stderr, + ) + return 1 + + concurrency_values = sorted(set(args.concurrency)) + sweep_root = base_report_dir / "concurrency_sweep" + sweep_root.mkdir(parents=True, exist_ok=True) + + print(f"Config : {args.config}") + print(f"Concurrency : {concurrency_values}") + print(f"Duration/run : {args.duration_ms / 60_000:.1f} minutes") + print(f"Sweep root : {sweep_root}") + + summary_rows: list[dict] = [] + + for concurrency in concurrency_values: + run_dir = sweep_root / f"concurrency_{concurrency}" + run_dir.mkdir(parents=True, exist_ok=True) + log_path = run_dir / "benchmark.log" + config = render_config(base_config, concurrency, run_dir, args.duration_ms) + + print(f"\nRunning concurrency={concurrency} ...") + status, detail = run_single_benchmark( + config=config, + timeout_seconds=args.timeout_seconds, + log_path=log_path, + verbose=args.verbose, + ) + print(f" status: {status}" + (f" ({detail})" if detail else "")) + if status != "success" and not args.verbose: + print(f" Re-run with --verbose to stream output, or inspect: {log_path}") + + summary_rows.append( + { + "concurrency": concurrency, + "status": status, + "report_dir": str(run_dir), + "log_file": str(log_path), + "detail": detail, + } + ) + + write_summary(sweep_root, summary_rows) + + n_ok = sum(1 for r in summary_rows if r["status"] == "success") + print(f"\nCompleted {n_ok}/{len(summary_rows)} runs successfully.") + print("To summarize results run:") + print(f" python scripts/concurrency_sweep/summarize.py {sweep_root}") + + return 0 if n_ok == len(summary_rows) else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/concurrency_sweep/summarize.py b/scripts/concurrency_sweep/summarize.py new file mode 100644 index 00000000..021c83a2 --- /dev/null +++ b/scripts/concurrency_sweep/summarize.py @@ -0,0 +1,420 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Summarize and tabulate metrics from a concurrency sweep results folder. + +Usage: + python scripts/concurrency_sweep/summarize.py + python scripts/concurrency_sweep/summarize.py results/my_sweep/concurrency_sweep/ + +Outputs: + - Formatted tables to stdout + - /metrics_summary.csv + - /metrics_summary.md + - /metrics_summary.png +""" + +from __future__ import annotations + +import argparse +import csv +import json +import sys +from pathlib import Path + +# --------------------------------------------------------------------------- +# Parsing +# --------------------------------------------------------------------------- + + +def ns_to_ms(value: float) -> float: + return round(value / 1e6, 2) + + +def parse_result_summary(result_file: Path) -> dict | None: + try: + data = json.loads(result_file.read_text()) + except Exception as e: + print(f"Error reading {result_file}: {e}", file=sys.stderr) + return None + + lat = data.get("latency") or {} + ttft = data.get("ttft") or {} + tpot = data.get("tpot") or {} + osl = data.get("output_sequence_lengths") or {} + + def f(v: object) -> float: + """Return v as float, falling back to 0.0 for None/missing.""" + return float(v) if v is not None else 0.0 + + pct = lambda d, k: f(d.get("percentiles", {}).get(k)) # noqa: E731 + + return { + "qps": round(f(data.get("qps")), 2), + "tps": round(f(data.get("tps")), 2), + "latency_mean_ms": ns_to_ms(f(lat.get("avg"))), + "latency_p50_ms": ns_to_ms(f(lat.get("median"))), + "latency_p90_ms": ns_to_ms(pct(lat, "90")), + "latency_p95_ms": ns_to_ms(pct(lat, "95")), + "latency_p99_ms": ns_to_ms(pct(lat, "99")), + "ttft_mean_ms": ns_to_ms(f(ttft.get("avg"))), + "ttft_p50_ms": ns_to_ms(f(ttft.get("median"))), + "ttft_p90_ms": ns_to_ms(pct(ttft, "90")), + "ttft_p99_ms": ns_to_ms(pct(ttft, "99")), + "tpot_mean_ms": ns_to_ms(f(tpot.get("avg"))), + "tpot_p50_ms": ns_to_ms(f(tpot.get("median"))), + "tpot_p90_ms": ns_to_ms(pct(tpot, "90")), + "tpot_p99_ms": ns_to_ms(pct(tpot, "99")), + "n_completed": data.get("n_samples_completed", 0), + "duration_s": round(f(data.get("duration_ns")) / 1e9, 1), + "avg_output_tokens": round(f(osl.get("avg")), 1), + } + + +def collect_results(sweep_dir: Path) -> list[dict]: + def _concurrency_key(p: Path) -> int: + try: + return int(p.name.split("_")[1]) + except (IndexError, ValueError): + return -1 + + rows = [] + for concurrency_dir in sorted( + sweep_dir.glob("concurrency_*"), key=_concurrency_key + ): + try: + concurrency = int(concurrency_dir.name.split("_")[1]) + except (IndexError, ValueError): + continue + + result_file = concurrency_dir / "result_summary.json" + if result_file.exists(): + metrics = parse_result_summary(result_file) + if metrics: + rows.append({"concurrency": concurrency, "status": "ok", **metrics}) + else: + rows.append({"concurrency": concurrency, "status": "parse_error"}) + else: + rows.append({"concurrency": concurrency, "status": "no_results"}) + + return rows + + +# --------------------------------------------------------------------------- +# Terminal output +# --------------------------------------------------------------------------- + + +def print_table(rows: list[dict]) -> None: + successful = [r for r in rows if r["status"] == "ok"] + + def section(title: str, columns: list[tuple[str, str]]) -> None: + print(f"\n{title}") + header_parts = [f"{'Conc':>6}"] + for label, _ in columns: + header_parts.append(f"{label:>12}") + print(" ".join(header_parts)) + print(" ".join("-" * w for w in [6] + [12] * len(columns))) + for r in rows: + if r["status"] != "ok": + vals = [f"{r['concurrency']:>6}", f" {'-- ' + r['status']:>12}"] + print(" ".join(vals)) + continue + parts = [f"{r['concurrency']:>6}"] + for _, key in columns: + parts.append(f"{r[key]:>12}") + print(" ".join(parts)) + + section( + "Throughput", + [ + ("QPS", "qps"), + ("TPS", "tps"), + ("Completed", "n_completed"), + ("Duration(s)", "duration_s"), + ("AvgOutTok", "avg_output_tokens"), + ], + ) + section( + "End-to-End Latency (ms)", + [ + ("Mean", "latency_mean_ms"), + ("P50", "latency_p50_ms"), + ("P90", "latency_p90_ms"), + ("P95", "latency_p95_ms"), + ("P99", "latency_p99_ms"), + ], + ) + section( + "Time to First Token / TTFT (ms)", + [ + ("Mean", "ttft_mean_ms"), + ("P50", "ttft_p50_ms"), + ("P90", "ttft_p90_ms"), + ("P99", "ttft_p99_ms"), + ], + ) + section( + "Time Per Output Token / TPOT (ms)", + [ + ("Mean", "tpot_mean_ms"), + ("P50", "tpot_p50_ms"), + ("P90", "tpot_p90_ms"), + ("P99", "tpot_p99_ms"), + ], + ) + + if successful: + best_qps = max(successful, key=lambda r: r["qps"]) + best_lat = min(successful, key=lambda r: r["latency_p50_ms"]) + print( + f"\nPeak throughput : {best_qps['qps']} QPS (concurrency={best_qps['concurrency']})" + ) + print( + f"Best P50 latency: {best_lat['latency_p50_ms']} ms (concurrency={best_lat['concurrency']})" + ) + print(f"Successful runs : {len(successful)}/{len(rows)}") + + +# --------------------------------------------------------------------------- +# File outputs +# --------------------------------------------------------------------------- + +CSV_FIELDS = [ + "concurrency", + "status", + "qps", + "tps", + "latency_mean_ms", + "latency_p50_ms", + "latency_p90_ms", + "latency_p95_ms", + "latency_p99_ms", + "ttft_mean_ms", + "ttft_p50_ms", + "ttft_p90_ms", + "ttft_p99_ms", + "tpot_mean_ms", + "tpot_p50_ms", + "tpot_p90_ms", + "tpot_p99_ms", + "n_completed", + "duration_s", + "avg_output_tokens", +] + + +def write_csv(rows: list[dict], path: Path) -> None: + with path.open("w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=CSV_FIELDS, extrasaction="ignore") + writer.writeheader() + for row in rows: + writer.writerow({k: row.get(k, "") for k in CSV_FIELDS}) + + +def write_markdown(rows: list[dict], path: Path) -> None: + successful = [r for r in rows if r["status"] == "ok"] + + def md_table(headers: list[str], col_keys: list[str], data: list[dict]) -> str: + sep = "|" + "|".join("---" for _ in headers) + "|" + hdr = "|" + "|".join(headers) + "|" + lines = [hdr, sep] + for r in data: + cells = [str(r.get(k, r.get("status", "-"))) for k in col_keys] + lines.append("|" + "|".join(cells) + "|") + return "\n".join(lines) + + with path.open("w") as f: + f.write("# Concurrency Sweep Results\n\n") + + f.write("## Throughput\n\n") + f.write( + md_table( + [ + "Concurrency", + "Status", + "QPS", + "TPS", + "Completed", + "Duration (s)", + "Avg Out Tokens", + ], + [ + "concurrency", + "status", + "qps", + "tps", + "n_completed", + "duration_s", + "avg_output_tokens", + ], + rows, + ) + ) + + f.write("\n\n## End-to-End Latency (ms)\n\n") + f.write( + md_table( + ["Concurrency", "Mean", "P50", "P90", "P95", "P99"], + [ + "concurrency", + "latency_mean_ms", + "latency_p50_ms", + "latency_p90_ms", + "latency_p95_ms", + "latency_p99_ms", + ], + [r for r in rows if r["status"] == "ok"], + ) + ) + + f.write("\n\n## Time to First Token / TTFT (ms)\n\n") + f.write( + md_table( + ["Concurrency", "Mean", "P50", "P90", "P99"], + [ + "concurrency", + "ttft_mean_ms", + "ttft_p50_ms", + "ttft_p90_ms", + "ttft_p99_ms", + ], + [r for r in rows if r["status"] == "ok"], + ) + ) + + f.write("\n\n## Time Per Output Token / TPOT (ms)\n\n") + f.write( + md_table( + ["Concurrency", "Mean", "P50", "P90", "P99"], + [ + "concurrency", + "tpot_mean_ms", + "tpot_p50_ms", + "tpot_p90_ms", + "tpot_p99_ms", + ], + [r for r in rows if r["status"] == "ok"], + ) + ) + + if successful: + best_qps = max(successful, key=lambda r: r["qps"]) + best_lat = min(successful, key=lambda r: r["latency_p50_ms"]) + f.write("\n\n## Analysis\n\n") + f.write( + f"- **Peak throughput:** {best_qps['qps']} QPS at concurrency={best_qps['concurrency']}\n" + ) + f.write( + f"- **Best P50 latency:** {best_lat['latency_p50_ms']} ms at concurrency={best_lat['concurrency']}\n" + ) + f.write(f"- **Successful runs:** {len(successful)}/{len(rows)}\n") + + +def write_plots(rows: list[dict], path: Path) -> None: + try: + import matplotlib + import matplotlib.ticker + + matplotlib.use("Agg") + import matplotlib.pyplot as plt + except ImportError: + print("matplotlib not installed; skipping plot generation.", file=sys.stderr) + return + + successful = [r for r in rows if r["status"] == "ok"] + if not successful: + print("No successful runs to plot.", file=sys.stderr) + return + + x = [r["concurrency"] for r in successful] + tps = [r["tps"] for r in successful] + ttft_p99 = [r["ttft_p99_ms"] for r in successful] + tpot_p50 = [r["tpot_p50_ms"] for r in successful] + + fig, axes = plt.subplots(3, 1, figsize=(9, 11), sharex=True) + fig.suptitle("Concurrency Sweep Performance", fontsize=14, fontweight="bold") + + metrics = [ + (axes[0], tps, "TPS", "Tokens / s", "tab:blue"), + (axes[1], ttft_p99, "TTFT P99 (ms)", "Latency (ms)", "tab:orange"), + (axes[2], tpot_p50, "TPOT P50 (ms)", "Latency (ms)", "tab:green"), + ] + for ax, values, label, ylabel, color in metrics: + ax.plot(x, values, marker="o", linewidth=2, color=color, label=label) + ax.set_ylabel(ylabel) + ax.set_xscale("log", base=2) + ax.set_xticks(x) + ax.get_xaxis().set_major_formatter( + matplotlib.ticker.FuncFormatter(lambda v, _: str(int(v))) + ) + ax.legend(loc="upper left") + ax.grid(True, which="both", linestyle="--", alpha=0.5) + + axes[-1].set_xlabel("Concurrency") + fig.tight_layout() + fig.savefig(path, dpi=150, bbox_inches="tight") + plt.close(fig) + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Summarize metrics from a concurrency sweep results folder." + ) + parser.add_argument( + "sweep_dir", + type=Path, + help="Path to the concurrency sweep directory (contains concurrency_N/ subdirs).", + ) + parser.add_argument( + "--no-save", + action="store_true", + help="Print to stdout only; do not write CSV, Markdown, or PNG files.", + ) + args = parser.parse_args() + + sweep_dir: Path = args.sweep_dir + if not sweep_dir.is_dir(): + print(f"Error: {sweep_dir} is not a directory", file=sys.stderr) + return 1 + + rows = collect_results(sweep_dir) + if not rows: + print(f"No concurrency_* subdirectories found in {sweep_dir}", file=sys.stderr) + return 1 + + print_table(rows) + + if not args.no_save: + csv_path = sweep_dir / "metrics_summary.csv" + md_path = sweep_dir / "metrics_summary.md" + png_path = sweep_dir / "metrics_summary.png" + write_csv(rows, csv_path) + write_markdown(rows, md_path) + write_plots(rows, png_path) + print(f"\nSaved: {csv_path}") + print(f"Saved: {md_path}") + print(f"Saved: {png_path}") + + return 0 + + +if __name__ == "__main__": + sys.exit(main())