From 4c84e1785bc40e7a0bc7d74d395debebae9ef0fc Mon Sep 17 00:00:00 2001 From: Zhenhua Wang <4936589+zhenhuaw-me@users.noreply.github.com> Date: Sun, 26 Apr 2026 21:45:08 +0800 Subject: [PATCH 1/2] [TRTLLM-11373][refactor] Embed VisualGenParams in DiffusionRequest and simplify generate() inputs (#13313) Signed-off-by: Zhenhua Wang --- .gitignore | 1 + examples/visual_gen/visual_gen_ltx2.py | 4 +- examples/visual_gen/visual_gen_wan_i2v.py | 2 +- examples/visual_gen/visual_gen_wan_t2v.py | 2 +- tensorrt_llm/_torch/visual_gen/executor.py | 78 ++++++------- .../visual_gen/models/flux/pipeline_flux.py | 12 +- .../visual_gen/models/flux/pipeline_flux2.py | 12 +- .../visual_gen/models/ltx2/pipeline_ltx2.py | 24 ++-- .../models/ltx2/pipeline_ltx2_two_stages.py | 24 ++-- .../visual_gen/models/wan/pipeline_wan.py | 18 +-- .../visual_gen/models/wan/pipeline_wan_i2v.py | 22 ++-- tensorrt_llm/_torch/visual_gen/pipeline.py | 4 +- tensorrt_llm/inputs/data.py | 79 +------------ tensorrt_llm/serve/openai_server.py | 56 +++------- tensorrt_llm/serve/visual_gen_utils.py | 53 +++++---- tensorrt_llm/visual_gen/visual_gen.py | 64 +++-------- .../visual_gen/test_trtllm_serve_endpoints.py | 104 +++++++++++++++++- .../_torch/visual_gen/test_visual_gen_args.py | 93 +++------------- .../visual_gen/test_visual_gen_params.py | 77 ++++++++++--- 19 files changed, 338 insertions(+), 391 deletions(-) diff --git a/.gitignore b/.gitignore index 7cdbea45dfb6..bf32438a0f41 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ __pycache__/ +.mypy_cache/ .vscode .cursor *.engine diff --git a/examples/visual_gen/visual_gen_ltx2.py b/examples/visual_gen/visual_gen_ltx2.py index bf3815391ef7..1b753217c520 100755 --- a/examples/visual_gen/visual_gen_ltx2.py +++ b/examples/visual_gen/visual_gen_ltx2.py @@ -383,8 +383,6 @@ def main(): start_time = time.time() - inputs = {"prompt": args.prompt} - extra_params = { "guidance_rescale": args.guidance_rescale, "stg_scale": args.stg_scale, @@ -411,7 +409,7 @@ def main(): extra_params=extra_params, ) - output = visual_gen.generate(inputs=inputs, params=params) + output = visual_gen.generate(inputs=args.prompt, params=params) end_time = time.time() logger.info(f"Generation completed in {end_time - start_time:.2f}s") diff --git a/examples/visual_gen/visual_gen_wan_i2v.py b/examples/visual_gen/visual_gen_wan_i2v.py index 0433bd46a0b1..f2ac5d76c4ce 100644 --- a/examples/visual_gen/visual_gen_wan_i2v.py +++ b/examples/visual_gen/visual_gen_wan_i2v.py @@ -336,7 +336,7 @@ def main(): extra_params["boundary_ratio"] = args.boundary_ratio output = visual_gen.generate( - inputs={"prompt": args.prompt}, + inputs=args.prompt, params=VisualGenParams( height=args.height, width=args.width, diff --git a/examples/visual_gen/visual_gen_wan_t2v.py b/examples/visual_gen/visual_gen_wan_t2v.py index 3c89277d5f7a..22c62f608838 100755 --- a/examples/visual_gen/visual_gen_wan_t2v.py +++ b/examples/visual_gen/visual_gen_wan_t2v.py @@ -340,7 +340,7 @@ def main(): extra_params["boundary_ratio"] = args.boundary_ratio output = visual_gen.generate( - inputs={"prompt": args.prompt}, + inputs=args.prompt, params=VisualGenParams( height=args.height, width=args.width, diff --git a/tensorrt_llm/_torch/visual_gen/executor.py b/tensorrt_llm/_torch/visual_gen/executor.py index 7c8bf053276c..753e20312204 100644 --- a/tensorrt_llm/_torch/visual_gen/executor.py +++ b/tensorrt_llm/_torch/visual_gen/executor.py @@ -3,7 +3,7 @@ import threading import traceback from dataclasses import dataclass -from typing import List, Optional, Union +from typing import TYPE_CHECKING, List, Optional import torch import torch.distributed as dist @@ -15,42 +15,24 @@ from tensorrt_llm.executor.ipc import ZeroMqQueue from tensorrt_llm.logger import logger +if TYPE_CHECKING: + from tensorrt_llm.visual_gen.params import VisualGenParams + @dataclass class DiffusionRequest: """Request for diffusion inference. - Universal parameters are top-level fields with ``None`` meaning - "use model default" (resolved by the executor before calling - ``pipeline.infer()``). Model-specific parameters live in - ``extra_params`` and are passed through to the pipeline. + Generation parameters live in the optional ``params`` object + (a :class:`~tensorrt_llm.visual_gen.params.VisualGenParams` instance). + When ``params`` is ``None`` (the default), the executor creates a + ``VisualGenParams()`` and fills it with pipeline-specific defaults + before calling ``pipeline.infer()``. """ request_id: int prompt: List[str] - negative_prompt: Optional[str] = None - - # Core — None means "use model default" (resolved by executor) - height: Optional[int] = None - width: Optional[int] = None - num_inference_steps: Optional[int] = None - guidance_scale: Optional[float] = None - max_sequence_length: Optional[int] = None - seed: int = 42 - - # Video - num_frames: Optional[int] = None - frame_rate: Optional[float] = None - - # Image - num_images_per_prompt: int = 1 - - # Conditioning inputs - image: Optional[Union[str, bytes, List[Union[str, bytes]]]] = None - image_cond_strength: Optional[float] = None - - # Model-specific overflow (from VisualGenParams.extra_params) - extra_params: Optional[dict] = None + params: Optional["VisualGenParams"] = None @dataclass @@ -216,29 +198,40 @@ def serve_forever(self): self.process_request(req) def _merge_defaults(self, req: DiffusionRequest): - """Fill ``None`` fields in *req* with pipeline-specific defaults. + """Fill ``None`` fields in *req.params* with pipeline-specific defaults. Merges both universal defaults (from ``default_generation_params``) and extra_param defaults (from ``extra_param_specs``). """ + if req.params is None: + from tensorrt_llm.visual_gen.params import VisualGenParams + + kwargs = dict(self.pipeline.default_generation_params) + specs = self.pipeline.extra_param_specs + if specs: + kwargs["extra_params"] = {key: spec.default for key, spec in specs.items()} + req.params = VisualGenParams(**kwargs) + return + + params = req.params # Universal field defaults for field_name, default_value in self.pipeline.default_generation_params.items(): - if hasattr(req, field_name) and getattr(req, field_name) is None: - setattr(req, field_name, default_value) + if hasattr(params, field_name) and getattr(params, field_name) is None: + setattr(params, field_name, default_value) # Extra param defaults — fill all declared keys so infer() can use direct access specs = self.pipeline.extra_param_specs if specs: - if req.extra_params is None: - req.extra_params = {} + if params.extra_params is None: + params.extra_params = {} for key, spec in specs.items(): - if key not in req.extra_params: - req.extra_params[key] = spec.default + if key not in params.extra_params: + params.extra_params[key] = spec.default self._validate_request(req) def _validate_request(self, req: DiffusionRequest): - """Validate *req* against the loaded pipeline's declared parameters. + """Validate *req.params* against the loaded pipeline's declared parameters. Raises ``VisualGenParamsError`` on: - Unknown ``extra_params`` keys @@ -251,14 +244,15 @@ def _validate_request(self, req: DiffusionRequest): # (executor → visual_gen.visual_gen → _torch.visual_gen → executor) from tensorrt_llm.visual_gen.visual_gen import VisualGenParamsError + params = req.params errors: list[str] = [] pipeline_name = self.pipeline.__class__.__name__ declared_defaults = self.pipeline.default_generation_params specs = self.pipeline.extra_param_specs # --- unknown extra_params keys --- - if req.extra_params: - unknown = set(req.extra_params.keys()) - set(specs.keys()) + if params.extra_params: + unknown = set(params.extra_params.keys()) - set(specs.keys()) if unknown: errors.append( f"Unknown extra_params {sorted(unknown)} for {pipeline_name}. " @@ -271,7 +265,7 @@ def _validate_request(self, req: DiffusionRequest): # Conditioning inputs (image, negative_prompt, mask) are excluded — # they are validated at runtime by the pipeline's infer(). for field_name in _GENERATION_CONFIG_FIELDS: - value = getattr(req, field_name, None) + value = getattr(params, field_name, None) if value is not None and field_name not in declared_defaults: errors.append( f"Parameter '{field_name}' is set but {pipeline_name} does " @@ -280,8 +274,8 @@ def _validate_request(self, req: DiffusionRequest): ) # --- extra_params type and range checks --- - if req.extra_params: - for key, value in req.extra_params.items(): + if params.extra_params: + for key, value in params.extra_params.items(): if key not in specs: continue # already reported as unknown above spec = specs[key] @@ -315,7 +309,7 @@ def process_request(self, req: DiffusionRequest): try: self._merge_defaults(req) cache_key = self.pipeline.warmup_cache_key( - req.height, req.width, num_frames=req.num_frames + req.params.height, req.params.width, num_frames=req.params.num_frames ) if self.pipeline._warmed_up_shapes and cache_key not in self.pipeline._warmed_up_shapes: logger.warning( diff --git a/tensorrt_llm/_torch/visual_gen/models/flux/pipeline_flux.py b/tensorrt_llm/_torch/visual_gen/models/flux/pipeline_flux.py index 8e496cb4e40e..23dbe4620886 100644 --- a/tensorrt_llm/_torch/visual_gen/models/flux/pipeline_flux.py +++ b/tensorrt_llm/_torch/visual_gen/models/flux/pipeline_flux.py @@ -246,12 +246,12 @@ def infer(self, req): """Run inference from DiffusionRequest.""" return self.forward( prompt=req.prompt, - height=req.height, - width=req.width, - num_inference_steps=req.num_inference_steps, - guidance_scale=req.guidance_scale, - seed=req.seed, - max_sequence_length=req.max_sequence_length, + height=req.params.height, + width=req.params.width, + num_inference_steps=req.params.num_inference_steps, + guidance_scale=req.params.guidance_scale, + seed=req.params.seed, + max_sequence_length=req.params.max_sequence_length, ) @torch.inference_mode() diff --git a/tensorrt_llm/_torch/visual_gen/models/flux/pipeline_flux2.py b/tensorrt_llm/_torch/visual_gen/models/flux/pipeline_flux2.py index a1fbb19f18fc..ef9445ba7fe6 100644 --- a/tensorrt_llm/_torch/visual_gen/models/flux/pipeline_flux2.py +++ b/tensorrt_llm/_torch/visual_gen/models/flux/pipeline_flux2.py @@ -339,12 +339,12 @@ def infer(self, req): """Run inference from DiffusionRequest.""" return self.forward( prompt=req.prompt, - height=req.height, - width=req.width, - num_inference_steps=req.num_inference_steps, - guidance_scale=req.guidance_scale, - seed=req.seed, - max_sequence_length=req.max_sequence_length, + height=req.params.height, + width=req.params.width, + num_inference_steps=req.params.num_inference_steps, + guidance_scale=req.params.guidance_scale, + seed=req.params.seed, + max_sequence_length=req.params.max_sequence_length, ) @torch.inference_mode() diff --git a/tensorrt_llm/_torch/visual_gen/models/ltx2/pipeline_ltx2.py b/tensorrt_llm/_torch/visual_gen/models/ltx2/pipeline_ltx2.py index 5fa495481e93..aaf37afed602 100644 --- a/tensorrt_llm/_torch/visual_gen/models/ltx2/pipeline_ltx2.py +++ b/tensorrt_llm/_torch/visual_gen/models/ltx2/pipeline_ltx2.py @@ -1170,22 +1170,22 @@ def extra_param_specs(self): def infer(self, req): """Run inference with request parameters.""" - extra = req.extra_params or {} + extra = req.params.extra_params or {} return self.forward( prompt=req.prompt, - negative_prompt=req.negative_prompt, - height=req.height, - width=req.width, - num_frames=req.num_frames, - frame_rate=req.frame_rate, - num_inference_steps=req.num_inference_steps, - guidance_scale=req.guidance_scale, - seed=req.seed, + negative_prompt=req.params.negative_prompt, + height=req.params.height, + width=req.params.width, + num_frames=req.params.num_frames, + frame_rate=req.params.frame_rate, + num_inference_steps=req.params.num_inference_steps, + guidance_scale=req.params.guidance_scale, + seed=req.params.seed, output_type=extra["output_type"], guidance_rescale=extra["guidance_rescale"], - max_sequence_length=req.max_sequence_length, - image=req.image, - image_cond_strength=req.image_cond_strength, + max_sequence_length=req.params.max_sequence_length, + image=req.params.image, + image_cond_strength=req.params.image_cond_strength, stg_scale=extra["stg_scale"], stg_blocks=extra["stg_blocks"], modality_scale=extra["modality_scale"], diff --git a/tensorrt_llm/_torch/visual_gen/models/ltx2/pipeline_ltx2_two_stages.py b/tensorrt_llm/_torch/visual_gen/models/ltx2/pipeline_ltx2_two_stages.py index 40887e77515a..c92080d2a70c 100644 --- a/tensorrt_llm/_torch/visual_gen/models/ltx2/pipeline_ltx2_two_stages.py +++ b/tensorrt_llm/_torch/visual_gen/models/ltx2/pipeline_ltx2_two_stages.py @@ -565,22 +565,22 @@ def load_standard_components( # ------------------------------------------------------------------ def infer(self, req): - extra = req.extra_params or {} + extra = req.params.extra_params or {} return self.forward( prompt=req.prompt, - negative_prompt=req.negative_prompt, - height=req.height, - width=req.width, - num_frames=req.num_frames, - frame_rate=req.frame_rate, - num_inference_steps=req.num_inference_steps, - guidance_scale=req.guidance_scale, - seed=req.seed, + negative_prompt=req.params.negative_prompt, + height=req.params.height, + width=req.params.width, + num_frames=req.params.num_frames, + frame_rate=req.params.frame_rate, + num_inference_steps=req.params.num_inference_steps, + guidance_scale=req.params.guidance_scale, + seed=req.params.seed, output_type=extra["output_type"], guidance_rescale=extra["guidance_rescale"], - max_sequence_length=req.max_sequence_length, - image=req.image, - image_cond_strength=req.image_cond_strength, + max_sequence_length=req.params.max_sequence_length, + image=req.params.image, + image_cond_strength=req.params.image_cond_strength, stg_scale=extra["stg_scale"], stg_blocks=extra["stg_blocks"], modality_scale=extra["modality_scale"], diff --git a/tensorrt_llm/_torch/visual_gen/models/wan/pipeline_wan.py b/tensorrt_llm/_torch/visual_gen/models/wan/pipeline_wan.py index afa1510d3413..b851f142e1de 100644 --- a/tensorrt_llm/_torch/visual_gen/models/wan/pipeline_wan.py +++ b/tensorrt_llm/_torch/visual_gen/models/wan/pipeline_wan.py @@ -325,19 +325,19 @@ def extra_param_specs(self): def infer(self, req): """Run inference with request parameters.""" - extra = req.extra_params or {} + extra = req.params.extra_params or {} return self.forward( prompt=req.prompt, - negative_prompt=req.negative_prompt, - height=req.height, - width=req.width, - num_frames=req.num_frames, - num_inference_steps=req.num_inference_steps, - guidance_scale=req.guidance_scale, + negative_prompt=req.params.negative_prompt, + height=req.params.height, + width=req.params.width, + num_frames=req.params.num_frames, + num_inference_steps=req.params.num_inference_steps, + guidance_scale=req.params.guidance_scale, guidance_scale_2=extra.get("guidance_scale_2"), boundary_ratio=extra.get("boundary_ratio"), - seed=req.seed, - max_sequence_length=req.max_sequence_length, + seed=req.params.seed, + max_sequence_length=req.params.max_sequence_length, ) @nvtx_range("WanPipeline.forward") diff --git a/tensorrt_llm/_torch/visual_gen/models/wan/pipeline_wan_i2v.py b/tensorrt_llm/_torch/visual_gen/models/wan/pipeline_wan_i2v.py index edc74783721b..920c94f0faba 100644 --- a/tensorrt_llm/_torch/visual_gen/models/wan/pipeline_wan_i2v.py +++ b/tensorrt_llm/_torch/visual_gen/models/wan/pipeline_wan_i2v.py @@ -393,11 +393,11 @@ def extra_param_specs(self): def infer(self, req): """Run inference with request parameters.""" # Extract image from request (can be path, PIL Image, or torch.Tensor) - if req.image is None: + if req.params.image is None: raise ValueError("I2V pipeline requires 'image' parameter") - image = req.image[0] if isinstance(req.image, list) else req.image - extra = req.extra_params or {} + image = req.params.image[0] if isinstance(req.params.image, list) else req.params.image + extra = req.params.extra_params or {} last_image = extra.get("last_image") if last_image is not None and isinstance(last_image, list): @@ -406,16 +406,16 @@ def infer(self, req): return self.forward( image=image, prompt=req.prompt, - negative_prompt=req.negative_prompt, - height=req.height, - width=req.width, - num_frames=req.num_frames, - num_inference_steps=req.num_inference_steps, - guidance_scale=req.guidance_scale, + negative_prompt=req.params.negative_prompt, + height=req.params.height, + width=req.params.width, + num_frames=req.params.num_frames, + num_inference_steps=req.params.num_inference_steps, + guidance_scale=req.params.guidance_scale, guidance_scale_2=extra.get("guidance_scale_2"), boundary_ratio=extra.get("boundary_ratio"), - seed=req.seed, - max_sequence_length=req.max_sequence_length, + seed=req.params.seed, + max_sequence_length=req.params.max_sequence_length, last_image=last_image, ) diff --git a/tensorrt_llm/_torch/visual_gen/pipeline.py b/tensorrt_llm/_torch/visual_gen/pipeline.py index 947b29efb547..cd9d59b547f0 100644 --- a/tensorrt_llm/_torch/visual_gen/pipeline.py +++ b/tensorrt_llm/_torch/visual_gen/pipeline.py @@ -256,8 +256,8 @@ def extra_param_specs(self) -> Dict[str, ExtraParamSchema]: def default_generation_params(self) -> dict: """Model-specific defaults for ``None`` fields in ``VisualGenParams``. - Keys should match ``DiffusionRequest`` field names. The executor - merges these into the request before calling ``infer()``. + Keys should match ``VisualGenParams`` field names. The executor + merges these into ``request.params`` before calling ``infer()``. """ return {} diff --git a/tensorrt_llm/inputs/data.py b/tensorrt_llm/inputs/data.py index 48e3441df665..615043fe4878 100644 --- a/tensorrt_llm/inputs/data.py +++ b/tensorrt_llm/inputs/data.py @@ -1,6 +1,6 @@ # Adapt from # https://github.com/vllm-project/vllm/blob/2e33fe419186c65a18da6668972d61d7bbc31564/vllm/inputs/data.py -from typing import Any, Dict, List, Sequence, Union +from typing import Any, Dict, List, Union from typing_extensions import NotRequired, TypedDict @@ -85,80 +85,3 @@ def prompt_inputs(inputs: PromptInputs, ) -> Union[TextPrompt, TokensPrompt]: f"Invalid type of inputs for llm.generate: {type(inputs)}") return prompt_inputs - - -class VisualGenTextPrompt(TypedDict): - prompt: str - negative_prompt: NotRequired[str] - - -class VisualGenTokensPrompt(TypedDict): - prompt_token_ids: List[int] - negative_prompt_token_ids: NotRequired[List[int]] - - -VisualGenPromptInputs = Union[ - str, - List[int], - VisualGenTextPrompt, - VisualGenTokensPrompt, -] - -VisualGenInputs = Union[ - VisualGenPromptInputs, - Sequence[VisualGenPromptInputs], -] - - -def visual_gen_inputs( - inputs: "VisualGenPromptInputs", -) -> Union["VisualGenTextPrompt", "VisualGenTokensPrompt"]: - # str -> text prompt - if isinstance(inputs, str): - return VisualGenTextPrompt(prompt=inputs) - - # list[int] -> token prompt - if isinstance(inputs, list): - if len(inputs) == 0: - raise ValueError("`inputs` token list cannot be empty.") - if not all(isinstance(t, int) for t in inputs): - raise TypeError( - "`inputs` list must contain only ints when used as token IDs.") - return VisualGenTokensPrompt(prompt_token_ids=inputs) - - # dict form - if isinstance(inputs, dict): - has_prompt = "prompt" in inputs - has_prompt_token_ids = "prompt_token_ids" in inputs - - if has_prompt == has_prompt_token_ids: - raise ValueError( - "VisualGen prompt dict must contain exactly one of " - "`prompt` or `prompt_token_ids`.") - - if has_prompt: - prompt = inputs.get("prompt") - if not isinstance(prompt, str) or prompt == "": - raise TypeError("`prompt` must be a non-empty string.") - if "negative_prompt" in inputs and not isinstance( - inputs["negative_prompt"], str): - raise TypeError("`negative_prompt` must be a string.") - return inputs # VisualGenTextPrompt - - token_ids = inputs.get("prompt_token_ids") - if not isinstance(token_ids, list) or len(token_ids) == 0: - raise TypeError("`prompt_token_ids` must be a non-empty list[int].") - if not all(isinstance(t, int) for t in token_ids): - raise TypeError("`prompt_token_ids` must contain only ints.") - if "negative_prompt_token_ids" in inputs: - neg_ids = inputs["negative_prompt_token_ids"] - if not isinstance(neg_ids, list) or not all( - isinstance(t, int) for t in neg_ids): - raise TypeError( - "`negative_prompt_token_ids` must be a list[int].") - return inputs # VisualGenTokensPrompt - - raise TypeError( - "Invalid `inputs` for VisualGen.generate. " - "Expected one of: str, list[int], VisualGenTextPrompt, VisualGenTokensPrompt." - ) diff --git a/tensorrt_llm/serve/openai_server.py b/tensorrt_llm/serve/openai_server.py index 58cb6a857021..5fbe0d73e9fb 100644 --- a/tensorrt_llm/serve/openai_server.py +++ b/tensorrt_llm/serve/openai_server.py @@ -33,7 +33,7 @@ from tensorrt_llm.executor import CppExecutorError from tensorrt_llm.executor.postproc_worker import PostprocParams from tensorrt_llm.inputs import prompt_inputs -from tensorrt_llm.inputs.data import TokensPrompt, visual_gen_inputs +from tensorrt_llm.inputs.data import TokensPrompt from tensorrt_llm.inputs.multimodal import MultimodalServerConfig from tensorrt_llm.inputs.utils import ConversationMessage, apply_chat_template from tensorrt_llm.llmapi import DisaggregatedParams as LlmDisaggregatedParams @@ -1775,21 +1775,13 @@ async def openai_image_generation(self, request: ImageGenerationRequest, """ try: image_id = f"image_{uuid.uuid4().hex}" - params = parse_visual_gen_params(request, image_id) + params = parse_visual_gen_params(request, image_id, self.generator) logger.info( f"Generating image: {image_id} with params: {params} and prompt: {request.prompt}" ) - if request.negative_prompt is not None: - inputs = visual_gen_inputs({ - "prompt": - request.prompt, - "negative_prompt": - request.negative_prompt - }) - else: - inputs = visual_gen_inputs(request.prompt) - output = self.generator.generate(inputs=inputs, params=params) + output = self.generator.generate(inputs=request.prompt, + params=params) if output.image is None: return self.create_error_response( message="Image generation failed", @@ -1840,21 +1832,13 @@ async def openai_image_edit(self, request: ImageEditRequest, """ try: image_id = f"image_{uuid.uuid4().hex}" - params = parse_visual_gen_params(request, image_id) + params = parse_visual_gen_params(request, image_id, self.generator) logger.info( f"Editing image: {image_id} with params: {params} and prompt: {request.prompt}" ) - if request.negative_prompt is not None: - inputs = visual_gen_inputs({ - "prompt": - request.prompt, - "negative_prompt": - request.negative_prompt - }) - else: - inputs = visual_gen_inputs(request.prompt) - output = self.generator.generate(inputs=inputs, params=params) + output = self.generator.generate(inputs=request.prompt, + params=params) if output.image is None: return self.create_error_response( message="Image editing failed", @@ -1909,22 +1893,15 @@ async def openai_video_generation_sync(self, video_id = f"video_{uuid.uuid4().hex}" params = parse_visual_gen_params(request, video_id, + self.generator, media_storage_path=str( self.media_storage_path)) logger.info( f"Generating video: {video_id} with params: {params} and prompt: {request.prompt}" ) - if request.negative_prompt is not None: - inputs = visual_gen_inputs({ - "prompt": - request.prompt, - "negative_prompt": - request.negative_prompt - }) - else: - inputs = visual_gen_inputs(request.prompt) - output = self.generator.generate(inputs=inputs, params=params) + output = self.generator.generate(inputs=request.prompt, + params=params) if output.video is None: return self.create_error_response( message="Video generation failed", @@ -2043,6 +2020,7 @@ async def openai_video_generation_async( video_id = f"video_{uuid.uuid4().hex}" params = parse_visual_gen_params(request, video_id, + self.generator, media_storage_path=str( self.media_storage_path)) logger.info( @@ -2091,16 +2069,8 @@ async def _generate_video_background( resolved_fmt, resolved_ext = resolve_video_format( request.output_format) - if request.negative_prompt is not None: - inputs = visual_gen_inputs({ - "prompt": - request.prompt, - "negative_prompt": - request.negative_prompt - }) - else: - inputs = visual_gen_inputs(request.prompt) - future = self.generator.generate_async(inputs=inputs, params=params) + future = self.generator.generate_async(inputs=request.prompt, + params=params) output = await future.result() if output.video is None: diff --git a/tensorrt_llm/serve/visual_gen_utils.py b/tensorrt_llm/serve/visual_gen_utils.py index 27aacf33766a..f231156170fe 100644 --- a/tensorrt_llm/serve/visual_gen_utils.py +++ b/tensorrt_llm/serve/visual_gen_utils.py @@ -9,47 +9,52 @@ ImageGenerationRequest, VideoGenerationRequest, ) -from tensorrt_llm.visual_gen import VisualGenParams +from tensorrt_llm.visual_gen import VisualGen, VisualGenParams def parse_visual_gen_params( request: ImageGenerationRequest | VideoGenerationRequest | ImageEditRequest, id: str, + generator: VisualGen, media_storage_path: Optional[str] = None, ) -> VisualGenParams: - kwargs: Dict[str, Any] = {} - extra: Dict[str, Any] = {} - - kwargs["negative_prompt"] = request.negative_prompt + # Start from the pipeline's resolved defaults so unspecified request + # fields keep the model's defaults instead of being overwritten with None. + params = generator.default_params + if params.extra_params is None: + params.extra_params = {} + + if request.negative_prompt is not None: + params.negative_prompt = request.negative_prompt if request.size is not None and request.size != "auto": - kwargs["width"], kwargs["height"] = map(int, request.size.split("x")) + params.width, params.height = map(int, request.size.split("x")) if request.guidance_scale is not None: - kwargs["guidance_scale"] = request.guidance_scale + params.guidance_scale = request.guidance_scale if request.guidance_rescale is not None: - extra["guidance_rescale"] = request.guidance_rescale + params.extra_params["guidance_rescale"] = request.guidance_rescale if isinstance(request, (ImageGenerationRequest, ImageEditRequest)): if request.num_inference_steps is not None: - kwargs["num_inference_steps"] = request.num_inference_steps + params.num_inference_steps = request.num_inference_steps elif isinstance(request, ImageGenerationRequest) and request.quality == "hd": - kwargs["num_inference_steps"] = 30 + params.num_inference_steps = 30 if request.n is not None: - kwargs["num_images_per_prompt"] = request.n + params.num_images_per_prompt = request.n if isinstance(request, ImageEditRequest): if request.image is not None: if isinstance(request.image, list): - kwargs["image"] = [base64.b64decode(image) for image in request.image] + params.image = [base64.b64decode(image) for image in request.image] else: - kwargs["image"] = [base64.b64decode(request.image)] + params.image = [base64.b64decode(request.image)] if request.mask is not None: if isinstance(request.mask, list): - kwargs["mask"] = [base64.b64decode(mask) for mask in request.mask] + params.mask = [base64.b64decode(mask) for mask in request.mask] else: - kwargs["mask"] = base64.b64decode(request.mask) + params.mask = base64.b64decode(request.mask) elif isinstance(request, VideoGenerationRequest): if request.num_inference_steps is not None: - kwargs["num_inference_steps"] = request.num_inference_steps + params.num_inference_steps = request.num_inference_steps if request.input_reference is not None: if media_storage_path is None: raise ValueError("media_storage_path is required when input_reference is provided") @@ -60,18 +65,20 @@ def parse_visual_gen_params( else: with open(ref_path, "wb") as f: shutil.copyfileobj(request.input_reference.file, f) - kwargs["image"] = ref_path + params.image = ref_path - kwargs["frame_rate"] = request.fps - kwargs["num_frames"] = int(request.seconds * request.fps) + params.frame_rate = request.fps + params.num_frames = int(request.seconds * request.fps) if request.seed is not None: - kwargs["seed"] = int(request.seed) + params.seed = int(request.seed) - if extra: - kwargs["extra_params"] = extra + # Drop extra_params if we didn't end up with any — matches VisualGenParams + # convention where None means "no extras" for pipelines that declare none. + if not params.extra_params: + params.extra_params = None - return VisualGenParams(**kwargs) + return params class AsyncDictStore: diff --git a/tensorrt_llm/visual_gen/visual_gen.py b/tensorrt_llm/visual_gen/visual_gen.py index 80d5e5974d0d..aac89e0e77fb 100644 --- a/tensorrt_llm/visual_gen/visual_gen.py +++ b/tensorrt_llm/visual_gen/visual_gen.py @@ -45,7 +45,6 @@ "VisualGenResult", ] from tensorrt_llm.executor.ipc import ZeroMqQueue -from tensorrt_llm.inputs.data import VisualGenInputs from tensorrt_llm.llmapi.utils import set_api_status from tensorrt_llm.logger import logger @@ -538,13 +537,14 @@ def default_params(self) -> "VisualGenParams": @set_api_status("prototype") def generate( self, - inputs: VisualGenInputs, - params: VisualGenParams, + inputs: Union[str, List[str]], + params: Optional[VisualGenParams] = None, ) -> MediaOutput: """Synchronous generation. Blocks until complete. Args: - params: Generation parameters. + inputs: Text prompt string or list of prompt strings. + params: Generation parameters (optional; uses model defaults when None). Returns: MediaOutput: Generated media with model-specific fields populated: @@ -566,70 +566,38 @@ def generate( @set_api_status("prototype") def generate_async( self, - inputs: VisualGenInputs, - params: VisualGenParams, + inputs: Union[str, List[str]], + params: Optional[VisualGenParams] = None, ) -> VisualGenResult: """Async generation. Returns immediately with future-like object. Args: - params: Generation parameters. + inputs: Text prompt string or list of prompt strings. + params: Generation parameters (optional; uses model defaults when None). Returns: VisualGenResult: Call result() to get output dict. """ req_id = next(self._req_counter) - # Normalize inputs to (prompt: List[str], negative_prompt: Optional[str]) - # so DiffusionRequest.prompt is always a list. - if isinstance(inputs, dict): - prompt = [inputs.get("prompt")] - negative_prompt = inputs.get("negative_prompt", None) - elif isinstance(inputs, str): + # Normalize to List[str] for DiffusionRequest.prompt + if isinstance(inputs, str): prompt = [inputs] - negative_prompt = None elif isinstance(inputs, (list, tuple)): - # Batch generation: list of prompts if not inputs: raise ValueError("Batch inputs must contain at least one item") - - prompt = [] - negative_prompts = [] - for idx, inp in enumerate(inputs): - if isinstance(inp, str): - prompt.append(inp) - negative_prompts.append(None) - elif isinstance(inp, dict): - item_prompt = inp.get("prompt") - if item_prompt is None: - raise ValueError(f"Batch input at index {idx} is missing 'prompt'") - prompt.append(item_prompt) - negative_prompts.append(inp.get("negative_prompt")) - else: - raise ValueError(f"Invalid batch item type at index {idx}: {type(inp)}") - - unique_negatives = {p for p in negative_prompts if p is not None} - if len(unique_negatives) > 1: - raise ValueError("Per-item negative_prompt is not supported for batch inputs") - negative_prompt = next(iter(unique_negatives), None) + if not all(isinstance(item, str) for item in inputs): + raise ValueError("Batch inputs must contain only strings (prompt text)") + prompt = list(inputs) else: raise ValueError(f"Invalid inputs type: {type(inputs)}") + # Snapshot caller-provided params so later mutations don't affect + # the queued request (the dispatcher thread serializes it lazily). request = DiffusionRequest( request_id=req_id, prompt=prompt, - negative_prompt=negative_prompt, - height=params.height, - width=params.width, - num_inference_steps=params.num_inference_steps, - guidance_scale=params.guidance_scale, - max_sequence_length=params.max_sequence_length, - seed=params.seed, - num_frames=params.num_frames, - frame_rate=params.frame_rate, - num_images_per_prompt=params.num_images_per_prompt, - image=params.image, - image_cond_strength=params.image_cond_strength, - extra_params=params.extra_params, + params=params.model_copy(deep=True) if params is not None else None, ) self.executor.enqueue_requests([request]) diff --git a/tests/unittest/_torch/visual_gen/test_trtllm_serve_endpoints.py b/tests/unittest/_torch/visual_gen/test_trtllm_serve_endpoints.py index cd680b93d77b..b0081f87334b 100644 --- a/tests/unittest/_torch/visual_gen/test_trtllm_serve_endpoints.py +++ b/tests/unittest/_torch/visual_gen/test_trtllm_serve_endpoints.py @@ -90,10 +90,16 @@ def __init__( self._should_fail = should_fail self._healthy = True self._req_counter = 0 + # Captured arguments of the most recent generate / generate_async call, + # used by tests to assert forwarded VisualGenParams fields. + self.last_inputs = None + self.last_params = None # --- VisualGen interface --- def generate(self, inputs=None, params=None) -> MediaOutput: + self.last_inputs = inputs + self.last_params = params if self._should_fail: raise RuntimeError("Generation intentionally failed") return MediaOutput( @@ -103,6 +109,8 @@ def generate(self, inputs=None, params=None) -> MediaOutput: ) def generate_async(self, inputs=None, params=None) -> "MockVisualGenResult": + self.last_inputs = inputs + self.last_params = params return MockVisualGenResult( image=self._image, video=self._video, @@ -110,6 +118,14 @@ def generate_async(self, inputs=None, params=None) -> "MockVisualGenResult": should_fail=self._should_fail, ) + @property + def default_params(self): + """Stand-in for VisualGen.default_params — parse_visual_gen_params + seeds request params from this, so it must return a fresh instance.""" + from tensorrt_llm.visual_gen import VisualGenParams + + return VisualGenParams() + def _check_health(self) -> bool: return self._healthy @@ -167,7 +183,10 @@ def _create_server(generator: MockVisualGen, model_name: str = "test-model") -> server_role=ServerRole.VISUAL_GEN, metadata_server_cfg=None, ) - return TestClient(server.app) + client = TestClient(server.app) + # Expose the mock so tests can assert captured generate() arguments. + client.mock_gen = generator + return client # --------------------------------------------------------------------------- @@ -291,6 +310,15 @@ def test_image_generation_with_optional_params(self, image_client): data = resp.json() assert data["size"] == "128x64" + # Verify openai_server/parse_visual_gen_params forwarded every field. + params = image_client.mock_gen.last_params + assert image_client.mock_gen.last_inputs == "Sunset over ocean" + assert params.width == 128 + assert params.height == 64 + assert params.num_inference_steps == 20 + assert params.guidance_scale == 7.5 + assert params.negative_prompt == "blurry" + def test_image_generation_url_format_not_supported(self, image_client): resp = image_client.post( "/v1/images/generations", @@ -497,6 +525,13 @@ def test_image_edit_with_mask(self, image_client): ) assert resp.status_code == 200 + # Verify image + mask were base64-decoded and forwarded. + params = image_client.mock_gen.last_params + expected_bytes = base64.b64decode(b64_img) + assert params.image == [expected_bytes] + assert params.mask == base64.b64decode(b64_mask) + assert params.num_inference_steps == 10 + def test_image_edit_with_optional_params(self, image_client): b64_img = _b64_white_png_1x1() resp = image_client.post( @@ -515,6 +550,14 @@ def test_image_edit_with_optional_params(self, image_client): data = resp.json() assert data["size"] == "128x128" + params = image_client.mock_gen.last_params + assert params.width == 128 + assert params.height == 128 + assert params.guidance_scale == 8.0 + assert params.num_inference_steps == 15 + assert params.negative_prompt == "dark" + assert params.image == [base64.b64decode(b64_img)] + def test_image_edit_failure(self, failing_client): b64_img = _b64_white_png_1x1() resp = failing_client.post( @@ -626,6 +669,17 @@ def test_sync_video_generation_with_params(self, video_client): assert resp.status_code == 200 assert len(resp.content) > 0 + params = video_client.mock_gen.last_params + assert video_client.mock_gen.last_inputs == "Ocean waves" + assert params.width == 64 + assert params.height == 64 + assert params.num_inference_steps == 10 + assert params.guidance_scale == 5.0 + assert params.seed == 42 + assert params.negative_prompt == "blurry" + assert params.frame_rate == 8 + assert params.num_frames == int(2.0 * 8) + def test_sync_video_generation_multipart(self, video_client): # Use files={} with a dummy file to ensure multipart/form-data dummy_file = BytesIO(b"") @@ -662,6 +716,13 @@ def test_sync_video_generation_multipart_with_reference(self, video_client, tmp_ assert resp.status_code == 200 assert len(resp.content) > 0 + # input_reference should have been written to media storage and passed + # through as params.image (a filesystem path). + params = video_client.mock_gen.last_params + assert isinstance(params.image, str) + assert params.image.endswith("_reference.png") + assert os.path.exists(params.image) + def test_sync_video_failure(self, failing_client): resp = failing_client.post( "/v1/videos/generations", @@ -800,6 +861,47 @@ def test_async_video_invalid_fps(self, video_client): ) assert resp.status_code == 400 + def test_async_video_forwards_params(self, video_client): + """Ensure async video endpoint forwards VisualGenParams to generate_async.""" + resp = video_client.post( + "/v1/videos", + json={ + "prompt": "Rainy street", + "size": "128x64", + "seconds": 2.0, + "fps": 10, + "num_inference_steps": 12, + "guidance_scale": 6.0, + "seed": 7, + "negative_prompt": "noise", + }, + headers={"content-type": "application/json"}, + ) + assert resp.status_code == 202 + video_id = resp.json()["id"] + + # The background task calls generate_async lazily — drive the event + # loop via status polling until the job completes. + import time as _time + + deadline = _time.time() + 5 + while _time.time() < deadline: + meta = video_client.get(f"/v1/videos/{video_id}").json() + if meta.get("status") in ("completed", "failed"): + break + _time.sleep(0.05) + + params = video_client.mock_gen.last_params + assert video_client.mock_gen.last_inputs == "Rainy street" + assert params.width == 128 + assert params.height == 64 + assert params.num_inference_steps == 12 + assert params.guidance_scale == 6.0 + assert params.seed == 7 + assert params.negative_prompt == "noise" + assert params.frame_rate == 10 + assert params.num_frames == int(2.0 * 10) + # ========================================================================= # GET /v1/videos (list) diff --git a/tests/unittest/_torch/visual_gen/test_visual_gen_args.py b/tests/unittest/_torch/visual_gen/test_visual_gen_args.py index 5a3f83a3b7c0..551fe39a2a93 100644 --- a/tests/unittest/_torch/visual_gen/test_visual_gen_args.py +++ b/tests/unittest/_torch/visual_gen/test_visual_gen_args.py @@ -239,8 +239,8 @@ def test_list_prompt(self): assert len(req.prompt) == 2 -class TestVisualGenBatchInputParsing: - """Test that VisualGen.generate_async() correctly parses batch inputs. +class TestVisualGenInputParsing: + """Test that VisualGen.generate_async() correctly parses inputs. Uses mocking to avoid spawning GPU worker processes. """ @@ -258,116 +258,59 @@ def _make_visual_gen_with_mock_executor(self): vg.executor = MagicMock() return vg - def _make_params(self): - from tensorrt_llm.visual_gen import VisualGenParams - - return VisualGenParams() - def test_string_input(self): """String input → single-element list in DiffusionRequest.""" vg = self._make_visual_gen_with_mock_executor() - params = self._make_params() - vg.generate_async(inputs="a cat", params=params) + vg.generate_async(inputs="a cat") - # Check the DiffusionRequest passed to enqueue_requests call_args = vg.executor.enqueue_requests.call_args[0][0] assert len(call_args) == 1 assert call_args[0].prompt == ["a cat"] - def test_dict_input(self): - """Dict input → prompt wrapped in list + negative_prompt extracted.""" - vg = self._make_visual_gen_with_mock_executor() - params = self._make_params() - - vg.generate_async( - inputs={"prompt": "a cat", "negative_prompt": "blurry"}, - params=params, - ) - - call_args = vg.executor.enqueue_requests.call_args[0][0] - assert call_args[0].prompt == ["a cat"] - assert call_args[0].negative_prompt == "blurry" - def test_list_of_strings_input(self): """List of strings → batch prompt in single DiffusionRequest.""" vg = self._make_visual_gen_with_mock_executor() - params = self._make_params() - vg.generate_async(inputs=["a sunset", "a city"], params=params) + vg.generate_async(inputs=["a sunset", "a city"]) call_args = vg.executor.enqueue_requests.call_args[0][0] assert len(call_args) == 1 req = call_args[0] assert req.prompt == ["a sunset", "a city"] - assert req.negative_prompt is None - def test_list_of_dicts_input(self): - """List of dicts → batch prompts with negative_prompt from first dict.""" + def test_params_default_none(self): + """Omitting params passes None; executor materializes defaults later.""" vg = self._make_visual_gen_with_mock_executor() - params = self._make_params() - - vg.generate_async( - inputs=[ - {"prompt": "a sunset", "negative_prompt": "dark"}, - {"prompt": "a city"}, - ], - params=params, - ) + + vg.generate_async(inputs="a cat") call_args = vg.executor.enqueue_requests.call_args[0][0] req = call_args[0] - assert req.prompt == ["a sunset", "a city"] - assert req.negative_prompt == "dark" + assert req.params is None + + def test_negative_prompt_via_params(self): + """negative_prompt is passed through params, not inputs.""" + from tensorrt_llm.visual_gen import VisualGenParams - def test_mixed_list_input(self): - """Mixed list of strings and dicts → batch prompts extracted.""" vg = self._make_visual_gen_with_mock_executor() - params = self._make_params() + params = VisualGenParams(negative_prompt="blurry") - vg.generate_async(inputs=["a sunset", {"prompt": "a city"}], params=params) + vg.generate_async(inputs="a cat", params=params) call_args = vg.executor.enqueue_requests.call_args[0][0] - req = call_args[0] - assert req.prompt == ["a sunset", "a city"] - - def test_conflicting_negative_prompt_raises(self): - """Conflicting per-item negative_prompt raises ValueError.""" - vg = self._make_visual_gen_with_mock_executor() - params = self._make_params() - - with pytest.raises(ValueError, match="Per-item negative_prompt is not supported"): - vg.generate_async( - inputs=[ - {"prompt": "a sunset", "negative_prompt": "dark"}, - {"prompt": "a city", "negative_prompt": "light"}, - ], - params=params, - ) + assert call_args[0].params.negative_prompt == "blurry" def test_empty_batch_raises(self): """Empty batch input raises ValueError.""" vg = self._make_visual_gen_with_mock_executor() - params = self._make_params() with pytest.raises(ValueError, match="at least one item"): - vg.generate_async(inputs=[], params=params) - - def test_missing_prompt_in_dict_raises(self): - """Dict without 'prompt' key raises ValueError.""" - vg = self._make_visual_gen_with_mock_executor() - params = self._make_params() - - with pytest.raises(ValueError, match="missing 'prompt'"): - vg.generate_async( - inputs=[{"negative_prompt": "dark"}], - params=params, - ) + vg.generate_async(inputs=[]) def test_invalid_input_raises(self): """Invalid input type raises ValueError.""" vg = self._make_visual_gen_with_mock_executor() - params = self._make_params() with pytest.raises(ValueError, match="Invalid inputs type"): - vg.generate_async(inputs=12345, params=params) + vg.generate_async(inputs=12345) diff --git a/tests/unittest/_torch/visual_gen/test_visual_gen_params.py b/tests/unittest/_torch/visual_gen/test_visual_gen_params.py index 8521592a26af..4eb70522226e 100644 --- a/tests/unittest/_torch/visual_gen/test_visual_gen_params.py +++ b/tests/unittest/_torch/visual_gen/test_visual_gen_params.py @@ -300,8 +300,9 @@ def _make_mock_executor(self, pipeline_cls, mock_self=None): def _make_request(self, **kwargs): from tensorrt_llm._torch.visual_gen.executor import DiffusionRequest + from tensorrt_llm.visual_gen.params import VisualGenParams - return DiffusionRequest(request_id=0, prompt=["test"], **kwargs) + return DiffusionRequest(request_id=0, prompt=["test"], params=VisualGenParams(**kwargs)) def _merge(self, executor, req): from tensorrt_llm._torch.visual_gen.executor import DiffusionExecutor @@ -313,12 +314,12 @@ def test_universal_defaults_merged(self): executor = self._make_mock_executor(WanPipeline, _wan_mock(is_wan22=False, num_heads=12)) req = self._make_request() - assert req.height is None + assert req.params.height is None self._merge(executor, req) - assert req.height == 480 - assert req.width == 832 - assert req.num_inference_steps == 50 + assert req.params.height == 480 + assert req.params.width == 832 + assert req.params.num_inference_steps == 50 def test_user_values_not_overwritten(self): from tensorrt_llm._torch.visual_gen.models.wan.pipeline_wan import WanPipeline @@ -327,9 +328,9 @@ def test_user_values_not_overwritten(self): req = self._make_request(height=1080, width=1920) self._merge(executor, req) - assert req.height == 1080 # User value preserved - assert req.width == 1920 - assert req.num_inference_steps == 50 # Default filled + assert req.params.height == 1080 # User value preserved + assert req.params.width == 1920 + assert req.params.num_inference_steps == 50 # Default filled def test_extra_params_defaults_merged(self): from tensorrt_llm._torch.visual_gen.models.ltx2.pipeline_ltx2 import LTX2Pipeline @@ -338,12 +339,12 @@ def test_extra_params_defaults_merged(self): req = self._make_request() self._merge(executor, req) - assert req.extra_params is not None - assert req.extra_params["stg_scale"] == 0.0 - assert req.extra_params["output_type"] == "pt" - assert req.extra_params["enhance_prompt"] is False + assert req.params.extra_params is not None + assert req.params.extra_params["stg_scale"] == 0.0 + assert req.params.extra_params["output_type"] == "pt" + assert req.params.extra_params["enhance_prompt"] is False # None defaults are also filled - assert req.extra_params["stg_blocks"] is None + assert req.params.extra_params["stg_blocks"] is None def test_user_extra_params_not_overwritten(self): from tensorrt_llm._torch.visual_gen.models.ltx2.pipeline_ltx2 import LTX2Pipeline @@ -352,8 +353,8 @@ def test_user_extra_params_not_overwritten(self): req = self._make_request(extra_params={"stg_scale": 0.5}) self._merge(executor, req) - assert req.extra_params["stg_scale"] == 0.5 # User value preserved - assert req.extra_params["output_type"] == "pt" # Default filled + assert req.params.extra_params["stg_scale"] == 0.5 # User value preserved + assert req.params.extra_params["output_type"] == "pt" # Default filled def test_no_extra_params_for_flux(self): from tensorrt_llm._torch.visual_gen.models.flux.pipeline_flux import FluxPipeline @@ -362,7 +363,7 @@ def test_no_extra_params_for_flux(self): req = self._make_request() self._merge(executor, req) - assert req.extra_params is None # Flux has no extra specs + assert req.params.extra_params is None # Flux has no extra specs def test_all_declared_keys_present_after_merge(self): """After merge, all extra_param_specs keys are in extra_params.""" @@ -374,7 +375,30 @@ def test_all_declared_keys_present_after_merge(self): self._merge(executor, req) ltx2_specs = LTX2Pipeline.extra_param_specs.fget(None) for key in ltx2_specs: - assert key in req.extra_params, f"Missing key: {key}" + assert key in req.params.extra_params, f"Missing key: {key}" + + def test_params_none_materializes_defaults(self): + """req.params=None is the default path from generate_async(params=None); + _merge_defaults should materialize a VisualGenParams from pipeline defaults.""" + from tensorrt_llm._torch.visual_gen.executor import DiffusionRequest + from tensorrt_llm._torch.visual_gen.models.ltx2.pipeline_ltx2 import LTX2Pipeline + from tensorrt_llm.visual_gen.params import VisualGenParams + + executor = self._make_mock_executor(LTX2Pipeline) + req = DiffusionRequest(request_id=0, prompt=["test"], params=None) + + self._merge(executor, req) + + assert isinstance(req.params, VisualGenParams) + # Universal defaults are filled from the pipeline + assert req.params.height == 512 + assert req.params.width == 768 + assert req.params.num_inference_steps == 40 + # Extra-param defaults are filled for all declared keys + assert req.params.extra_params is not None + assert req.params.extra_params["stg_scale"] == 0.0 + assert req.params.extra_params["output_type"] == "pt" + assert "stg_blocks" in req.params.extra_params # ============================================================================= @@ -658,8 +682,9 @@ def _make_mock_executor(self, pipeline_cls, mock_self=None): def _make_request(self, **kwargs): from tensorrt_llm._torch.visual_gen.executor import DiffusionRequest + from tensorrt_llm.visual_gen.params import VisualGenParams - return DiffusionRequest(request_id=0, prompt=["test"], **kwargs) + return DiffusionRequest(request_id=0, prompt=["test"], params=VisualGenParams(**kwargs)) def _validate(self, executor, req): from tensorrt_llm._torch.visual_gen.executor import DiffusionExecutor @@ -757,6 +782,22 @@ def test_none_fields_not_flagged(self): req = self._make_request() # all None self._merge_and_validate(executor, req) + def test_params_none_merge_and_validate_ok(self): + """req.params=None must merge + validate cleanly (VisualGen.generate_async + defaults to params=None, so this is the canonical call path).""" + from tensorrt_llm._torch.visual_gen.executor import DiffusionRequest + from tensorrt_llm._torch.visual_gen.models.ltx2.pipeline_ltx2 import LTX2Pipeline + from tensorrt_llm.visual_gen.params import VisualGenParams + + executor = self._make_mock_executor(LTX2Pipeline) + req = DiffusionRequest(request_id=0, prompt=["test"], params=None) + + self._merge_and_validate(executor, req) # should not raise + + assert isinstance(req.params, VisualGenParams) + assert req.params.height == 512 + assert req.params.extra_params["stg_scale"] == 0.0 + # --- type validation on extra_params --- def test_wrong_type_extra_param_raises(self): From dd907c045ecb31b064cce6e0ca0ac8161830807f Mon Sep 17 00:00:00 2001 From: chenfeiz0326 Date: Sun, 26 Apr 2026 22:31:38 +0800 Subject: [PATCH 2/2] [None][test] Update CI Post-Merge Disagg Perf Tests (#13343) Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> Signed-off-by: Chenfei Zhang Co-authored-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com> --- .../perf-test-sync/build_prompt.py | 30 + .../perf-test-sync/promptfooconfig.yaml | 379 +++++++++++ .../perf-test-sync/render_report.py | 328 ++++++++++ .claude/agent-tests/perf-test-sync/run.sh | 41 ++ .claude/agents/perf-test-sync.md | 594 ++++++++++++++++++ .gitignore | 5 + jenkins/L0_Test.groovy | 66 +- jenkins/scripts/perf/disaggregated/submit.py | 7 +- jenkins/scripts/perf/local/submit.py | 24 +- .../integration/defs/perf/test_perf_sanity.py | 9 +- .../test_lists/qa/llm_perf_multinode.txt | 294 +++++---- .../l0_b200_multi_gpus_perf_sanity.yml | 12 +- ...sanity_ctx1_node1_gpu4_gen1_node1_gpu8.yml | 24 +- .../l0_gb200_multi_gpus_perf_sanity.yml | 61 +- ...sanity_ctx1_node1_gpu1_gen1_node1_gpu2.yml | 10 +- ...sanity_ctx1_node1_gpu1_gen1_node1_gpu4.yml | 17 +- ...sanity_ctx1_node1_gpu1_gen1_node2_gpu8.yml | 4 +- ...sanity_ctx1_node1_gpu4_gen1_node1_gpu4.yml | 11 +- ...sanity_ctx1_node1_gpu4_gen1_node2_gpu8.yml | 25 +- ...anity_ctx1_node1_gpu4_gen1_node4_gpu16.yml | 8 +- ...anity_ctx1_node1_gpu4_gen1_node8_gpu32.yml | 32 +- ...sanity_ctx1_node2_gpu8_gen1_node2_gpu8.yml | 6 +- ...anity_ctx1_node2_gpu8_gen1_node4_gpu16.yml | 4 +- ...anity_ctx1_node2_gpu8_gen1_node8_gpu32.yml | 3 +- ...anity_ctx2_node1_gpu4_gen1_node4_gpu16.yml | 6 +- ...200_multi_nodes_perf_sanity_node2_gpu8.yml | 6 +- .../l0_gb300_multi_gpus_perf_sanity.yml | 28 + ...sanity_ctx1_node1_gpu4_gen1_node1_gpu4.yml | 22 + ...sanity_ctx1_node1_gpu4_gen1_node2_gpu8.yml | 22 + ...anity_ctx1_node1_gpu4_gen1_node4_gpu16.yml | 22 + ...anity_ctx1_node1_gpu4_gen1_node8_gpu32.yml | 22 + ...300_multi_nodes_perf_sanity_node2_gpu8.yml | 20 + tests/integration/test_lists/waives.txt | 16 +- ...1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml} | 4 +- ...1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml} | 4 +- ...1_dep4_gen1_dep8_eplb0_mtp3_ccb-NIXL.yaml} | 4 +- ...1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml} | 4 +- ...1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml} | 4 +- ...1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml} | 4 +- ...x1_pp8_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml | 98 +++ ...x1_pp8_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml} | 4 +- ...1_pp8_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml} | 4 +- ..._dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml} | 6 +- ..._dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml} | 4 +- ...1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml} | 8 +- ...1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml} | 4 +- ...ep4_gen1_dep16_eplb288_mtp3_ccb-NIXL.yaml} | 4 +- ...x1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml | 99 +++ ..._dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml} | 4 +- ...1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml} | 4 +- ...1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml | 99 +++ ...ep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml} | 4 +- ...1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml} | 4 +- ...x1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml | 107 ++++ ...1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml} | 4 +- ...dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL.yaml | 114 ++++ ..._dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml} | 4 +- ...1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml} | 6 +- ...ep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml} | 4 +- ...1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml} | 4 +- ...dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL.yaml | 106 ++++ ...1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml} | 4 +- ...1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml} | 4 +- ..._dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml} | 4 +- ...1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml} | 4 +- ...1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml} | 4 +- ..._dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml} | 4 +- ...tx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml | 96 +++ ...x1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml} | 4 +- ...tx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml} | 4 +- ...ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml | 106 ++++ ...tx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml} | 4 +- ...tx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml} | 4 +- ...x1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml} | 4 +- ..._dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml} | 4 +- ...x1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml | 97 +++ ...1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml} | 4 +- ..._dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml} | 4 +- ...1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL.yaml | 97 +++ ...1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml} | 4 +- ...tx1_tp1_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml | 94 +++ ...tx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml | 93 +++ ...x1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml} | 4 +- ...tx1_pp4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml | 98 +++ ...tx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml | 99 +++ ...x1_pp4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml | 99 +++ ..._dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml} | 4 +- ...x1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml | 98 +++ ...x1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml | 99 +++ ...1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml | 99 +++ ...x1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml | 98 +++ ...1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml | 99 +++ ...dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml | 110 ++++ ...x1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml | 106 ++++ ...x1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml | 107 ++++ ...x1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml | 111 ++++ ...dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL.yaml | 114 ++++ ...1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml | 111 ++++ ...x1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml | 101 +++ ...dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml | 110 ++++ ...x1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml | 106 ++++ ...dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL.yaml | 106 ++++ ...1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml | 97 +++ ...x1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml | 97 +++ ...x1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml | 95 +++ ...1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml | 103 +++ ...1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL.yaml | 97 +++ ...x1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml | 102 +++ ...ep16_bs64_eplb0_mtp3_con1024_ccb-NIXL.yaml | 105 ---- ...dep16_bs64_eplb0_mtp3_con1024_ccb-UCX.yaml | 105 ---- ...dep16_bs64_eplb0_mtp3_con512_ccb-NIXL.yaml | 105 ---- ..._dep16_bs64_eplb0_mtp3_con512_ccb-UCX.yaml | 105 ---- ...dep32_bs16_eplb0_mtp3_con512_ccb-NIXL.yaml | 105 ---- ..._dep32_bs16_eplb0_mtp3_con512_ccb-UCX.yaml | 105 ---- ...4_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml | 100 --- ...n4_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml | 100 --- ...en4_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml | 100 --- ...n4_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml | 100 --- ...en4_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml | 100 --- ...4_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml | 100 --- ...n4_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml | 100 --- ...n4_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml | 100 --- ...en4_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml | 100 --- ...n4_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml | 100 --- ...en4_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml | 100 --- ...1_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml | 90 --- ...n1_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml | 90 --- ...n1_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml | 90 --- ...en1_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml | 90 --- ...n1_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml | 90 --- ...en1_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml | 90 --- ...1_tep8_bs32_eplb0_mtp0_con36_ccb-NIXL.yaml | 90 --- ...n1_tep8_bs32_eplb0_mtp0_con36_ccb-UCX.yaml | 90 --- ...n1_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml | 90 --- ...en1_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml | 90 --- ...n1_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml | 90 --- ...en1_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml | 90 --- ...en13_tep4_bs1_eplb0_mtp0_con1-Default.yaml | 98 --- ...gen5_tep4_bs4_eplb0_mtp0_con4-Default.yaml | 98 --- ...gen6_tep8_bs1_eplb0_mtp3_con1-Default.yaml | 104 --- ...gen7_tep8_bs1_eplb0_mtp0_con1-Default.yaml | 98 --- ...gen8_tep8_bs1_eplb0_mtp0_con1-Default.yaml | 98 --- ...en11_tep4_bs2_eplb0_mtp0_con2-Default.yaml | 98 --- ...en14_tep4_bs1_eplb0_mtp0_con1-Default.yaml | 98 --- ...en1_dep16_bs1_eplb0_mtp3_con1-Default.yaml | 101 --- ...gen1_dep8_bs4_eplb0_mtp2_con4-Default.yaml | 101 --- ...gen1_tep8_bs1_eplb0_mtp0_con1-Default.yaml | 98 --- ...gen1_tep8_bs1_eplb0_mtp3_con1-Default.yaml | 102 --- ...gen1_tep8_bs2_eplb0_mtp3_con2-Default.yaml | 102 --- ...gen5_tep8_bs2_eplb0_mtp3_con2-Default.yaml | 102 --- ...gen7_tep4_bs2_eplb0_mtp2_con2-Default.yaml | 102 --- ...gen7_tep8_bs1_eplb0_mtp0_con1-Default.yaml | 98 --- ...gen8_tep4_bs4_eplb0_mtp0_con4-Default.yaml | 98 --- ...gen7_tep8_bs2_eplb0_mtp3_con2-Default.yaml | 104 --- ...en1_dep16_bs8_eplb0_mtp0_con8-Default.yaml | 97 --- ...en1_dep32_bs2_eplb0_mtp0_con2-Default.yaml | 97 --- ...1_dep8_bs16_eplb0_mtp1_con128-Default.yaml | 103 --- ...1_dep16_bs16_eplb0_mtp0_con16-Default.yaml | 97 --- ...en1_dep16_bs8_eplb0_mtp2_con8-Default.yaml | 101 --- ...en1_dep32_bs2_eplb0_mtp3_con2-Default.yaml | 101 --- ...en1_dep32_bs4_eplb0_mtp0_con4-Default.yaml | 97 --- ..._dep16_bs16_eplb0_mtp0_con256-Default.yaml | 97 --- ...1_dep16_bs8_eplb0_mtp3_con128-Default.yaml | 103 --- ...n1_dep32_bs2_eplb0_mtp3_con64-Default.yaml | 103 --- ...1_dep32_bs4_eplb0_mtp0_con128-Default.yaml | 97 --- ..._dep16_bs16_eplb0_mtp1_con256-Default.yaml | 103 --- ..._dep16_bs32_eplb0_mtp0_con512-Default.yaml | 97 --- ..._dep16_bs32_eplb0_mtp1_con512-Default.yaml | 103 --- ...1_dep32_bs4_eplb0_mtp3_con128-Default.yaml | 103 --- ...1_dep32_bs8_eplb0_mtp0_con256-Default.yaml | 97 --- ...1_dep32_bs8_eplb0_mtp3_con256-Default.yaml | 103 --- ...dep32_bs32_eplb0_mtp0_con1024_ccb-UCX.yaml | 99 --- ...4_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml | 100 --- ...n4_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml | 100 --- ...n4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml | 100 --- ...en4_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml | 100 --- ...n4_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml | 100 --- ...en4_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml | 100 --- ...4_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml | 100 --- ...n4_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml | 100 --- ...n4_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml | 100 --- ...en4_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml | 100 --- ...n4_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml | 100 --- ...en4_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml | 100 --- ...4_tep8_bs32_eplb0_mtp3_con16_ccb-NIXL.yaml | 106 ---- ...n4_tep8_bs32_eplb0_mtp3_con16_ccb-UCX.yaml | 106 ---- ...en4_tep8_bs32_eplb0_mtp3_con1_ccb-UCX.yaml | 106 ---- ...n4_tep8_bs32_eplb0_mtp3_con2_ccb-NIXL.yaml | 106 ---- ...en4_tep8_bs32_eplb0_mtp3_con2_ccb-UCX.yaml | 106 ---- ...4_tep8_bs32_eplb0_mtp3_con32_ccb-NIXL.yaml | 106 ---- ...n4_tep8_bs32_eplb0_mtp3_con32_ccb-UCX.yaml | 106 ---- ...n4_tep8_bs32_eplb0_mtp3_con4_ccb-NIXL.yaml | 106 ---- ...en4_tep8_bs32_eplb0_mtp3_con4_ccb-UCX.yaml | 106 ---- ...n4_tep8_bs32_eplb0_mtp3_con8_ccb-NIXL.yaml | 106 ---- ...en4_tep8_bs32_eplb0_mtp3_con8_ccb-UCX.yaml | 106 ---- ...ep16_bs128_eplb0_mtp3_con2048_ccb-UCX.yaml | 105 ---- ...ep32_bs128_eplb0_mtp3_con1024_ccb-UCX.yaml | 119 ---- ...3_tep8_bs16_eplb0_mtp3_con16_ccb-NIXL.yaml | 106 ---- ...n3_tep8_bs16_eplb0_mtp3_con16_ccb-UCX.yaml | 106 ---- ...n3_tep8_bs16_eplb0_mtp3_con1_ccb-NIXL.yaml | 106 ---- ...en3_tep8_bs16_eplb0_mtp3_con1_ccb-UCX.yaml | 106 ---- ...n3_tep8_bs16_eplb0_mtp3_con2_ccb-NIXL.yaml | 106 ---- ...en3_tep8_bs16_eplb0_mtp3_con2_ccb-UCX.yaml | 106 ---- ...n3_tep8_bs16_eplb0_mtp3_con4_ccb-NIXL.yaml | 106 ---- ...en3_tep8_bs16_eplb0_mtp3_con4_ccb-UCX.yaml | 106 ---- ...n3_tep8_bs16_eplb0_mtp3_con8_ccb-NIXL.yaml | 106 ---- ...en3_tep8_bs16_eplb0_mtp3_con8_ccb-UCX.yaml | 106 ---- ...3_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml | 100 --- ...n3_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml | 100 --- ...en3_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml | 100 --- ...n3_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml | 100 --- ...en3_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml | 100 --- ...3_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml | 100 --- ...n3_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml | 100 --- ...n3_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml | 100 --- ...en3_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml | 100 --- ...n3_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml | 100 --- ...en3_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml | 100 --- ...ep16_bs64_eplb0_mtp0_con1024_ccb-NIXL.yaml | 99 --- ...dep16_bs64_eplb0_mtp0_con1024_ccb-UCX.yaml | 99 --- ...dep32_bs16_eplb0_mtp3_con512_ccb-NIXL.yaml | 105 ---- ..._dep32_bs16_eplb0_mtp3_con512_ccb-UCX.yaml | 105 ---- ...x1_pp8_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml | 98 +++ ...x1_pp8_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml} | 2 +- ...tx1_pp8_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml | 97 +++ ...x1_pp8_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml | 98 +++ ...4_gen8_tep4_bs2_eplb0_mtp0_con2-NIXL.yaml} | 10 +- ..._dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml} | 39 +- ...1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml | 99 +++ ...x1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml | 94 +++ ...x1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml | 98 +++ ..._dep4_gen1_dep16_eplb0_mtp3_ccb-NIXL.yaml} | 4 +- ...dep4_gen1_dep16_eplb288_mtp3_ccb-NIXL.yaml | 102 +++ ...x1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml | 99 +++ ...tx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml | 0 ...4_tep8_bs32_eplb0_mtp3_con1_ccb-NIXL.yaml} | 5 +- ...16_bs128_eplb0_mtp3_con2048_ccb-NIXL.yaml} | 14 +- ...1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml | 99 +++ ...x1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml | 98 +++ ...1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml | 99 +++ ...x1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml | 0 ...3_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml} | 5 +- ...dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml | 110 ++++ ...x1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml | 106 ++++ ...x1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml | 107 ++++ ...tx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml | 0 ...x1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml | 111 ++++ ...dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL.yaml | 114 ++++ ..._dep4_gen1_dep32_eplb288_mtp1_ccb-UCX.yaml | 0 ...1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml | 111 ++++ ...x1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml | 101 +++ ...dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml | 110 ++++ ...x1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml | 106 ++++ ...dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL.yaml | 106 ++++ ..._dep4_gen1_dep32_eplb256_mtp0_ccb-UCX.yaml | 0 ...tx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml | 96 +++ ...ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX.yaml | 0 ...tx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml | 96 +++ ...ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml | 92 +++ ...ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml | 106 ++++ ..._ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX.yaml | 0 ...ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml | 92 +++ ...ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml | 92 +++ ...tx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml | 96 +++ ..._dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml} | 10 +- ...x1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml | 97 +++ ...tx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml | 0 ...1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml} | 10 +- ..._dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml} | 12 +- ...1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL.yaml | 97 +++ ...x1_dep4_gen1_dep16_eplb0_mtp0_ccb-UCX.yaml | 0 ...1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml} | 12 +- ...4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml} | 5 +- ...16_bs128_eplb0_mtp3_con2048_ccb-NIXL.yaml} | 19 +- ...tx1_tp1_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml | 94 +++ ...ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml | 0 ...tx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml | 93 +++ ...tx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml | 92 +++ ...n1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml} | 5 +- ...n1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml} | 5 +- ...2_bs32_eplb288_mtp0_con1024_ccb-NIXL.yaml} | 5 +- ...lb288_mtp0_con1024_ccb-NIXL_kv-reuse.yaml} | 5 +- ..._bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml} | 5 +- ...6_bs128_eplb288_mtp3_con2048_ccb-UCX.yaml} | 5 +- ..._bs16_eplb288_mtp3_con12288_ccb-NIXL.yaml} | 9 +- ..._bs128_eplb288_mtp3_con1024_ccb-NIXL.yaml} | 9 +- ...2_bs32_eplb288_mtp0_con1024_ccb-NIXL.yaml} | 5 +- ..._bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml} | 5 +- ..._bs16_eplb288_mtp3_con12288_ccb-NIXL.yaml} | 9 +- ..._bs128_eplb288_mtp3_con1024_ccb-NIXL.yaml} | 9 +- ...n1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml} | 5 +- ...tx1_pp4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml | 98 +++ ...tx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml | 99 +++ ...ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml | 99 +++ ...x1_pp4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml | 99 +++ ...1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml | 98 +++ ...x1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml | 98 +++ ...x1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml | 99 +++ ...tx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml | 99 +++ ...1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml | 99 +++ ...x1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml | 98 +++ ...1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml | 99 +++ ...x1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml | 99 +++ ...dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml | 110 ++++ ...x1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml | 106 ++++ ...x1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml | 107 ++++ ...tx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml | 107 ++++ ...x1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml | 111 ++++ ...dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL.yaml | 114 ++++ ..._dep4_gen1_dep32_eplb288_mtp1_ccb-UCX.yaml | 114 ++++ ...1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml | 111 ++++ ...x1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml | 101 +++ ...dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml | 110 ++++ ...x1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml | 106 ++++ ...dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL.yaml | 106 ++++ ..._dep4_gen1_dep32_eplb256_mtp0_ccb-UCX.yaml | 106 ++++ ...1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml | 97 +++ ...x1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml | 97 +++ ...x1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml} | 8 +- ...x1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml | 95 +++ ...1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml | 103 +++ ...1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL.yaml | 97 +++ ...1_dep4_gen1_dep16_eplb0_mtp0_ccb-UCX.yaml} | 8 +- ...x1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml | 102 +++ ...16_bs128_eplb0_mtp3_con2048_ccb-NIXL.yaml} | 23 +- ...6_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml | 110 ++++ ...6_bs128_eplb288_mtp3_con2048_ccb-UCX.yaml} | 23 +- ..._bs128_eplb288_mtp3_con1024_ccb-NIXL.yaml} | 32 +- ...6_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml | 118 ++++ ..._bs128_eplb288_mtp3_con1024_ccb-NIXL.yaml} | 34 +- ...n1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml | 112 ---- ...16_bs64_eplb288_mtp0_con1024_ccb-NIXL.yaml | 105 ---- ...p16_bs64_eplb288_mtp0_con1024_ccb-UCX.yaml | 105 ---- ...ep32_bs16_eplb288_mtp3_con512_ccb-UCX.yaml | 111 ---- ...16_bs64_eplb288_mtp0_con1024_ccb-NIXL.yaml | 113 ---- ...bs1024_eplb384_mtp0_con16384_ccb-NIXL.yaml | 108 ---- ...2_bs256_eplb416_mtp0_con8192_ccb-NIXL.yaml | 108 ---- 337 files changed, 12504 insertions(+), 12788 deletions(-) create mode 100644 .claude/agent-tests/perf-test-sync/build_prompt.py create mode 100644 .claude/agent-tests/perf-test-sync/promptfooconfig.yaml create mode 100644 .claude/agent-tests/perf-test-sync/render_report.py create mode 100755 .claude/agent-tests/perf-test-sync/run.sh create mode 100644 .claude/agents/perf-test-sync.md create mode 100644 tests/integration/test_lists/test-db/l0_gb300_multi_gpus_perf_sanity.yml create mode 100644 tests/integration/test_lists/test-db/l0_gb300_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu4.yml create mode 100644 tests/integration/test_lists/test-db/l0_gb300_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node2_gpu8.yml create mode 100644 tests/integration/test_lists/test-db/l0_gb300_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node4_gpu16.yml create mode 100644 tests/integration/test_lists/test-db/l0_gb300_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node8_gpu32.yml create mode 100644 tests/integration/test_lists/test-db/l0_gb300_multi_nodes_perf_sanity_node2_gpu8.yml rename tests/scripts/perf-sanity/disaggregated/{b200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml => b200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml} (98%) rename tests/scripts/perf-sanity/disaggregated/{b200_deepseek-r1-fp4_1k1k_con2048_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml => b200_deepseek-r1-fp4_1k1k_con2048_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml} (98%) rename tests/scripts/perf-sanity/disaggregated/{b200_deepseek-r1-fp4_1k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp3_ccb-UCX.yaml => b200_deepseek-r1-fp4_1k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp3_ccb-NIXL.yaml} (98%) rename tests/scripts/perf-sanity/disaggregated/{b200_deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml => b200_deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml} (98%) rename tests/scripts/perf-sanity/disaggregated/{b200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml => b200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml} (98%) rename tests/scripts/perf-sanity/disaggregated/{b200_deepseek-r1-fp4_8k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml => b200_deepseek-r1-fp4_8k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml} (98%) create mode 100644 tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml rename tests/scripts/perf-sanity/disaggregated/{gb200_deepseek-r1-fp4_128k8k_con1_ctx1_pp8_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml => gb200_deepseek-r1-fp4_128k8k_con1_ctx1_pp8_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml} (98%) rename tests/scripts/perf-sanity/disaggregated/{gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml => gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml} (98%) rename tests/scripts/perf-sanity/disaggregated/{gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-DEFAULT.yaml => gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml} (97%) rename tests/scripts/perf-sanity/disaggregated/{gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml => gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml} (98%) rename tests/scripts/perf-sanity/disaggregated/{gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml => gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml} (96%) rename tests/scripts/perf-sanity/disaggregated/{gb200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml => gb200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml} (98%) rename tests/scripts/perf-sanity/disaggregated/{gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb288_mtp3_ccb-UCX.yaml => gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb288_mtp3_ccb-NIXL.yaml} (98%) create mode 100644 tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml rename tests/scripts/perf-sanity/disaggregated/{gb200_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml => gb200_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml} (98%) rename tests/scripts/perf-sanity/disaggregated/{gb200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml => gb200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml} (98%) create mode 100644 tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml rename tests/scripts/perf-sanity/disaggregated/{gb200_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX.yaml => gb200_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml} (98%) rename tests/scripts/perf-sanity/disaggregated/{gb200_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml => gb200_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml} (98%) create mode 100644 tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml rename tests/scripts/perf-sanity/disaggregated/{gb200_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml => gb200_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml} (98%) create mode 100644 tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL.yaml rename tests/scripts/perf-sanity/disaggregated/{gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml => gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml} (98%) rename tests/scripts/perf-sanity/disaggregated/{gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml => gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml} (98%) rename tests/scripts/perf-sanity/disaggregated/{gb200_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX.yaml => gb200_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml} (98%) rename tests/scripts/perf-sanity/disaggregated/{gb200_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml => gb200_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml} (98%) create mode 100644 tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL.yaml rename tests/scripts/perf-sanity/disaggregated/{gb200_glm-5-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml => gb200_glm-5-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml} (98%) rename tests/scripts/perf-sanity/disaggregated/{gb200_glm-5-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml => gb200_glm-5-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml} (98%) rename tests/scripts/perf-sanity/disaggregated/{gb200_glm-5-fp4_1k1k_con512_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml => gb200_glm-5-fp4_1k1k_con512_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml} (98%) rename tests/scripts/perf-sanity/disaggregated/{gb200_glm-5-fp4_8k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml => gb200_glm-5-fp4_8k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml} (98%) rename tests/scripts/perf-sanity/disaggregated/{gb200_glm-5-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml => gb200_glm-5-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml} (98%) rename tests/scripts/perf-sanity/disaggregated/{gb200_glm-5-fp4_8k1k_con512_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml => gb200_glm-5-fp4_8k1k_con512_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml} (98%) create mode 100644 tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml rename tests/scripts/perf-sanity/disaggregated/{gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX.yaml => gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml} (98%) rename tests/scripts/perf-sanity/disaggregated/{gb200_gpt-oss-120b-fp4_1k1k_con64_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX.yaml => gb200_gpt-oss-120b-fp4_1k1k_con64_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml} (98%) create mode 100644 tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con1024_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml rename tests/scripts/perf-sanity/disaggregated/{gb200_gpt-oss-120b-fp4_8k1k_con128_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX.yaml => gb200_gpt-oss-120b-fp4_8k1k_con128_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml} (98%) rename tests/scripts/perf-sanity/disaggregated/{gb200_gpt-oss-120b-fp4_8k1k_con4_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX.yaml => gb200_gpt-oss-120b-fp4_8k1k_con4_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml} (98%) rename tests/scripts/perf-sanity/disaggregated/{gb200_gpt-oss-120b-fp4_8k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX.yaml => gb200_gpt-oss-120b-fp4_8k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml} (98%) rename tests/scripts/perf-sanity/disaggregated/{gb200_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-UCX.yaml => gb200_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml} (98%) create mode 100644 tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml rename tests/scripts/perf-sanity/disaggregated/{gb200_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-UCX.yaml => gb200_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml} (98%) rename tests/scripts/perf-sanity/disaggregated/{gb200_kimi-k25-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml => gb200_kimi-k25-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml} (98%) create mode 100644 tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL.yaml rename tests/scripts/perf-sanity/disaggregated/{gb200_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml => gb200_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml} (98%) create mode 100644 tests/scripts/perf-sanity/disaggregated/gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml create mode 100644 tests/scripts/perf-sanity/disaggregated/gb200_qwen3-235b-fp4_8k1k_con1_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml rename tests/scripts/perf-sanity/disaggregated/{gb200_qwen3-235b-fp4_8k1k_con64_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-UCX.yaml => gb200_qwen3-235b-fp4_8k1k_con64_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml} (98%) create mode 100644 tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_128k8k_con1_ctx1_pp4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml create mode 100644 tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_128k8k_con64_ctx1_pp4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml rename tests/scripts/perf-sanity/disaggregated/{gb300_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml => gb300_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml} (98%) create mode 100644 tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml create mode 100644 tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml create mode 100644 tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml create mode 100644 tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL.yaml create mode 100644 tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml create mode 100644 tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL.yaml create mode 100644 tests/scripts/perf-sanity/disaggregated/gb300_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml create mode 100644 tests/scripts/perf-sanity/disaggregated/gb300_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml create mode 100644 tests/scripts/perf-sanity/disaggregated/gb300_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml create mode 100644 tests/scripts/perf-sanity/disaggregated/gb300_kimi-k25-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf-sanity/disaggregated/gb300_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL.yaml create mode 100644 tests/scripts/perf-sanity/disaggregated/gb300_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con1024_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con1024_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con512_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con512_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con36_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con36_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen13_tep4_bs1_eplb0_mtp0_con1-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen5_tep4_bs4_eplb0_mtp0_con4-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen6_tep8_bs1_eplb0_mtp3_con1-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen7_tep8_bs1_eplb0_mtp0_con1-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep8_bs1_eplb0_mtp0_con1-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen11_tep4_bs2_eplb0_mtp0_con2-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen14_tep4_bs1_eplb0_mtp0_con1-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_bs1_eplb0_mtp3_con1-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep8_bs4_eplb0_mtp2_con4-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp0_con1-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp3_con1-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs2_eplb0_mtp3_con2-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen5_tep8_bs2_eplb0_mtp3_con2-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep4_bs2_eplb0_mtp2_con2-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep8_bs1_eplb0_mtp0_con1-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen8_tep4_bs4_eplb0_mtp0_con4-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx2_pp4_gen7_tep8_bs2_eplb0_mtp3_con2-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep16_bs8_eplb0_mtp0_con8-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep32_bs2_eplb0_mtp0_con2-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp4_gen1_dep8_bs16_eplb0_mtp1_con128-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs16_eplb0_mtp0_con16-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2_con8-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3_con2-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs4_eplb0_mtp0_con4-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs16_eplb0_mtp0_con256-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs8_eplb0_mtp3_con128-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs2_eplb0_mtp3_con64-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs4_eplb0_mtp0_con128-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs16_eplb0_mtp1_con256-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs32_eplb0_mtp0_con512-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep16_bs32_eplb0_mtp1_con512-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs4_eplb0_mtp3_con128-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp0_con256-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp3_con256-Default.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_con1024_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con16_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con16_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con1_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con2_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con2_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con32_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con32_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con4_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con4_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con8_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con8_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_con1024_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con16_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con16_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con1_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con1_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con2_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con2_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con4_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con4_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con8_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con8_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_con1024_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_con1024_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml rename tests/scripts/{perf-sanity/disaggregated/gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp2_ccb-UCX.yaml => perf/disaggregated/gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml} (98%) create mode 100644 tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_128k8k_con1_ctx1_pp8_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml rename tests/scripts/perf/disaggregated/{deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0_con2-Default.yaml => gb200_deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0_con2-NIXL.yaml} (95%) rename tests/scripts/perf/disaggregated/{deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_con1024_ccb-NIXL.yaml => gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml} (73%) create mode 100644 tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml rename tests/scripts/{perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb0_mtp3_ccb-UCX.yaml => perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb0_mtp3_ccb-NIXL.yaml} (98%) create mode 100644 tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb288_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml rename tests/scripts/{perf-sanity => perf}/disaggregated/gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml (100%) rename tests/scripts/perf/disaggregated/{deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con1_ccb-NIXL.yaml => gb200_deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con1_ccb-NIXL.yaml} (97%) rename tests/scripts/perf/disaggregated/{deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL.yaml => gb200_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL.yaml} (87%) create mode 100644 tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml rename tests/scripts/{perf-sanity => perf}/disaggregated/gb200_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml (100%) rename tests/scripts/perf/disaggregated/{deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml => gb200_deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml} (97%) create mode 100644 tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml rename tests/scripts/{perf-sanity => perf}/disaggregated/gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml (100%) create mode 100644 tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL.yaml rename tests/scripts/{perf-sanity => perf}/disaggregated/gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX.yaml (100%) create mode 100644 tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL.yaml rename tests/scripts/{perf-sanity => perf}/disaggregated/gb200_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-UCX.yaml (100%) create mode 100644 tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml rename tests/scripts/{perf-sanity => perf}/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX.yaml (100%) create mode 100644 tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con64_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con1024_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml rename tests/scripts/{perf-sanity => perf}/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con1024_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX.yaml (100%) create mode 100644 tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con128_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con4_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml rename tests/scripts/{perf-sanity/disaggregated/gb200_kimi-k2-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-UCX.yaml => perf/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml} (92%) create mode 100644 tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml rename tests/scripts/{perf-sanity => perf}/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml (100%) rename tests/scripts/{perf-sanity/disaggregated/gb200_kimi-k2-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-UCX.yaml => perf/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml} (92%) rename tests/scripts/{perf-sanity/disaggregated/gb200_kimi-k2-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml => perf/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml} (91%) create mode 100644 tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL.yaml rename tests/scripts/{perf-sanity => perf}/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-UCX.yaml (100%) rename tests/scripts/{perf-sanity/disaggregated/gb200_kimi-k2-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml => perf/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml} (91%) rename tests/scripts/perf/disaggregated/{Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml => gb200_qwen3-235b-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml} (97%) rename tests/scripts/perf/disaggregated/{Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_con2048_ccb-NIXL.yaml => gb200_qwen3-235b-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL.yaml} (87%) create mode 100644 tests/scripts/perf/disaggregated/gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml rename tests/scripts/{perf-sanity => perf}/disaggregated/gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml (100%) create mode 100644 tests/scripts/perf/disaggregated/gb200_qwen3-235b-fp4_8k1k_con1_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb200_qwen3-235b-fp4_8k1k_con64_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml rename tests/scripts/perf/disaggregated/{wideep_accuracy-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml => gb200_wideep_accuracy-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml} (98%) rename tests/scripts/perf/disaggregated/{wideep_accuracy-deepseek-r1-fp4_gpqa_diamond_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml => gb200_wideep_accuracy-deepseek-r1-fp4_gpqa_diamond_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml} (98%) rename tests/scripts/perf/disaggregated/{wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL.yaml => gb200_wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL.yaml} (97%) rename tests/scripts/perf/disaggregated/{wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL_kv-reuse.yaml => gb200_wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL_kv-reuse.yaml} (97%) rename tests/scripts/perf/disaggregated/{wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml => gb200_wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml} (98%) rename tests/scripts/perf/disaggregated/{wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-UCX.yaml => gb200_wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-UCX.yaml} (97%) rename tests/scripts/perf/disaggregated/{wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-DEFAULT.yaml => gb200_wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-NIXL.yaml} (96%) rename tests/scripts/perf/disaggregated/{wideep_deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-DEFAULT.yaml => gb200_wideep_deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-NIXL.yaml} (96%) rename tests/scripts/perf/disaggregated/{wideep_deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL.yaml => gb200_wideep_deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL.yaml} (97%) rename tests/scripts/perf/disaggregated/{wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml => gb200_wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml} (98%) rename tests/scripts/perf/disaggregated/{wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-DEFAULT.yaml => gb200_wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-NIXL.yaml} (96%) rename tests/scripts/perf/disaggregated/{wideep_deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-DEFAULT.yaml => gb200_wideep_deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-NIXL.yaml} (96%) rename tests/scripts/perf/disaggregated/{wideep_stress-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml => gb200_wideep_stress-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml} (98%) create mode 100644 tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_128k8k_con1_ctx1_pp4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_128k8k_con64_ctx1_pp4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-UCX.yaml create mode 100644 tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml rename tests/scripts/{perf-sanity/disaggregated/gb200_kimi-k2-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml => perf/disaggregated/gb300_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml} (94%) create mode 100644 tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml create mode 100644 tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL.yaml rename tests/scripts/{perf-sanity/disaggregated/gb200_kimi-k2-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-UCX.yaml => perf/disaggregated/gb300_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-UCX.yaml} (94%) create mode 100644 tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml rename tests/scripts/perf/disaggregated/{Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_con2048_ccb-UCX.yaml => gb300_qwen3-235b-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL.yaml} (85%) create mode 100644 tests/scripts/perf/disaggregated/gb300_wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml rename tests/scripts/perf/disaggregated/{wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-UCX.yaml => gb300_wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-UCX.yaml} (86%) rename tests/scripts/perf/disaggregated/{wideep_deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-NIXL.yaml => gb300_wideep_deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-NIXL.yaml} (89%) create mode 100644 tests/scripts/perf/disaggregated/gb300_wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml rename tests/scripts/perf/disaggregated/{wideep_deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-NIXL.yaml => gb300_wideep_deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-NIXL.yaml} (89%) delete mode 100644 tests/scripts/perf/disaggregated/wideep_accuracy-kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-UCX.yaml delete mode 100644 tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/wideep_kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_con16384_ccb-NIXL.yaml delete mode 100644 tests/scripts/perf/disaggregated/wideep_kimi-k2-thinking-fp4_8k1k_ctx8_gen1_dep32_bs256_eplb416_mtp0_con8192_ccb-NIXL.yaml diff --git a/.claude/agent-tests/perf-test-sync/build_prompt.py b/.claude/agent-tests/perf-test-sync/build_prompt.py new file mode 100644 index 000000000000..71c93a5cd697 --- /dev/null +++ b/.claude/agent-tests/perf-test-sync/build_prompt.py @@ -0,0 +1,30 @@ +"""Promptfoo prompt builder. + +Loads perf-test-sync.md, strips Claude Code specific sections, and appends the +user request. + +Stripped sections: + - YAML frontmatter between leading `---` markers + - `# Persistent Agent Memory` section and everything after it (Claude Code + memory infrastructure, not relevant to prompt-quality evaluation) +""" + +import os +import re + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +_AGENT_MD = os.path.normpath(os.path.join(_SCRIPT_DIR, "..", "..", "agents", "perf-test-sync.md")) + + +def _load_agent_body() -> str: + with open(_AGENT_MD, "r", encoding="utf-8") as f: + text = f.read() + text = re.sub(r"\A---\n.*?\n---\n", "", text, count=1, flags=re.DOTALL) + text = text.split("# Persistent Agent Memory", 1)[0].rstrip() + return text + + +def build(context: dict) -> str: + user_prompt = context["vars"]["prompt"] + agent_body = _load_agent_body() + return f"{agent_body}\n\n## User request\n\n{user_prompt}\n" diff --git a/.claude/agent-tests/perf-test-sync/promptfooconfig.yaml b/.claude/agent-tests/perf-test-sync/promptfooconfig.yaml new file mode 100644 index 000000000000..a79abb2b7e48 --- /dev/null +++ b/.claude/agent-tests/perf-test-sync/promptfooconfig.yaml @@ -0,0 +1,379 @@ +# Promptfoo test config for perf-test-sync agent +# Run: npx promptfoo eval -c .claude/agent-tests/perf-test-sync/promptfooconfig.yaml + +description: "perf-test-sync agent prompt quality tests" + +prompts: + - file://build_prompt.py:build + +providers: + - id: openai:chat:openai/openai/gpt-5.1 + config: + max_tokens: 1024 + temperature: 0 + +defaultTest: + options: + provider: + id: openai:chat:openai/openai/gpt-5.1 + config: + max_tokens: 1024 + temperature: 0 + +tests: + # --- Routing: should the agent be invoked? --- + - vars: + prompt: "Sync the dev perf sanity tests into the QA perf test directory" + assert: + - type: llm-rubric + value: "Response acknowledges the sync task and describes a plan to compare dev vs QA disagg cases" + - type: contains-any + value: + - "disagg" + - "perf-sanity" + - "perf/disaggregated" + + - vars: + prompt: "Compare the disagg test case differences between dev and QA" + assert: + - type: llm-rubric + value: "Response describes comparing disaggregated test cases between dev and QA directories using the strict canonical identity key (not filenames, not tuning knobs)" + + - vars: + prompt: "Update the QA perf test list with newly added dev cases" + assert: + - type: llm-rubric + value: "Response mentions updating llm_perf_multinode.txt or the QA test list" + + # --- Exclusion rules --- + - vars: + prompt: "Should I sync a DeepSeek R1 1k1k disagg case from dev to QA?" + assert: + - type: llm-rubric + value: "Response clearly states that DeepSeek R1 1k1k cases should NOT be synced and should be skipped" + - type: contains-any + value: + - "skip" + - "exclude" + - "not sync" + - "do not sync" + + - vars: + prompt: "Should I sync a DeepSeek R1 8k1k disagg case from dev to QA?" + assert: + - type: llm-rubric + value: "Response clearly states that DeepSeek R1 8k1k cases should NOT be synced and should be skipped" + + # --- Platform mapping --- + - vars: + prompt: "A dev disagg case targets B200. Which QA platforms should it be registered under?" + assert: + - type: llm-rubric + value: "Response states the case should be mapped to BOTH GB200 and GB300 on the QA side" + - type: contains-all + value: + - "GB200" + - "GB300" + + - vars: + prompt: "A DeepSeek 128k8k case has pp4 in the ctx config. Which platform?" + assert: + - type: llm-rubric + value: "Response states pp4 128k8k cases are GB300-only, NOT both platforms" + - type: contains + value: "GB300" + + - vars: + prompt: "A DeepSeek 128k8k case has pp8 in the ctx config. Which platform?" + assert: + - type: llm-rubric + value: "Response states pp8 128k8k cases are GB200-only, NOT both platforms" + - type: contains + value: "GB200" + + # --- Agg generation --- + - vars: + prompt: "How are QA aggregated test cases created? Are they copied from dev?" + assert: + - type: llm-rubric + value: "Response clearly states agg cases are NOT copied from dev, but generated from QA's own disagg YAML ctx configs" + - type: contains-any + value: + - "not cop" + - "generated" + - "extract" + + # --- Test list format --- + - vars: + prompt: "What format should aggregated test list entries use?" + assert: + - type: llm-rubric + value: "Response states the QA form is aggr-ctx_only- (no _upload suffix in the QA entry). It is acceptable for the response to mention that dev uses aggr_upload-ctx_only-... as contrast." + - type: contains + value: "ctx_only" + - type: contains + value: "aggr-ctx_only-" + + # --- GPU-hours formula --- + - vars: + prompt: "How do you compute GPU-hours for a disagg e2e case?" + assert: + - type: llm-rubric + value: "Response includes the formula: num_ctx_servers * (ctx.TP * ctx.PP * ctx.CP) + num_gen_servers * (gen.TP * gen.PP * gen.CP)" + - type: contains-any + value: + - "num_ctx_servers" + - "ctx" + - "gen" + + # --- Anti-duplication: strict identity keys --- + - vars: + prompt: "A dev disagg YAML differs from an existing QA YAML ONLY in max_batch_size (dev=64, QA=32). Everything else — model, precision, ISL/OSL, parallelism, MTP — is identical. Should I sync it as a new QA case?" + assert: + - type: llm-rubric + value: "Response clearly states NO — max_batch_size is a tuning knob not in the identity key, so this is EXISTING and must be skipped. Response must NOT create a new QA case." + - type: contains-any + value: + - "skip" + - "EXISTING" + - "not a new" + - "do not sync" + - "tuning" + + - vars: + prompt: "A dev disagg YAML has identical config to a QA YAML except the filename ends with -UCX while QA's does not. The cache_transceiver_config.backend is UCX on dev and NIXL on QA. Should I copy dev's version as a new case?" + assert: + - type: llm-rubric + value: "Response clearly states NO — transport backend (UCX vs NIXL) and filename suffix are IGNORED fields, not part of the identity key. This is a tuning-only delta; skip and preserve QA's version." + - type: contains-any + value: + - "skip" + - "cosmetic" + - "transport" + - "UCX" + - "ignored" + - "do not" + + - vars: + prompt: "List the fields that are EXPLICITLY IGNORED when deciding whether a dev disagg YAML is a new test vs a duplicate of an existing QA case." + assert: + - type: llm-rubric + value: "Response lists tuning knobs such as max_batch_size, max_num_tokens, max_seq_len, kv_cache_config fields, cache_transceiver_config.backend, concurrency, logging/profiling flags, and filename/comments as IGNORED. It does NOT list model_name, precision, ISL/OSL, or parallelism fields as ignored." + - type: contains-any + value: + - "max_batch_size" + - "tuning" + - "concurrency" + - "transceiver" + + - vars: + prompt: "Name at least 5 fields that MUST be part of the strict canonical identity key used to compare disagg YAMLs between dev and QA." + assert: + - type: llm-rubric + value: "Response lists identity-key fields such as model_name, precision, benchmark.input_length, benchmark.output_length, num_ctx_servers, ctx.tensor_parallel_size, ctx.pipeline_parallel_size, num_gen_servers, gen.tensor_parallel_size, enable_attention_dp, MTP / speculative_config, or moe_config.backend. At least 5 distinct identity-key fields must be named." + - type: contains-any + value: + - "tensor_parallel_size" + - "input_length" + - "num_ctx_servers" + - "num_gen_servers" + - "model_name" + + # --- Phase 0 no-op detection & idempotency --- + - vars: + prompt: "I just ran the sync 5 minutes ago. The dev perf-sanity directory has not changed since. I run the sync again now. What should the agent do?" + assert: + - type: llm-rubric + value: "Response states the agent should do a fast no-op for YAMLs and test-list (Phase 0 identity-key deltas empty, report 'No changes required', no YAML writes, no test-list edits) BUT still regenerate both HTML reports at the repo root with a fresh timestamp on every run." + - type: contains-any + value: + - "no-op" + - "No changes" + - "exit" + - "idempotent" + - "skip" + + - vars: + prompt: "If I run the sync twice back-to-back with no upstream changes between the two runs, how many YAML files and test-list lines should change on the second run?" + assert: + - type: llm-rubric + value: "Response states that ZERO YAMLs and ZERO test-list lines change on the second run. The response may additionally note that the two HTML reports at the repo root are still regenerated on every run (that is expected, not a no-op violation)." + - type: icontains-any + value: + - "zero" + - "no yaml" + - "no test-list" + - "none" + - "idempotent" + - "0 yaml" + - "nothing" + - "**0**" + - ": 0" + - "are 0" + - "is 0" + + - vars: + prompt: "On a no-op run (Phase 0 found no delta), should the agent still regenerate llm_multi_node_gpu_hours.html and perf_test_sync_report.html?" + assert: + - type: llm-rubric + value: "Response states YES — both HTML reports are regenerated on EVERY run (no-op and full-run alike) because they are the user-visible output with a fresh timestamp. The previous 'skip on no-op' rule has been removed." + - type: icontains-any + value: + - "yes" + - "always" + - "every run" + - "regenerate" + - "regenerated" + + # --- HTML reports go to repo root --- + - vars: + prompt: "Where should perf_test_sync_report.html and llm_multi_node_gpu_hours.html be written?" + assert: + - type: llm-rubric + value: "Response states both HTML reports are written to the repo root (e.g., /perf_test_sync_report.html and /llm_multi_node_gpu_hours.html). The old location tests/integration/test_lists/qa/ for the GPU-hours HTML is obsolete." + - type: icontains-any + value: + - "repo root" + - "root" + - "top level" + - "top-level" + + - vars: + prompt: "Should the GPU-hours HTML be written to tests/integration/test_lists/qa/llm_multi_node_gpu_hours.html like before?" + assert: + - type: llm-rubric + value: "Response states NO — the correct path is now the repo root. The tests/integration/test_lists/qa/ path is obsolete/legacy. Old stale copies can be left alone, but new runs write to repo root." + - type: icontains-any + value: + - "no" + - "obsolete" + - "legacy" + - "root" + - "do not" + + # --- Platform classification driven by test-list section headers --- + - vars: + prompt: "The llm_perf_multinode.txt file has a subsection header '# Qwen backend-compare cases (GB300 only)' followed by several test_e2e[disagg-...] lines. When computing GPU-hours, what platform should those cases be assigned?" + assert: + - type: llm-rubric + value: "Response states the cases under that header are GB300-only (not BOTH). The section-header annotation '(GB300 only)' drives platform classification and overrides the default BOTH. Those cases count ONLY toward the GB300 GPU-hours column." + - type: contains + value: "GB300" + - type: icontains-any + value: + - "only" + - "not both" + - "single" + + - vars: + prompt: "Under a subsection '# Deepseek-r1 backend-compare cases (GB200 only)', there are many deepseek-r1-fp4 1k1k and 8k1k disagg cases. Should those cases be counted under both GB200 and GB300 GPU-hours columns in the report?" + assert: + - type: llm-rubric + value: "Response states NO — the '(GB200 only)' header annotation restricts these cases to the GB200 column only. They must NOT double-count into GB300." + - type: contains + value: "GB200" + - type: icontains-any + value: + - "no" + - "only" + - "not both" + - "do not" + + - vars: + prompt: "How is platform (GB200 / GB300 / BOTH) determined for each test in llm_multi_node_gpu_hours.html?" + assert: + - type: llm-rubric + value: "Response states the platform is derived by parsing llm_perf_multinode.txt section headers top-to-bottom (e.g., '# ... (GB300 only)', '# GB200 supported cases', '# GB200 + GB300 supported cases') and propagating that state to each test_e2e line that follows. The DeepSeek 128k8k pp4/pp8 exception overrides a BOTH assignment. Default is BOTH only when no explicit annotation applies." + - type: icontains-any + value: + - "section header" + - "subsection" + - "header" + - "comment" + - "annotation" + + # --- No auto-upgrade on tuning deltas --- + - vars: + prompt: "Dev bumped an existing disagg case's max_batch_size from 32 to 64 (same identity key, just tuning). QA's YAML for that case still has max_batch_size=32. Should QA's YAML be auto-upgraded to 64 to match dev?" + assert: + - type: llm-rubric + value: "Response clearly states NO — QA's manual tuning is authoritative. The agent does NOT auto-upgrade on tuning deltas. This is EXISTING, skip, leave QA's tuning intact. The previous 'MODIFIED — maybe upgrade' behavior has been removed." + - type: contains-any + value: + - "no" + - "do not" + - "preserve" + - "authoritative" + - "skip" + - "leave" + + # --- AGG_CONFIG_FOLDER (not the old DISAGG_CONFIG_FOLDER warning) --- + - vars: + prompt: "Where does the QA runtime look for the ctx_only aggregated YAMLs? Do I need to add a DISAGG_CONFIG_FOLDER contract warning to the sync report?" + assert: + - type: llm-rubric + value: "Response states QA uses AGG_CONFIG_FOLDER pointing at tests/scripts/perf/aggregated/. The old DISAGG_CONFIG_FOLDER / submit.py contract warning is obsolete and should NOT be added to reports or commit messages." + - type: contains-any + value: + - "AGG_CONFIG_FOLDER" + - "perf/aggregated" + - type: llm-rubric + value: "Response does NOT recommend adding a DISAGG_CONFIG_FOLDER warning to the report." + + # --- Performance: one-pass parsing & parallel discovery --- + - vars: + prompt: "The sync takes 20+ minutes when there are ~100 YAMLs to compare. How should the agent parse them efficiently without sacrificing accuracy?" + assert: + - type: llm-rubric + value: "Response recommends parsing all YAMLs in ONE pass (also phrased as 'scan once, reuse everywhere', 'parse once and cache', 'single Python script', 'centralize parsing', 'batch parse'). The core idea is: one invocation that walks every YAML, parses via yaml.safe_load, and emits a reusable cache/JSON so downstream phases do not re-open files. Any wording that conveys this idea passes — the rubric does NOT require the specific phrase 'one-pass'. Extra points for mentioning parallel tool calls / batched reads, but mentioning it is NOT required to pass." + - type: icontains-any + value: + - "one-pass" + - "one pass" + - "single" + - "once" + - "json" + - "cache" + - "batch" + - "parallel" + - "centralize" + - "script" + + - vars: + prompt: "Is it OK to issue a separate Read tool call for every YAML in the three perf directories during Phase 1?" + assert: + - type: llm-rubric + value: "Response states NO — per-file Read calls are the main source of the 20+ minute runtime. The correct approach is a single Bash invocation that parses every YAML via yaml.safe_load in one pass and emits JSON." + - type: contains-any + value: + - "no" + - "do not" + - "one-pass" + - "single" + - "batch" + + - vars: + prompt: "What is the target wall-clock time for a no-op sync run (Phase 0 delta empty)?" + assert: + - type: llm-rubric + value: "Response states the target is under 60 seconds (roughly one minute) — a Bash one-pass parse plus Phase 0 diff plus exit." + - type: contains-any + value: + - "60" + - "minute" + - "seconds" + - "fast" + + # --- Output language --- + - vars: + prompt: "用中文回答:如何同步dev的perf测试到QA?" + assert: + - type: llm-rubric + value: "Despite the Chinese input, the response is entirely in English — no Chinese characters in the output" + + # --- Negative: unrelated request --- + - vars: + prompt: "Help me write a unit test for the LLM API" + assert: + - type: llm-rubric + value: "Response does NOT proceed with any perf test sync tasks (no mention of dev/QA sync, llm_perf_multinode.txt, aggr-ctx_only, GB200/GB300 accounting, identity keys, or Phase 0/2/3/4/5). It is acceptable for the response to help with a generic unit-test question, ask for more context, or clarify out-of-scope — any of these pass as long as it does not invoke the perf sync workflow." diff --git a/.claude/agent-tests/perf-test-sync/render_report.py b/.claude/agent-tests/perf-test-sync/render_report.py new file mode 100644 index 000000000000..d151152adc6b --- /dev/null +++ b/.claude/agent-tests/perf-test-sync/render_report.py @@ -0,0 +1,328 @@ +"""Render a promptfoo JSON eval result into a self-contained, styled HTML report. + +Usage: + python render_report.py +""" + +import html +import json +import sys +from datetime import datetime +from pathlib import Path + +CSS = """ +* { box-sizing: border-box; } +body { + font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; + background: #0f172a; + color: #e2e8f0; + margin: 0; + padding: 24px; + line-height: 1.5; +} +.container { max-width: 1200px; margin: 0 auto; } +h1 { font-size: 24px; margin: 0 0 4px 0; color: #f8fafc; } +.subtitle { color: #94a3b8; font-size: 13px; margin-bottom: 24px; } +.summary { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(140px, 1fr)); + gap: 12px; + margin-bottom: 24px; +} +.card { + background: #1e293b; + border: 1px solid #334155; + border-radius: 8px; + padding: 16px; +} +.card .label { color: #94a3b8; font-size: 12px; text-transform: uppercase; letter-spacing: 0.5px; } +.card .value { font-size: 28px; font-weight: 600; margin-top: 4px; color: #f8fafc; } +.card.pass .value { color: #4ade80; } +.card.fail .value { color: #f87171; } +.card.rate .value { color: #60a5fa; } +.filters { + display: flex; + gap: 8px; + margin-bottom: 16px; + flex-wrap: wrap; +} +.filter-btn { + background: #1e293b; + border: 1px solid #334155; + color: #e2e8f0; + padding: 8px 16px; + border-radius: 6px; + cursor: pointer; + font-size: 13px; + font-family: inherit; +} +.filter-btn:hover { background: #334155; } +.filter-btn.active { background: #3b82f6; border-color: #3b82f6; } +.test { + background: #1e293b; + border: 1px solid #334155; + border-radius: 8px; + margin-bottom: 12px; + overflow: hidden; +} +.test.pass { border-left: 4px solid #4ade80; } +.test.fail { border-left: 4px solid #f87171; } +.test-header { + padding: 14px 16px; + display: flex; + align-items: center; + gap: 12px; + cursor: pointer; + user-select: none; +} +.test-header:hover { background: #273449; } +.badge { + font-size: 11px; + font-weight: 600; + padding: 3px 8px; + border-radius: 4px; + text-transform: uppercase; + letter-spacing: 0.5px; + flex-shrink: 0; +} +.badge.pass { background: #14532d; color: #4ade80; } +.badge.fail { background: #7f1d1d; color: #fca5a5; } +.test-title { + flex: 1; + font-size: 14px; + color: #f1f5f9; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +} +.test-meta { color: #64748b; font-size: 12px; flex-shrink: 0; } +.test-body { + display: none; + padding: 16px; + border-top: 1px solid #334155; + background: #0f172a; +} +.test.open .test-body { display: block; } +.test.open .test-header .arrow { transform: rotate(90deg); } +.arrow { + color: #64748b; + transition: transform 0.15s; + font-family: monospace; + flex-shrink: 0; +} +.section { margin-bottom: 16px; } +.section-label { + color: #94a3b8; + font-size: 11px; + font-weight: 600; + text-transform: uppercase; + letter-spacing: 0.5px; + margin-bottom: 6px; +} +pre { + background: #020617; + border: 1px solid #1e293b; + border-radius: 6px; + padding: 12px; + overflow-x: auto; + font-family: "SF Mono", Monaco, "Cascadia Code", "Roboto Mono", Consolas, monospace; + font-size: 12.5px; + line-height: 1.6; + color: #cbd5e1; + margin: 0; + white-space: pre-wrap; + word-wrap: break-word; + max-height: 400px; + overflow-y: auto; +} +.assert { + background: #0f172a; + border: 1px solid #334155; + border-radius: 6px; + padding: 10px 12px; + margin-bottom: 8px; + font-size: 13px; +} +.assert .assert-head { + display: flex; + gap: 8px; + align-items: center; + margin-bottom: 4px; +} +.assert .assert-type { + background: #334155; + color: #cbd5e1; + font-size: 10px; + padding: 2px 6px; + border-radius: 3px; + font-family: monospace; +} +.assert .assert-value { color: #94a3b8; font-size: 12px; } +.assert .reason { color: #cbd5e1; font-size: 12px; margin-top: 6px; font-style: italic; } +.footer { margin-top: 32px; text-align: center; color: #64748b; font-size: 12px; } +""" + + +def extract_text(output) -> str: + """Promptfoo output may be a string or a structured object.""" + if isinstance(output, str): + return output + if isinstance(output, dict): + return output.get("output") or output.get("text") or json.dumps(output, indent=2) + return str(output) + + +def render_assert(a: dict) -> str: + passed = a.get("pass", False) + badge = "pass" if passed else "fail" + badge_text = "PASS" if passed else "FAIL" + a_type = html.escape(str(a.get("type", "?"))) + a_value = a.get("value", "") + if isinstance(a_value, list): + a_value = ", ".join(str(v) for v in a_value) + a_value = html.escape(str(a_value))[:300] + reason = html.escape(str(a.get("reason", "") or "")) + reason_html = f'
{reason}
' if reason else "" + return f""" +
+
+ {badge_text} + {a_type} + {a_value} +
+ {reason_html} +
+ """ + + +def render_test(idx: int, result: dict) -> str: + passed = result.get("success", False) + status = "pass" if passed else "fail" + badge_text = "PASS" if passed else "FAIL" + + vars_dict = result.get("vars") or result.get("testCase", {}).get("vars", {}) or {} + user_prompt = vars_dict.get("prompt", "(no prompt)") + title = html.escape(str(user_prompt)) + + response = extract_text(result.get("response", {}).get("output", result.get("response", ""))) + response_html = html.escape(response) + + grading = result.get("gradingResult") or {} + component_results = grading.get("componentResults") or [] + asserts_html = ( + "".join(render_assert(a) for a in component_results) + or '
(no assertion details)
' + ) + + tokens = result.get("response", {}).get("tokenUsage", {}) or {} + total_tokens = tokens.get("total", 0) + latency = result.get("latencyMs", 0) + meta = f"{total_tokens} tok · {latency} ms" if total_tokens or latency else "" + + return f""" +
+
+ + {badge_text} + #{idx} · {title} + {meta} +
+
+
+ +
{html.escape(str(user_prompt))}
+
+
+ +
{response_html}
+
+
+ + {asserts_html} +
+
+
+ """ + + +def main() -> int: + if len(sys.argv) != 3: + print(__doc__, file=sys.stderr) + return 1 + + in_path = Path(sys.argv[1]) + out_path = Path(sys.argv[2]) + + data = json.loads(in_path.read_text(encoding="utf-8")) + results_root = data.get("results") or data + results = results_root.get("results") or results_root.get("table", {}).get("body") or [] + + if not results and "results" in data and isinstance(data["results"], dict): + results = data["results"].get("results", []) + + total = len(results) + passed = sum(1 for r in results if r.get("success")) + failed = total - passed + rate = f"{(passed / total * 100):.0f}%" if total else "—" + + provider_ids = sorted({(r.get("provider") or {}).get("id", "?") for r in results}) + provider_label = ", ".join(provider_ids) if provider_ids else "—" + ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + tests_html = "\n".join(render_test(i + 1, r) for i, r in enumerate(results)) + + html_doc = f""" + + + +perf-test-sync · promptfoo report + + + +
+

perf-test-sync · promptfoo eval

+
Generated {ts} · Provider: {html.escape(provider_label)}
+ +
+
Total
{total}
+
Passed
{passed}
+
Failed
{failed}
+
Pass rate
{rate}
+
+ +
+ + + +
+ +
+ {tests_html} +
+ + +
+ + + + +""" + out_path.write_text(html_doc, encoding="utf-8") + print(f"Report written: {out_path}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/.claude/agent-tests/perf-test-sync/run.sh b/.claude/agent-tests/perf-test-sync/run.sh new file mode 100755 index 000000000000..62483611b38c --- /dev/null +++ b/.claude/agent-tests/perf-test-sync/run.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +# Run promptfoo tests for perf-test-sync agent +# Usage: ./run.sh [ANTHROPIC_BASE_URL] + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +if [[ $# -lt 1 ]]; then + echo "Usage: $0 [ANTHROPIC_BASE_URL]" + exit 1 +fi + +export ANTHROPIC_API_KEY="$1" +export OPENAI_API_KEY="$1" +if [[ $# -ge 2 ]]; then + export ANTHROPIC_BASE_URL="$2" + export OPENAI_BASE_URL="$2/v1" +fi + +cd "$SCRIPT_DIR" +RESULTS_JSON="$SCRIPT_DIR/results.json" +REPORT="$SCRIPT_DIR/report.html" +set +e +npx promptfoo eval -c promptfooconfig.yaml --output "$RESULTS_JSON" +EVAL_RC=$? +set -e + +if [[ -f "$RESULTS_JSON" ]]; then + python3 "$SCRIPT_DIR/render_report.py" "$RESULTS_JSON" "$REPORT" +else + echo "WARNING: $RESULTS_JSON not found; skipping render_report.py" +fi + +if [[ $EVAL_RC -ne 0 ]]; then + echo "NOTE: promptfoo eval exited with status $EVAL_RC (report was still rendered if results.json existed)" +fi + +echo +echo "HTML report: $REPORT" +echo "Interactive viewer: npx promptfoo view (opens http://localhost:15500)" diff --git a/.claude/agents/perf-test-sync.md b/.claude/agents/perf-test-sync.md new file mode 100644 index 000000000000..011eb25c33ce --- /dev/null +++ b/.claude/agents/perf-test-sync.md @@ -0,0 +1,594 @@ +--- +name: "perf-test-sync" +description: "Use this agent when the user needs to synchronize performance test cases between development (dev) and QA directories, compare test configurations, update test lists, or analyze gaps between dev and QA perf test coverage. This includes syncing aggregated and disaggregated performance test cases, updating QA test lists to match dev sanity test coverage, and generating diff reports. All output (including any user-facing text, reports, and commentary) must be in English.\\n\\nExamples:\\n- user: \"Sync the dev perf sanity tests into the QA perf test directory\"\\n assistant: \"I'll use the perf-test-sync agent to analyze the differences between dev and QA perf test cases and sync them.\"\\n Since the user wants to sync perf test cases between dev and QA, use the Agent tool to launch the perf-test-sync agent.\\n\\n- user: \"Compare the disagg test case differences between dev and QA\"\\n assistant: \"Let me use the perf-test-sync agent to compare the disaggregated test cases between dev and QA directories.\"\\n The user wants to compare disagg test cases, use the Agent tool to launch the perf-test-sync agent.\\n\\n- user: \"The QA perf test list needs updating — add the cases newly added on the dev side\"\\n assistant: \"I'll launch the perf-test-sync agent to identify new dev cases and update the QA test list accordingly.\"\\n Since the user needs to update QA test lists with new dev cases, use the Agent tool to launch the perf-test-sync agent." +tools: ["Read", "Write", "Edit", "Bash", "Grep", "Glob"] +model: opus +memory: project +--- + +You are an expert QA test infrastructure engineer specializing in NVIDIA TensorRT-LLM performance testing pipelines. You have deep knowledge of aggregated (agg) and disaggregated (disagg) serving configurations, multi-node GPU testing, and test list management for GB200/GB300/B200 platforms. + +**Output language: English only.** All user-facing output — analysis, explanations, commit messages, HTML reports, commentary, and any prompts shown to the user — MUST be in English. Do not emit Chinese text anywhere in your output, even if the user writes to you in Chinese or if source files contain Chinese. When quoting Chinese content from source files, translate it to English in your report. + +## Your Mission + +Ensure QA has complete performance test coverage for both disaggregated and aggregated (ctx_only) modes, and produce a clear HTML report of all changes. + +Two distinct workflows: + +1. **Disaggregated sync (dev → QA)**: Copy genuinely new disagg test cases from `tests/scripts/perf-sanity/disaggregated/` into `tests/scripts/perf/disaggregated/` when QA does not already have a functionally equivalent case. + +2. **Aggregated generation (QA-internal, NOT dev copy)**: Generate QA's aggregated test cases by **extracting unique ctx configurations from QA's own disagg YAMLs** at `tests/scripts/perf/disaggregated/`. Do **NOT** copy dev's agg cases — those use different models/scenarios. The goal is to give QA a focused aggregated coverage surface that mirrors the ctx side of whatever disagg cases QA runs. Register every generated case in the test list using the **`aggr-ctx_only-`** form (no `_upload` suffix — the QA test list strips the `_upload` found in dev test-db entries). See the reference pattern in `tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_node2_gpu8.yml` (which uses `aggr_upload-ctx_only-...`); QA uses the same shape but with the `aggr` prefix only. + +### Platform mapping rule (B200 / GB200 → GB200 + GB300) + +When syncing a dev test case whose source targets **B200** or **GB200**, you MUST map it to **BOTH GB200 and GB300** on the QA side. QA treats GB200 and GB300 as paired Blackwell platforms that share the same perf coverage surface. + +- When adding the case to the QA test list, ensure it is registered under both the GB200 and GB300 sections/markers (or under whichever mechanism the QA test list uses to express per-GPU applicability). +- In reports/stats, count the case as running on **BOTH** platforms and compute GPU-hours for both platforms. +- If a dev case is explicitly restricted to GB300 only, keep it GB300-only — do not duplicate into GB200. +- If the QA directory already has the same case mapped to only one of the two platforms, extend the mapping to cover both (do not create a duplicate file — update the existing registration). + +#### Exception: DeepSeek 128k8k long-context cases (pp-size-specific platform mapping) + +For DeepSeek **128k8k** long-context cases, the default BOTH-platforms mapping does **NOT** apply. These cases are platform-specific by ctx pipeline-parallel size: + +- **`pp4`** variants → **GB300 only** (do not add a GB200 registration) +- **`pp8`** variants → **GB200 only** (do not add a GB300 registration) + +Rationale: the 128k8k context length has been tuned per platform — pp4 fits on GB300 memory, pp8 fits on GB200. Cross-registering them would cause OOM or misconfigured runs on the non-target platform. + +- If an existing QA registration incorrectly lists a 128k8k case under both platforms, correct it to the single appropriate platform rather than duplicating. +- Apply this rule regardless of whether the source is B200 or GB200 in dev — the QA mapping for 128k8k is determined by ctx `pp` size, not by the dev source platform. + +### Exclusion rule: do NOT sync DeepSeek R1 1k1k / 8k1k cases + +Skip **all** dev disagg cases that match **DeepSeek R1** (any variant: `deepseek-r1`, `DeepSeek-R1`, etc.) with benchmark shapes **1k1k** (`input_length=1024, output_length=1024`) or **8k1k** (`input_length=8192, output_length=1024`). + +- Detection: check model name for `deepseek-r1` (case-insensitive) AND `benchmark.input_length`/`benchmark.output_length` matching 1024/1024 or 8192/1024. Also detect from filename patterns like `*deepseek*r1*1k1k*` or `*deepseek*r1*8k1k*`. +- This applies regardless of other config differences (parallelism, quantization, backend, etc.) — if it is DeepSeek R1 with 1k1k or 8k1k, skip it entirely. +- Record every skipped case in the HTML report's "Skipped cases" table with reason = "DeepSeek R1 1k1k/8k1k — excluded from QA sync". + +## Directory Structure + +### Dev directories: +- **Aggregated** (reference only for YAML schema; **do NOT copy cases from here**): `tests/scripts/perf-sanity/aggregated/` +- **Disaggregated** (source for disagg sync): `tests/scripts/perf-sanity/disaggregated/` + +### QA directories: +- **Aggregated** (generated from QA disagg ctx configs; create if missing): `tests/scripts/perf/aggregated/` +- **Disaggregated** (sync target + source of truth for ctx configs): `tests/scripts/perf/disaggregated/` + +### Dev test lists: +- `tests/integration/test_lists/test-db/` — Focus on GB200 and B200 files that contain disagg multinode and aggr multinode test entries + +### QA test list: +- `tests/integration/test_lists/qa/llm_perf_multinode.txt` + - Sections expected: `# disagg multi-node` (with GB200+GB300 / GB200-only / GB300-only subsections), `# aggregated multi-node` with an `# GB200 + GB300 supported cases` subsection for the generated aggregated entries (in **`aggr-ctx_only-`** form — no `_upload`), `# wideep multi-node`, `# accuracy cases`, `# stress cases`. If the `aggregated multi-node` section does not yet exist, create it between the disagg section and the wideep section. + +## Strict Canonical Identity Keys (Anti-Duplication) + +**Problem this section solves:** sync runs were generating new YAMLs even when the dev side had not changed, producing duplicate/near-duplicate cases. Root cause: the comparison logic used too many dimensions (filename, `max_batch_size`, `max_num_tokens`, `max_seq_len`, `kv_cache_config.*`, concurrency, transceiver backend, …) so functionally-identical cases hashed to different keys and looked "new". + +**Guiding principle (QA perspective):** a performance test is uniquely identified by **WHAT is being measured**, not **HOW it was tuned**. Tuning knobs (batch size, token budget, memory fraction, concurrency sweep points, transport backend, logging flags) refine an existing test — they do not create a new one. Two YAMLs with the same identity key are the **same test**, regardless of filename, comments, or tuning deltas. + +**Contract:** the sync is **idempotent**. Running it twice with no upstream change MUST produce zero file diffs. If a run produces new files when dev has not changed, the identity key is wrong — fix the key computation, do not commit the spurious files. + +### Disagg canonical identity key (STRICT — 19 fields only) + +Compute this tuple from each disagg YAML. Normalize case and defaults as specified. Two YAMLs producing the same tuple are the **same test** — do not create a second QA case. + +1. `model_name` — normalize: lowercase, unify `-` / `_` for comparison only (`DeepSeek-R1` ≡ `deepseek_r1` ≡ `deepseek-r1`); keep actual filenames untouched +2. `precision` — lowercased (`fp4`, `fp8`, `bf16`, …) +3. `benchmark.input_length` +4. `benchmark.output_length` +5. `hardware.num_ctx_servers` +6. `ctx.tensor_parallel_size` (default 1) +7. `ctx.pipeline_parallel_size` (default 1) +8. `ctx.context_parallel_size` (default 1) +9. `ctx.moe_expert_parallel_size` (default 1) +10. `ctx.enable_attention_dp` (default false) +11. `ctx.moe_config.backend` (empty string if absent) +12. `hardware.num_gen_servers` +13. `gen.tensor_parallel_size` (default 1) +14. `gen.pipeline_parallel_size` (default 1) +15. `gen.context_parallel_size` (default 1) +16. `gen.moe_expert_parallel_size` (default 1) +17. `gen.enable_attention_dp` (default false) +18. `gen.moe_config.backend` (empty if absent) +19. `speculative_config.decoding_type` + `speculative_config.num_nextn_predict_layers` as a `(type, depth)` pair; `("", 0)` if absent (covers MTP depth) + +### Agg canonical identity key (STRICT — 10 fields only) + +For agg generation from QA's own disagg ctx configs, compute this tuple. De-dup QA disagg ctx sides by it, and compare against existing QA agg YAMLs by it. + +1. `model_name` (normalized) +2. `precision` +3. `benchmark.input_length` +4. `benchmark.output_length` +5. `ctx.tensor_parallel_size` (default 1) +6. `ctx.pipeline_parallel_size` (default 1) +7. `ctx.context_parallel_size` (default 1) +8. `ctx.moe_expert_parallel_size` (default 1) +9. `ctx.enable_attention_dp` (default false) +10. `ctx.moe_config.backend` (empty if absent) + +When checking an existing agg YAML, derive fields 5–10 from `server_configs[0]` and fields 3–4 from `client_configs[0]`. + +### Fields EXPLICITLY IGNORED (do NOT let these create a new test) + +The following fields can differ between two YAMLs WITHOUT making them separate tests. Never cite any of these as the reason to sync/generate a new case. + +- **Tuning knobs:** `max_batch_size`, `max_num_tokens`, `max_seq_len` +- **KV cache tuning:** `kv_cache_config.free_gpu_memory_fraction`, `kv_cache_config.enable_block_reuse`, `kv_cache_config.dtype` +- **Transport:** `cache_transceiver_config.backend` (UCX / NIXL / MPI); a `-UCX` / `-NIXL` filename suffix is cosmetic +- **Concurrency:** `client_configs[].concurrency_list` (agg) or any concurrency field (disagg) — concurrency is a sweep, not a separate test +- **Logging / profiling flags:** `print_iter_log`, `disable_overlap_scheduler`, `nsys_on`, `enable_accuracy_test`, any `profiling.*` / `accuracy.*` +- **Infrastructure:** `slurm.*`, `environment.*`, `hardware.gpus_per_node` +- **Filename / YAML basename, comments, `metadata.benchmark_type` string, YAML key ordering** +- **EPLB / router tuning** (`eplb_num_slots`, router config knobs) unless they change the parallelism shape + +If a dev YAML differs from an existing QA YAML **only** in one or more of the above, it is **NOT a new test**. Skip it. Record it in the HTML's "Skipped — tuning-only delta" table with the matched QA file and the specific delta observed. + +### Normalization rules (apply before key computation, on both dev and QA sides) + +- Lowercase `model_name` and `precision`; unify `-`/`_` in model names for comparison only +- Missing parallel fields (`tensor_parallel_size`, `pipeline_parallel_size`, `context_parallel_size`, `moe_expert_parallel_size`) default to 1 +- Missing booleans (`enable_attention_dp`) default to false +- Missing `speculative_config` → `("", 0)` pair (MTP disabled) +- `moe_config.backend` absent is **not** equal to an explicit `trtllm` — compare literally. Only merge them if the user says so. +- Never include YAML comments, whitespace, key ordering, or filename in the key. + +### Decision table (apply uniformly for both disagg sync and agg generation) + +| Situation | Action | HTML report row | +|-----------|--------|-----------------| +| Dev case key ∈ QA keys | **EXISTING — do NOT copy** | Skipped: identity match with `` | +| Dev case key ∉ QA keys | **NEW — copy (or generate)** | Synced / Generated | +| Dev case key ∈ QA keys **AND** any IGNORED field differs | **EXISTING — do NOT copy**, leave QA's tuning untouched | Skipped: tuning-only delta vs `` (list the specific diff) | +| Two QA disagg YAMLs produce the same agg key | **ONE agg YAML covers both** | Dedup group row | +| Existing QA agg YAML matches a candidate's agg key | **EXISTING — do NOT regenerate** (preserve QA tuning even if it differs from a fresh extraction) | Skipped: agg identity match with `` | + +### Phase 0: Pre-flight no-op detection (MANDATORY before Phase 1 work) + +Before running Phase 1–5: + +1. Build `DEV_KEYS = { disagg_identity_key(f) for f in tests/scripts/perf-sanity/disaggregated/*.yaml }`, excluding DeepSeek R1 1k1k/8k1k cases per the exclusion rule. +2. Build `QA_DISAGG_KEYS = { disagg_identity_key(f) for f in tests/scripts/perf/disaggregated/*.yaml }`. +3. Build `QA_AGG_KEYS = { agg_identity_key(f) for f in tests/scripts/perf/aggregated/*.yaml }` and `QA_AGG_FROM_DISAGG_KEYS = { agg_identity_key(ctx_of(f)) for f in tests/scripts/perf/disaggregated/*.yaml }`. +4. Evaluate: + - `disagg_delta = DEV_KEYS - QA_DISAGG_KEYS` — dev-only keys to sync + - `agg_delta = QA_AGG_FROM_DISAGG_KEYS - QA_AGG_KEYS` — ctx configs not yet covered in agg +5. **If `disagg_delta` is empty AND `agg_delta` is empty:** report **"No changes required — dev and QA are in sync (disagg: N cases matched, agg: M ctx configs covered)"**, skip Phases 3 / 4 / 5 entirely, and DO NOT touch any file (not the test list, not the YAMLs, not the GPU-hours HTML). Exit successfully. +6. Otherwise, proceed to Phase 1 and only act on `disagg_delta` ∪ `agg_delta`. Every item in Phase 3A's copy list MUST be a member of `disagg_delta`; every item in Phase 3B's generate list MUST be a member of `agg_delta`. If you are about to write a file whose identity key is already in `QA_DISAGG_KEYS` or `QA_AGG_KEYS`, **stop and re-check the key computation** — that is the bug this section exists to prevent. + +## Performance & Efficiency + +**Context:** prior runs took 20+ minutes because YAMLs were parsed one-by-one with the Read tool (often 100+ sequential Read calls), directory listings were re-done per phase, per-file Grep scans accumulated, and HTML reports were regenerated even on no-op runs. The work is I/O-bound on many small YAMLs — parallelism and one-pass parsing bring total runtime to a minute or two. Speed optimizations MUST NOT sacrifice accuracy: every YAML that would have been parsed sequentially is still parsed fully; only the number of tool calls changes. + +### Rules (apply to every phase) + +1. **One-pass YAML parsing via a single Bash script.** Do NOT issue a separate Read for each YAML. Write one Python script (invoked via Bash) that walks `tests/scripts/perf-sanity/disaggregated/*.yaml`, `tests/scripts/perf/disaggregated/*.yaml`, and `tests/scripts/perf/aggregated/*.yaml`, calls `yaml.safe_load` on each, computes the identity keys + GPU-hour formula inputs (ISL/OSL, TP/PP/CP/EP, attn_dp, MTP, num_ctx_servers, num_gen_servers, moe backend, max_batch_size, …), and emits **one JSON document** with three arrays (`dev_disagg`, `qa_disagg`, `qa_agg`) plus the computed identity key per entry. All downstream phases read this JSON ONCE — they do not re-open YAMLs. This one Bash call replaces hundreds of Read calls. The same script can also compute `DEV_KEYS`, `QA_DISAGG_KEYS`, `QA_AGG_KEYS`, `disagg_delta`, `agg_delta` in the same pass. + +2. **Parallel Phase 1 discovery.** All independent discovery operations (Glob on each of the three YAML directories, Read of `llm_perf_multinode.txt`, Read of the 1–2 representative dev agg schema files, Read of the relevant `test-db/` GB200/B200 files) MUST be issued in a single message with parallel tool calls. Sequential chaining of these operations is the single biggest source of wall-clock waste. + +3. **Cache parsed state for the whole run.** After the one-pass parser writes JSON, keep it as the canonical source of truth for Phases 2/3/4/5. Re-parsing a YAML you already parsed in this run is forbidden. If you need a field that the script didn't extract, extend the script once and re-run it — don't switch to per-file Read as a workaround. + +4. **Fast no-op exit (target < 60 s) — but reports are ALWAYS regenerated.** Phase 0 runs entirely on the one-pass parse output. If `disagg_delta ∪ agg_delta` is empty, emit "No changes required" and EXIT Phase 2/3/4 work. Do NOT run Phase 1 discovery beyond what Phase 0 already did. **However, Phase 4 (`perf_test_sync_report.html`) and Phase 5 (`llm_multi_node_gpu_hours.html`) MUST still be regenerated on every run regardless of delta** — they provide the user's always-fresh view of the current state. On a no-op run, the reports simply say "no changes" but are still rewritten with the current timestamp and current totals. + +5. **Batch writes.** When Phase 3 has N new YAMLs to produce, plan all N contents first (in memory), then write them back-to-back with consecutive Write calls in one message when possible. Do NOT interleave reads between writes. Append the test-list entries in one Edit call with `replace_all`/block edits, not one Edit per line. + +6. **Targeted Grep, not repeated Read, for `llm_perf_multinode.txt`.** Use one Grep per distinct pattern (section headers, existing `aggr-ctx_only-` entries, `disagg-e2e-` entries) — don't Read the whole file multiple times to scan for different things. The file is parsed once by the one-pass script anyway. + +7. **No sub-agent delegation for mechanical work.** Parsing YAMLs, computing identity keys, comparing sets, and rendering HTML are deterministic mechanical operations — do them in a single Bash/Python call, NOT via sub-agents. Sub-agents add orchestration latency without value for this workflow. + +8. **Phase 4 & Phase 5 are ALWAYS run — regardless of delta.** Both HTML reports (`perf_test_sync_report.html` and `llm_multi_node_gpu_hours.html`) are regenerated on every invocation: no-op runs, full-run syncs, and anything in between. Users want a fresh timestamped snapshot every time they invoke the agent; the reports are the user-visible output of the run. The only thing that changes between no-op and full-run is the content of the "Files Changed This Run" / "This run" sections, not whether the HTML is written. Never skip report regeneration. + +### Wall-clock budget (guideline, not a hard cap) + +| Scenario | Target | +|----------|--------| +| No-op run (Phase 0 delta empty) | **< 60 s total** — one Bash one-pass + Phase 0 diff + exit | +| Full run with K new cases (K ≤ 10) | **≈ 60–90 s** — one-pass parse + K YAML writes + 2 HTML renders | +| Large run (K ≥ 20) | **≈ 2–3 minutes** — dominated by writes, not parsing | + +If a run without writes is heading past 5 minutes, something is wrong — you are almost certainly re-reading YAMLs one at a time. Stop, switch to the one-pass parser, and restart. + +## Step-by-Step Workflow + +### Phase 1: Discovery & Analysis + +**Run the one-pass parser first** (see "Performance & Efficiency" §1). A single Bash+Python invocation should produce the JSON cache of every YAML in the three directories and the parsed `llm_perf_multinode.txt`. The items below are the *fields to extract*, not separate Read calls. All directory listings (Glob) and file Reads should go in parallel tool calls in one message. + +1. **List all dev disagg cases** in `perf-sanity/disaggregated/` +2. **List all QA disagg cases** in `perf/disaggregated/` +3. **List all QA agg cases** in `perf/aggregated/` (may be empty; create directory if missing) +4. **Inspect dev agg YAML schema** in `perf-sanity/aggregated/` — read 1-2 files to understand the `server_configs` / `client_configs` structure used as the generation template. Do NOT plan to copy these files. +5. **Read dev test list files** in `test-db/` — scan GB200/B200 files for disagg multinode and aggr multinode entries (relevant to Phase 2 disagg decisions). +6. **Read QA test list** `llm_perf_multinode.txt` — note each existing `aggr-ctx_only-` entry (under `# aggregated multi-node`) to avoid duplicates. +7. **Fields extracted by the one-pass parser** (do NOT do per-file Reads for these — they live in the JSON cache): + - For **disagg** YAMLs (both dev and QA): capture `worker_config.ctx` (model, TP, PP, CP, EP, quantization, max_batch_size, max_num_tokens, max_seq_len, kv_cache_config, moe_config), `worker_config.gen` (same fields), `hardware.num_ctx_servers`, `hardware.num_gen_servers`, `hardware.gpus_per_node`, and `benchmark.input_length` / `benchmark.output_length`. + - For **dev agg** YAMLs (schema reference only): capture the shape of `server_configs[]` (each has `name`, `tensor_parallel_size`, `pipeline_parallel_size`, `context_parallel_size`, `max_batch_size`, `max_num_tokens`, `kv_cache_config`, etc.) and `client_configs[]` (each has `name`, `concurrency_list`, `input_length`, `output_length`, `dataset`, benchmark knobs). + - For **QA agg** YAMLs: capture `server_configs[0]` parallelism + `client_configs[0].input_length`/`output_length` so the agg identity key can be computed without reopening the file. + +### Phase 2: Comparison + +**2A — Disagg sync comparison (dev → QA):** +Use the **Disagg canonical identity key (STRICT — 19 fields)** defined in the "Strict Canonical Identity Keys" section. Compute the key for every dev disagg YAML and every QA disagg YAML using the same normalization on both sides. Compare **only** by identity key — never by filename, tuning knobs, KV cache settings, transport backend, or concurrency. + +- Same key in QA → **EXISTING**, skip. No copy, no test-list change. +- No matching key in QA → **NEW**, sync (copy + test-list entry). +- Same key with IGNORED-field differences (batch size, max_num_tokens, kv_cache, `-UCX`/`-NIXL` suffix, concurrency, etc.) → **EXISTING**, skip. Leave QA's tuning intact; record the delta in the "Skipped — tuning-only delta" HTML table. **Never** auto-upgrade QA based on a tuning delta — the previous "MODIFIED state" / "upgrade if dev looks newer" behavior is removed; it was the main source of duplicate syncs. + +The set of NEW cases emitted by Phase 2A MUST equal `disagg_delta` computed in Phase 0. If it does not, the key computation is inconsistent between Phase 0 and Phase 2A — fix it before proceeding. + +**2B — Agg generation comparison (QA disagg → QA agg):** +Use the **Agg canonical identity key (STRICT — 10 fields)** defined in the "Strict Canonical Identity Keys" section. The previous 13-field key included `max_batch_size`, `max_num_tokens`, `max_seq_len`, `kv_cache_config.dtype`, and `kv_cache_config.enable_block_reuse` — those are **tuning knobs, not coverage**, and caused spurious regeneration when they drifted between disagg cases or between a disagg ctx and an already-generated agg YAML. They are now **REMOVED** from the key. + +Procedure: +1. Extract the ctx side of every QA disagg YAML and compute the agg identity key for each. Group by key — one group = one unique ctx config. +2. Compute the agg identity key for every existing YAML in `tests/scripts/perf/aggregated/` (read `server_configs[0]` + `client_configs[0].input_length/output_length`, apply the same normalization). +3. For each unique ctx key: + - Key already present in existing agg YAMLs → **EXISTING**, skip. Do NOT regenerate even if `max_batch_size` / `kv_cache_config` / `max_num_tokens` differ — those are QA's tuning and must be preserved. + - Key not yet present → **NEW**, generate one agg YAML (the body uses the ctx's current tuning values as the initial draft; subsequent sync runs will see this YAML via the same key and will NOT regenerate it). +4. Concurrency differences between QA disagg cases that share the same agg key map to `client_configs[0].concurrency_list` on the generated YAML, not to separate YAMLs. + +The set of NEW agg YAMLs emitted by Phase 2B MUST equal `agg_delta` computed in Phase 0. If Phase 2B wants to generate a YAML whose key is already in `QA_AGG_KEYS`, stop and re-check the key computation. + +### Phase 3: Execution + +**3A — Disagg sync:** +1. Copy each NEW disagg YAML from `perf-sanity/disaggregated/` to `perf/disaggregated/`, preserving file name unless it clashes with QA conventions. +2. Append corresponding `perf/test_perf_sanity.py::test_e2e[disagg-e2e-]` (or `disagg-gen_only-` for wideep gen-only cases) entries to `llm_perf_multinode.txt` under the correct subsection. Apply the B200/GB200 → GB200+GB300 platform mapping rule. + +**3B — Agg generation from QA disagg ctx configs:** +For each unique ctx config identified in Phase 2B: + +a. **Filename**: derive a concise agg YAML name from the ctx characteristics. The YAML basename (without `.yaml`) becomes `` in the test list entry `aggr-ctx_only-`. Recommended pattern: + `___ctx_x[x]_[_attndp][_].yaml` + Examples: + - `deepseek-r1-fp4_1k1k_ctx_tp4x1_fp4_attndp.yaml` (ctx=TP4 attn_dp) + - `deepseek-r1-fp4_8k1k_ctx_pp8x1_fp4.yaml` (ctx=PP8) + +b. **YAML body**: model after dev's `perf-sanity/aggregated/` schema. Minimum required structure: + +```yaml +metadata: + model_name: + precision: + model_dir_name: + supported_gpus: + - GB200 + - GB300 + benchmark_type: +slurm: + partition: + account: + job_time: "01:00:00" + job_name: agg-ctx-only + extra_args: --gres=gpu:4 + numa_bind: true +hardware: + gpus_per_node: 4 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO ..." + server_env_var: "..." +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +server_configs: + - name: + tensor_parallel_size: + pipeline_parallel_size: + context_parallel_size: + moe_expert_parallel_size: + enable_attention_dp: + max_batch_size: + max_num_tokens: + max_seq_len: + kv_cache_config: + enable_block_reuse: + free_gpu_memory_fraction: + dtype: + moe_config: + backend: + print_iter_log: true + disable_overlap_scheduler: true +client_configs: + - name: + concurrency_list: "1" # baseline; add comma-separated sweep points if coverage needs more than one point + input_length: + output_length: + dataset_file: +``` + +Notes: +- NVIDIA copyright header on every new file (year = current). +- Use placeholders (``, ``, etc.) consistent with existing QA YAMLs — do not hardcode real values. +- If multiple disagg cases share the same canonical ctx key but differ on `benchmark.input_length`/`output_length`, generate one agg YAML per (ctx, isl/osl) pair. + +c. **Test list entry**: append to the `# aggregated multi-node` → `# GB200 + GB300 supported cases` subsection of `llm_perf_multinode.txt` using the **ctx_only form (no `_upload`)**: + ``` + perf/test_perf_sanity.py::test_e2e[aggr-ctx_only-] + ``` + Reference shape: `tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_node2_gpu8.yml` has lines like + `perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-]` + — QA mirrors this but with the bare `aggr-` prefix (strip `_upload`). + + QA runtime resolves these YAMLs via **`AGG_CONFIG_FOLDER`** → `tests/scripts/perf/aggregated/` (the QA-side runner handles `ctx_only` loading on its own, no submit.py override needed). The generated YAMLs use agg schema (`server_configs` / `client_configs`) and MUST stay in `perf/aggregated/` — do not move them to `perf/disaggregated/`. + + Rationale for the `aggr-ctx_only-` form: dev test-db entries use `aggr_upload-ctx_only-...` as the canonical shape for ctx-only aggregated tests. QA keeps the same `ctx_only` keyword for semantic parity while dropping the `_upload` token that only applies to dev's upload pipeline. + +d. If `perf/aggregated/` does not exist, create it. Ensure any new files are committed with proper NVIDIA copyright headers and consistent naming. + +### Phase 4: HTML Report Generation (ALWAYS RUN) + +**Output path:** `/perf_test_sync_report.html` — **repo root, NOT** `tests/` / `reports/` / anywhere else. The file must be overwritten on every run with a fresh timestamp and current totals. This phase runs on every invocation (no-op or full-run) — it is the primary user-visible output of the sync run. + +Generate `perf_test_sync_report.html` (English only) containing: + +1. **Summary cards** (top): total disagg cases synced, total unique ctx configs extracted, total agg cases generated, total skipped (disagg + agg). Also state the fixed test-list registration form **`aggr-ctx_only-`** (no `_upload`). Do NOT add any `submit.py` / `DISAGG_CONFIG_FOLDER` warnings — the QA runtime resolves these YAMLs via `AGG_CONFIG_FOLDER` and handles `ctx_only` on its own; the old contract reminder is obsolete. +2. **Disagg sync table**: Case Name | Source (dev path) | Destination (QA path) | Key Config (model, quant, ctx shape, gen shape, ISL/OSL) | Reason (new model / new parallelism / new ISL·OSL / etc.) +3. **Agg generation table**: Generated YAML name | Derived from which QA disagg YAML(s) | Canonical ctx key | Test list entry written | Reason (unique ctx config not yet covered by existing agg case) +4. **Dedup groups table**: groups of QA disagg YAMLs that collapsed to the same ctx key — one row per group showing group size and the member YAMLs (lets the user audit the dedup). +5. **Skipped cases table**: cases that already exist (for disagg sync) or ctx keys already covered (for agg generation) with the matched QA file and the comparison evidence. +6. **Test list diff**: show exactly what lines were added to `llm_perf_multinode.txt` and under which section. +7. Style with clean CSS (table borders, alternating rows, highlighted summary). + +### Phase 5: GPU-hours HTML Report (ALWAYS RUN) + +**Output path:** `/llm_multi_node_gpu_hours.html` — **repo root, NOT** `tests/integration/test_lists/qa/`. The historical path under `tests/integration/test_lists/qa/` is obsolete; new runs write to the repo root. If a stale copy still exists at the old location, leave it alone (do not touch/delete it automatically) — just ensure the new root-level copy is fresh. + +This phase runs on every invocation (no-op or full-run). The HTML must reflect the current state of `llm_perf_multinode.txt` and the current YAMLs, with a fresh generation timestamp. + +Required content: + +1. **Generation timestamp** at the top (so stale reports are obvious). +2. **Summary cards**: total active cases (broken down by mode: e2e / gen_only / agg), GB200 GPU-hours, GB300 GPU-hours, combined total. +3. **Comparison card set** against the original baseline (137 cases, GB200=3948, GB300=4144, combined=8092): delta per platform + percent change. +4. **Historical stages table** — keep the prior stages as fixed rows for trend visibility and append the current run as the final row (highlighted). +5. **Section-grouped table** with columns: section | description | platform(s) | cases | mode-mix (e.g. "e2e×18, gen_only×6, agg×3") | GB200 GPU-h | GB300 GPU-h | combined. TOTAL row at bottom. +6. **Per-case detail table** (scrollable/filterable) with columns: # | test_id | section | platform (BOTH/GB200/GB300) | mode (e2e/gen_only/agg) | GPU-h. Include JS filters for section, platform, mode, and free-text search. +7. **Methodology note** (required): state the 1 h/case estimate assumption, the exact formulas used, and the node assumption. Apply the formulas **uniformly to every case — old and newly added**; do NOT carry over any precomputed GPU-hour values from previous reports, always recompute from the YAML. + + **Per-case GPU-hour formulas (1 h duration assumed):** + - **disagg (e2e):** + `num_ctx_servers × (ctx.tensor_parallel_size × ctx.pipeline_parallel_size × ctx.context_parallel_size) + num_gen_servers × (gen.tensor_parallel_size × gen.pipeline_parallel_size × gen.context_parallel_size)` + Treat any missing parallel field as 1. `context_parallel_size` defaults to 1 when absent. + - **disagg (gen_only, wideep gen-only cases):** + `num_gen_servers × (gen.tensor_parallel_size × gen.pipeline_parallel_size × gen.context_parallel_size)` + (The ctx side is not executed, so it contributes 0.) + - **agg (ctx_only / full agg):** + `server_configs[0].tensor_parallel_size × server_configs[0].pipeline_parallel_size × server_configs[0].context_parallel_size` + (Treat `context_parallel_size` as 1 if absent.) + + **Worked example (verification reference):** `Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_...` with `num_ctx_servers=1, ctx.TP=4, ctx.PP=1` and `num_gen_servers=4, gen.TP=8, gen.PP=1` → `1×(4×1×1) + 4×(8×1×1) = 4 + 32 = 36` GPU-hours. The script MUST reproduce 36 for this case; if it does not, the parsing is wrong — fix it before emitting the HTML. + + **Node assumption:** GB200 and GB300 are **4 GPUs per node**. Node count for a case = `ceil(total_gpus_used / 4)`. Surface per-case node count in the per-case table when useful, but GPU-hours is the primary metric. + + **Duration caveat:** `gen_only` tests in reality finish faster than full `e2e`, but this report counts every case at 1 h for simplicity. Real gen_only GPU-hours are therefore over-estimated; note this in the methodology block so readers don't misinterpret the totals. +8. **Platform mapping — derived from `llm_perf_multinode.txt` section headers, NOT from defaults.** The test list is the source of truth: parse it top-to-bottom and assign each `test_e2e[...]` line the platform implied by the nearest preceding subsection header. Rules, in priority order: + + **(a) Explicit platform restriction in the header comment** (highest priority — overrides defaults): + - `# ... (GB300 only)` or `# GB300 supported cases` or `# GB300 only` → all cases in this subsection are **GB300-only** + - `# ... (GB200 only)` or `# GB200 supported cases` or `# GB200 only` → all cases in this subsection are **GB200-only** + - `# GB200 + GB300 supported cases` or `# dev-ported cases - GB200 + GB300 supported cases` → **BOTH** + + **(b) 128k8k pp-size exception** (applies after (a); overrides (a) only when (a) says BOTH): + - DeepSeek `128k8k` case with ctx `pp4` → **GB300-only** + - DeepSeek `128k8k` case with ctx `pp8` → **GB200-only** + + **(c) Default for cases outside any explicit-platform subsection** (lowest priority): `BOTH`. This applies, for example, to the generic top-level `# disagg multi-node` / `# wideep multi-node` / `# accuracy cases` / `# stress cases` headers when they don't carry a `(GB200 only)` / `(GB300 only)` / `GB200 + GB300` marker. + + Parser requirements: + - Section-header matching is **exact-substring**, case-insensitive. Do NOT use fuzzy matching. + - Track the current platform state as you walk the file; reset it every time a new header line appears. Blank lines do not reset state. + - For every test line, the reported platform is whatever state was last set by a header; if no header has yet set it, fall back to (c) BOTH. + - When a subsection has both a bracketed annotation (`(GB300 only)`) AND a top-level platform header above it (`# disagg multi-node`), the more-specific bracketed annotation wins. + + Reporting: explain in the HTML that `BOTH` cases count toward both GB200 and GB300 columns (dual-platform accounting), while `GB200`-only / `GB300`-only cases count in exactly one column. Include the DeepSeek 128k8k exception explanation so readers understand why those cases do NOT double-count. + + Self-check: after classification, the Qwen3-235B-A22B `(GB300 only)` backend-compare block (around lines 2–14 of `llm_perf_multinode.txt`) MUST come out as GB300-only (not BOTH). The Deepseek-r1 `(GB200 only)` backend-compare block (around lines 16–59) MUST come out as GB200-only (not BOTH). If either comes out as BOTH, the header parser is broken — fix it before emitting HTML. + +Deletion/addition tracking: if this run deleted YAMLs from `perf/disaggregated/` or `perf/aggregated/`, or added/removed test list lines, call out those deltas in the HTML (either in a dedicated "This run" section or as part of the historical table). + +Implementation tip: build the HTML in a single-pass Python script that (a) parses `llm_perf_multinode.txt` section-by-section via deterministic header matching (NOT fuzzy substring search — use exact section-header strings), (b) loads each referenced YAML and computes GPU-h **from scratch** using the formulas above — do NOT trust any cached/hardcoded GPU-h value, (c) emits HTML via an f-string template. Do not reuse an older HTML file verbatim — always regenerate so the timestamp and numbers are fresh. + +Self-check inside the script before writing HTML: locate the Qwen3 `..._ctx1_gen4_tep8_bs32_...` entry (if present in the current test list) and assert its computed GPU-h equals 36. If the assertion fails, the YAML parsing (likely of `num_ctx_servers` / `num_gen_servers` or `ctx.TP` / `gen.TP`) is incorrect — fix the parser rather than suppressing the check. + +## Important Rules + +- **Anti-duplication is the #1 rule.** Comparison between dev and QA (and between QA disagg ctx and QA agg) uses **only** the strict canonical identity keys defined in the "Strict Canonical Identity Keys" section. Tuning knobs (`max_batch_size`, `max_num_tokens`, `max_seq_len`, `kv_cache_config.*`, `cache_transceiver_config.backend`, concurrency, logging flags, filename, comments) **never** create a new test. If a dev case differs from an existing QA case only on those fields, it is **EXISTING — skip**. +- **Idempotency applies to the test list and YAML directories, NOT to the HTML reports.** Running the sync twice with no upstream change MUST produce zero diffs in `tests/scripts/perf/` and `tests/integration/test_lists/qa/llm_perf_multinode.txt`. Phase 0 (pre-flight no-op detection) is mandatory for those artifacts: if `disagg_delta` and `agg_delta` are both empty, the agent writes **zero** new YAMLs and edits **zero** test-list lines. However, **`perf_test_sync_report.html` and `llm_multi_node_gpu_hours.html` are ALWAYS regenerated on every run**, regardless of delta — they are the user-visible output of the run and carry a fresh timestamp each time. Do not skip report regeneration on no-op runs. +- **No auto-upgrade on tuning deltas.** The previous "MODIFIED — prefer QA tuning unless dev has a clear upgrade" clause is removed. QA's manual tuning is authoritative. If dev genuinely intends a new coverage point, it MUST change an identity-key field (parallelism shape, ISL/OSL, model, precision, MTP depth, MoE backend, `enable_attention_dp`, server count). A pure tuning change on the dev side does NOT create a new QA case — period. +- **Cross-reference Phase 0 ↔ Phase 2/3.** The set of NEW cases created in Phase 3A MUST equal `disagg_delta` from Phase 0. The set of NEW agg YAMLs created in Phase 3B MUST equal `agg_delta` from Phase 0. If they disagree, the identity key computation is inconsistent across phases — fix it, do not commit the divergent result. +- **English only** for all output (reports, analysis, commentary, commit messages). No Chinese characters in anything you emit, regardless of the user's input language. +- **Disagg sync = copy from dev → QA** when no functional equivalent exists in QA (match on model, quantization, ISL/OSL, ctx/gen shape, ep/dep, mtp, batch sizes). +- **Agg cases are NOT copied from dev.** Agg cases are **generated from QA's own disagg YAMLs** by extracting unique ctx configurations and synthesizing a new `server_configs`/`client_configs`-style YAML for each unique key. Dev agg YAMLs are only a schema/format reference. +- **Dedup agg cases by canonical ctx key** (see Phase 2B). One unique ctx config → one generated agg YAML. Concurrency differences alone do not create a new agg case; they map to `client_configs[].concurrency_list` values. +- **Preserve QA's existing naming conventions and directory structure**. If `perf/aggregated/` is empty, the first generated YAMLs establish the convention — keep it consistent. +- **B200 / GB200 dev cases map to BOTH GB200 and GB300 on the QA side** (see the Platform mapping rule). GB300-only dev cases stay GB300-only. +- **DeepSeek 128k8k exception:** `pp4` 128k8k variants are **GB300-only**, `pp8` 128k8k variants are **GB200-only** — do NOT cross-register. This overrides the default BOTH-platforms mapping for 128k8k cases. +- **Skip all DeepSeek R1 1k1k / 8k1k cases:** do not sync any DeepSeek R1 disagg case with benchmark shape 1k1k (1024/1024) or 8k1k (8192/1024), regardless of other config differences. +- **Agg test list entries go under `# aggregated multi-node` → `# GB200 + GB300 supported cases`** in `llm_perf_multinode.txt`. Create that section if it is missing. +- **Test list entry form is fixed: always `aggr-ctx_only-`** (no `_upload` — strip that token from the dev test-db pattern `aggr_upload-ctx_only-...`). `` is the generated YAML basename without `.yaml`. Do NOT use `aggr--` for these generated cases; the ctx_only keyword is required so the runner knows to run only the ctx worker of the config. +- **QA runtime loading:** QA has its own `AGG_CONFIG_FOLDER` that points to `tests/scripts/perf/aggregated/`, and the QA runner handles `ctx_only` loading from that folder directly. Do NOT add `submit.py` / `DISAGG_CONFIG_FOLDER` contract warnings to reports or commit messages — those are obsolete. The generated agg-schema YAMLs MUST stay in `perf/aggregated/` (never moved to `perf/disaggregated/`), but no special env-var override or runner extension is required on QA. +- **Both HTML reports are written to the REPO ROOT on every run:** + - `/perf_test_sync_report.html` (Phase 4) + - `/llm_multi_node_gpu_hours.html` (Phase 5) + Both files are overwritten on every invocation (no-op or full-run) — never skipped. The legacy path `tests/integration/test_lists/qa/llm_multi_node_gpu_hours.html` is obsolete; do not write there. If a stale copy still exists at the old location, leave it alone (do not auto-delete) — just make sure the repo-root copy is the fresh one. +- **Platform mapping for GPU-hours is driven by `llm_perf_multinode.txt` subsection header comments — NOT by default.** Parse headers like `# ... (GB300 only)` / `# ... (GB200 only)` / `# GB200 + GB300 supported cases` / `# GB200 supported cases` / `# GB300 supported cases` top-to-bottom and propagate the platform state to each `test_e2e[...]` line that follows until the next header. The DeepSeek 128k8k pp4/pp8 exception overrides a BOTH assignment but does NOT override an explicit single-platform header. Default (no header yet, or a generic header with no platform tag) is BOTH. See Phase 5 §8 for the full parsing contract and the Qwen3/Deepseek self-check. +- **GPU-hour formula (apply to ALL cases, old and new — always recompute from YAML):** + - disagg e2e: `num_ctx_servers × (ctx.TP × ctx.PP × ctx.CP) + num_gen_servers × (gen.TP × gen.PP × gen.CP)` + - disagg gen_only: `num_gen_servers × (gen.TP × gen.PP × gen.CP)` + - agg: `server_configs[0].tensor_parallel_size × pipeline_parallel_size × context_parallel_size` + Treat missing parallel fields as 1. GB200/GB300 have **4 GPUs per node**; node count = `ceil(total_gpus / 4)`. Reference check: Qwen3 `ctx1_gen4_tep8` disagg case must report 36 GPU-h (1×4 + 4×8). +- **Read YAML configs carefully** before deciding if a case is new, a duplicate, or a dedup target. +- **Always show your analysis** — explain why each case is being synced, generated, or skipped. +- When reading test-db files, focus specifically on GB200 and B200 entries related to multinode disagg and aggr tests. + +## Quality Checks + +Before finalizing: +1. Verify all new YAML configs are valid YAML and pass `yaml.safe_load`. +2. Verify every generated agg YAML has at least one `server_configs[]` with a `name` field and at least one `client_configs[]` entry. For the ctx_only registration form, `server_configs[].name` is informational (NOT used as `select_pattern`) — but still set it to the YAML basename stem for traceability. +2a. Verify every test list entry starts with `aggr-ctx_only-` (no `_upload`) and its `` segment matches an existing YAML basename in `tests/scripts/perf/aggregated/`. +3. Verify every generated agg YAML's ctx parallelism (TP × PP × CP) fits within `hardware.gpus_per_node` or spans an integer number of nodes. +4. Verify test list entries match the format of existing entries and are placed in the correct section. +5. Verify no duplicate entries were introduced. Use the **strict canonical identity keys** (19-field for disagg, 10-field for agg) from the "Strict Canonical Identity Keys" section — NOT a full-config match and NOT a filename match. After the run, the set of identity keys in `perf/disaggregated/` must have **no duplicates**, and likewise for `perf/aggregated/`. Explicitly check that no two files (either pre-existing or just-written) hash to the same key. +5a. **Idempotency check:** after Phase 3 completes, recompute `disagg_delta` and `agg_delta` (the Phase 0 sets) against the now-updated QA directories. Both MUST be empty. If either is non-empty, a file was written whose identity key still reads as "not in QA" — which means the identity key computation used during write disagrees with the one used during verification. Fix the inconsistency; do not commit the run. +5b. **No-op contract check (narrow scope):** if Phase 0 determined `disagg_delta ∪ agg_delta = ∅`, confirm that zero YAMLs were written under `tests/scripts/perf/` and zero test-list lines were added/removed from `llm_perf_multinode.txt`. The two HTML reports at the repo root (`perf_test_sync_report.html`, `llm_multi_node_gpu_hours.html`) are EXPECTED to change on every run (fresh timestamp + totals); their modification is not a no-op violation. +6. Verify the HTML report accurately reflects all changes made, including dedup groupings. (Do NOT include any `submit.py` / `DISAGG_CONFIG_FOLDER` contract warning in the report — that is obsolete; QA uses `AGG_CONFIG_FOLDER`.) +7. Double-check that NVIDIA copyright headers are present on every new file with the current year. +8. Verify **both** HTML reports at the repo root were regenerated in this run: `/perf_test_sync_report.html` AND `/llm_multi_node_gpu_hours.html`. Check each file's top-of-file timestamp matches "now". The GPU-hours HTML must reflect the current state — per-section totals and per-case rows must match the current `llm_perf_multinode.txt` exactly, and every referenced YAML must resolve to a real file on disk. Both reports are regenerated on **every** run including no-op runs. +9. Verify GPU-hour numbers are **recomputed from YAML** for every case (old and new). Spot-check at least one disagg case by hand using `num_ctx_servers × (ctx.TP × ctx.PP × ctx.CP) + num_gen_servers × (gen.TP × gen.PP × gen.CP)`; the Qwen3 `ctx1_gen4_tep8` case must land at 36 GPU-h. Also sanity-check that GB200/GB300 per-node GPU count is 4 (node count = ceil(total GPUs / 4)) wherever nodes are reported. +10. Verify **platform classification came from `llm_perf_multinode.txt` section headers, not defaults**. Specifically: the Qwen3-235B-A22B backend-compare cases under `(GB300 only)` must be labeled GB300-only (NOT BOTH); the Deepseek-r1 backend-compare cases under `(GB200 only)` must be labeled GB200-only (NOT BOTH); 128k8k pp4 must be GB300-only; 128k8k pp8 must be GB200-only. If any of these fail, the header parser is broken — fix it and re-emit the HTML. + +**Update your agent memory** as you discover test case patterns, configuration conventions, naming schemes, and directory structures across dev and QA perf test directories. Record notes about: +- Config YAML schema differences between agg and disagg +- Naming conventions used in dev vs QA +- GPU-specific test patterns (GB200, B200) +- Test list format and entry patterns +- Common model/config combinations already covered + +# Persistent Agent Memory + +You have a persistent, file-based memory system at `/localhome/swqa/fzhu/TensorRT-LLM/.claude/agent-memory/perf-test-sync/`. This directory already exists — write to it directly with the Write tool (do not run mkdir or check for its existence). + +You should build up this memory system over time so that future conversations can have a complete picture of who the user is, how they'd like to collaborate with you, what behaviors to avoid or repeat, and the context behind the work the user gives you. + +If the user explicitly asks you to remember something, save it immediately as whichever type fits best. If they ask you to forget something, find and remove the relevant entry. + +## Types of memory + +There are several discrete types of memory that you can store in your memory system: + + + + user + Contain information about the user's role, goals, responsibilities, and knowledge. Great user memories help you tailor your future behavior to the user's preferences and perspective. Your goal in reading and writing these memories is to build up an understanding of who the user is and how you can be most helpful to them specifically. For example, you should collaborate with a senior software engineer differently than a student who is coding for the very first time. Keep in mind, that the aim here is to be helpful to the user. Avoid writing memories about the user that could be viewed as a negative judgement or that are not relevant to the work you're trying to accomplish together. + When you learn any details about the user's role, preferences, responsibilities, or knowledge + When your work should be informed by the user's profile or perspective. For example, if the user is asking you to explain a part of the code, you should answer that question in a way that is tailored to the specific details that they will find most valuable or that helps them build their mental model in relation to domain knowledge they already have. + + user: I'm a data scientist investigating what logging we have in place + assistant: [saves user memory: user is a data scientist, currently focused on observability/logging] + + user: I've been writing Go for ten years but this is my first time touching the React side of this repo + assistant: [saves user memory: deep Go expertise, new to React and this project's frontend — frame frontend explanations in terms of backend analogues] + + + + feedback + Guidance the user has given you about how to approach work — both what to avoid and what to keep doing. These are a very important type of memory to read and write as they allow you to remain coherent and responsive to the way you should approach work in the project. Record from failure AND success: if you only save corrections, you will avoid past mistakes but drift away from approaches the user has already validated, and may grow overly cautious. + Any time the user corrects your approach ("no not that", "don't", "stop doing X") OR confirms a non-obvious approach worked ("yes exactly", "perfect, keep doing that", accepting an unusual choice without pushback). Corrections are easy to notice; confirmations are quieter — watch for them. In both cases, save what is applicable to future conversations, especially if surprising or not obvious from the code. Include *why* so you can judge edge cases later. + Let these memories guide your behavior so that the user does not need to offer the same guidance twice. + Lead with the rule itself, then a **Why:** line (the reason the user gave — often a past incident or strong preference) and a **How to apply:** line (when/where this guidance kicks in). Knowing *why* lets you judge edge cases instead of blindly following the rule. + + user: don't mock the database in these tests — we got burned last quarter when mocked tests passed but the prod migration failed + assistant: [saves feedback memory: integration tests must hit a real database, not mocks. Reason: prior incident where mock/prod divergence masked a broken migration] + + user: stop summarizing what you just did at the end of every response, I can read the diff + assistant: [saves feedback memory: this user wants terse responses with no trailing summaries] + + user: yeah the single bundled PR was the right call here, splitting this one would've just been churn + assistant: [saves feedback memory: for refactors in this area, user prefers one bundled PR over many small ones. Confirmed after I chose this approach — a validated judgment call, not a correction] + + + + project + Information that you learn about ongoing work, goals, initiatives, bugs, or incidents within the project that is not otherwise derivable from the code or git history. Project memories help you understand the broader context and motivation behind the work the user is doing within this working directory. + When you learn who is doing what, why, or by when. These states change relatively quickly so try to keep your understanding of this up to date. Always convert relative dates in user messages to absolute dates when saving (e.g., "Thursday" → "2026-03-05"), so the memory remains interpretable after time passes. + Use these memories to more fully understand the details and nuance behind the user's request and make better informed suggestions. + Lead with the fact or decision, then a **Why:** line (the motivation — often a constraint, deadline, or stakeholder ask) and a **How to apply:** line (how this should shape your suggestions). Project memories decay fast, so the why helps future-you judge whether the memory is still load-bearing. + + user: we're freezing all non-critical merges after Thursday — mobile team is cutting a release branch + assistant: [saves project memory: merge freeze begins 2026-03-05 for mobile release cut. Flag any non-critical PR work scheduled after that date] + + user: the reason we're ripping out the old auth middleware is that legal flagged it for storing session tokens in a way that doesn't meet the new compliance requirements + assistant: [saves project memory: auth middleware rewrite is driven by legal/compliance requirements around session token storage, not tech-debt cleanup — scope decisions should favor compliance over ergonomics] + + + + reference + Stores pointers to where information can be found in external systems. These memories allow you to remember where to look to find up-to-date information outside of the project directory. + When you learn about resources in external systems and their purpose. For example, that bugs are tracked in a specific project in Linear or that feedback can be found in a specific Slack channel. + When the user references an external system or information that may be in an external system. + + user: check the Linear project "INGEST" if you want context on these tickets, that's where we track all pipeline bugs + assistant: [saves reference memory: pipeline bugs are tracked in Linear project "INGEST"] + + user: the Grafana board at grafana.internal/d/api-latency is what oncall watches — if you're touching request handling, that's the thing that'll page someone + assistant: [saves reference memory: grafana.internal/d/api-latency is the oncall latency dashboard — check it when editing request-path code] + + + + +## What NOT to save in memory + +- Code patterns, conventions, architecture, file paths, or project structure — these can be derived by reading the current project state. +- Git history, recent changes, or who-changed-what — `git log` / `git blame` are authoritative. +- Debugging solutions or fix recipes — the fix is in the code; the commit message has the context. +- Anything already documented in CLAUDE.md files. +- Ephemeral task details: in-progress work, temporary state, current conversation context. + +These exclusions apply even when the user explicitly asks you to save. If they ask you to save a PR list or activity summary, ask what was *surprising* or *non-obvious* about it — that is the part worth keeping. + +## How to save memories + +Saving a memory is a two-step process: + +**Step 1** — write the memory to its own file (e.g., `user_role.md`, `feedback_testing.md`) using this frontmatter format: + +```markdown +--- +name: {{memory name}} +description: {{one-line description — used to decide relevance in future conversations, so be specific}} +type: {{user, feedback, project, reference}} +--- + +{{memory content — for feedback/project types, structure as: rule/fact, then **Why:** and **How to apply:** lines}} +``` + +**Step 2** — add a pointer to that file in `MEMORY.md`. `MEMORY.md` is an index, not a memory — each entry should be one line, under ~150 characters: `- [Title](file.md) — one-line hook`. It has no frontmatter. Never write memory content directly into `MEMORY.md`. + +- `MEMORY.md` is always loaded into your conversation context — lines after 200 will be truncated, so keep the index concise +- Keep the name, description, and type fields in memory files up-to-date with the content +- Organize memory semantically by topic, not chronologically +- Update or remove memories that turn out to be wrong or outdated +- Do not write duplicate memories. First check if there is an existing memory you can update before writing a new one. + +## When to access memories +- When memories seem relevant, or the user references prior-conversation work. +- You MUST access memory when the user explicitly asks you to check, recall, or remember. +- If the user says to *ignore* or *not use* memory: proceed as if MEMORY.md were empty. Do not apply remembered facts, cite, compare against, or mention memory content. +- Memory records can become stale over time. Use memory as context for what was true at a given point in time. Before answering the user or building assumptions based solely on information in memory records, verify that the memory is still correct and up-to-date by reading the current state of the files or resources. If a recalled memory conflicts with current information, trust what you observe now — and update or remove the stale memory rather than acting on it. + +## Before recommending from memory + +A memory that names a specific function, file, or flag is a claim that it existed *when the memory was written*. It may have been renamed, removed, or never merged. Before recommending it: + +- If the memory names a file path: check the file exists. +- If the memory names a function or flag: grep for it. +- If the user is about to act on your recommendation (not just asking about history), verify first. + +"The memory says X exists" is not the same as "X exists now." + +A memory that summarizes repo state (activity logs, architecture snapshots) is frozen in time. If the user asks about *recent* or *current* state, prefer `git log` or reading the code over recalling the snapshot. + +## Memory and other forms of persistence +Memory is one of several persistence mechanisms available to you as you assist the user in a given conversation. The distinction is often that memory can be recalled in future conversations and should not be used for persisting information that is only useful within the scope of the current conversation. +- When to use or update a plan instead of memory: If you are about to start a non-trivial implementation task and would like to reach alignment with the user on your approach you should use a Plan rather than saving this information to memory. Similarly, if you already have a plan within the conversation and you have changed your approach persist that change by updating the plan rather than saving a memory. +- When to use or update tasks instead of memory: When you need to break your work in current conversation into discrete steps or keep track of your progress use tasks instead of saving to memory. Tasks are great for persisting information about the work that needs to be done in the current conversation, but memory should be reserved for information that will be useful in future conversations. + +- Since this memory is project-scope and shared with your team via version control, tailor your memories to this project + +## MEMORY.md + +Your MEMORY.md is currently empty. When you save new memories, they will appear here. diff --git a/.gitignore b/.gitignore index bf32438a0f41..0cea02f49b74 100644 --- a/.gitignore +++ b/.gitignore @@ -105,3 +105,8 @@ enroot/tensorrt_llm.devel.sqsh # MacOSX Files .DS_Store + +# Agent related files +.claude/agent-memory/ +.claude/agent-tests/perf-test-sync/report.html +.claude/agent-tests/perf-test-sync/results.json diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index c716fa2021e5..14a329523eb6 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -3504,6 +3504,7 @@ def launchTestJobs(pipeline, testFilter) "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-5": ["auto:gb200-x4", "l0_gb200_multi_gpus_perf_sanity", 5, 7, 4], "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-6": ["auto:gb200-x4", "l0_gb200_multi_gpus_perf_sanity", 6, 7, 4], "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-7": ["auto:gb200-x4", "l0_gb200_multi_gpus_perf_sanity", 7, 7, 4], + "GB300-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["auto:gb300-x4", "l0_gb300_multi_gpus_perf_sanity", 1, 1, 4], ] fullSet += SBSASlurmTestConfigs.keySet() @@ -3532,7 +3533,7 @@ def launchTestJobs(pipeline, testFilter) "GB200-8_GPUs-2_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE1-GPU1-GEN1-NODE1-GPU2-Post-Merge", "auto:gb200-flex", "l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu1_gen1_node1_gpu2", - 3, + 4, 8, 2 ) @@ -3540,7 +3541,7 @@ def launchTestJobs(pipeline, testFilter) "GB200-8_GPUs-2_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE1-GPU1-GEN1-NODE1-GPU4-Post-Merge", "auto:gb200-flex", "l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu1_gen1_node1_gpu4", - 4, + 7, 8, 2 ) @@ -3548,7 +3549,7 @@ def launchTestJobs(pipeline, testFilter) "GB200-8_GPUs-2_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE1-GPU4-GEN1-NODE1-GPU4-Post-Merge", "auto:gb200-flex", "l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu4", - 3, + 5, 8, 2 ) @@ -3557,7 +3558,7 @@ def launchTestJobs(pipeline, testFilter) "GB200-12_GPUs-3_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE1-GPU1-GEN1-NODE2-GPU8-Post-Merge", "auto:gb200-flex", "l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu1_gen1_node2_gpu8", - 1, + 2, 12, 3 ) @@ -3565,7 +3566,7 @@ def launchTestJobs(pipeline, testFilter) "GB200-12_GPUs-3_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE1-GPU4-GEN1-NODE2-GPU8-Post-Merge", "auto:gb200-flex", "l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node2_gpu8", - 7, + 8, 12, 3 ) @@ -3574,7 +3575,7 @@ def launchTestJobs(pipeline, testFilter) "GB200-16_GPUs-4_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE2-GPU8-GEN1-NODE2-GPU8-Post-Merge", "auto:gb200-flex", "l0_gb200_multi_nodes_perf_sanity_ctx1_node2_gpu8_gen1_node2_gpu8", - 1, + 2, 16, 4 ) @@ -3583,7 +3584,7 @@ def launchTestJobs(pipeline, testFilter) "GB200-20_GPUs-5_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE1-GPU4-GEN1-NODE4-GPU16-Post-Merge", "auto:gb200-flex", "l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node4_gpu16", - 2, + 4, 20, 5 ) @@ -3600,7 +3601,7 @@ def launchTestJobs(pipeline, testFilter) "GB200-24_GPUs-6_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE2-GPU8-GEN1-NODE4-GPU16-Post-Merge", "auto:gb200-flex", "l0_gb200_multi_nodes_perf_sanity_ctx1_node2_gpu8_gen1_node4_gpu16", - 1, + 2, 24, 6 ) @@ -3609,7 +3610,7 @@ def launchTestJobs(pipeline, testFilter) "GB200-36_GPUs-9_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE1-GPU4-GEN1-NODE8-GPU32-Post-Merge", "auto:gb200-flex", "l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node8_gpu32", - 8, + 11, 36, 9 ) @@ -3622,6 +3623,53 @@ def launchTestJobs(pipeline, testFilter) 40, 10 ) + // GB300 PerfSanity post-merge aggregated + // 2 Nodes + multiNodesSBSAConfigs += buildStageConfigs( + "GB300-8_GPUs-2_Nodes-PyTorch-PerfSanity-Node2-GPU8-Post-Merge", + "auto:gb300-flex", + "l0_gb300_multi_nodes_perf_sanity_node2_gpu8", + 3, + 8, + 2 + ) + // GB300 PerfSanity post-merge disaggregated + // // 2 Nodes + // multiNodesSBSAConfigs += buildStageConfigs( + // "GB300-8_GPUs-2_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE1-GPU4-GEN1-NODE1-GPU4-Post-Merge", + // "auto:gb300-flex", + // "l0_gb300_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu4", + // 2, + // 8, + // 2 + // ) + // // 3 Nodes + // multiNodesSBSAConfigs += buildStageConfigs( + // "GB300-12_GPUs-3_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE1-GPU4-GEN1-NODE2-GPU8-Post-Merge", + // "auto:gb300-flex", + // "l0_gb300_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node2_gpu8", + // 2, + // 12, + // 3 + // ) + // // 5 Nodes + // multiNodesSBSAConfigs += buildStageConfigs( + // "GB300-20_GPUs-5_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE1-GPU4-GEN1-NODE4-GPU16-Post-Merge", + // "auto:gb300-flex", + // "l0_gb300_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node4_gpu16", + // 2, + // 20, + // 5 + // ) + // // 9 Nodes + // multiNodesSBSAConfigs += buildStageConfigs( + // "GB300-36_GPUs-9_Nodes-PyTorch-Disagg-PerfSanity-CTX1-NODE1-GPU4-GEN1-NODE8-GPU32-Post-Merge", + // "auto:gb300-flex", + // "l0_gb300_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node8_gpu32", + // 2, + // 36, + // 9 + // ) fullSet += multiNodesSBSAConfigs.keySet() if (env.targetArch == AARCH64_TRIPLE) { diff --git a/jenkins/scripts/perf/disaggregated/submit.py b/jenkins/scripts/perf/disaggregated/submit.py index 6962fa2c4e05..a4dd4673d8f3 100644 --- a/jenkins/scripts/perf/disaggregated/submit.py +++ b/jenkins/scripts/perf/disaggregated/submit.py @@ -400,11 +400,12 @@ def main(): f'export GEN_WORKER_ENV_VARS="{gen_worker_env_vars}"', f'export SERVER_ENV_VARS="{server_env_vars}"', f'export BENCHMARK_ENV_VARS="{env_config["benchmark_env_var"]}"', - 'export pytestCommandCTXWorker="unset UCX_TLS && $CTX_WORKER_ENV_VARS' + 'export pytestCommandCTXWorker="unset UCX_TLS UCX_NET_DEVICES && $CTX_WORKER_ENV_VARS' ' $PYTEST_COMMON_VARS $partialPytestCommandWorker"', - 'export pytestCommandGENWorker="unset UCX_TLS && $GEN_WORKER_ENV_VARS' + 'export pytestCommandGENWorker="unset UCX_TLS UCX_NET_DEVICES && $GEN_WORKER_ENV_VARS' ' $PYTEST_COMMON_VARS $partialPytestCommandWorker"', - 'export pytestCommandDisaggServer="$SERVER_ENV_VARS $PYTEST_COMMON_VARS $partialPytestCommandDisaggServer"', + 'export pytestCommandDisaggServer="unset UCX_TLS UCX_NET_DEVICES &&' + ' $SERVER_ENV_VARS $PYTEST_COMMON_VARS $partialPytestCommandDisaggServer"', 'export pytestCommandBenchmark="$BENCHMARK_ENV_VARS $PYTEST_COMMON_VARS $partialPytestCommandBenchmark"', f"export runScript={args.run_sh}", f"export installScript={install_script}", diff --git a/jenkins/scripts/perf/local/submit.py b/jenkins/scripts/perf/local/submit.py index 7d2954429f71..fa9a6c0a024c 100755 --- a/jenkins/scripts/perf/local/submit.py +++ b/jenkins/scripts/perf/local/submit.py @@ -324,7 +324,13 @@ def generate_srun_args(args, runtime_mode, timestamp): def generate_pytest_command( - test_prefix, work_dir, config_file_base_name, select_pattern, runtime_mode, benchmark_mode + test_prefix, + work_dir, + config_file_base_name, + select_pattern, + runtime_mode, + benchmark_mode, + waives_file="", ): """Generate pytest command and test list.""" # Generate test list content based on runtime_mode and benchmark_mode @@ -354,6 +360,9 @@ def generate_pytest_command( f"-o junit_logging=out-err" ) + if waives_file: + pytest_command += f" --waives-file={waives_file}" + return pytest_command, test_list_content, test_list_path @@ -452,6 +461,11 @@ def main(): help="Nsys start-stop range for generation workers in disaggregated mode (default: 1-100)", ) parser.add_argument("--test-prefix", default="", help="Test prefix") + parser.add_argument( + "--waives-file", + default="", + help="Path to waives file for pytest (optional)", + ) parser.add_argument( "--mpi-type", default="", @@ -558,7 +572,13 @@ def main(): # Generate pytest command pytest_command, test_list_content, test_list_path = generate_pytest_command( - test_prefix, work_dir, config_file_base_name, select_pattern, runtime_mode, benchmark_mode + test_prefix, + work_dir, + config_file_base_name, + select_pattern, + runtime_mode, + benchmark_mode, + waives_file=args.waives_file, ) # Write test list file diff --git a/tests/integration/defs/perf/test_perf_sanity.py b/tests/integration/defs/perf/test_perf_sanity.py index fc606c765190..6dd7bc5a009f 100644 --- a/tests/integration/defs/perf/test_perf_sanity.py +++ b/tests/integration/defs/perf/test_perf_sanity.py @@ -1750,7 +1750,14 @@ def add_dict_prefix(config_dict: dict, prefix_name: str) -> dict: if num_ctx_servers > 0: match_keys.extend(add_list_prefix(ctx_config.to_match_keys(), "ctx")) if num_gen_servers > 0: - match_keys.extend(add_list_prefix(gen_config.to_match_keys(), "gen")) + gen_match_keys = add_list_prefix(gen_config.to_match_keys(), "gen") + if disagg_config.benchmark_mode == "gen_only": + gen_match_keys = [ + k + for k in gen_match_keys + if k != "gen_s_cache_transceiver_backend" + ] + match_keys.extend(gen_match_keys) match_keys.extend(client_config.to_match_keys()) else: return diff --git a/tests/integration/test_lists/qa/llm_perf_multinode.txt b/tests/integration/test_lists/qa/llm_perf_multinode.txt index cf9e938c8153..d820a33ccf8e 100644 --- a/tests/integration/test_lists/qa/llm_perf_multinode.txt +++ b/tests/integration/test_lists/qa/llm_perf_multinode.txt @@ -1,171 +1,157 @@ # disagg multi-node -# GB200 + GB300 supported cases -# perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con512_ccb-NIXL] -# perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con1024_ccb-NIXL] -# perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con512_ccb-UCX] -# perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con1024_ccb-UCX] -# perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-NIXL] -# perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-UCX] -# perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_con2048_ccb-NIXL] -# perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_con2048_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con36_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con1_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con2_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con4_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con8_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con16_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con36_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_con1024_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_con1024_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con1_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con2_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con4_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con8_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con16_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con32_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con1_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con2_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con4_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con8_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con16_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con32_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_con1024_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con1_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con2_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con4_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con8_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con16_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con1_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con2_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con4_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con8_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con16_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con1_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con2_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con4_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con8_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con16_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con32_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_con1024_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_con1024_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-UCX] - -# GB200 supported cases -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp8_gen11_tep4_bs2_eplb0_mtp0_con2-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp8_gen14_tep4_bs1_eplb0_mtp0_con1-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_bs1_eplb0_mtp3_con1-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep8_bs4_eplb0_mtp2_con4-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp0_con1-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp3_con1-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs2_eplb0_mtp3_con2-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp8_gen5_tep8_bs2_eplb0_mtp3_con2-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep4_bs2_eplb0_mtp2_con2-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep8_bs1_eplb0_mtp0_con1-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp8_gen8_tep4_bs4_eplb0_mtp0_con4-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep16_bs8_eplb0_mtp0_con8-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep32_bs2_eplb0_mtp0_con2-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs16_eplb0_mtp0_con16-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2_con8-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3_con2-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs4_eplb0_mtp0_con4-Default] - -# GB300 supported cases -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp4_gen13_tep4_bs1_eplb0_mtp0_con1-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp4_gen5_tep4_bs4_eplb0_mtp0_con4-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp4_gen6_tep8_bs1_eplb0_mtp3_con1-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp4_gen7_tep8_bs1_eplb0_mtp0_con1-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0_con2-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep8_bs1_eplb0_mtp0_con1-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx2_pp4_gen7_tep8_bs2_eplb0_mtp3_con2-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx3_pp4_gen1_dep8_bs16_eplb0_mtp1_con128-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs16_eplb0_mtp0_con256-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs8_eplb0_mtp3_con128-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs2_eplb0_mtp3_con64-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs4_eplb0_mtp0_con128-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs16_eplb0_mtp1_con256-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs32_eplb0_mtp0_con512-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep16_bs32_eplb0_mtp1_con512-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs4_eplb0_mtp3_con128-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp0_con256-Default] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp3_con256-Default] +# GB200 DeepSeek-R1 +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp2_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp2_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-r1-fp4_128k8k_con1_ctx1_pp8_gen1_tep8_eplb0_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0_con2-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb0_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb288_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con1_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL] +# GB200 DeepSeek-V32 +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-UCX] + +# GB200 GPT-OSS-120B +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_gpt-oss-120b-fp4_1k1k_con64_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_gpt-oss-120b-fp4_8k1k_con1024_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_gpt-oss-120b-fp4_8k1k_con1024_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_gpt-oss-120b-fp4_8k1k_con128_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_gpt-oss-120b-fp4_8k1k_con4_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_gpt-oss-120b-fp4_8k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL] + +# GB200 Kimi-K2.5-Thinking +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_kimi-k25-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] + +# GB200 Qwen3-235B +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_qwen3-235b-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_qwen3-235b-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_qwen3-235b-fp4_8k1k_con1_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_qwen3-235b-fp4_8k1k_con64_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL] + +# GB300 DeepSeek-R1 +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_deepseek-r1-fp4_128k8k_con1_ctx1_pp4_gen1_tep8_eplb0_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_deepseek-r1-fp4_128k8k_con64_ctx1_pp4_gen1_dep16_eplb0_mtp2_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX] + +# GB300 DeepSeek-V32 +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-UCX] + +# GB300 Kimi-K2.5-Thinking +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_kimi-k25-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] + +# GB300 Qwen3-235B +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_qwen3-235b-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL] # wideep multi-node -# GB200 + GB300 supported cases -# perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_con512_ccb-NIXL] -# perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_con1024_ccb-NIXL] -# perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_con512_ccb-UCX] -# perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_con1024_ccb-UCX] -# perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-NIXL] -# perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-UCX] -# perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_con2048_ccb-NIXL] -# perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_con2048_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL_kv-reuse] -perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-DEFAULT] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-wideep_deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-DEFAULT] +# GB200 wideep +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_wideep_deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_wideep_deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-gb200_wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-gb200_wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL_kv-reuse] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-gb200_wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-gb200_wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-gb200_wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-gb200_wideep_deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-gb200_wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-gb200_wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-NIXL] + +# GB300 wideep +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_wideep_deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb300_wideep_deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-gb300_wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-gb300_wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-UCX] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-gb300_wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL] + +# external wideep configs perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-UCX] perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-UCX] -perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-DEFAULT] -perf/test_perf_sanity.py::test_e2e[disagg-e2e-wideep_deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-DEFAULT] perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-NIXL] perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-NIXL] perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_con16384_ccb-NIXL] perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_kimi-k2-thinking-fp4_8k1k_ctx8_gen1_dep32_bs256_eplb416_mtp0_con8192_ccb-NIXL] # accuracy cases -perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_accuracy-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL] -perf/test_perf_sanity.py::test_e2e[disagg-gen_only-wideep_accuracy-deepseek-r1-fp4_gpqa_diamond_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-gb200_wideep_accuracy-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-gb200_wideep_accuracy-deepseek-r1-fp4_gpqa_diamond_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL] perf/test_perf_sanity.py::test_e2e[disagg-e2e-wideep_accuracy-kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL] # stress cases -perf/test_perf_sanity.py::test_e2e[disagg-e2e-wideep_stress-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-e2e-gb200_wideep_stress-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL] + +# aggregated multi-node (ctx_only and gen_only reuse disagg config yamls) + +# ctx_only +perf/test_perf_sanity.py::test_e2e[aggr-ctx_only-gb200_qwen3-235b-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[aggr-ctx_only-gb200_qwen3-235b-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[aggr-ctx_only-gb300_qwen3-235b-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL] -# GB200 supported cases -# GB300 supported cases +# gen_only +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-gb200_qwen3-235b-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-gb200_qwen3-235b-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL] +perf/test_perf_sanity.py::test_e2e[disagg-gen_only-gb300_qwen3-235b-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL] diff --git a/tests/integration/test_lists/test-db/l0_b200_multi_gpus_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_b200_multi_gpus_perf_sanity.yml index ff841be8868c..c026356682ae 100644 --- a/tests/integration/test_lists/test-db/l0_b200_multi_gpus_perf_sanity.yml +++ b/tests/integration/test_lists/test-db/l0_b200_multi_gpus_perf_sanity.yml @@ -43,9 +43,9 @@ l0_b200_multi_gpus_perf_sanity: - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_blackwell-k25_thinking_fp4_dep8_32k8k] TIMEOUT (90) - perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_k25_thinking_fp4_blackwell-k25_thinking_fp4_tep8_adp_2k1k] # ctx_only tests (disagg config) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-b200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-b200_deepseek-r1-fp4_1k1k_con2048_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-b200_deepseek-r1-fp4_1k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-b200_deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-b200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-b200_deepseek-r1-fp4_8k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-b200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-b200_deepseek-r1-fp4_1k1k_con2048_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-b200_deepseek-r1-fp4_1k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-b200_deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-b200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-b200_deepseek-r1-fp4_8k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_b200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu8.yml b/tests/integration/test_lists/test-db/l0_b200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu8.yml index f1d122854018..da5fde3c3f94 100644 --- a/tests/integration/test_lists/test-db/l0_b200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu8.yml +++ b/tests/integration/test_lists/test-db/l0_b200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu8.yml @@ -14,15 +14,15 @@ l0_b200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu8: stage: post_merge backend: pytorch tests: - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-b200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-b200_deepseek-r1-fp4_1k1k_con2048_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-b200_deepseek-r1-fp4_1k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-b200_deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-b200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-b200_deepseek-r1-fp4_8k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-b200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-b200_deepseek-r1-fp4_1k1k_con2048_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-b200_deepseek-r1-fp4_1k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-b200_deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-b200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-b200_deepseek-r1-fp4_8k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-b200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-b200_deepseek-r1-fp4_1k1k_con2048_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-b200_deepseek-r1-fp4_1k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-b200_deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-b200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-b200_deepseek-r1-fp4_8k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-b200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-b200_deepseek-r1-fp4_1k1k_con2048_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-b200_deepseek-r1-fp4_1k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-b200_deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-b200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-b200_deepseek-r1-fp4_8k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml index aaee4cfe2b33..29aa69a6c8ad 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml @@ -62,37 +62,38 @@ l0_gb200_multi_gpus_perf_sanity: - perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_grace_blackwell-k25_thinking_fp4_dep4_8k1k] TIMEOUT (90) # ctx_only tests (disagg config) # deepseek-r1-fp4 - # - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) # deepseek-v32-fp4 - # - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-UCX] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL] TIMEOUT (120) # gpt-oss-120b-fp4 - - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_gpt-oss-120b-fp4_1k1k_con64_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_gpt-oss-120b-fp4_8k1k_con128_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_gpt-oss-120b-fp4_8k1k_con4_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_gpt-oss-120b-fp4_8k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_gpt-oss-120b-fp4_1k1k_con64_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_gpt-oss-120b-fp4_8k1k_con128_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_gpt-oss-120b-fp4_8k1k_con4_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_gpt-oss-120b-fp4_8k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) # kimi-k25-thinking-fp4 - # - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_kimi-k25-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_kimi-k25-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) # qwen3-235b-fp4 - - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_qwen3-235b-fp4_8k1k_con64_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_qwen3-235b-fp4_8k1k_con1_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_qwen3-235b-fp4_8k1k_con64_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu1_gen1_node1_gpu2.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu1_gen1_node1_gpu2.yml index c551a6ce311f..f8e458307a4e 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu1_gen1_node1_gpu2.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu1_gen1_node1_gpu2.yml @@ -14,9 +14,7 @@ l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu1_gen1_node1_gpu2: stage: post_merge backend: pytorch tests: - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_gpt-oss-120b-fp4_8k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_gpt-oss-120b-fp4_8k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_gpt-oss-120b-fp4_8k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu1_gen1_node1_gpu4.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu1_gen1_node1_gpu4.yml index 6717b08dda45..fa6b4054a1a4 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu1_gen1_node1_gpu4.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu1_gen1_node1_gpu4.yml @@ -14,13 +14,10 @@ l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu1_gen1_node1_gpu4: stage: post_merge backend: pytorch tests: - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_gpt-oss-120b-fp4_1k1k_con64_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_gpt-oss-120b-fp4_8k1k_con128_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_gpt-oss-120b-fp4_8k1k_con4_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_qwen3-235b-fp4_8k1k_con64_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_gpt-oss-120b-fp4_8k1k_con1024_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_gpt-oss-120b-fp4_1k1k_con64_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_gpt-oss-120b-fp4_8k1k_con1024_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_gpt-oss-120b-fp4_8k1k_con128_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_gpt-oss-120b-fp4_8k1k_con4_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_qwen3-235b-fp4_8k1k_con64_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_gpt-oss-120b-fp4_1k1k_con64_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_gpt-oss-120b-fp4_8k1k_con128_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_gpt-oss-120b-fp4_8k1k_con4_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_qwen3-235b-fp4_8k1k_con1_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_qwen3-235b-fp4_8k1k_con64_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_gpt-oss-120b-fp4_8k1k_con1024_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_gpt-oss-120b-fp4_8k1k_con1024_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu1_gen1_node2_gpu8.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu1_gen1_node2_gpu8.yml index 494530f88091..c1759f5ec4dd 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu1_gen1_node2_gpu8.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu1_gen1_node2_gpu8.yml @@ -14,5 +14,5 @@ l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu1_gen1_node2_gpu8: stage: post_merge backend: pytorch tests: - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu4.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu4.yml index 612377ed287f..1716a1d548ca 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu4.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu4.yml @@ -14,9 +14,8 @@ l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu4: stage: post_merge backend: pytorch tests: - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node2_gpu8.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node2_gpu8.yml index 30d6500886c3..528c185416e1 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node2_gpu8.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node2_gpu8.yml @@ -14,19 +14,12 @@ l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node2_gpu8: stage: post_merge backend: pytorch tests: - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX] TIMEOUT (120) # Failed requests - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) # Failed requests + # - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node4_gpu16.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node4_gpu16.yml index 0e4e68522b1e..f134442dd0ba 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node4_gpu16.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node4_gpu16.yml @@ -14,7 +14,7 @@ l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node4_gpu16: stage: post_merge backend: pytorch tests: - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node8_gpu32.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node8_gpu32.yml index 9260d25ca87f..5955e9b8b35c 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node8_gpu32.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node8_gpu32.yml @@ -14,23 +14,15 @@ l0_gb200_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node8_gpu32: stage: post_merge backend: pytorch tests: - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-DEFAULT] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_kimi-k25-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-DEFAULT] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_kimi-k25-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_kimi-k25-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node2_gpu8_gen1_node2_gpu8.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node2_gpu8_gen1_node2_gpu8.yml index 2b13922ab83a..6daea3c4ce62 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node2_gpu8_gen1_node2_gpu8.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node2_gpu8_gen1_node2_gpu8.yml @@ -14,7 +14,5 @@ l0_gb200_multi_nodes_perf_sanity_ctx1_node2_gpu8_gen1_node2_gpu8: stage: post_merge backend: pytorch tests: - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_128k8k_con1_ctx1_pp8_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-r1-fp4_128k8k_con1_ctx1_pp8_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_128k8k_con1_ctx1_pp8_gen1_tep8_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node2_gpu8_gen1_node4_gpu16.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node2_gpu8_gen1_node4_gpu16.yml index 83727909da7b..fd19669b6d0b 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node2_gpu8_gen1_node4_gpu16.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node2_gpu8_gen1_node4_gpu16.yml @@ -14,5 +14,5 @@ l0_gb200_multi_nodes_perf_sanity_ctx1_node2_gpu8_gen1_node4_gpu16: stage: post_merge backend: pytorch tests: - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp2_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp2_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node2_gpu8_gen1_node8_gpu32.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node2_gpu8_gen1_node8_gpu32.yml index 462b4f43c1e7..c6096c4ce8ee 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node2_gpu8_gen1_node8_gpu32.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx1_node2_gpu8_gen1_node8_gpu32.yml @@ -14,5 +14,4 @@ l0_gb200_multi_nodes_perf_sanity_ctx1_node2_gpu8_gen1_node8_gpu32: stage: post_merge backend: pytorch tests: - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx2_node1_gpu4_gen1_node4_gpu16.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx2_node1_gpu4_gen1_node4_gpu16.yml index fb3ae40b3ae3..95b1470487e5 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx2_node1_gpu4_gen1_node4_gpu16.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_ctx2_node1_gpu4_gen1_node4_gpu16.yml @@ -14,7 +14,5 @@ l0_gb200_multi_nodes_perf_sanity_ctx2_node1_gpu4_gen1_node4_gpu16: stage: post_merge backend: pytorch tests: - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb288_mtp3_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb288_mtp3_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb288_mtp3_ccb-NIXL] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_node2_gpu8.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_node2_gpu8.yml index ae3d5665c4ca..501ecb45ebd7 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_node2_gpu8.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity_node2_gpu8.yml @@ -24,6 +24,6 @@ l0_gb200_multi_nodes_perf_sanity_node2_gpu8: - perf/test_perf_sanity.py::test_e2e[aggr_upload-dynamo_deepseek_v32_fp4_2_nodes_grace_blackwell-dsv32_fp4_dep8_trtllm_lpc_mnnvl_8k1k] TIMEOUT (120) # ctx only tests (disagg config) # deepseek-r1-fp4 - - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp2_ccb-UCX] TIMEOUT (120) - - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-r1-fp4_128k8k_con1_ctx1_pp8_gen1_tep8_eplb0_mtp3_ccb-UCX] TIMEOUT (120) - # - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-UCX] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-r1-fp4_128k8k_con1_ctx1_pp8_gen1_tep8_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-NIXL] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_gb300_multi_gpus_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_gb300_multi_gpus_perf_sanity.yml new file mode 100644 index 000000000000..a2706e3ba79a --- /dev/null +++ b/tests/integration/test_lists/test-db/l0_gb300_multi_gpus_perf_sanity.yml @@ -0,0 +1,28 @@ +version: 0.0.1 +l0_gb300_multi_gpus_perf_sanity: +- condition: + ranges: + system_gpu_count: + gte: 4 + lte: 4 + wildcards: + gpu: + - '*gb300*' + linux_distribution_name: ubuntu* + cpu: aarch64 + terms: + stage: post_merge + backend: pytorch + tests: + # ctx_only tests (disagg config) + # deepseek-r1-fp4 + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb300_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb300_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) + # kimi-k25-thinking-fp4 + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb300_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb300_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + # deepseek-v32-fp4 + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb300_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb300_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL] TIMEOUT (120) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb300_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_gb300_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu4.yml b/tests/integration/test_lists/test-db/l0_gb300_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu4.yml new file mode 100644 index 000000000000..1a73d8776e93 --- /dev/null +++ b/tests/integration/test_lists/test-db/l0_gb300_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu4.yml @@ -0,0 +1,22 @@ +version: 0.0.1 +l0_gb300_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node1_gpu4: +- condition: + ranges: + # 1 ctx worker with each 1 node and 4 GPUs + # 1 gen worker with each 1 node and 4 GPUs + system_gpu_count: + gte: 8 + lte: 8 + wildcards: + gpu: + - '*gb300*' + terms: + stage: post_merge + backend: pytorch + tests: + # deepseek-r1-fp4 + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb300_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) + # deepseek-v32-fp4 + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb300_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb300_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb300_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_gb300_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node2_gpu8.yml b/tests/integration/test_lists/test-db/l0_gb300_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node2_gpu8.yml new file mode 100644 index 000000000000..cafd273a44b7 --- /dev/null +++ b/tests/integration/test_lists/test-db/l0_gb300_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node2_gpu8.yml @@ -0,0 +1,22 @@ +version: 0.0.1 +l0_gb300_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node2_gpu8: +- condition: + ranges: + # 1 ctx worker with each 1 node and 4 GPUs + # 1 gen worker with each 2 node and 8 GPUs + system_gpu_count: + gte: 12 + lte: 12 + wildcards: + gpu: + - '*gb300*' + terms: + stage: post_merge + backend: pytorch + tests: + # deepseek-r1-fp4 128k8k + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) + # kimi-k25-thinking-fp4 1k1k + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb300_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb300_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_gb300_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node4_gpu16.yml b/tests/integration/test_lists/test-db/l0_gb300_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node4_gpu16.yml new file mode 100644 index 000000000000..7f31f3b4ae25 --- /dev/null +++ b/tests/integration/test_lists/test-db/l0_gb300_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node4_gpu16.yml @@ -0,0 +1,22 @@ +version: 0.0.1 +l0_gb300_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node4_gpu16: +- condition: + ranges: + # 1 ctx worker with each 1 node and 4 GPUs + # 1 gen worker with each 4 node and 16 GPUs + system_gpu_count: + gte: 20 + lte: 20 + wildcards: + gpu: + - '*gb300*' + terms: + stage: post_merge + backend: pytorch + tests: + # deepseek-r1-fp4 8k1k + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb300_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) + # kimi-k25-thinking-fp4 8k1k + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb300_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb300_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb300_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_gb300_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node8_gpu32.yml b/tests/integration/test_lists/test-db/l0_gb300_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node8_gpu32.yml new file mode 100644 index 000000000000..ae8ebf170361 --- /dev/null +++ b/tests/integration/test_lists/test-db/l0_gb300_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node8_gpu32.yml @@ -0,0 +1,22 @@ +version: 0.0.1 +l0_gb300_multi_nodes_perf_sanity_ctx1_node1_gpu4_gen1_node8_gpu32: +- condition: + ranges: + # 1 ctx worker with each 1 node and 4 GPUs + # 1 gen worker with each 8 node and 32 GPUs + system_gpu_count: + gte: 36 + lte: 36 + wildcards: + gpu: + - '*gb300*' + terms: + stage: post_merge + backend: pytorch + tests: + # deepseek-v32-fp4 8k1k + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb300_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL] TIMEOUT (120) + # deepseek-v32-fp4 32k4k + - perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb300_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb300_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL] TIMEOUT (120) + # - perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb300_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL] TIMEOUT (120) diff --git a/tests/integration/test_lists/test-db/l0_gb300_multi_nodes_perf_sanity_node2_gpu8.yml b/tests/integration/test_lists/test-db/l0_gb300_multi_nodes_perf_sanity_node2_gpu8.yml new file mode 100644 index 000000000000..61a822d5b29c --- /dev/null +++ b/tests/integration/test_lists/test-db/l0_gb300_multi_nodes_perf_sanity_node2_gpu8.yml @@ -0,0 +1,20 @@ +version: 0.0.1 +l0_gb300_multi_nodes_perf_sanity_node2_gpu8: +- condition: + ranges: + # 2 nodes with each node has 4 GPUs + system_gpu_count: + gte: 8 + lte: 8 + wildcards: + gpu: + - '*gb300*' + terms: + stage: post_merge + backend: pytorch + tests: + # aggr config + # deepseek-r1-fp4 + - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_2_nodes_grace_blackwell-r1_fp4_v2_dep8_mtp1_1k1k] TIMEOUT (90) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_2_nodes_grace_blackwell-r1_fp4_v2_dep8_mtp1_8k1k] TIMEOUT (90) + - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_2_nodes_grace_blackwell-r1_fp4_v2_tep8_mtp3] TIMEOUT (90) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 90d82c4650a2..f90d6e54931c 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -283,7 +283,10 @@ visual_gen/test_visual_gen_benchmark.py::test_offline_benchmark SKIP (https://nv accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6050489) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6050489) disaggregated/test_disaggregated.py::test_disaggregated_overlap_gen_first[ctx_pp1-TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6057459) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX] SKIP (https://nvbugs/6085022) +perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL] SKIP (https://nvbugs/6085022) +perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL] SKIP (https://nvbugs/6085022) +perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL] SKIP (https://nvbugs/6085022) +perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL] SKIP (https://nvbugs/6085022) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[v1_kv_cache-True-True-triton-auto] SKIP (https://nvbugs/6026676) accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[4gpus] SKIP (https://nvbugs/6069790) accuracy/test_llm_api_pytorch.py::TestGLM4_5Air::test_nvfp4_2_model_mtp[2model_trtllm] SKIP (https://nvbugs/5981293) @@ -328,7 +331,8 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughp accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6084824) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding_4gpus[attention_dp=True-mtp_nextn=0] SKIP (https://nvbugs/6084447) accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_attention_dp] SKIP (https://nvbugs/6084568) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugs/6088149) +perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL] SKIP (https://nvbugs/6088149) +perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp1_ccb-NIXL] SKIP (https://nvbugs/6088149) accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8 SKIP (https://nvbugs/6070857) accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=FLASHINFER-torch_compile=False] SKIP (https://nvbugs/6094071) accuracy/test_llm_api_pytorch_ray.py::TestLlama3_1_8BInstruct::test_pp2_ray SKIP (https://nvbugs/6094070) @@ -410,8 +414,12 @@ full:H20/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[t accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_bf16 SKIP (https://nvbugs/6069543) perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_grace_blackwell-v32_fp4_tep4_mtp3_8k1k] SKIP (https://nvbugs/6110326) perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_v32_fp4_grace_blackwell-v32_fp4_tep4_mtp3_1k1k] SKIP (https://nvbugs/6110326) -perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugs/6110326) -perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX] SKIP (https://nvbugs/6110326) +perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL] SKIP (https://nvbugs/6110326) +perf/test_perf_sanity.py::test_e2e[disagg_upload-gen_only-gb200_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL] SKIP (https://nvbugs/6110326) +perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL] SKIP (https://nvbugs/6110326) +perf/test_perf_sanity.py::test_e2e[disagg_upload-e2e-gb200_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL] SKIP (https://nvbugs/6110326) +perf/test_perf_sanity.py::test_e2e[aggr_upload-ctx_only-gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL] SKIP (https://nvbugs/6110326) +perf/test_perf_sanity.py::test_e2e[aggr_upload-k25_thinking_fp4_2_nodes_grace_blackwell-k25_thinking_fp4_dep8_32k8k] SKIP (https://nvbugs/6110326) unittest/_torch/visual_gen/multi_gpu/test_ulysses_sage_attention.py::TestSageUlyssesAttention::test_sage_ulysses_forward[False] SKIP (https://nvbugs/6111076) unittest/_torch/visual_gen/multi_gpu/test_ulysses_sage_attention.py::TestSageUlyssesAttention::test_sage_ulysses_forward[True] SKIP (https://nvbugs/6111076) unittest/_torch/visual_gen/multi_gpu/test_ulysses_sage_attention.py::TestSageUlyssesAttention::test_sage_ulysses_vs_reference[False-1] SKIP (https://nvbugs/6111076) diff --git a/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml index df7f341f7b40..94e8e0e4b4a2 100644 --- a/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -65,7 +65,7 @@ worker_config: backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: MTP @@ -92,6 +92,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_1k1k_con2048_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_1k1k_con2048_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_1k1k_con2048_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_1k1k_con2048_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml index 0b2867475c24..ac5c36f16665 100644 --- a/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_1k1k_con2048_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_1k1k_con2048_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml @@ -67,7 +67,7 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: MTP @@ -94,6 +94,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_1k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp3_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_1k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp3_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_1k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp3_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_1k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp3_ccb-NIXL.yaml index d78acf4be157..27ce86b321ef 100644 --- a/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_1k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_1k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp3_ccb-NIXL.yaml @@ -67,7 +67,7 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: MTP @@ -94,6 +94,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml index 3736a01be558..f986eeda6983 100644 --- a/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_8k1k_con1536_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml @@ -67,7 +67,7 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: MTP @@ -94,6 +94,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml index a5c30735c80c..2b39be1b676d 100644 --- a/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -65,7 +65,7 @@ worker_config: backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: MTP @@ -92,6 +92,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_8k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_8k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_8k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_8k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml index 7e2e14961855..c2f518f836c0 100644 --- a/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_8k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/b200_deepseek-r1-fp4_8k1k_con256_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml @@ -67,7 +67,7 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: MTP @@ -94,6 +94,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml new file mode 100644 index 000000000000..12916108ff24 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml @@ -0,0 +1,98 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '128' + input_length: 131072 + output_length: 8192 + dataset_file: datasets/perf-ci/deepseek_r1-128k8k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 8 + max_num_tokens: 32 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 8 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 8 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_128k8k_con1_ctx1_pp8_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_128k8k_con1_ctx1_pp8_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_128k8k_con1_ctx1_pp8_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_128k8k_con1_ctx1_pp8_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml index 285013491e52..eafb048e68b9 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_128k8k_con1_ctx1_pp8_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_128k8k_con1_ctx1_pp8_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -65,7 +65,7 @@ worker_config: backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 131104 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: MTP @@ -92,6 +92,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 131104 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml index 50a59ad1a996..0117cf4d0b5a 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml @@ -67,7 +67,7 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 131104 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: MTP @@ -93,6 +93,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 131104 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-DEFAULT.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml similarity index 97% rename from tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-DEFAULT.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml index f751ba6f9eb3..5f9722f34ff2 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-DEFAULT.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml @@ -79,10 +79,10 @@ worker_config: free_gpu_memory_fraction: 0.7 dtype: fp8 moe_config: - backend: WIDEEP + backend: CUTEDSL cache_transceiver_config: max_tokens_in_buffer: 4608 - backend: DEFAULT + backend: NIXL stream_interval: 20 ctx: print_iter_log: true @@ -102,4 +102,4 @@ worker_config: dtype: fp8 cache_transceiver_config: max_tokens_in_buffer: 4608 - backend: DEFAULT + backend: NIXL diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml index 144eb1adf065..aa31b144efbc 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml @@ -67,7 +67,7 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: MTP @@ -94,6 +94,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml similarity index 96% rename from tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml index 1e24af5f6c49..20706a01c585 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml @@ -18,9 +18,9 @@ slurm: numa_bind: true benchmark: mode: e2e - use_nv_sa_benchmark: true + use_nv_sa_benchmark: false multi_round: 8 - benchmark_ratio: 0.8 + benchmark_ratio: 0.0 streaming: true concurrency_list: '1024' input_length: 1024 @@ -67,7 +67,7 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL stream_interval: 100 num_postprocess_workers: 4 ctx: @@ -91,4 +91,4 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml index 7fb18c0f8f71..ce7d3b5be45d 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -65,7 +65,7 @@ worker_config: backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: MTP @@ -93,6 +93,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb288_mtp3_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb288_mtp3_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb288_mtp3_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb288_mtp3_ccb-NIXL.yaml index a188f164f95b..3370d1faa8cb 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb288_mtp3_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb288_mtp3_ccb-NIXL.yaml @@ -70,7 +70,7 @@ worker_config: layer_updates_per_iter: 1 cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: MTP @@ -97,6 +97,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml new file mode 100644 index 000000000000..43f0535d8383 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '3072' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 768 + max_num_tokens: 1536 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 768 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml index 41efce28454b..8c08fcde53d2 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml @@ -67,7 +67,7 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: MTP @@ -94,6 +94,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml index 397b88b39469..60bec288c4a2 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -65,7 +65,7 @@ worker_config: backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: MTP @@ -93,6 +93,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml new file mode 100644 index 000000000000..c31910ec28f1 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4096' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 256 + max_num_tokens: 512 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml index 4179f8d9edb2..fcfbe0616235 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml @@ -71,7 +71,7 @@ worker_config: layer_updates_per_iter: 1 cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: MTP @@ -105,6 +105,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml index 6011e0560e51..2c3b2b19759d 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -66,7 +66,7 @@ worker_config: backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: MTP @@ -101,6 +101,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml new file mode 100644 index 000000000000..bf9ba21f0835 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '2048' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 512 + max_num_tokens: 1024 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml index faaf881af078..2a91f3633c0d 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -69,7 +69,7 @@ worker_config: backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 32832 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: MTP @@ -106,6 +106,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 32832 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL.yaml new file mode 100644 index 000000000000..930876aa4403 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL.yaml @@ -0,0 +1,114 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 32k4k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '2048' + input_length: 32768 + output_length: 4096 + dataset_file: datasets/perf-ci/deepseek_v32-32k4k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 64 + max_num_tokens: 128 + max_input_len: 32784 + max_seq_len: 40960 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.5 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 32832 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 32784 + max_input_len: 32784 + max_seq_len: 32832 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 32832 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml index bdb9d1ded5ef..99002c9f2558 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml @@ -70,7 +70,7 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 32832 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: MTP @@ -106,6 +106,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 32832 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml index c54a25675b93..9158df38b979 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml @@ -67,11 +67,11 @@ worker_config: dtype: fp8 tokens_per_block: 64 moe_config: - backend: WIDEEP + backend: CUTEDSL use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 120000 - backend: UCX + backend: NIXL num_postprocess_workers: 8 stream_interval: 10 ctx: @@ -98,4 +98,4 @@ worker_config: backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 120000 - backend: UCX + backend: NIXL diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml index 26f3078eb126..e40afbe44267 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml @@ -71,7 +71,7 @@ worker_config: layer_updates_per_iter: 1 cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: MTP @@ -105,6 +105,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml index 0c8289465e6e..0a7b3a052e37 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -66,7 +66,7 @@ worker_config: backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: MTP @@ -101,6 +101,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..877516abbe54 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL.yaml @@ -0,0 +1,106 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4096' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 128 + max_num_tokens: 128 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 256 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml index 2742f8eeebbb..23f9239f77d7 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -65,7 +65,7 @@ worker_config: backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: MTP @@ -93,6 +93,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml index 98411eea80e6..f378906e9b0e 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml @@ -67,7 +67,7 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: MTP @@ -94,6 +94,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_1k1k_con512_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_1k1k_con512_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_1k1k_con512_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_1k1k_con512_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml index bf35904af644..696147466387 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_1k1k_con512_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_1k1k_con512_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml @@ -67,7 +67,7 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: MTP @@ -94,6 +94,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_8k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_8k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_8k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_8k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml index 2ede9a3e2209..1e55e8fe0bc7 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_8k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_8k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml @@ -67,7 +67,7 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: MTP @@ -94,6 +94,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml index 22ee060e180b..021265456d6d 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -65,7 +65,7 @@ worker_config: backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: MTP @@ -93,6 +93,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_8k1k_con512_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_8k1k_con512_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_8k1k_con512_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_8k1k_con512_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml index 619394b4932c..b6fba51707bf 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_8k1k_con512_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_glm-5-fp4_8k1k_con512_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml @@ -67,7 +67,7 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: MTP @@ -94,6 +94,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..6d2ef88af19e --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,96 @@ +metadata: + model_name: gpt_oss_120b_fp4 + precision: fp4 + model_dir_name: GPT-OSS-120B-FP4 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '2048' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/gpt_oss_120b-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 1536 + max_num_tokens: 20000 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 + cuda_graph_config: + enable_padding: true + max_batch_size: 1536 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 1024 + backend: NIXL + disable_overlap_scheduler: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 32 + max_num_tokens: 20000 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 1024 + backend: NIXL + disable_overlap_scheduler: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml index 3cdb95f39179..3a3747731f78 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml @@ -69,7 +69,7 @@ worker_config: backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 1024 - backend: UCX + backend: NIXL disable_overlap_scheduler: true num_postprocess_workers: 4 stream_interval: 20 @@ -92,5 +92,5 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 1024 - backend: UCX + backend: NIXL disable_overlap_scheduler: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con64_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con64_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con64_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con64_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml index b0894bc54572..ca773d4b268d 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con64_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con64_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml @@ -65,7 +65,7 @@ worker_config: backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 1024 - backend: UCX + backend: NIXL disable_overlap_scheduler: true num_postprocess_workers: 4 stream_interval: 20 @@ -88,5 +88,5 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 1024 - backend: UCX + backend: NIXL disable_overlap_scheduler: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con1024_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con1024_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..6417a0c38c8d --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con1024_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,106 @@ +metadata: + model_name: gpt_oss_120b_fp4 + precision: fp4 + model_dir_name: GPT-OSS-120B-FP4 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k + # Mirror of the Dynamo GPT-OSS-120B TRT-LLM disagg deployment recipe: + # https://github.com/ai-dynamo/dynamo/tree/main/recipes/gpt-oss-120b/trtllm/disagg +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1024' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/gpt_oss_120b-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + # Env vars taken from the Dynamo deploy.yaml (OVERRIDE_QUANT_ALGO selects + # W4A8_MXFP4_MXFP8, which is the distinguishing feature of this recipe vs. + # existing gpt-oss perf-sanity configs; PDL / no-parallel-weight-load / + # disabled NCCL graph register / disabled UCC collective are preserved so + # the same load & runtime paths are exercised). + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes OVERRIDE_QUANT_ALGO=W4A8_MXFP4_MXFP8 TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL=True NCCL_GRAPH_REGISTER=0 OMPI_MCA_coll_ucc_enable=0" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + print_iter_log: true + tensor_parallel_size: 4 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + max_batch_size: 1280 + max_num_tokens: 20000 + max_seq_len: 11000 + trust_remote_code: true + allreduce_strategy: AUTO + attention_dp_config: + enable_balance: true + cuda_graph_config: + enable_padding: true + max_batch_size: 1280 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 9216 + backend: NIXL + disable_overlap_scheduler: false + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + max_batch_size: 64 + max_num_tokens: 20000 + max_seq_len: 9000 + trust_remote_code: true + cuda_graph_config: + enable_padding: true + max_batch_size: 30 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 9216 + backend: NIXL + disable_overlap_scheduler: true + num_postprocess_workers: 4 + stream_interval: 20 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con128_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con128_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con128_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con128_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml index 6cbe2b02d0b3..0765412f2e62 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con128_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con128_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml @@ -65,7 +65,7 @@ worker_config: backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 8448 - backend: UCX + backend: NIXL disable_overlap_scheduler: true num_postprocess_workers: 4 stream_interval: 20 @@ -88,5 +88,5 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 8448 - backend: UCX + backend: NIXL disable_overlap_scheduler: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con4_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con4_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con4_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con4_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml index 9d7a5dd02ed9..bb027c657d2c 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con4_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con4_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml @@ -65,7 +65,7 @@ worker_config: backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 8448 - backend: UCX + backend: NIXL disable_overlap_scheduler: true num_postprocess_workers: 4 stream_interval: 20 @@ -88,5 +88,5 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 8448 - backend: UCX + backend: NIXL disable_overlap_scheduler: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml index be873c25903c..3a1d7fdbeffd 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml @@ -69,7 +69,7 @@ worker_config: backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 8448 - backend: UCX + backend: NIXL disable_overlap_scheduler: true num_postprocess_workers: 4 stream_interval: 20 @@ -92,5 +92,5 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 8448 - backend: UCX + backend: NIXL disable_overlap_scheduler: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml index 4e3eb972d195..8eb7293b9a87 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml @@ -67,7 +67,7 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true trust_remote_code: true num_postprocess_workers: 4 @@ -92,6 +92,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true trust_remote_code: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..b03708e45ad6 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,97 @@ +metadata: + model_name: k25_thinking_fp4 + precision: fp4 + model_dir_name: Kimi-K2.5-NVFP4 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4096' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 512 + max_num_tokens: 512 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + trust_remote_code: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 8192 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + trust_remote_code: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml index d3fd2febaa92..3ee92df5b4a2 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml @@ -65,7 +65,7 @@ worker_config: backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true trust_remote_code: true num_postprocess_workers: 4 @@ -90,6 +90,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true trust_remote_code: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml index d4ab81988a60..00028abf8eee 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml @@ -67,7 +67,7 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: Eagle @@ -97,7 +97,7 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 trust_remote_code: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..4b7f537cb9a3 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,97 @@ +metadata: + model_name: k25_thinking_fp4 + precision: fp4 + model_dir_name: Kimi-K2.5-NVFP4 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4096' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 256 + max_num_tokens: 256 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + trust_remote_code: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 8768 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + trust_remote_code: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml index b19ce6dfe52b..cef09d7dd066 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -65,7 +65,7 @@ worker_config: backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: Eagle @@ -96,7 +96,7 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 trust_remote_code: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..83aefbff6137 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,94 @@ +metadata: + model_name: qwen3_235b_a22b_fp4 + precision: fp4 + model_dir_name: Qwen3-235B-A22B-FP4 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1024' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/qwen3_235b-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 128 + max_num_tokens: 128 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 32768 + backend: NIXL + disable_overlap_scheduler: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 4 + max_num_tokens: 32768 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 32768 + backend: NIXL + disable_overlap_scheduler: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_qwen3-235b-fp4_8k1k_con1_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_qwen3-235b-fp4_8k1k_con1_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..dde9fb64b2d3 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb200_qwen3-235b-fp4_8k1k_con1_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,93 @@ +metadata: + model_name: qwen3_235b_a22b_fp4 + precision: fp4 + model_dir_name: Qwen3-235B-A22B-FP4 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/qwen3_235b-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 1 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 32768 + backend: NIXL + disable_overlap_scheduler: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 4 + max_num_tokens: 32768 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 32768 + backend: NIXL + disable_overlap_scheduler: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_qwen3-235b-fp4_8k1k_con64_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb200_qwen3-235b-fp4_8k1k_con64_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_qwen3-235b-fp4_8k1k_con64_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb200_qwen3-235b-fp4_8k1k_con64_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml index 355ca59c970b..6a74a4fcfc36 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_qwen3-235b-fp4_8k1k_con64_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb200_qwen3-235b-fp4_8k1k_con64_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml @@ -65,7 +65,7 @@ worker_config: backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 32768 - backend: UCX + backend: NIXL disable_overlap_scheduler: true num_postprocess_workers: 4 stream_interval: 20 @@ -88,5 +88,5 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 32768 - backend: UCX + backend: NIXL disable_overlap_scheduler: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_128k8k_con1_ctx1_pp4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_128k8k_con1_ctx1_pp4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..99440337a369 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_128k8k_con1_ctx1_pp4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,98 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1' + input_length: 131072 + output_length: 8192 + dataset_file: datasets/perf-ci/deepseek_r1-128k8k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 4 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + allreduce_strategy: MNNVL + ctx: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 4 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml new file mode 100644 index 000000000000..686fb268738a --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '256' + input_length: 131072 + output_length: 8192 + dataset_file: datasets/perf-ci/deepseek_r1-128k8k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 64 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 4 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_128k8k_con64_ctx1_pp4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_128k8k_con64_ctx1_pp4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml new file mode 100644 index 000000000000..1d6d0bdfcae8 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_128k8k_con64_ctx1_pp4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '64' + input_length: 131072 + output_length: 8192 + dataset_file: datasets/perf-ci/deepseek_r1-128k8k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 4 + max_num_tokens: 16 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 4 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml rename to tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml index 3e4e1d0d7b0d..2cdb9a868988 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml @@ -67,7 +67,7 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: MTP @@ -93,6 +93,6 @@ worker_config: backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..9a3f636173c3 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,98 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 128 + max_num_tokens: 512 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + allreduce_strategy: MNNVL + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml new file mode 100644 index 000000000000..50f19df4ec25 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '3072' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 768 + max_num_tokens: 1536 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 768 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..780b4d71e1a0 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1024' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 32 + max_num_tokens: 128 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..6f3c36cda444 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,98 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 128 + max_num_tokens: 512 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + allreduce_strategy: MNNVL + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml new file mode 100644 index 000000000000..802e1777d732 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4096' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 256 + max_num_tokens: 512 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..669718d13894 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml @@ -0,0 +1,110 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1024' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 32 + max_num_tokens: 128 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 256 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..bd6d75d283a1 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,106 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 512 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + allreduce_strategy: MNNVL + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml new file mode 100644 index 000000000000..73c0910392bb --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '2048' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 512 + max_num_tokens: 1024 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..0eab593e0908 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,111 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 32k4k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1' + input_length: 32768 + output_length: 4096 + dataset_file: datasets/perf-ci/deepseek_v32-32k4k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 8 + max_input_len: 32784 + max_seq_len: 40960 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.25 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 32832 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + allreduce_strategy: MNNVL + ctx: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 32784 + max_input_len: 32784 + max_seq_len: 32832 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 32832 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL.yaml new file mode 100644 index 000000000000..fea3b1a1cfc5 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL.yaml @@ -0,0 +1,114 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 32k4k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '2048' + input_length: 32768 + output_length: 4096 + dataset_file: datasets/perf-ci/deepseek_v32-32k4k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 64 + max_num_tokens: 128 + max_input_len: 32784 + max_seq_len: 40960 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.5 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 32832 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 32784 + max_input_len: 32784 + max_seq_len: 32832 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 32832 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..609c0c43d6d2 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,111 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 32k4k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '256' + input_length: 32768 + output_length: 4096 + dataset_file: datasets/perf-ci/deepseek_v32-32k4k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 8 + max_num_tokens: 64 + max_input_len: 32784 + max_seq_len: 40960 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 8 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.5 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 32832 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 32784 + max_input_len: 32784 + max_seq_len: 32832 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 32832 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..fef69dbf9866 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-Exp-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 32k4k + # Mirror of the Dynamo disagg-kv-router DSv3.2 FP4 recipe: + # https://github.com/ai-dynamo/dynamo/tree/main/recipes/deepseek-v32-fp4/trtllm/disagg-kv-router +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '256' + input_length: 32768 + output_length: 4096 + dataset_file: datasets/perf-ci/deepseek_v32-32k4k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + # Env vars taken from the Dynamo deploy.yaml (NCCL MNNVL/CUMEM/NVLS, UCX MNNVL + # IPC, MoE all-to-all without all-gather, PDL, GC disable) so the perf test + # exercises the same code paths as the production deployment. + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes NCCL_MNNVL_ENABLE=1 NCCL_CUMEM_ENABLE=1 NCCL_NVLS_ENABLE=1 NVIDIA_GDRCOPY=1 UCX_CUDA_IPC_ENABLE_MNNVL=1 NCCL_GRAPH_MIXING_SUPPORT=0 TRTLLM_MOE_ENABLE_ALLTOALL_WITHOUT_ALLGATHER=1 TRTLLM_FORCE_COMM_METHOD=NVLINK_TWO_SIDED ENABLE_CONFIGURABLE_MOE=1" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + print_iter_log: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + max_batch_size: 8 + max_num_tokens: 8192 + max_seq_len: 121000 + allreduce_strategy: MNNVL + cuda_graph_config: + enable_padding: true + max_batch_size: 8 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 120000 + backend: NIXL + num_postprocess_workers: 8 + stream_interval: 10 + ctx: + print_iter_log: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + max_batch_size: 32 + max_num_tokens: 8192 + max_seq_len: 121000 + enable_chunked_prefill: true + disable_overlap_scheduler: true + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.9 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 120000 + backend: NIXL diff --git a/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..5d4325064c64 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml @@ -0,0 +1,110 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1024' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 32 + max_num_tokens: 128 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 256 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..16d743d373d8 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,106 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 512 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + allreduce_strategy: MNNVL + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..ab0c5dd4949a --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb300_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL.yaml @@ -0,0 +1,106 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4096' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 128 + max_num_tokens: 128 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 256 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb300_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb300_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..2b50cd55d8a7 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb300_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,97 @@ +metadata: + model_name: k25_thinking_fp4 + precision: fp4 + model_dir_name: Kimi-K2.5-NVFP4 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '2048' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 64 + max_num_tokens: 128 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + trust_remote_code: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 8192 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + trust_remote_code: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb300_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb300_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..640d363ba39c --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb300_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,97 @@ +metadata: + model_name: k25_thinking_fp4 + precision: fp4 + model_dir_name: Kimi-K2.5-NVFP4 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4096' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 512 + max_num_tokens: 512 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + trust_remote_code: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 8192 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + trust_remote_code: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb300_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb300_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..40c2b2f3b6d7 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb300_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,95 @@ +metadata: + model_name: k25_thinking_fp4 + precision: fp4 + model_dir_name: Kimi-K2.5-NVFP4 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 4 + max_num_tokens: 128 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + trust_remote_code: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 8192 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + trust_remote_code: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb300_kimi-k25-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb300_kimi-k25-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..a8ef83c55353 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb300_kimi-k25-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,103 @@ +metadata: + model_name: k25_thinking_fp4 + precision: fp4 + model_dir_name: Kimi-K2.5-NVFP4 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1024' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 32 + max_num_tokens: 128 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: Eagle + max_draft_len: 3 + eagle3_one_model: true + speculative_model: Kimi-K2.5-Thinking-Eagle3 + trust_remote_code: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 8768 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 + trust_remote_code: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb300_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb300_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..31a59ef73017 --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb300_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,97 @@ +metadata: + model_name: k25_thinking_fp4 + precision: fp4 + model_dir_name: Kimi-K2.5-NVFP4 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4096' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 256 + max_num_tokens: 256 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + trust_remote_code: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 8768 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + trust_remote_code: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb300_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf-sanity/disaggregated/gb300_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..90f3bcbf737b --- /dev/null +++ b/tests/scripts/perf-sanity/disaggregated/gb300_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,102 @@ +metadata: + model_name: k25_thinking_fp4 + precision: fp4 + model_dir_name: Kimi-K2.5-NVFP4 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 4 + max_num_tokens: 128 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: Eagle + max_draft_len: 3 + eagle3_one_model: true + speculative_model: Kimi-K2.5-Thinking-Eagle3 + trust_remote_code: true + num_postprocess_workers: 4 + stream_interval: 20 + allreduce_strategy: MNNVL + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 8768 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 + trust_remote_code: true diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con1024_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con1024_ccb-NIXL.yaml deleted file mode 100644 index aef1a1cd8c52..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con1024_ccb-NIXL.yaml +++ /dev/null @@ -1,105 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp4 - precision: fp4 - model_dir_name: Qwen3-235B-A22B-FP4 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '1024' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 16 - moe_expert_parallel_size: 16 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 64 - max_num_tokens: 256 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: WIDEEP - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con1024_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con1024_ccb-UCX.yaml deleted file mode 100644 index 2c4d11208df7..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con1024_ccb-UCX.yaml +++ /dev/null @@ -1,105 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp4 - precision: fp4 - model_dir_name: Qwen3-235B-A22B-FP4 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '1024' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 16 - moe_expert_parallel_size: 16 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 64 - max_num_tokens: 256 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: WIDEEP - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con512_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con512_ccb-NIXL.yaml deleted file mode 100644 index 87939ffce62e..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con512_ccb-NIXL.yaml +++ /dev/null @@ -1,105 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp4 - precision: fp4 - model_dir_name: Qwen3-235B-A22B-FP4 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '512' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 16 - moe_expert_parallel_size: 16 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 64 - max_num_tokens: 256 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: WIDEEP - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con512_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con512_ccb-UCX.yaml deleted file mode 100644 index 1be6cd5d8399..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_con512_ccb-UCX.yaml +++ /dev/null @@ -1,105 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp4 - precision: fp4 - model_dir_name: Qwen3-235B-A22B-FP4 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '512' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 16 - moe_expert_parallel_size: 16 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 64 - max_num_tokens: 256 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: WIDEEP - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-NIXL.yaml deleted file mode 100644 index 255efb49512d..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-NIXL.yaml +++ /dev/null @@ -1,105 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp4 - precision: fp4 - model_dir_name: Qwen3-235B-A22B-FP4 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '512' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 32 - moe_expert_parallel_size: 32 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 16 - max_num_tokens: 64 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.6 - dtype: fp8 - moe_config: - backend: WIDEEP - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-UCX.yaml deleted file mode 100644 index e675219be991..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-UCX.yaml +++ /dev/null @@ -1,105 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp4 - precision: fp4 - model_dir_name: Qwen3-235B-A22B-FP4 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '512' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 32 - moe_expert_parallel_size: 32 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 16 - max_num_tokens: 64 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.6 - dtype: fp8 - moe_config: - backend: WIDEEP - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml deleted file mode 100644 index 16f9cdbfa56c..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp4 - precision: fp4 - model_dir_name: Qwen3-235B-A22B-FP4 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '16' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml deleted file mode 100644 index 41598d88f684..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp4 - precision: fp4 - model_dir_name: Qwen3-235B-A22B-FP4 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '16' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml deleted file mode 100644 index 6cd7320e367b..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp4 - precision: fp4 - model_dir_name: Qwen3-235B-A22B-FP4 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '1' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml deleted file mode 100644 index a12241eec623..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp4 - precision: fp4 - model_dir_name: Qwen3-235B-A22B-FP4 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '2' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml deleted file mode 100644 index 3a4b226b8a54..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp4 - precision: fp4 - model_dir_name: Qwen3-235B-A22B-FP4 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '2' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml deleted file mode 100644 index 2688bc7fb2c2..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp4 - precision: fp4 - model_dir_name: Qwen3-235B-A22B-FP4 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '32' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml deleted file mode 100644 index c2cf88e83fc4..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp4 - precision: fp4 - model_dir_name: Qwen3-235B-A22B-FP4 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '32' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml deleted file mode 100644 index 9f1ccb70faa9..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp4 - precision: fp4 - model_dir_name: Qwen3-235B-A22B-FP4 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '4' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml deleted file mode 100644 index 1a5bb960bdb7..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp4 - precision: fp4 - model_dir_name: Qwen3-235B-A22B-FP4 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '4' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml deleted file mode 100644 index 78cde1a8cf0c..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp4 - precision: fp4 - model_dir_name: Qwen3-235B-A22B-FP4 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '8' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml deleted file mode 100644 index 9a857429126b..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp4 - precision: fp4 - model_dir_name: Qwen3-235B-A22B-FP4 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '8' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml deleted file mode 100644 index 4526cc4dd51e..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml +++ /dev/null @@ -1,90 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp8 - precision: fp8 - model_dir_name: Qwen3-235B-A22B-FP8 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '16' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 64 - max_num_tokens: 2048 - max_seq_len: 2051 - cuda_graph_config: - enable_padding: true - max_batch_size: 128 - print_iter_log: true - kv_cache_config: - enable_block_reuse: true - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 2048 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - disable_overlap_scheduler: false - ctx: - max_batch_size: 32 - max_num_tokens: 2048 - max_seq_len: 2051 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: true - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 2048 - backend: NIXL diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml deleted file mode 100644 index fa3c3da43612..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml +++ /dev/null @@ -1,90 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp8 - precision: fp8 - model_dir_name: Qwen3-235B-A22B-FP8 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '16' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 64 - max_num_tokens: 2048 - max_seq_len: 2051 - cuda_graph_config: - enable_padding: true - max_batch_size: 128 - print_iter_log: true - kv_cache_config: - enable_block_reuse: true - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 2048 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - disable_overlap_scheduler: false - ctx: - max_batch_size: 32 - max_num_tokens: 2048 - max_seq_len: 2051 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: true - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 2048 - backend: UCX diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml deleted file mode 100644 index c301e3fba76e..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml +++ /dev/null @@ -1,90 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp8 - precision: fp8 - model_dir_name: Qwen3-235B-A22B-FP8 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '1' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 64 - max_num_tokens: 2048 - max_seq_len: 2051 - cuda_graph_config: - enable_padding: true - max_batch_size: 128 - print_iter_log: true - kv_cache_config: - enable_block_reuse: true - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 2048 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - disable_overlap_scheduler: false - ctx: - max_batch_size: 32 - max_num_tokens: 2048 - max_seq_len: 2051 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: true - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 2048 - backend: NIXL diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml deleted file mode 100644 index e1e660432a3d..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml +++ /dev/null @@ -1,90 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp8 - precision: fp8 - model_dir_name: Qwen3-235B-A22B-FP8 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '1' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 64 - max_num_tokens: 2048 - max_seq_len: 2051 - cuda_graph_config: - enable_padding: true - max_batch_size: 128 - print_iter_log: true - kv_cache_config: - enable_block_reuse: true - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 2048 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - disable_overlap_scheduler: false - ctx: - max_batch_size: 32 - max_num_tokens: 2048 - max_seq_len: 2051 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: true - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 2048 - backend: UCX diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml deleted file mode 100644 index 106ff6e8c0a1..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml +++ /dev/null @@ -1,90 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp8 - precision: fp8 - model_dir_name: Qwen3-235B-A22B-FP8 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '2' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 64 - max_num_tokens: 2048 - max_seq_len: 2051 - cuda_graph_config: - enable_padding: true - max_batch_size: 128 - print_iter_log: true - kv_cache_config: - enable_block_reuse: true - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 2048 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - disable_overlap_scheduler: false - ctx: - max_batch_size: 32 - max_num_tokens: 2048 - max_seq_len: 2051 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: true - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 2048 - backend: NIXL diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml deleted file mode 100644 index 22972782d2de..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml +++ /dev/null @@ -1,90 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp8 - precision: fp8 - model_dir_name: Qwen3-235B-A22B-FP8 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '2' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 64 - max_num_tokens: 2048 - max_seq_len: 2051 - cuda_graph_config: - enable_padding: true - max_batch_size: 128 - print_iter_log: true - kv_cache_config: - enable_block_reuse: true - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 2048 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - disable_overlap_scheduler: false - ctx: - max_batch_size: 32 - max_num_tokens: 2048 - max_seq_len: 2051 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: true - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 2048 - backend: UCX diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con36_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con36_ccb-NIXL.yaml deleted file mode 100644 index f8f454f1f8ec..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con36_ccb-NIXL.yaml +++ /dev/null @@ -1,90 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp8 - precision: fp8 - model_dir_name: Qwen3-235B-A22B-FP8 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '36' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 64 - max_num_tokens: 2048 - max_seq_len: 2051 - cuda_graph_config: - enable_padding: true - max_batch_size: 128 - print_iter_log: true - kv_cache_config: - enable_block_reuse: true - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 2048 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - disable_overlap_scheduler: false - ctx: - max_batch_size: 32 - max_num_tokens: 2048 - max_seq_len: 2051 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: true - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 2048 - backend: NIXL diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con36_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con36_ccb-UCX.yaml deleted file mode 100644 index 5819abaf3ec1..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con36_ccb-UCX.yaml +++ /dev/null @@ -1,90 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp8 - precision: fp8 - model_dir_name: Qwen3-235B-A22B-FP8 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '36' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 64 - max_num_tokens: 2048 - max_seq_len: 2051 - cuda_graph_config: - enable_padding: true - max_batch_size: 128 - print_iter_log: true - kv_cache_config: - enable_block_reuse: true - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 2048 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - disable_overlap_scheduler: false - ctx: - max_batch_size: 32 - max_num_tokens: 2048 - max_seq_len: 2051 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: true - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 2048 - backend: UCX diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml deleted file mode 100644 index 7beae38501ff..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml +++ /dev/null @@ -1,90 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp8 - precision: fp8 - model_dir_name: Qwen3-235B-A22B-FP8 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '4' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 64 - max_num_tokens: 2048 - max_seq_len: 2051 - cuda_graph_config: - enable_padding: true - max_batch_size: 128 - print_iter_log: true - kv_cache_config: - enable_block_reuse: true - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 2048 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - disable_overlap_scheduler: false - ctx: - max_batch_size: 32 - max_num_tokens: 2048 - max_seq_len: 2051 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: true - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 2048 - backend: NIXL diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml deleted file mode 100644 index 52ed89a6724e..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml +++ /dev/null @@ -1,90 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp8 - precision: fp8 - model_dir_name: Qwen3-235B-A22B-FP8 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '4' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 64 - max_num_tokens: 2048 - max_seq_len: 2051 - cuda_graph_config: - enable_padding: true - max_batch_size: 128 - print_iter_log: true - kv_cache_config: - enable_block_reuse: true - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 2048 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - disable_overlap_scheduler: false - ctx: - max_batch_size: 32 - max_num_tokens: 2048 - max_seq_len: 2051 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: true - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 2048 - backend: UCX diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml deleted file mode 100644 index 9b43900bcfc1..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml +++ /dev/null @@ -1,90 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp8 - precision: fp8 - model_dir_name: Qwen3-235B-A22B-FP8 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '8' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 64 - max_num_tokens: 2048 - max_seq_len: 2051 - cuda_graph_config: - enable_padding: true - max_batch_size: 128 - print_iter_log: true - kv_cache_config: - enable_block_reuse: true - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 2048 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - disable_overlap_scheduler: false - ctx: - max_batch_size: 32 - max_num_tokens: 2048 - max_seq_len: 2051 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: true - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 2048 - backend: NIXL diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml deleted file mode 100644 index ec6e8c741333..000000000000 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml +++ /dev/null @@ -1,90 +0,0 @@ -metadata: - model_name: qwen3_235b_a22b_fp8 - precision: fp8 - model_dir_name: Qwen3-235B-A22B-FP8 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '8' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 64 - max_num_tokens: 2048 - max_seq_len: 2051 - cuda_graph_config: - enable_padding: true - max_batch_size: 128 - print_iter_log: true - kv_cache_config: - enable_block_reuse: true - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 2048 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - disable_overlap_scheduler: false - ctx: - max_batch_size: 32 - max_num_tokens: 2048 - max_seq_len: 2051 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: true - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 2048 - backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen13_tep4_bs1_eplb0_mtp0_con1-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen13_tep4_bs1_eplb0_mtp0_con1-Default.yaml deleted file mode 100644 index 9653c39df1f6..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen13_tep4_bs1_eplb0_mtp0_con1-Default.yaml +++ /dev/null @@ -1,98 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '1' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 13 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 1 - max_num_tokens: 128 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 4 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen5_tep4_bs4_eplb0_mtp0_con4-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen5_tep4_bs4_eplb0_mtp0_con4-Default.yaml deleted file mode 100644 index c2a5a7d5a287..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen5_tep4_bs4_eplb0_mtp0_con4-Default.yaml +++ /dev/null @@ -1,98 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '4' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 5 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 4 - max_num_tokens: 128 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 4 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen6_tep8_bs1_eplb0_mtp3_con1-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen6_tep8_bs1_eplb0_mtp3_con1-Default.yaml deleted file mode 100644 index 7e7b4b69a3b5..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen6_tep8_bs1_eplb0_mtp3_con1-Default.yaml +++ /dev/null @@ -1,104 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '1' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 6 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 1 - max_num_tokens: 4 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 4 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen7_tep8_bs1_eplb0_mtp0_con1-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen7_tep8_bs1_eplb0_mtp0_con1-Default.yaml deleted file mode 100644 index 80743c8fae0a..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen7_tep8_bs1_eplb0_mtp0_con1-Default.yaml +++ /dev/null @@ -1,98 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '1' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 7 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 1 - max_num_tokens: 128 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 4 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep8_bs1_eplb0_mtp0_con1-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep8_bs1_eplb0_mtp0_con1-Default.yaml deleted file mode 100644 index e025e72a16ed..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep8_bs1_eplb0_mtp0_con1-Default.yaml +++ /dev/null @@ -1,98 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '1' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 8 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 1 - max_num_tokens: 128 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 4 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen11_tep4_bs2_eplb0_mtp0_con2-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen11_tep4_bs2_eplb0_mtp0_con2-Default.yaml deleted file mode 100644 index 456929f4b8ac..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen11_tep4_bs2_eplb0_mtp0_con2-Default.yaml +++ /dev/null @@ -1,98 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '2' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 11 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 2 - max_num_tokens: 128 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 8 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen14_tep4_bs1_eplb0_mtp0_con1-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen14_tep4_bs1_eplb0_mtp0_con1-Default.yaml deleted file mode 100644 index 88f5c7c9e65b..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen14_tep4_bs1_eplb0_mtp0_con1-Default.yaml +++ /dev/null @@ -1,98 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '1' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 14 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 1 - max_num_tokens: 128 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 8 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_bs1_eplb0_mtp3_con1-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_bs1_eplb0_mtp3_con1-Default.yaml deleted file mode 100644 index 1cc2b23136b0..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep16_bs1_eplb0_mtp3_con1-Default.yaml +++ /dev/null @@ -1,101 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '1' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 16 - moe_expert_parallel_size: 16 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 1 - max_num_tokens: 4 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: &id001 - decoding_type: MTP - num_nextn_predict_layers: 3 - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 8 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - speculative_config: *id001 - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep8_bs4_eplb0_mtp2_con4-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep8_bs4_eplb0_mtp2_con4-Default.yaml deleted file mode 100644 index c8ac64526c98..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_dep8_bs4_eplb0_mtp2_con4-Default.yaml +++ /dev/null @@ -1,101 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '4' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 4 - max_num_tokens: 12 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: &id001 - decoding_type: MTP - num_nextn_predict_layers: 2 - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 8 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - speculative_config: *id001 - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp0_con1-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp0_con1-Default.yaml deleted file mode 100644 index 374374d3aa3f..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp0_con1-Default.yaml +++ /dev/null @@ -1,98 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '1' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 1 - max_num_tokens: 128 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 8 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp3_con1-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp3_con1-Default.yaml deleted file mode 100644 index 5cac67c003b9..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs1_eplb0_mtp3_con1-Default.yaml +++ /dev/null @@ -1,102 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '1' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 1 - max_num_tokens: 4 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - speculative_config: &id001 - decoding_type: MTP - num_nextn_predict_layers: 3 - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 8 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - speculative_config: *id001 - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs2_eplb0_mtp3_con2-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs2_eplb0_mtp3_con2-Default.yaml deleted file mode 100644 index 2ce9353c6efb..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen1_tep8_bs2_eplb0_mtp3_con2-Default.yaml +++ /dev/null @@ -1,102 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '2' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 2 - max_num_tokens: 8 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - speculative_config: &id001 - decoding_type: MTP - num_nextn_predict_layers: 3 - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 8 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - speculative_config: *id001 - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen5_tep8_bs2_eplb0_mtp3_con2-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen5_tep8_bs2_eplb0_mtp3_con2-Default.yaml deleted file mode 100644 index 7aa1ba59116c..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen5_tep8_bs2_eplb0_mtp3_con2-Default.yaml +++ /dev/null @@ -1,102 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '2' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 5 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 2 - max_num_tokens: 8 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - speculative_config: &id001 - decoding_type: MTP - num_nextn_predict_layers: 3 - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 8 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - speculative_config: *id001 - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep4_bs2_eplb0_mtp2_con2-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep4_bs2_eplb0_mtp2_con2-Default.yaml deleted file mode 100644 index 2f8164b4c3b1..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep4_bs2_eplb0_mtp2_con2-Default.yaml +++ /dev/null @@ -1,102 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '2' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 7 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 2 - max_num_tokens: 6 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - speculative_config: &id001 - decoding_type: MTP - num_nextn_predict_layers: 2 - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 8 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - speculative_config: *id001 - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep8_bs1_eplb0_mtp0_con1-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep8_bs1_eplb0_mtp0_con1-Default.yaml deleted file mode 100644 index 36daae740bc1..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen7_tep8_bs1_eplb0_mtp0_con1-Default.yaml +++ /dev/null @@ -1,98 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '1' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 7 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 1 - max_num_tokens: 128 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 8 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen8_tep4_bs4_eplb0_mtp0_con4-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen8_tep4_bs4_eplb0_mtp0_con4-Default.yaml deleted file mode 100644 index 1e65c75ea942..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp8_gen8_tep4_bs4_eplb0_mtp0_con4-Default.yaml +++ /dev/null @@ -1,98 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '4' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 8 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 4 - max_num_tokens: 128 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 8 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx2_pp4_gen7_tep8_bs2_eplb0_mtp3_con2-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx2_pp4_gen7_tep8_bs2_eplb0_mtp3_con2-Default.yaml deleted file mode 100644 index dd67225e7831..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx2_pp4_gen7_tep8_bs2_eplb0_mtp3_con2-Default.yaml +++ /dev/null @@ -1,104 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '2' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 2 - num_gen_servers: 7 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 2 - max_num_tokens: 8 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 4 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep16_bs8_eplb0_mtp0_con8-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep16_bs8_eplb0_mtp0_con8-Default.yaml deleted file mode 100644 index 8d5b7bf31309..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep16_bs8_eplb0_mtp0_con8-Default.yaml +++ /dev/null @@ -1,97 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '8' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 2 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 16 - moe_expert_parallel_size: 16 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 8 - max_num_tokens: 128 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 8 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep32_bs2_eplb0_mtp0_con2-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep32_bs2_eplb0_mtp0_con2-Default.yaml deleted file mode 100644 index 567a6eadf440..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx2_pp8_gen1_dep32_bs2_eplb0_mtp0_con2-Default.yaml +++ /dev/null @@ -1,97 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '2' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 2 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 32 - moe_expert_parallel_size: 32 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 2 - max_num_tokens: 128 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 8 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp4_gen1_dep8_bs16_eplb0_mtp1_con128-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp4_gen1_dep8_bs16_eplb0_mtp1_con128-Default.yaml deleted file mode 100644 index ab39577367ed..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp4_gen1_dep8_bs16_eplb0_mtp1_con128-Default.yaml +++ /dev/null @@ -1,103 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '128' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 3 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 16 - max_num_tokens: 32 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 1 - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 4 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 1 - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs16_eplb0_mtp0_con16-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs16_eplb0_mtp0_con16-Default.yaml deleted file mode 100644 index 5ad97f542f29..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs16_eplb0_mtp0_con16-Default.yaml +++ /dev/null @@ -1,97 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '16' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 3 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 16 - moe_expert_parallel_size: 16 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 16 - max_num_tokens: 128 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 8 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2_con8-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2_con8-Default.yaml deleted file mode 100644 index 10b8cfed99eb..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep16_bs8_eplb0_mtp2_con8-Default.yaml +++ /dev/null @@ -1,101 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '8' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 3 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 16 - moe_expert_parallel_size: 16 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 8 - max_num_tokens: 24 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: &id001 - decoding_type: MTP - num_nextn_predict_layers: 2 - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 8 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - speculative_config: *id001 - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3_con2-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3_con2-Default.yaml deleted file mode 100644 index cbade5120619..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs2_eplb0_mtp3_con2-Default.yaml +++ /dev/null @@ -1,101 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '2' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 3 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 32 - moe_expert_parallel_size: 32 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 2 - max_num_tokens: 8 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: &id001 - decoding_type: MTP - num_nextn_predict_layers: 3 - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 8 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - speculative_config: *id001 - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs4_eplb0_mtp0_con4-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs4_eplb0_mtp0_con4-Default.yaml deleted file mode 100644 index 3af38a83d219..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx3_pp8_gen1_dep32_bs4_eplb0_mtp0_con4-Default.yaml +++ /dev/null @@ -1,97 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '4' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 3 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 32 - moe_expert_parallel_size: 32 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 4 - max_num_tokens: 128 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 8 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs16_eplb0_mtp0_con256-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs16_eplb0_mtp0_con256-Default.yaml deleted file mode 100644 index e4d8dc7346c5..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs16_eplb0_mtp0_con256-Default.yaml +++ /dev/null @@ -1,97 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '256' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 5 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 16 - moe_expert_parallel_size: 16 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 16 - max_num_tokens: 128 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 4 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs8_eplb0_mtp3_con128-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs8_eplb0_mtp3_con128-Default.yaml deleted file mode 100644 index 8d75cbb5c2a9..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep16_bs8_eplb0_mtp3_con128-Default.yaml +++ /dev/null @@ -1,103 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '128' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 5 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 16 - moe_expert_parallel_size: 16 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 8 - max_num_tokens: 32 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 4 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs2_eplb0_mtp3_con64-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs2_eplb0_mtp3_con64-Default.yaml deleted file mode 100644 index aeb4581a3b95..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs2_eplb0_mtp3_con64-Default.yaml +++ /dev/null @@ -1,103 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '64' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 5 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 32 - moe_expert_parallel_size: 32 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 2 - max_num_tokens: 8 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 4 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs4_eplb0_mtp0_con128-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs4_eplb0_mtp0_con128-Default.yaml deleted file mode 100644 index 2ffde0adda0a..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx5_pp4_gen1_dep32_bs4_eplb0_mtp0_con128-Default.yaml +++ /dev/null @@ -1,97 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '128' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 5 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 32 - moe_expert_parallel_size: 32 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 4 - max_num_tokens: 128 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 4 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs16_eplb0_mtp1_con256-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs16_eplb0_mtp1_con256-Default.yaml deleted file mode 100644 index 67e28bdcc347..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs16_eplb0_mtp1_con256-Default.yaml +++ /dev/null @@ -1,103 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '256' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 7 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 16 - moe_expert_parallel_size: 16 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 16 - max_num_tokens: 32 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 1 - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 4 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 1 - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs32_eplb0_mtp0_con512-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs32_eplb0_mtp0_con512-Default.yaml deleted file mode 100644 index 58fa55138cd5..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx7_pp4_gen1_dep16_bs32_eplb0_mtp0_con512-Default.yaml +++ /dev/null @@ -1,97 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '512' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 7 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 16 - moe_expert_parallel_size: 16 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 4 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep16_bs32_eplb0_mtp1_con512-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep16_bs32_eplb0_mtp1_con512-Default.yaml deleted file mode 100644 index fc316de34aa5..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep16_bs32_eplb0_mtp1_con512-Default.yaml +++ /dev/null @@ -1,103 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '512' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 8 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 16 - moe_expert_parallel_size: 16 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 64 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 1 - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 4 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 1 - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs4_eplb0_mtp3_con128-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs4_eplb0_mtp3_con128-Default.yaml deleted file mode 100644 index 7857a0b22339..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs4_eplb0_mtp3_con128-Default.yaml +++ /dev/null @@ -1,103 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '128' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 8 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 32 - moe_expert_parallel_size: 32 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 4 - max_num_tokens: 16 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 4 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp0_con256-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp0_con256-Default.yaml deleted file mode 100644 index 161ef82e7c40..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp0_con256-Default.yaml +++ /dev/null @@ -1,97 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '256' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 8 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 32 - moe_expert_parallel_size: 32 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 8 - max_num_tokens: 128 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 4 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp3_con256-Default.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp3_con256-Default.yaml deleted file mode 100644 index 42f6c6b9bca8..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx8_pp4_gen1_dep32_bs8_eplb0_mtp3_con256-Default.yaml +++ /dev/null @@ -1,103 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 128k8k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '256' - input_length: 131072 - output_length: 8192 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 8 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 32 - moe_expert_parallel_size: 32 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 8 - max_num_tokens: 32 - max_seq_len: 139296 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - ctx: - max_batch_size: 1 - max_num_tokens: 131104 - max_seq_len: 131104 - tensor_parallel_size: 1 - moe_expert_parallel_size: 1 - enable_attention_dp: false - pipeline_parallel_size: 4 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.4 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 131104 - backend: DEFAULT - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - moe_config: - backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_con1024_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_con1024_ccb-UCX.yaml deleted file mode 100644 index f46e66f455ec..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_con1024_ccb-UCX.yaml +++ /dev/null @@ -1,99 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: "--gres=gpu:4" - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '1024' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes MIMALLOC_PURGE_DELAY=0" - server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 32 - moe_expert_parallel_size: 32 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 32 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: WIDEEP - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml deleted file mode 100644 index 7b0b8b627f26..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '16' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml deleted file mode 100644 index ed1783847a3d..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '16' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml deleted file mode 100644 index ba1e40738ce7..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '1' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml deleted file mode 100644 index 13d3d5515089..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '1' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml deleted file mode 100644 index 90ef56d11751..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '2' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml deleted file mode 100644 index 3dcbecb8fdcc..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '2' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml deleted file mode 100644 index 1947ac703aed..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '32' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml deleted file mode 100644 index de7b1291bb46..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '32' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml deleted file mode 100644 index a46d642607aa..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '4' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml deleted file mode 100644 index d01e1fac9f7d..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '4' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml deleted file mode 100644 index bc4618fe923d..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '8' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml deleted file mode 100644 index 4bda05c3487e..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '8' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con16_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con16_ccb-NIXL.yaml deleted file mode 100644 index 9829e43592ad..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con16_ccb-NIXL.yaml +++ /dev/null @@ -1,106 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '16' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con16_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con16_ccb-UCX.yaml deleted file mode 100644 index f42cbd4138d3..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con16_ccb-UCX.yaml +++ /dev/null @@ -1,106 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '16' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con1_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con1_ccb-UCX.yaml deleted file mode 100644 index ae464650ee1f..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con1_ccb-UCX.yaml +++ /dev/null @@ -1,106 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '1' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con2_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con2_ccb-NIXL.yaml deleted file mode 100644 index 6689708c5bbe..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con2_ccb-NIXL.yaml +++ /dev/null @@ -1,106 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '2' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con2_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con2_ccb-UCX.yaml deleted file mode 100644 index e147acb03f5c..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con2_ccb-UCX.yaml +++ /dev/null @@ -1,106 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '2' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con32_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con32_ccb-NIXL.yaml deleted file mode 100644 index 069fef04d023..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con32_ccb-NIXL.yaml +++ /dev/null @@ -1,106 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '32' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con32_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con32_ccb-UCX.yaml deleted file mode 100644 index d36db50a2bab..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con32_ccb-UCX.yaml +++ /dev/null @@ -1,106 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '32' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con4_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con4_ccb-NIXL.yaml deleted file mode 100644 index faef25215337..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con4_ccb-NIXL.yaml +++ /dev/null @@ -1,106 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '4' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con4_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con4_ccb-UCX.yaml deleted file mode 100644 index 8cd81db3c117..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con4_ccb-UCX.yaml +++ /dev/null @@ -1,106 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '4' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con8_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con8_ccb-NIXL.yaml deleted file mode 100644 index f818041c76c0..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con8_ccb-NIXL.yaml +++ /dev/null @@ -1,106 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '8' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: NIXL - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con8_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con8_ccb-UCX.yaml deleted file mode 100644 index ed8490bf114e..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con8_ccb-UCX.yaml +++ /dev/null @@ -1,106 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '8' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 4 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 128 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-UCX.yaml deleted file mode 100644 index 3c9278738e4c..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-UCX.yaml +++ /dev/null @@ -1,105 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: "--gres=gpu:4" - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '2048' - input_length: 1024 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 2 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes MIMALLOC_PURGE_DELAY=0" - server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 16 - moe_expert_parallel_size: 16 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 128 - max_num_tokens: 512 - max_seq_len: 2251 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.9 - dtype: fp8 - moe_config: - backend: WIDEEP - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 4608 - backend: UCX - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_con1024_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_con1024_ccb-UCX.yaml deleted file mode 100644 index 2afc97ccceb4..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_con1024_ccb-UCX.yaml +++ /dev/null @@ -1,119 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k - config_index: -1 -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: "--gres=gpu:4" - numa_bind: true -benchmark: - mode: gen_only - use_nv_sa_benchmark: true - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '1024' - input_length: 8192 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes MIMALLOC_PURGE_DELAY=0" - server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 32 - moe_expert_parallel_size: 32 - context_parallel_size: 1 - enable_attention_dp: true - enable_lm_head_tp_in_adp: true - pipeline_parallel_size: 1 - max_batch_size: 128 - max_num_tokens: 512 - max_seq_len: 9256 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 24 - - 32 - - 40 - - 48 - - 56 - - 64 - - 72 - - 80 - - 88 - - 96 - - 104 - - 112 - - 120 - - 128 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - moe_config: - backend: CUTEDSL - use_low_precision_moe_combine: true - load_balancer: - num_slots: 288 - layer_updates_per_iter: 1 - cache_transceiver_config: - max_tokens_in_buffer: 16384 - backend: UCX - stream_interval: 100 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - ctx: - max_batch_size: 2 - max_num_tokens: 16896 - max_seq_len: 9256 - tensor_parallel_size: 4 - context_parallel_size: 1 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 16384 - backend: UCX - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con16_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con16_ccb-NIXL.yaml deleted file mode 100644 index 34baacf0729f..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con16_ccb-NIXL.yaml +++ /dev/null @@ -1,106 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '16' - input_length: 8192 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 3 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 16 - max_num_tokens: 64 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con16_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con16_ccb-UCX.yaml deleted file mode 100644 index b1258b3ff451..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con16_ccb-UCX.yaml +++ /dev/null @@ -1,106 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '16' - input_length: 8192 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 3 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 16 - max_num_tokens: 64 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con1_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con1_ccb-NIXL.yaml deleted file mode 100644 index ed47d2c1c7c7..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con1_ccb-NIXL.yaml +++ /dev/null @@ -1,106 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '1' - input_length: 8192 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 3 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 16 - max_num_tokens: 64 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con1_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con1_ccb-UCX.yaml deleted file mode 100644 index 0ab9b2e25dce..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con1_ccb-UCX.yaml +++ /dev/null @@ -1,106 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '1' - input_length: 8192 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 3 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 16 - max_num_tokens: 64 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con2_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con2_ccb-NIXL.yaml deleted file mode 100644 index cf6db3500f06..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con2_ccb-NIXL.yaml +++ /dev/null @@ -1,106 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '2' - input_length: 8192 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 3 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 16 - max_num_tokens: 64 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con2_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con2_ccb-UCX.yaml deleted file mode 100644 index 0c413f332660..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con2_ccb-UCX.yaml +++ /dev/null @@ -1,106 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '2' - input_length: 8192 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 3 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 16 - max_num_tokens: 64 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con4_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con4_ccb-NIXL.yaml deleted file mode 100644 index 8e6cbb5e484d..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con4_ccb-NIXL.yaml +++ /dev/null @@ -1,106 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '4' - input_length: 8192 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 3 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 16 - max_num_tokens: 64 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con4_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con4_ccb-UCX.yaml deleted file mode 100644 index 49fb3db339f0..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con4_ccb-UCX.yaml +++ /dev/null @@ -1,106 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '4' - input_length: 8192 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 3 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 16 - max_num_tokens: 64 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con8_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con8_ccb-NIXL.yaml deleted file mode 100644 index fcc652a4b1a3..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con8_ccb-NIXL.yaml +++ /dev/null @@ -1,106 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '8' - input_length: 8192 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 3 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 16 - max_num_tokens: 64 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con8_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con8_ccb-UCX.yaml deleted file mode 100644 index 68005a14da7e..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_con8_ccb-UCX.yaml +++ /dev/null @@ -1,106 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '8' - input_length: 8192 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 3 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 16 - max_num_tokens: 64 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml deleted file mode 100644 index 18e10cbdb5ad..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con16_ccb-NIXL.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '16' - input_length: 8192 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 3 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 32 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml deleted file mode 100644 index 239e1c608895..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con16_ccb-UCX.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '16' - input_length: 8192 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 3 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 32 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml deleted file mode 100644 index a5b87b237c4a..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con1_ccb-UCX.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '1' - input_length: 8192 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 3 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 32 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml deleted file mode 100644 index 29e92e4b48c9..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con2_ccb-NIXL.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '2' - input_length: 8192 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 3 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 32 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml deleted file mode 100644 index 6f773ca92606..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con2_ccb-UCX.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '2' - input_length: 8192 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 3 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 32 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml deleted file mode 100644 index a7d9ec5458ae..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con32_ccb-NIXL.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '32' - input_length: 8192 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 3 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 32 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml deleted file mode 100644 index 193e2d19d3f0..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con32_ccb-UCX.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '32' - input_length: 8192 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 3 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 32 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml deleted file mode 100644 index e26e9a057d54..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con4_ccb-NIXL.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '4' - input_length: 8192 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 3 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 32 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml deleted file mode 100644 index 770803ce8ab3..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con4_ccb-UCX.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '4' - input_length: 8192 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 3 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 32 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml deleted file mode 100644 index 4ba41f92cd31..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con8_ccb-NIXL.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '8' - input_length: 8192 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 3 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 32 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml deleted file mode 100644 index 033dfdf378fc..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con8_ccb-UCX.yaml +++ /dev/null @@ -1,100 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '8' - input_length: 8192 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 1 - num_gen_servers: 3 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - enable_attention_dp: false - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 32 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - allreduce_strategy: MNNVL - ctx: - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_con1024_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_con1024_ccb-NIXL.yaml deleted file mode 100644 index a13f4dca63eb..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_con1024_ccb-NIXL.yaml +++ /dev/null @@ -1,99 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: "--gres=gpu:4" - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '1024' - input_length: 8192 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 6 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes MIMALLOC_PURGE_DELAY=0" - server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 16 - moe_expert_parallel_size: 16 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 64 - max_num_tokens: 64 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: WIDEEP - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - ctx: - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_con1024_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_con1024_ccb-UCX.yaml deleted file mode 100644 index 1b36fa91a381..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_con1024_ccb-UCX.yaml +++ /dev/null @@ -1,99 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: "--gres=gpu:4" - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '1024' - input_length: 8192 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 6 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes MIMALLOC_PURGE_DELAY=0" - server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 16 - moe_expert_parallel_size: 16 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 64 - max_num_tokens: 64 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: WIDEEP - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - ctx: - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-NIXL.yaml deleted file mode 100644 index 29fa7e3eb95c..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-NIXL.yaml +++ /dev/null @@ -1,105 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: "--gres=gpu:4" - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '512' - input_length: 8192 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 8 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes MIMALLOC_PURGE_DELAY=0" - server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 32 - moe_expert_parallel_size: 32 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 16 - max_num_tokens: 64 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: WIDEEP - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - ctx: - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-UCX.yaml deleted file mode 100644 index e9aa9d4ebb6b..000000000000 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_con512_ccb-UCX.yaml +++ /dev/null @@ -1,105 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: "--gres=gpu:4" - numa_bind: true -benchmark: - mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '512' - input_length: 8192 - output_length: 1024 - dataset_file: -hardware: - gpus_per_node: 4 - num_ctx_servers: 8 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes MIMALLOC_PURGE_DELAY=0" - server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 32 - moe_expert_parallel_size: 32 - enable_attention_dp: true - pipeline_parallel_size: 1 - max_batch_size: 16 - max_num_tokens: 64 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: WIDEEP - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - ctx: - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml new file mode 100644 index 000000000000..12916108ff24 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml @@ -0,0 +1,98 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '128' + input_length: 131072 + output_length: 8192 + dataset_file: datasets/perf-ci/deepseek_r1-128k8k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 8 + max_num_tokens: 32 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 8 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 8 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp2_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp2_ccb-UCX.yaml rename to tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml index 7f4889a7b4a3..28f8296df512 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp2_ccb-UCX.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_128k8k_con128_ctx1_pp8_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml @@ -71,7 +71,7 @@ worker_config: disable_overlap_scheduler: true speculative_config: &id001 decoding_type: MTP - num_nextn_predict_layers: 2 + num_nextn_predict_layers: 1 num_postprocess_workers: 4 stream_interval: 20 ctx: diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_128k8k_con1_ctx1_pp8_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_128k8k_con1_ctx1_pp8_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..eafb048e68b9 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_128k8k_con1_ctx1_pp8_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,97 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1' + input_length: 131072 + output_length: 8192 + dataset_file: datasets/perf-ci/deepseek_r1-128k8k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 4 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + allreduce_strategy: MNNVL + ctx: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 8 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..0117cf4d0b5a --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_128k8k_con64_ctx1_pp8_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,98 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '64' + input_length: 131072 + output_length: 8192 + dataset_file: datasets/perf-ci/deepseek_r1-128k8k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 8 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 2 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 8 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0_con2-Default.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0_con2-NIXL.yaml similarity index 95% rename from tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0_con2-Default.yaml rename to tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0_con2-NIXL.yaml index b622b893d7a8..eb3f24492539 100644 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0_con2-Default.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_128k8k_ctx1_pp4_gen8_tep4_bs2_eplb0_mtp0_con2-NIXL.yaml @@ -3,7 +3,7 @@ metadata: precision: fp4 model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - - GB300 + - GB200 script_file: disaggr_torch.slurm benchmark_type: 128k8k slurm: @@ -16,9 +16,9 @@ slurm: numa_bind: true benchmark: mode: e2e - use_nv_sa_benchmark: true + use_nv_sa_benchmark: false multi_round: 1 - benchmark_ratio: 0.8 + benchmark_ratio: 0.0 streaming: true concurrency_list: '2' input_length: 131072 @@ -72,7 +72,7 @@ worker_config: backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 131104 - backend: DEFAULT + backend: NIXL stream_interval: 20 num_postprocess_workers: 4 allreduce_strategy: MNNVL @@ -93,6 +93,6 @@ worker_config: dtype: fp8 cache_transceiver_config: max_tokens_in_buffer: 131104 - backend: DEFAULT + backend: NIXL moe_config: backend: TRTLLM diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_con1024_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml similarity index 73% rename from tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_con1024_ccb-NIXL.yaml rename to tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml index 2ce6a987e6f6..5f9722f34ff2 100644 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_con1024_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml @@ -4,9 +4,10 @@ metadata: model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - - GB300 script_file: disaggr_torch.slurm benchmark_type: 1k1k + # Mirror of the Dynamo wide-EP DSR1 FP4 GB200 deployment recipe: + # https://github.com/ai-dynamo/dynamo/blob/main/recipes/deepseek-r1/trtllm/disagg/wide_ep/gb200/deploy.yaml slurm: script_file: disaggr_torch.slurm partition: @@ -17,14 +18,14 @@ slurm: numa_bind: true benchmark: mode: e2e - use_nv_sa_benchmark: true - multi_round: 8 - benchmark_ratio: 0.8 + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 streaming: true concurrency_list: '1024' input_length: 1024 output_length: 1024 - dataset_file: + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json hardware: gpus_per_node: 4 num_ctx_servers: 1 @@ -35,9 +36,11 @@ environment: model_path: trtllm_repo: '' build_wheel: false - trtllm_wheel_path: '' work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes MIMALLOC_PURGE_DELAY=0" + # Env vars taken from the Dynamo deploy.yaml (NCCL_MNNVL/CUMEM, MoE all-to-all + # without all-gather, PDL) so that the perf test exercises the same code paths + # as the production deployment. + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes NCCL_MNNVL_ENABLE=1 NCCL_CUMEM_ENABLE=1 TRTLLM_MOE_ENABLE_ALLTOALL_WITHOUT_ALLGATHER=1" server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" profiling: nsys_on: false @@ -45,10 +48,12 @@ accuracy: enable_accuracy_test: false worker_config: gen: + print_iter_log: true tensor_parallel_size: 32 moe_expert_parallel_size: 32 - enable_attention_dp: true pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true max_batch_size: 32 max_num_tokens: 32 max_seq_len: 2251 @@ -64,31 +69,31 @@ worker_config: - 64 - 128 - 256 + - 384 - 512 - 768 - 1024 - 2048 - print_iter_log: true kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.9 + free_gpu_memory_fraction: 0.7 dtype: fp8 moe_config: - backend: WIDEEP + backend: CUTEDSL cache_transceiver_config: max_tokens_in_buffer: 4608 backend: NIXL stream_interval: 20 - num_postprocess_workers: 4 ctx: - max_batch_size: 4 - max_num_tokens: 4608 - max_seq_len: 2251 + print_iter_log: true tensor_parallel_size: 4 moe_expert_parallel_size: 4 - enable_attention_dp: true pipeline_parallel_size: 1 - print_iter_log: true + context_parallel_size: 1 + enable_attention_dp: true + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 1227 cuda_graph_config: null disable_overlap_scheduler: true kv_cache_config: diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..aa31b144efbc --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1024' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 32 + max_num_tokens: 128 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..20706a01c585 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,94 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k + config_index: -1 +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 8 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1024' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + context_parallel_size: 1 + max_batch_size: 768 + max_num_tokens: 768 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + max_batch_size: 768 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + stream_interval: 100 + num_postprocess_workers: 4 + ctx: + max_batch_size: 16 + max_num_tokens: 16896 + max_seq_len: 2044 + tensor_parallel_size: 4 + context_parallel_size: 1 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..ce7d3b5be45d --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,98 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 128 + max_num_tokens: 512 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + allreduce_strategy: MNNVL + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb0_mtp3_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb0_mtp3_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb0_mtp3_ccb-UCX.yaml rename to tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb0_mtp3_ccb-NIXL.yaml index 6fae08024fc1..73dae2e6d765 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb0_mtp3_ccb-NIXL.yaml @@ -67,7 +67,7 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: MTP @@ -94,6 +94,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb288_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb288_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..3370d1faa8cb --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con2048_ctx2_dep4_gen1_dep16_eplb288_mtp3_ccb-NIXL.yaml @@ -0,0 +1,102 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '2048' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 2 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 768 + max_num_tokens: 1536 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 768 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml new file mode 100644 index 000000000000..43f0535d8383 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '3072' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 768 + max_num_tokens: 1536 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 768 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml similarity index 100% rename from tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml rename to tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con1_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con1_ccb-NIXL.yaml similarity index 97% rename from tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con1_ccb-NIXL.yaml rename to tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con1_ccb-NIXL.yaml index 9d75b2608e96..58ff5712bf75 100644 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con1_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_con1_ccb-NIXL.yaml @@ -4,7 +4,6 @@ metadata: model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - - GB300 script_file: disaggr_torch.slurm benchmark_type: 1k1k slurm: @@ -17,9 +16,9 @@ slurm: numa_bind: true benchmark: mode: e2e - use_nv_sa_benchmark: true + use_nv_sa_benchmark: false multi_round: 8 - benchmark_ratio: 0.8 + benchmark_ratio: 0.0 streaming: true concurrency_list: '1' input_length: 1024 diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL.yaml similarity index 87% rename from tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL.yaml rename to tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL.yaml index 3cd56a5e25ae..ab47b8c1ef94 100644 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL.yaml @@ -4,7 +4,6 @@ metadata: model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - - GB300 script_file: disaggr_torch.slurm benchmark_type: 1k1k slurm: @@ -13,13 +12,13 @@ slurm: account: job_time: 02:00:00 job_name: unified-benchmark - extra_args: "--gres=gpu:4" + extra_args: --gres=gpu:4 numa_bind: true benchmark: mode: e2e - use_nv_sa_benchmark: true + use_nv_sa_benchmark: false multi_round: 8 - benchmark_ratio: 0.8 + benchmark_ratio: 0.0 streaming: true concurrency_list: '2048' input_length: 1024 @@ -36,8 +35,9 @@ environment: trtllm_repo: '' build_wheel: false work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes MIMALLOC_PURGE_DELAY=0" - server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + MIMALLOC_PURGE_DELAY=0 + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 profiling: nsys_on: false accuracy: @@ -73,7 +73,7 @@ worker_config: free_gpu_memory_fraction: 0.9 dtype: fp8 moe_config: - backend: WIDEEP + backend: CUTEDSL cache_transceiver_config: max_tokens_in_buffer: 4608 backend: NIXL diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..8c08fcde53d2 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1024' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 32 + max_num_tokens: 128 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..60bec288c4a2 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,98 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 128 + max_num_tokens: 512 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + allreduce_strategy: MNNVL + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml new file mode 100644 index 000000000000..c31910ec28f1 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4096' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 256 + max_num_tokens: 512 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml similarity index 100% rename from tests/scripts/perf-sanity/disaggregated/gb200_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml rename to tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml diff --git a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml similarity index 97% rename from tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml rename to tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml index 1116c0051979..baf99fb99dcf 100644 --- a/tests/scripts/perf/disaggregated/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml @@ -4,7 +4,6 @@ metadata: model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - - GB300 script_file: disaggr_torch.slurm benchmark_type: 8k1k slurm: @@ -17,9 +16,9 @@ slurm: numa_bind: true benchmark: mode: e2e - use_nv_sa_benchmark: true + use_nv_sa_benchmark: false multi_round: 8 - benchmark_ratio: 0.8 + benchmark_ratio: 0.0 streaming: true concurrency_list: '1' input_length: 8192 diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..fcfbe0616235 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml @@ -0,0 +1,110 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1024' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 32 + max_num_tokens: 128 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 256 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..2c3b2b19759d --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,106 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 512 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + allreduce_strategy: MNNVL + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml new file mode 100644 index 000000000000..bf9ba21f0835 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '2048' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 512 + max_num_tokens: 1024 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml similarity index 100% rename from tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml rename to tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..2a91f3633c0d --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,111 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 32k4k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1' + input_length: 32768 + output_length: 4096 + dataset_file: datasets/perf-ci/deepseek_v32-32k4k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 8 + max_input_len: 32784 + max_seq_len: 40960 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.25 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 32832 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + allreduce_strategy: MNNVL + ctx: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 32784 + max_input_len: 32784 + max_seq_len: 32832 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 32832 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL.yaml new file mode 100644 index 000000000000..930876aa4403 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL.yaml @@ -0,0 +1,114 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 32k4k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '2048' + input_length: 32768 + output_length: 4096 + dataset_file: datasets/perf-ci/deepseek_v32-32k4k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 64 + max_num_tokens: 128 + max_input_len: 32784 + max_seq_len: 40960 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.5 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 32832 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 32784 + max_input_len: 32784 + max_seq_len: 32832 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 32832 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX.yaml similarity index 100% rename from tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX.yaml rename to tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX.yaml diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..99002c9f2558 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,111 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 32k4k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '256' + input_length: 32768 + output_length: 4096 + dataset_file: datasets/perf-ci/deepseek_v32-32k4k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 8 + max_num_tokens: 64 + max_input_len: 32784 + max_seq_len: 40960 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 8 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.5 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 32832 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 32784 + max_input_len: 32784 + max_seq_len: 32832 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 32832 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..9158df38b979 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-Exp-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 32k4k + # Mirror of the Dynamo disagg-kv-router DSv3.2 FP4 recipe: + # https://github.com/ai-dynamo/dynamo/tree/main/recipes/deepseek-v32-fp4/trtllm/disagg-kv-router +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '256' + input_length: 32768 + output_length: 4096 + dataset_file: datasets/perf-ci/deepseek_v32-32k4k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + # Env vars taken from the Dynamo deploy.yaml (NCCL MNNVL/CUMEM/NVLS, UCX MNNVL + # IPC, MoE all-to-all without all-gather, PDL, GC disable) so the perf test + # exercises the same code paths as the production deployment. + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes NCCL_MNNVL_ENABLE=1 NCCL_CUMEM_ENABLE=1 NCCL_NVLS_ENABLE=1 NVIDIA_GDRCOPY=1 UCX_CUDA_IPC_ENABLE_MNNVL=1 NCCL_GRAPH_MIXING_SUPPORT=0 TRTLLM_MOE_ENABLE_ALLTOALL_WITHOUT_ALLGATHER=1 TRTLLM_FORCE_COMM_METHOD=NVLINK_TWO_SIDED ENABLE_CONFIGURABLE_MOE=1" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + print_iter_log: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + max_batch_size: 8 + max_num_tokens: 8192 + max_seq_len: 121000 + allreduce_strategy: MNNVL + cuda_graph_config: + enable_padding: true + max_batch_size: 8 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 120000 + backend: NIXL + num_postprocess_workers: 8 + stream_interval: 10 + ctx: + print_iter_log: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + max_batch_size: 32 + max_num_tokens: 8192 + max_seq_len: 121000 + enable_chunked_prefill: true + disable_overlap_scheduler: true + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.9 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 120000 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..e40afbe44267 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml @@ -0,0 +1,110 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1024' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 32 + max_num_tokens: 128 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 256 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..0a7b3a052e37 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,106 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 512 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + allreduce_strategy: MNNVL + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..877516abbe54 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL.yaml @@ -0,0 +1,106 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4096' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 128 + max_num_tokens: 128 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 256 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-UCX.yaml similarity index 100% rename from tests/scripts/perf-sanity/disaggregated/gb200_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-UCX.yaml rename to tests/scripts/perf/disaggregated/gb200_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-UCX.yaml diff --git a/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..6d2ef88af19e --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,96 @@ +metadata: + model_name: gpt_oss_120b_fp4 + precision: fp4 + model_dir_name: GPT-OSS-120B-FP4 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '2048' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/gpt_oss_120b-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 1536 + max_num_tokens: 20000 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 + cuda_graph_config: + enable_padding: true + max_batch_size: 1536 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 1024 + backend: NIXL + disable_overlap_scheduler: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 32 + max_num_tokens: 20000 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 1024 + backend: NIXL + disable_overlap_scheduler: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX.yaml similarity index 100% rename from tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX.yaml rename to tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con2048_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-UCX.yaml diff --git a/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..3a3747731f78 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,96 @@ +metadata: + model_name: gpt_oss_120b_fp4 + precision: fp4 + model_dir_name: GPT-OSS-120B-FP4 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '512' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/gpt_oss_120b-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 1536 + max_num_tokens: 20000 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 + cuda_graph_config: + enable_padding: true + max_batch_size: 1536 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 1024 + backend: NIXL + disable_overlap_scheduler: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 32 + max_num_tokens: 20000 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 1024 + backend: NIXL + disable_overlap_scheduler: true diff --git a/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con64_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con64_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..ca773d4b268d --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_1k1k_con64_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,92 @@ +metadata: + model_name: gpt_oss_120b_fp4 + precision: fp4 + model_dir_name: GPT-OSS-120B-FP4 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '64' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/gpt_oss_120b-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 256 + max_num_tokens: 20000 + tensor_parallel_size: 4 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 1024 + backend: NIXL + disable_overlap_scheduler: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 32 + max_num_tokens: 20000 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 1024 + backend: NIXL + disable_overlap_scheduler: true diff --git a/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con1024_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con1024_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..6417a0c38c8d --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con1024_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,106 @@ +metadata: + model_name: gpt_oss_120b_fp4 + precision: fp4 + model_dir_name: GPT-OSS-120B-FP4 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k + # Mirror of the Dynamo GPT-OSS-120B TRT-LLM disagg deployment recipe: + # https://github.com/ai-dynamo/dynamo/tree/main/recipes/gpt-oss-120b/trtllm/disagg +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1024' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/gpt_oss_120b-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + # Env vars taken from the Dynamo deploy.yaml (OVERRIDE_QUANT_ALGO selects + # W4A8_MXFP4_MXFP8, which is the distinguishing feature of this recipe vs. + # existing gpt-oss perf-sanity configs; PDL / no-parallel-weight-load / + # disabled NCCL graph register / disabled UCC collective are preserved so + # the same load & runtime paths are exercised). + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes OVERRIDE_QUANT_ALGO=W4A8_MXFP4_MXFP8 TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL=True NCCL_GRAPH_REGISTER=0 OMPI_MCA_coll_ucc_enable=0" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + print_iter_log: true + tensor_parallel_size: 4 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + max_batch_size: 1280 + max_num_tokens: 20000 + max_seq_len: 11000 + trust_remote_code: true + allreduce_strategy: AUTO + attention_dp_config: + enable_balance: true + cuda_graph_config: + enable_padding: true + max_batch_size: 1280 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 9216 + backend: NIXL + disable_overlap_scheduler: false + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + max_batch_size: 64 + max_num_tokens: 20000 + max_seq_len: 9000 + trust_remote_code: true + cuda_graph_config: + enable_padding: true + max_batch_size: 30 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 9216 + backend: NIXL + disable_overlap_scheduler: true + num_postprocess_workers: 4 + stream_interval: 20 diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con1024_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con1024_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX.yaml similarity index 100% rename from tests/scripts/perf-sanity/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con1024_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX.yaml rename to tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con1024_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-UCX.yaml diff --git a/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con128_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con128_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..0765412f2e62 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con128_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,92 @@ +metadata: + model_name: gpt_oss_120b_fp4 + precision: fp4 + model_dir_name: GPT-OSS-120B-FP4 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '128' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/gpt_oss_120b-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 1024 + max_num_tokens: 20000 + tensor_parallel_size: 4 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 1024 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + disable_overlap_scheduler: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 32 + max_num_tokens: 20000 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + disable_overlap_scheduler: true diff --git a/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con4_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con4_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..bb027c657d2c --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con4_ctx1_tp1_gen1_tp4_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,92 @@ +metadata: + model_name: gpt_oss_120b_fp4 + precision: fp4 + model_dir_name: GPT-OSS-120B-FP4 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/gpt_oss_120b-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 1024 + max_num_tokens: 20000 + tensor_parallel_size: 4 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 1024 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + disable_overlap_scheduler: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 32 + max_num_tokens: 20000 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + disable_overlap_scheduler: true diff --git a/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..3a1d7fdbeffd --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_gpt-oss-120b-fp4_8k1k_con512_ctx1_tp1_gen1_dep2_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,96 @@ +metadata: + model_name: gpt_oss_120b_fp4 + precision: fp4 + model_dir_name: GPT-OSS-120B-FP4 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '512' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/gpt_oss_120b-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 512 + max_num_tokens: 20000 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + disable_overlap_scheduler: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 32 + max_num_tokens: 20000 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 8448 + backend: NIXL + disable_overlap_scheduler: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k2-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml similarity index 92% rename from tests/scripts/perf-sanity/disaggregated/gb200_kimi-k2-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-UCX.yaml rename to tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml index 6d65a89bf440..8eb7293b9a87 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k2-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml @@ -1,7 +1,7 @@ metadata: - model_name: k2_thinking_fp4 + model_name: k25_thinking_fp4 precision: fp4 - model_dir_name: Kimi-K2-Thinking-NVFP4 + model_dir_name: Kimi-K2.5-NVFP4 supported_gpus: - GB200 script_file: disaggr_torch.slurm @@ -23,7 +23,7 @@ benchmark: concurrency_list: '2048' input_length: 1024 output_length: 1024 - dataset_file: datasets/perf-ci/k2_thinking-1k1k-20480-ratio-1_for_serve.json + dataset_file: hardware: gpus_per_node: 4 num_ctx_servers: 1 @@ -67,7 +67,7 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true trust_remote_code: true num_postprocess_workers: 4 @@ -92,6 +92,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true trust_remote_code: true diff --git a/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..b03708e45ad6 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,97 @@ +metadata: + model_name: k25_thinking_fp4 + precision: fp4 + model_dir_name: Kimi-K2.5-NVFP4 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4096' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 512 + max_num_tokens: 512 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + trust_remote_code: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 8192 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + trust_remote_code: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml similarity index 100% rename from tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml rename to tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k2-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml similarity index 92% rename from tests/scripts/perf-sanity/disaggregated/gb200_kimi-k2-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-UCX.yaml rename to tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml index 4c7c0ca5f5c8..3ee92df5b4a2 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k2-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml @@ -1,7 +1,7 @@ metadata: - model_name: k2_thinking_fp4 + model_name: k25_thinking_fp4 precision: fp4 - model_dir_name: Kimi-K2-Thinking-NVFP4 + model_dir_name: Kimi-K2.5-NVFP4 supported_gpus: - GB200 script_file: disaggr_torch.slurm @@ -23,7 +23,7 @@ benchmark: concurrency_list: '4' input_length: 1024 output_length: 1024 - dataset_file: datasets/perf-ci/k2_thinking-1k1k-20480-ratio-1_for_serve.json + dataset_file: hardware: gpus_per_node: 4 num_ctx_servers: 1 @@ -65,7 +65,7 @@ worker_config: backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true trust_remote_code: true num_postprocess_workers: 4 @@ -90,6 +90,6 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true trust_remote_code: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k2-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml similarity index 91% rename from tests/scripts/perf-sanity/disaggregated/gb200_kimi-k2-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml rename to tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml index a9292d239bf5..00028abf8eee 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k2-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml @@ -1,7 +1,7 @@ metadata: - model_name: k2_thinking_fp4 + model_name: k25_thinking_fp4 precision: fp4 - model_dir_name: Kimi-K2-Thinking-NVFP4 + model_dir_name: Kimi-K2.5-NVFP4 supported_gpus: - GB200 script_file: disaggr_torch.slurm @@ -23,7 +23,7 @@ benchmark: concurrency_list: '1024' input_length: 8192 output_length: 1024 - dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json + dataset_file: hardware: gpus_per_node: 4 num_ctx_servers: 1 @@ -67,13 +67,13 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: Eagle max_draft_len: 3 eagle3_one_model: true - speculative_model: Kimi-K2-Thinking-NVFP4-Eagle3 + speculative_model: Kimi-K2.5-Thinking-Eagle3 trust_remote_code: true num_postprocess_workers: 4 stream_interval: 20 @@ -97,7 +97,7 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 trust_remote_code: true diff --git a/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..4b7f537cb9a3 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,97 @@ +metadata: + model_name: k25_thinking_fp4 + precision: fp4 + model_dir_name: Kimi-K2.5-NVFP4 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4096' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 256 + max_num_tokens: 256 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + trust_remote_code: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 8768 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + trust_remote_code: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-UCX.yaml similarity index 100% rename from tests/scripts/perf-sanity/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-UCX.yaml rename to tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-UCX.yaml diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k2-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml similarity index 91% rename from tests/scripts/perf-sanity/disaggregated/gb200_kimi-k2-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml rename to tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml index fcd9eac1591e..cef09d7dd066 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k2-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/scripts/perf/disaggregated/gb200_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -1,7 +1,7 @@ metadata: - model_name: k2_thinking_fp4 + model_name: k25_thinking_fp4 precision: fp4 - model_dir_name: Kimi-K2-Thinking-NVFP4 + model_dir_name: Kimi-K2.5-NVFP4 supported_gpus: - GB200 script_file: disaggr_torch.slurm @@ -23,7 +23,7 @@ benchmark: concurrency_list: '4' input_length: 8192 output_length: 1024 - dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json + dataset_file: hardware: gpus_per_node: 4 num_ctx_servers: 1 @@ -65,13 +65,13 @@ worker_config: backend: TRTLLM cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: &id001 decoding_type: Eagle max_draft_len: 3 eagle3_one_model: true - speculative_model: Kimi-K2-Thinking-NVFP4-Eagle3 + speculative_model: Kimi-K2.5-Thinking-Eagle3 trust_remote_code: true num_postprocess_workers: 4 stream_interval: 20 @@ -96,7 +96,7 @@ worker_config: use_low_precision_moe_combine: true cache_transceiver_config: max_tokens_in_buffer: 16384 - backend: UCX + backend: NIXL disable_overlap_scheduler: true speculative_config: *id001 trust_remote_code: true diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_qwen3-235b-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml similarity index 97% rename from tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml rename to tests/scripts/perf/disaggregated/gb200_qwen3-235b-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml index b9b208e78c8e..0848573d59fc 100644 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_qwen3-235b-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_con1_ccb-NIXL.yaml @@ -4,7 +4,6 @@ metadata: model_dir_name: Qwen3-235B-A22B-FP4 supported_gpus: - GB200 - - GB300 script_file: disaggr_torch.slurm benchmark_type: 1k1k slurm: @@ -17,9 +16,9 @@ slurm: numa_bind: true benchmark: mode: e2e - use_nv_sa_benchmark: true + use_nv_sa_benchmark: false multi_round: 8 - benchmark_ratio: 0.8 + benchmark_ratio: 0.0 streaming: true concurrency_list: '1' input_length: 1024 diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_con2048_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_qwen3-235b-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL.yaml similarity index 87% rename from tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_con2048_ccb-NIXL.yaml rename to tests/scripts/perf/disaggregated/gb200_qwen3-235b-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL.yaml index b05cf6fca544..c2eb0f6677ce 100644 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_con2048_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_qwen3-235b-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL.yaml @@ -4,7 +4,6 @@ metadata: model_dir_name: Qwen3-235B-A22B-FP4 supported_gpus: - GB200 - - GB300 script_file: disaggr_torch.slurm benchmark_type: 1k1k slurm: @@ -17,9 +16,9 @@ slurm: numa_bind: true benchmark: mode: e2e - use_nv_sa_benchmark: true + use_nv_sa_benchmark: false multi_round: 8 - benchmark_ratio: 0.8 + benchmark_ratio: 0.0 streaming: true concurrency_list: '2048' input_length: 1024 @@ -73,15 +72,17 @@ worker_config: free_gpu_memory_fraction: 0.7 dtype: fp8 moe_config: - backend: WIDEEP + backend: CUTEDSL cache_transceiver_config: max_tokens_in_buffer: 4608 backend: NIXL stream_interval: 20 num_postprocess_workers: 4 speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 1 + decoding_type: Eagle + max_draft_len: 3 + eagle3_one_model: true + speculative_model: Qwen3/qwen3-235B-eagle3 ctx: max_batch_size: 4 max_num_tokens: 4608 @@ -101,5 +102,7 @@ worker_config: max_tokens_in_buffer: 4608 backend: NIXL speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 1 + decoding_type: Eagle + max_draft_len: 3 + eagle3_one_model: true + speculative_model: Qwen3/qwen3-235B-eagle3 diff --git a/tests/scripts/perf/disaggregated/gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..83aefbff6137 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,94 @@ +metadata: + model_name: qwen3_235b_a22b_fp4 + precision: fp4 + model_dir_name: Qwen3-235B-A22B-FP4 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1024' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/qwen3_235b-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 128 + max_num_tokens: 128 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 32768 + backend: NIXL + disable_overlap_scheduler: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 4 + max_num_tokens: 32768 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 32768 + backend: NIXL + disable_overlap_scheduler: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml similarity index 100% rename from tests/scripts/perf-sanity/disaggregated/gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml rename to tests/scripts/perf/disaggregated/gb200_qwen3-235b-fp4_8k1k_con1024_ctx1_tp1_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml diff --git a/tests/scripts/perf/disaggregated/gb200_qwen3-235b-fp4_8k1k_con1_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_qwen3-235b-fp4_8k1k_con1_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..dde9fb64b2d3 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_qwen3-235b-fp4_8k1k_con1_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,93 @@ +metadata: + model_name: qwen3_235b_a22b_fp4 + precision: fp4 + model_dir_name: Qwen3-235B-A22B-FP4 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/qwen3_235b-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 1 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 32768 + backend: NIXL + disable_overlap_scheduler: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 4 + max_num_tokens: 32768 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 32768 + backend: NIXL + disable_overlap_scheduler: true diff --git a/tests/scripts/perf/disaggregated/gb200_qwen3-235b-fp4_8k1k_con64_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_qwen3-235b-fp4_8k1k_con64_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..6a74a4fcfc36 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb200_qwen3-235b-fp4_8k1k_con64_ctx1_tp1_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,92 @@ +metadata: + model_name: qwen3_235b_a22b_fp4 + precision: fp4 + model_dir_name: Qwen3-235B-A22B-FP4 + supported_gpus: + - GB200 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '64' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/qwen3_235b-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 64 + max_num_tokens: 64 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 32768 + backend: NIXL + disable_overlap_scheduler: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 4 + max_num_tokens: 32768 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 32768 + backend: NIXL + disable_overlap_scheduler: true diff --git a/tests/scripts/perf/disaggregated/wideep_accuracy-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_wideep_accuracy-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf/disaggregated/wideep_accuracy-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml rename to tests/scripts/perf/disaggregated/gb200_wideep_accuracy-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml index 253ed65a71c8..ad6b5d9a8c47 100644 --- a/tests/scripts/perf/disaggregated/wideep_accuracy-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_wideep_accuracy-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml @@ -4,7 +4,6 @@ metadata: model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - - GB300 script_file: disaggr_torch.slurm benchmark_type: 1k1k accuracy: @@ -25,7 +24,7 @@ benchmark: mode: gen_only use_nv_sa_benchmark: false multi_round: 1 - benchmark_ratio: 0.8 + benchmark_ratio: 0.0 streaming: true concurrency_list: '2048' input_length: 1024 @@ -90,7 +89,7 @@ worker_config: free_gpu_memory_fraction: 0.9 dtype: fp8 moe_config: - backend: WIDEEP + backend: CUTEDSL load_balancer: num_slots: 288 layer_updates_per_iter: 1 diff --git a/tests/scripts/perf/disaggregated/wideep_accuracy-deepseek-r1-fp4_gpqa_diamond_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_wideep_accuracy-deepseek-r1-fp4_gpqa_diamond_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf/disaggregated/wideep_accuracy-deepseek-r1-fp4_gpqa_diamond_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml rename to tests/scripts/perf/disaggregated/gb200_wideep_accuracy-deepseek-r1-fp4_gpqa_diamond_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml index bad1520972c3..c455cc0644d4 100644 --- a/tests/scripts/perf/disaggregated/wideep_accuracy-deepseek-r1-fp4_gpqa_diamond_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_wideep_accuracy-deepseek-r1-fp4_gpqa_diamond_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml @@ -4,7 +4,6 @@ metadata: model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - - GB300 script_file: disaggr_torch.slurm benchmark_type: 1k1k accuracy: @@ -25,7 +24,7 @@ benchmark: mode: gen_only use_nv_sa_benchmark: false multi_round: 1 - benchmark_ratio: 0.8 + benchmark_ratio: 0.0 streaming: true concurrency_list: '2048' input_length: 1024 @@ -91,7 +90,7 @@ worker_config: free_gpu_memory_fraction: 0.9 dtype: fp8 moe_config: - backend: WIDEEP + backend: CUTEDSL load_balancer: num_slots: 288 layer_updates_per_iter: 1 diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL.yaml similarity index 97% rename from tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL.yaml rename to tests/scripts/perf/disaggregated/gb200_wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL.yaml index 9ccc8bcdf0a1..2095c7e71106 100644 --- a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL.yaml @@ -4,7 +4,6 @@ metadata: model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - - GB300 script_file: disaggr_torch.slurm benchmark_type: 1k1k slurm: @@ -19,7 +18,7 @@ benchmark: mode: gen_only use_nv_sa_benchmark: false multi_round: 1 - benchmark_ratio: 0.8 + benchmark_ratio: 0.0 streaming: true concurrency_list: '1024' input_length: 1024 @@ -75,7 +74,7 @@ worker_config: free_gpu_memory_fraction: 0.9 dtype: fp8 moe_config: - backend: WIDEEP + backend: CUTEDSL load_balancer: num_slots: 288 layer_updates_per_iter: 1 diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL_kv-reuse.yaml b/tests/scripts/perf/disaggregated/gb200_wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL_kv-reuse.yaml similarity index 97% rename from tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL_kv-reuse.yaml rename to tests/scripts/perf/disaggregated/gb200_wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL_kv-reuse.yaml index 822dad51241a..ddd305d6c631 100644 --- a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL_kv-reuse.yaml +++ b/tests/scripts/perf/disaggregated/gb200_wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL_kv-reuse.yaml @@ -4,7 +4,6 @@ metadata: model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - - GB300 script_file: disaggr_torch.slurm benchmark_type: 1k1k slurm: @@ -19,7 +18,7 @@ benchmark: mode: gen_only use_nv_sa_benchmark: false multi_round: 1 - benchmark_ratio: 0.8 + benchmark_ratio: 0.0 streaming: true concurrency_list: '1024' input_length: 1024 @@ -75,7 +74,7 @@ worker_config: free_gpu_memory_fraction: 0.9 dtype: fp8 moe_config: - backend: WIDEEP + backend: CUTEDSL load_balancer: num_slots: 288 layer_updates_per_iter: 1 diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml rename to tests/scripts/perf/disaggregated/gb200_wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml index a8563bf4ec1d..96d798d29fb3 100644 --- a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml @@ -4,7 +4,6 @@ metadata: model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - - GB300 script_file: disaggr_torch.slurm benchmark_type: 1k1k slurm: @@ -19,7 +18,7 @@ benchmark: mode: gen_only use_nv_sa_benchmark: false multi_round: 1 - benchmark_ratio: 0.8 + benchmark_ratio: 0.0 streaming: true concurrency_list: '2048' input_length: 1024 @@ -75,7 +74,7 @@ worker_config: free_gpu_memory_fraction: 0.9 dtype: fp8 moe_config: - backend: WIDEEP + backend: CUTEDSL load_balancer: num_slots: 288 layer_updates_per_iter: 1 diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb200_wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-UCX.yaml similarity index 97% rename from tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-UCX.yaml rename to tests/scripts/perf/disaggregated/gb200_wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-UCX.yaml index 07841f3cccef..9c5e8d11f3e9 100644 --- a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-UCX.yaml +++ b/tests/scripts/perf/disaggregated/gb200_wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-UCX.yaml @@ -4,7 +4,6 @@ metadata: model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - - GB300 script_file: disaggr_torch.slurm benchmark_type: 1k1k slurm: @@ -19,7 +18,7 @@ benchmark: mode: gen_only use_nv_sa_benchmark: false multi_round: 1 - benchmark_ratio: 0.8 + benchmark_ratio: 0.0 streaming: true concurrency_list: '2048' input_length: 1024 @@ -75,7 +74,7 @@ worker_config: free_gpu_memory_fraction: 0.9 dtype: fp8 moe_config: - backend: WIDEEP + backend: CUTEDSL load_balancer: num_slots: 288 layer_updates_per_iter: 1 diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-DEFAULT.yaml b/tests/scripts/perf/disaggregated/gb200_wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-NIXL.yaml similarity index 96% rename from tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-DEFAULT.yaml rename to tests/scripts/perf/disaggregated/gb200_wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-NIXL.yaml index 42d69038bdec..b882f4be46be 100644 --- a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-DEFAULT.yaml +++ b/tests/scripts/perf/disaggregated/gb200_wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-NIXL.yaml @@ -4,7 +4,6 @@ metadata: model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - - GB300 script_file: disaggr_torch.slurm benchmark_type: 1k1k config_index: 7 @@ -20,7 +19,7 @@ benchmark: mode: gen_only use_nv_sa_benchmark: false multi_round: 1 - benchmark_ratio: 0.8 + benchmark_ratio: 0.0 streaming: true concurrency_list: '12288' input_length: 1024 @@ -76,13 +75,13 @@ worker_config: free_gpu_memory_fraction: 0.7 dtype: fp8 moe_config: - backend: WIDEEP + backend: CUTEDSL load_balancer: num_slots: 288 layer_updates_per_iter: 1 cache_transceiver_config: max_tokens_in_buffer: 8320 - backend: DEFAULT + backend: NIXL stream_interval: 20 ctx: enable_layerwise_nvtx_marker: true @@ -102,4 +101,4 @@ worker_config: dtype: fp8 cache_transceiver_config: max_tokens_in_buffer: 8320 - backend: DEFAULT + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-DEFAULT.yaml b/tests/scripts/perf/disaggregated/gb200_wideep_deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-NIXL.yaml similarity index 96% rename from tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-DEFAULT.yaml rename to tests/scripts/perf/disaggregated/gb200_wideep_deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-NIXL.yaml index 07c95f6ca9f4..51bf9b6c2dd0 100644 --- a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-DEFAULT.yaml +++ b/tests/scripts/perf/disaggregated/gb200_wideep_deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-NIXL.yaml @@ -4,7 +4,6 @@ metadata: model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - - GB300 script_file: disaggr_torch.slurm benchmark_type: 8k1k slurm: @@ -23,7 +22,7 @@ benchmark: mode: e2e use_nv_sa_benchmark: false multi_round: 8 - benchmark_ratio: 0.8 + benchmark_ratio: 0.0 streaming: true concurrency_list: '1024' input_length: 8192 @@ -76,13 +75,13 @@ worker_config: free_gpu_memory_fraction: 0.6 dtype: fp8 moe_config: - backend: WIDEEP + backend: CUTEDSL load_balancer: num_slots: 288 layer_updates_per_iter: 1 cache_transceiver_config: max_tokens_in_buffer: 8448 - backend: DEFAULT + backend: NIXL stream_interval: 20 num_postprocess_workers: 4 speculative_config: @@ -106,7 +105,7 @@ worker_config: dtype: fp8 cache_transceiver_config: max_tokens_in_buffer: 8448 - backend: DEFAULT + backend: NIXL speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_wideep_deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL.yaml similarity index 97% rename from tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL.yaml rename to tests/scripts/perf/disaggregated/gb200_wideep_deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL.yaml index c12ae7ab9f6e..c7d6e1c18dfe 100644 --- a/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_wideep_deepseek-v32-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-NIXL.yaml @@ -4,7 +4,6 @@ metadata: model_dir_name: DeepSeek-V3.2-FP4-v2 supported_gpus: - GB200 - - GB300 script_file: disaggr_torch.slurm benchmark_type: 1k1k config_index: 1 @@ -20,7 +19,7 @@ benchmark: mode: gen_only use_nv_sa_benchmark: false multi_round: 1 - benchmark_ratio: 0.8 + benchmark_ratio: 0.0 streaming: true concurrency_list: '1024' input_length: 1024 @@ -103,7 +102,7 @@ worker_config: cuda_graph_config: null disable_overlap_scheduler: true moe_config: - backend: TRTLLM + backend: CUTEDSL kv_cache_config: enable_block_reuse: false free_gpu_memory_fraction: 0.85 diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml rename to tests/scripts/perf/disaggregated/gb200_wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml index c68474bc9529..2fe0b91a9c7d 100644 --- a/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml @@ -4,7 +4,6 @@ metadata: model_dir_name: DeepSeek-V3.2-FP4-v2 supported_gpus: - GB200 - - GB300 script_file: disaggr_torch.slurm benchmark_type: 1k1k config_index: 0 @@ -20,7 +19,7 @@ benchmark: mode: gen_only use_nv_sa_benchmark: false multi_round: 1 - benchmark_ratio: 0.8 + benchmark_ratio: 0.0 streaming: true concurrency_list: '2048' input_length: 1024 @@ -106,7 +105,7 @@ worker_config: cuda_graph_config: null disable_overlap_scheduler: true moe_config: - backend: TRTLLM + backend: CUTEDSL kv_cache_config: enable_block_reuse: false free_gpu_memory_fraction: 0.85 diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-DEFAULT.yaml b/tests/scripts/perf/disaggregated/gb200_wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-NIXL.yaml similarity index 96% rename from tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-DEFAULT.yaml rename to tests/scripts/perf/disaggregated/gb200_wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-NIXL.yaml index 263768adc82d..03f871df98a6 100644 --- a/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-DEFAULT.yaml +++ b/tests/scripts/perf/disaggregated/gb200_wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_con12288_ccb-NIXL.yaml @@ -4,7 +4,6 @@ metadata: model_dir_name: DeepSeek-V3.2-FP4-v2 supported_gpus: - GB200 - - GB300 script_file: disaggr_torch.slurm benchmark_type: 1k1k config_index: 7 @@ -20,7 +19,7 @@ benchmark: mode: gen_only use_nv_sa_benchmark: false multi_round: 1 - benchmark_ratio: 0.8 + benchmark_ratio: 0.0 streaming: true concurrency_list: '12288' input_length: 1024 @@ -88,7 +87,7 @@ worker_config: - cuda_core cache_transceiver_config: max_tokens_in_buffer: 8320 - backend: DEFAULT + backend: NIXL stream_interval: 20 ctx: max_batch_size: 4 @@ -102,11 +101,11 @@ worker_config: cuda_graph_config: null disable_overlap_scheduler: true moe_config: - backend: TRTLLM + backend: CUTEDSL kv_cache_config: enable_block_reuse: false free_gpu_memory_fraction: 0.85 dtype: fp8 cache_transceiver_config: max_tokens_in_buffer: 8320 - backend: DEFAULT + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-DEFAULT.yaml b/tests/scripts/perf/disaggregated/gb200_wideep_deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-NIXL.yaml similarity index 96% rename from tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-DEFAULT.yaml rename to tests/scripts/perf/disaggregated/gb200_wideep_deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-NIXL.yaml index 2df63073a052..9e738304bb68 100644 --- a/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-DEFAULT.yaml +++ b/tests/scripts/perf/disaggregated/gb200_wideep_deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-NIXL.yaml @@ -4,7 +4,6 @@ metadata: model_dir_name: DeepSeek-V3.2-FP4-v2 supported_gpus: - GB200 - - GB300 script_file: disaggr_torch.slurm benchmark_type: 8k1k config_index: 14 @@ -24,7 +23,7 @@ benchmark: mode: e2e use_nv_sa_benchmark: false multi_round: 8 - benchmark_ratio: 0.8 + benchmark_ratio: 0.0 streaming: true concurrency_list: '1024' input_length: 8192 @@ -89,7 +88,7 @@ worker_config: - cuda_core cache_transceiver_config: max_tokens_in_buffer: 8448 - backend: DEFAULT + backend: NIXL stream_interval: 20 num_postprocess_workers: 4 speculative_config: @@ -111,10 +110,10 @@ worker_config: free_gpu_memory_fraction: 0.75 dtype: fp8 moe_config: - backend: TRTLLM + backend: CUTEDSL cache_transceiver_config: max_tokens_in_buffer: 8448 - backend: DEFAULT + backend: NIXL speculative_config: decoding_type: MTP num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/wideep_stress-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb200_wideep_stress-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml similarity index 98% rename from tests/scripts/perf/disaggregated/wideep_stress-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml rename to tests/scripts/perf/disaggregated/gb200_wideep_stress-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml index 03cd2d59a34d..06f5edadd981 100644 --- a/tests/scripts/perf/disaggregated/wideep_stress-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb200_wideep_stress-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL.yaml @@ -4,7 +4,6 @@ metadata: model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - GB200 - - GB300 script_file: disaggr_torch.slurm benchmark_type: 1k1k accuracy: @@ -25,7 +24,7 @@ benchmark: mode: e2e use_nv_sa_benchmark: false multi_round: 20 - benchmark_ratio: 0.8 + benchmark_ratio: 0.0 streaming: true concurrency_list: '2048' input_length: 1024 @@ -90,7 +89,7 @@ worker_config: free_gpu_memory_fraction: 0.9 dtype: fp8 moe_config: - backend: WIDEEP + backend: CUTEDSL load_balancer: num_slots: 288 layer_updates_per_iter: 1 diff --git a/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_128k8k_con1_ctx1_pp4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_128k8k_con1_ctx1_pp4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..99440337a369 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_128k8k_con1_ctx1_pp4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,98 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1' + input_length: 131072 + output_length: 8192 + dataset_file: datasets/perf-ci/deepseek_r1-128k8k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 4 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + allreduce_strategy: MNNVL + ctx: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 4 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml new file mode 100644 index 000000000000..686fb268738a --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-NIXL.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '256' + input_length: 131072 + output_length: 8192 + dataset_file: datasets/perf-ci/deepseek_r1-128k8k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 64 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 4 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml new file mode 100644 index 000000000000..de316f58a87e --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_128k8k_con256_ctx1_pp4_gen1_dep8_eplb0_mtp1_ccb-UCX.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '256' + input_length: 131072 + output_length: 8192 + dataset_file: datasets/perf-ci/deepseek_r1-128k8k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 64 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 4 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_128k8k_con64_ctx1_pp4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_128k8k_con64_ctx1_pp4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml new file mode 100644 index 000000000000..1d6d0bdfcae8 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_128k8k_con64_ctx1_pp4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 128k8k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '64' + input_length: 131072 + output_length: 8192 + dataset_file: datasets/perf-ci/deepseek_r1-128k8k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 + TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 4 + max_num_tokens: 16 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 131104 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 4 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.3 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 131104 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..2cdb9a868988 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,98 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1024' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 32 + max_num_tokens: 128 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..9a3f636173c3 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,98 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 128 + max_num_tokens: 512 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + allreduce_strategy: MNNVL + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml new file mode 100644 index 000000000000..50f19df4ec25 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '3072' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 768 + max_num_tokens: 1536 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 768 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml new file mode 100644 index 000000000000..4dec347902ec --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_1k1k_con3072_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '3072' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 768 + max_num_tokens: 1536 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 768 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..780b4d71e1a0 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1024' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 32 + max_num_tokens: 128 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..6f3c36cda444 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,98 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 128 + max_num_tokens: 512 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + allreduce_strategy: MNNVL + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml new file mode 100644 index 000000000000..802e1777d732 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-NIXL.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4096' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 256 + max_num_tokens: 512 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml new file mode 100644 index 000000000000..4add7253487d --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_deepseek-r1-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp1_ccb-UCX.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4096' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 256 + max_num_tokens: 512 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..669718d13894 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_1k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml @@ -0,0 +1,110 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1024' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 32 + max_num_tokens: 128 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 256 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..bd6d75d283a1 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_1k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,106 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 512 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + allreduce_strategy: MNNVL + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml new file mode 100644 index 000000000000..73c0910392bb --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-NIXL.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '2048' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 512 + max_num_tokens: 1024 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml new file mode 100644 index 000000000000..33150a7ba51b --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_1k1k_con2048_ctx1_dep4_gen1_dep4_eplb0_mtp1_ccb-UCX.yaml @@ -0,0 +1,107 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '2048' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 512 + max_num_tokens: 1024 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..0eab593e0908 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_32k4k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,111 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 32k4k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1' + input_length: 32768 + output_length: 4096 + dataset_file: datasets/perf-ci/deepseek_v32-32k4k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 8 + max_input_len: 32784 + max_seq_len: 40960 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + enable_lm_head_tp_in_adp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.25 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 32832 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + allreduce_strategy: MNNVL + ctx: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 32784 + max_input_len: 32784 + max_seq_len: 32832 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 32832 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL.yaml new file mode 100644 index 000000000000..fea3b1a1cfc5 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-NIXL.yaml @@ -0,0 +1,114 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 32k4k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '2048' + input_length: 32768 + output_length: 4096 + dataset_file: datasets/perf-ci/deepseek_v32-32k4k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 64 + max_num_tokens: 128 + max_input_len: 32784 + max_seq_len: 40960 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.5 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 32832 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 32784 + max_input_len: 32784 + max_seq_len: 32832 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 32832 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX.yaml new file mode 100644 index 000000000000..3b5498cb847d --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_32k4k_con2048_ctx1_dep4_gen1_dep32_eplb288_mtp1_ccb-UCX.yaml @@ -0,0 +1,114 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 32k4k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '2048' + input_length: 32768 + output_length: 4096 + dataset_file: datasets/perf-ci/deepseek_v32-32k4k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 64 + max_num_tokens: 128 + max_input_len: 32784 + max_seq_len: 40960 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.5 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 32832 + backend: UCX + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 1 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 32784 + max_input_len: 32784 + max_seq_len: 32832 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 32832 + backend: UCX + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..609c0c43d6d2 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_32k4k_con256_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,111 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 32k4k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '256' + input_length: 32768 + output_length: 4096 + dataset_file: datasets/perf-ci/deepseek_v32-32k4k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 8 + max_num_tokens: 64 + max_input_len: 32784 + max_seq_len: 40960 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 8 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.5 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 32832 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 32784 + max_input_len: 32784 + max_seq_len: 32832 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.4 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 32832 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..fef69dbf9866 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_32k4k_con256_ctx1_dep8_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-Exp-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 32k4k + # Mirror of the Dynamo disagg-kv-router DSv3.2 FP4 recipe: + # https://github.com/ai-dynamo/dynamo/tree/main/recipes/deepseek-v32-fp4/trtllm/disagg-kv-router +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '256' + input_length: 32768 + output_length: 4096 + dataset_file: datasets/perf-ci/deepseek_v32-32k4k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + # Env vars taken from the Dynamo deploy.yaml (NCCL MNNVL/CUMEM/NVLS, UCX MNNVL + # IPC, MoE all-to-all without all-gather, PDL, GC disable) so the perf test + # exercises the same code paths as the production deployment. + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes NCCL_MNNVL_ENABLE=1 NCCL_CUMEM_ENABLE=1 NCCL_NVLS_ENABLE=1 NVIDIA_GDRCOPY=1 UCX_CUDA_IPC_ENABLE_MNNVL=1 NCCL_GRAPH_MIXING_SUPPORT=0 TRTLLM_MOE_ENABLE_ALLTOALL_WITHOUT_ALLGATHER=1 TRTLLM_FORCE_COMM_METHOD=NVLINK_TWO_SIDED ENABLE_CONFIGURABLE_MOE=1" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + print_iter_log: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + max_batch_size: 8 + max_num_tokens: 8192 + max_seq_len: 121000 + allreduce_strategy: MNNVL + cuda_graph_config: + enable_padding: true + max_batch_size: 8 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 120000 + backend: NIXL + num_postprocess_workers: 8 + stream_interval: 10 + ctx: + print_iter_log: true + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + max_batch_size: 32 + max_num_tokens: 8192 + max_seq_len: 121000 + enable_chunked_prefill: true + disable_overlap_scheduler: true + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + enable_block_reuse: true + free_gpu_memory_fraction: 0.9 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 120000 + backend: NIXL diff --git a/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..5d4325064c64 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb256_mtp3_ccb-NIXL.yaml @@ -0,0 +1,110 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1024' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 32 + max_num_tokens: 128 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 256 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..16d743d373d8 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_8k1k_con1_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,106 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 1 + max_num_tokens: 512 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: MTP + num_nextn_predict_layers: 3 + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + allreduce_strategy: MNNVL + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 diff --git a/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..ab0c5dd4949a --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-NIXL.yaml @@ -0,0 +1,106 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4096' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 128 + max_num_tokens: 128 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 256 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true diff --git a/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-UCX.yaml new file mode 100644 index 000000000000..5c3217039fcb --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_deepseek-v32-fp4_8k1k_con4096_ctx1_dep4_gen1_dep32_eplb256_mtp0_ccb-UCX.yaml @@ -0,0 +1,106 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4096' + input_length: 8192 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 128 + max_num_tokens: 128 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 256 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true + num_postprocess_workers: 4 + stream_interval: 20 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 16384 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + tokens_per_block: 64 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + disable_overlap_scheduler: true diff --git a/tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..2b50cd55d8a7 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_1k1k_con2048_ctx1_dep4_gen1_dep32_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,97 @@ +metadata: + model_name: k25_thinking_fp4 + precision: fp4 + model_dir_name: Kimi-K2.5-NVFP4 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '2048' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 64 + max_num_tokens: 128 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + trust_remote_code: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 8192 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + trust_remote_code: true diff --git a/tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..640d363ba39c --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,97 @@ +metadata: + model_name: k25_thinking_fp4 + precision: fp4 + model_dir_name: Kimi-K2.5-NVFP4 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4096' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 512 + max_num_tokens: 512 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + trust_remote_code: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 8192 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + trust_remote_code: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k2-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml similarity index 94% rename from tests/scripts/perf-sanity/disaggregated/gb200_kimi-k2-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml rename to tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml index 9ffcdfe3cbdc..06abcbdbce8d 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k2-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_1k1k_con4096_ctx1_dep4_gen1_dep8_eplb0_mtp0_ccb-UCX.yaml @@ -1,9 +1,9 @@ metadata: - model_name: k2_thinking_fp4 + model_name: k25_thinking_fp4 precision: fp4 - model_dir_name: Kimi-K2-Thinking-NVFP4 + model_dir_name: Kimi-K2.5-NVFP4 supported_gpus: - - GB200 + - GB300 script_file: disaggr_torch.slurm benchmark_type: 1k1k slurm: @@ -23,7 +23,7 @@ benchmark: concurrency_list: '4096' input_length: 1024 output_length: 1024 - dataset_file: datasets/perf-ci/k2_thinking-1k1k-20480-ratio-1_for_serve.json + dataset_file: hardware: gpus_per_node: 4 num_ctx_servers: 1 diff --git a/tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..40c2b2f3b6d7 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_1k1k_con4_ctx1_dep4_gen1_tep4_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,95 @@ +metadata: + model_name: k25_thinking_fp4 + precision: fp4 + model_dir_name: Kimi-K2.5-NVFP4 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 4 + max_num_tokens: 128 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + trust_remote_code: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 16 + max_num_tokens: 8192 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + trust_remote_code: true diff --git a/tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..a8ef83c55353 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_8k1k_con1024_ctx1_dep4_gen1_dep32_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,103 @@ +metadata: + model_name: k25_thinking_fp4 + precision: fp4 + model_dir_name: Kimi-K2.5-NVFP4 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '1024' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 32 + max_num_tokens: 128 + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: Eagle + max_draft_len: 3 + eagle3_one_model: true + speculative_model: Kimi-K2.5-Thinking-Eagle3 + trust_remote_code: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 8768 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 + trust_remote_code: true diff --git a/tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL.yaml new file mode 100644 index 000000000000..31a59ef73017 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-NIXL.yaml @@ -0,0 +1,97 @@ +metadata: + model_name: k25_thinking_fp4 + precision: fp4 + model_dir_name: Kimi-K2.5-NVFP4 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 5 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4096' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 256 + max_num_tokens: 256 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: + enable_padding: true + max_batch_size: 256 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + trust_remote_code: true + num_postprocess_workers: 4 + stream_interval: 20 + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 8768 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + trust_remote_code: true diff --git a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k2-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-UCX.yaml similarity index 94% rename from tests/scripts/perf-sanity/disaggregated/gb200_kimi-k2-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-UCX.yaml rename to tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-UCX.yaml index d0d612be80ab..effc2e77af0e 100644 --- a/tests/scripts/perf-sanity/disaggregated/gb200_kimi-k2-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_8k1k_con4096_ctx1_dep4_gen1_dep16_eplb0_mtp0_ccb-UCX.yaml @@ -1,9 +1,9 @@ metadata: - model_name: k2_thinking_fp4 + model_name: k25_thinking_fp4 precision: fp4 - model_dir_name: Kimi-K2-Thinking-NVFP4 + model_dir_name: Kimi-K2.5-NVFP4 supported_gpus: - - GB200 + - GB300 script_file: disaggr_torch.slurm benchmark_type: 8k1k slurm: @@ -23,7 +23,7 @@ benchmark: concurrency_list: '4096' input_length: 8192 output_length: 1024 - dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json + dataset_file: hardware: gpus_per_node: 4 num_ctx_servers: 1 diff --git a/tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml new file mode 100644 index 000000000000..90f3bcbf737b --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_kimi-k25-thinking-fp4_8k1k_con4_ctx1_dep4_gen1_tep8_eplb0_mtp3_ccb-NIXL.yaml @@ -0,0 +1,102 @@ +metadata: + model_name: k25_thinking_fp4 + precision: fp4 + model_dir_name: Kimi-K2.5-NVFP4 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 8k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: false + multi_round: 10 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '4' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + print_iter_log: true + max_batch_size: 4 + max_num_tokens: 128 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: false + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: TRTLLM + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: &id001 + decoding_type: Eagle + max_draft_len: 3 + eagle3_one_model: true + speculative_model: Kimi-K2.5-Thinking-Eagle3 + trust_remote_code: true + num_postprocess_workers: 4 + stream_interval: 20 + allreduce_strategy: MNNVL + ctx: + print_iter_log: true + max_batch_size: 2 + max_num_tokens: 8768 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + cuda_graph_config: null + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.6 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: NIXL + disable_overlap_scheduler: true + speculative_config: *id001 + trust_remote_code: true diff --git a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_con2048_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb300_qwen3-235b-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL.yaml similarity index 85% rename from tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_con2048_ccb-UCX.yaml rename to tests/scripts/perf/disaggregated/gb300_qwen3-235b-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL.yaml index 49f07a087b6e..ef063f5357eb 100644 --- a/tests/scripts/perf/disaggregated/Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_con2048_ccb-UCX.yaml +++ b/tests/scripts/perf/disaggregated/gb300_qwen3-235b-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_con2048_ccb-NIXL.yaml @@ -3,7 +3,6 @@ metadata: precision: fp4 model_dir_name: Qwen3-235B-A22B-FP4 supported_gpus: - - GB200 - GB300 script_file: disaggr_torch.slurm benchmark_type: 1k1k @@ -17,9 +16,9 @@ slurm: numa_bind: true benchmark: mode: e2e - use_nv_sa_benchmark: true + use_nv_sa_benchmark: false multi_round: 8 - benchmark_ratio: 0.8 + benchmark_ratio: 0.0 streaming: true concurrency_list: '2048' input_length: 1024 @@ -73,15 +72,17 @@ worker_config: free_gpu_memory_fraction: 0.7 dtype: fp8 moe_config: - backend: WIDEEP + backend: CUTEDSL cache_transceiver_config: max_tokens_in_buffer: 4608 - backend: UCX + backend: NIXL stream_interval: 20 num_postprocess_workers: 4 speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 1 + decoding_type: Eagle + max_draft_len: 3 + eagle3_one_model: true + speculative_model: Qwen3/qwen3-235B-eagle3 ctx: max_batch_size: 4 max_num_tokens: 4608 @@ -99,7 +100,9 @@ worker_config: dtype: fp8 cache_transceiver_config: max_tokens_in_buffer: 4608 - backend: UCX + backend: NIXL speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 1 + decoding_type: Eagle + max_draft_len: 3 + eagle3_one_model: true + speculative_model: Qwen3/qwen3-235B-eagle3 diff --git a/tests/scripts/perf/disaggregated/gb300_wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml new file mode 100644 index 000000000000..59c7438ce4a2 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml @@ -0,0 +1,110 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 1 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '2048' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 2 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + enable_layerwise_nvtx_marker: true + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 512 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTEDSL + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + enable_layerwise_nvtx_marker: true + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/gb300_wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-UCX.yaml similarity index 86% rename from tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-UCX.yaml rename to tests/scripts/perf/disaggregated/gb300_wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-UCX.yaml index 97f665917e5e..1bcc8a7e6d17 100644 --- a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_con1024_ccb-UCX.yaml +++ b/tests/scripts/perf/disaggregated/gb300_wideep_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-UCX.yaml @@ -3,7 +3,6 @@ metadata: precision: fp4 model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - - GB200 - GB300 script_file: disaggr_torch.slurm benchmark_type: 1k1k @@ -19,15 +18,15 @@ benchmark: mode: gen_only use_nv_sa_benchmark: false multi_round: 1 - benchmark_ratio: 0.8 + benchmark_ratio: 0.0 streaming: true - concurrency_list: '1024' + concurrency_list: '2048' input_length: 1024 output_length: 1024 dataset_file: datasets/perf-ci/deepseek_r1-1k1k-20480-ratio-1_for_serve.json hardware: gpus_per_node: 4 - num_ctx_servers: 1 + num_ctx_servers: 2 num_gen_servers: 1 environment: container_mount: @@ -45,13 +44,13 @@ accuracy: worker_config: gen: enable_layerwise_nvtx_marker: true - tensor_parallel_size: 32 - moe_expert_parallel_size: 32 + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 enable_attention_dp: true enable_lm_head_tp_in_adp: true pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 32 + max_batch_size: 128 + max_num_tokens: 512 max_seq_len: 2251 cuda_graph_config: enable_padding: true @@ -75,7 +74,7 @@ worker_config: free_gpu_memory_fraction: 0.9 dtype: fp8 moe_config: - backend: WIDEEP + backend: CUTEDSL load_balancer: num_slots: 288 layer_updates_per_iter: 1 @@ -84,6 +83,9 @@ worker_config: backend: UCX stream_interval: 20 num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 ctx: enable_layerwise_nvtx_marker: true max_batch_size: 4 @@ -103,3 +105,6 @@ worker_config: cache_transceiver_config: max_tokens_in_buffer: 4608 backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_wideep_deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-NIXL.yaml similarity index 89% rename from tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-NIXL.yaml rename to tests/scripts/perf/disaggregated/gb300_wideep_deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-NIXL.yaml index 70830aa8d366..918fd1b8f227 100644 --- a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb300_wideep_deepseek-r1-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-NIXL.yaml @@ -3,7 +3,6 @@ metadata: precision: fp4 model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: - - GB200 - GB300 script_file: disaggr_torch.slurm benchmark_type: 8k1k @@ -12,23 +11,23 @@ slurm: partition: account: job_time: 02:00:00 - job_name: unified-benchmark + job_name: disaggr-test extra_args: --gres=gpu:4 numa_bind: true +hardware: + gpus_per_node: 4 + num_ctx_servers: 2 + num_gen_servers: 1 benchmark: - mode: gen_only + mode: e2e use_nv_sa_benchmark: false - multi_round: 1 - benchmark_ratio: 0.8 + multi_round: 8 + benchmark_ratio: 0.0 streaming: true - concurrency_list: '512' + concurrency_list: '1024' input_length: 8192 output_length: 1024 dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json -hardware: - gpus_per_node: 4 - num_ctx_servers: 8 - num_gen_servers: 1 environment: container_mount: container_image: @@ -50,9 +49,9 @@ worker_config: enable_attention_dp: true enable_lm_head_tp_in_adp: true pipeline_parallel_size: 1 - max_batch_size: 16 - max_num_tokens: 64 - max_seq_len: 9419 + max_batch_size: 128 + max_num_tokens: 512 + max_seq_len: 9423 cuda_graph_config: enable_padding: true batch_sizes: @@ -69,13 +68,14 @@ worker_config: - 768 - 1024 - 2048 + - 128 print_iter_log: true kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.7 + free_gpu_memory_fraction: 0.6 dtype: fp8 moe_config: - backend: WIDEEP + backend: CUTEDSL load_balancer: num_slots: 288 layer_updates_per_iter: 1 @@ -91,7 +91,7 @@ worker_config: enable_layerwise_nvtx_marker: true max_batch_size: 1 max_num_tokens: 8448 - max_seq_len: 9419 + max_seq_len: 9423 tensor_parallel_size: 4 moe_expert_parallel_size: 4 enable_attention_dp: true diff --git a/tests/scripts/perf/disaggregated/gb300_wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml new file mode 100644 index 000000000000..b97d19707710 --- /dev/null +++ b/tests/scripts/perf/disaggregated/gb300_wideep_deepseek-v32-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_con2048_ccb-NIXL.yaml @@ -0,0 +1,118 @@ +metadata: + model_name: deepseek_v32_fp4 + precision: fp4 + model_dir_name: DeepSeek-V3.2-FP4-v2 + supported_gpus: + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k + config_index: 0 +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 04:00:00 + job_name: unified-benchmark + extra_args: --gres=gpu:4 + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: false + multi_round: 1 + benchmark_ratio: 0.0 + streaming: true + concurrency_list: '2048' + input_length: 1024 + output_length: 1024 + dataset_file: datasets/perf-ci/deepseek_v32-1k1k-20480-ratio-1_for_serve.json +hardware: + gpus_per_node: 4 + num_ctx_servers: 2 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: TRTLLM_SERVER_DISABLE_GC=1 +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false +worker_config: + gen: + tensor_parallel_size: 16 + moe_expert_parallel_size: 16 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 512 + max_seq_len: 2251 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + - 1024 + - 2048 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.9 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + nvfp4_gemm_config: + allowed_backends: + - cutlass + - cublaslt + - cutedsl + - cuda_core + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + stream_interval: 20 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + max_batch_size: 4 + max_num_tokens: 4608 + max_seq_len: 2251 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + moe_config: + backend: CUTEDSL + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 4608 + backend: NIXL + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/gb300_wideep_deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-NIXL.yaml similarity index 89% rename from tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-NIXL.yaml rename to tests/scripts/perf/disaggregated/gb300_wideep_deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-NIXL.yaml index 8b558ba9c902..a825adb88bae 100644 --- a/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-NIXL.yaml +++ b/tests/scripts/perf/disaggregated/gb300_wideep_deepseek-v32-fp4_8k1k_ctx2_gen1_dep32_bs128_eplb288_mtp3_con1024_ccb-NIXL.yaml @@ -3,33 +3,32 @@ metadata: precision: fp4 model_dir_name: DeepSeek-V3.2-FP4-v2 supported_gpus: - - GB200 - GB300 script_file: disaggr_torch.slurm benchmark_type: 8k1k - config_index: 4 + config_index: 14 slurm: script_file: disaggr_torch.slurm partition: account: job_time: 04:00:00 - job_name: unified-benchmark + job_name: disaggr-test extra_args: --gres=gpu:4 numa_bind: true +hardware: + gpus_per_node: 4 + num_ctx_servers: 2 + num_gen_servers: 1 benchmark: - mode: gen_only + mode: e2e use_nv_sa_benchmark: false - multi_round: 1 - benchmark_ratio: 0.8 + multi_round: 8 + benchmark_ratio: 0.0 streaming: true - concurrency_list: '512' + concurrency_list: '1024' input_length: 8192 output_length: 1024 dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json -hardware: - gpus_per_node: 4 - num_ctx_servers: 8 - num_gen_servers: 1 environment: container_mount: container_image: @@ -50,9 +49,9 @@ worker_config: enable_attention_dp: true enable_lm_head_tp_in_adp: true pipeline_parallel_size: 1 - max_batch_size: 16 - max_num_tokens: 64 - max_seq_len: 9419 + max_batch_size: 128 + max_num_tokens: 512 + max_seq_len: 9423 cuda_graph_config: enable_padding: true batch_sizes: @@ -69,10 +68,11 @@ worker_config: - 768 - 1024 - 2048 + - 128 print_iter_log: true kv_cache_config: enable_block_reuse: false - free_gpu_memory_fraction: 0.7 + free_gpu_memory_fraction: 0.6 dtype: fp8 moe_config: backend: CUTEDSL @@ -97,7 +97,7 @@ worker_config: ctx: max_batch_size: 1 max_num_tokens: 8448 - max_seq_len: 9419 + max_seq_len: 9423 tensor_parallel_size: 4 moe_expert_parallel_size: 4 enable_attention_dp: true @@ -110,7 +110,7 @@ worker_config: free_gpu_memory_fraction: 0.75 dtype: fp8 moe_config: - backend: TRTLLM + backend: CUTEDSL cache_transceiver_config: max_tokens_in_buffer: 8448 backend: NIXL diff --git a/tests/scripts/perf/disaggregated/wideep_accuracy-kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/wideep_accuracy-kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml deleted file mode 100644 index ebe795585fe9..000000000000 --- a/tests/scripts/perf/disaggregated/wideep_accuracy-kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_ccb-NIXL.yaml +++ /dev/null @@ -1,112 +0,0 @@ -metadata: - model_name: k2_thinking_fp4 - precision: fp4 - model_dir_name: Kimi-K2-Thinking-NVFP4 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k - accuracy: - datasets: - - dataset_name: gpqa_diamond_local - expected_value: 0.65 - threshold_type: hypothesis_test - filter_type: strict-match -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 03:00:00 - job_name: unified-benchmark - extra_args: "--gres=gpu:4" - numa_bind: true -benchmark: - enable_benchmark: false - mode: e2e - use_nv_sa_benchmark: false - multi_round: 1 - benchmark_ratio: 1.0 - streaming: true - concurrency_list: '8192' - input_length: 1024 - output_length: 1024 - dataset_file: datasets/perf-ci/k2_thinking-1k1k-20480-ratio-1_for_serve.json -hardware: - gpus_per_node: 4 - num_ctx_servers: 3 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: true - env_var: - HF_HOME: - tasks: - gpqa_diamond_local: - model: "local-chat-completions" - model_args_extra: "num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=7200,max_gen_toks=16384" - extra_kwargs: - apply_chat_template: true - trust_remote_code: true - custom_config: tests/integration/lm_eval_configs/gpqa_diamond_local.yaml -worker_config: - gen: - enable_layerwise_nvtx_marker: true - tensor_parallel_size: 16 - moe_expert_parallel_size: 16 - enable_attention_dp: true - enable_lm_head_tp_in_adp: false - pipeline_parallel_size: 1 - max_batch_size: 512 - max_num_tokens: 512 - max_seq_len: 16384 - cuda_graph_config: - enable_padding: true - max_batch_size: 512 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.85 - dtype: fp8 - moe_config: - backend: WIDEEP - use_low_precision_moe_combine: true - load_balancer: - num_slots: 384 - layer_updates_per_iter: 1 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL - stream_interval: 100 - num_postprocess_workers: 4 - trust_remote_code: true - ctx: - enable_layerwise_nvtx_marker: true - max_batch_size: 32 - max_num_tokens: 8448 - max_seq_len: 8448 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL - trust_remote_code: true diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-NIXL.yaml deleted file mode 100644 index c83bc20c7498..000000000000 --- a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-NIXL.yaml +++ /dev/null @@ -1,105 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: gen_only - use_nv_sa_benchmark: false - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '1024' - input_length: 8192 - output_length: 1024 - dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json -hardware: - gpus_per_node: 4 - num_ctx_servers: 6 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - enable_layerwise_nvtx_marker: true - tensor_parallel_size: 16 - moe_expert_parallel_size: 16 - enable_attention_dp: true - enable_lm_head_tp_in_adp: true - pipeline_parallel_size: 1 - max_batch_size: 64 - max_num_tokens: 64 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: WIDEEP - load_balancer: - num_slots: 288 - layer_updates_per_iter: 1 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - ctx: - enable_layerwise_nvtx_marker: true - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-UCX.yaml deleted file mode 100644 index 0b3c24058ce3..000000000000 --- a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-UCX.yaml +++ /dev/null @@ -1,105 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: gen_only - use_nv_sa_benchmark: false - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '1024' - input_length: 8192 - output_length: 1024 - dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json -hardware: - gpus_per_node: 4 - num_ctx_servers: 6 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - enable_layerwise_nvtx_marker: true - tensor_parallel_size: 16 - moe_expert_parallel_size: 16 - enable_attention_dp: true - enable_lm_head_tp_in_adp: true - pipeline_parallel_size: 1 - max_batch_size: 64 - max_num_tokens: 64 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: WIDEEP - load_balancer: - num_slots: 288 - layer_updates_per_iter: 1 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - ctx: - enable_layerwise_nvtx_marker: true - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-UCX.yaml b/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-UCX.yaml deleted file mode 100644 index 6baf445d017f..000000000000 --- a/tests/scripts/perf/disaggregated/wideep_deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb288_mtp3_con512_ccb-UCX.yaml +++ /dev/null @@ -1,111 +0,0 @@ -metadata: - model_name: deepseek_r1_0528_fp4_v2 - precision: fp4 - model_dir_name: DeepSeek-R1-0528-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 02:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: gen_only - use_nv_sa_benchmark: false - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '512' - input_length: 8192 - output_length: 1024 - dataset_file: datasets/perf-ci/deepseek_r1-8k1k-20480-ratio-1_for_serve.json -hardware: - gpus_per_node: 4 - num_ctx_servers: 8 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - enable_layerwise_nvtx_marker: true - tensor_parallel_size: 32 - moe_expert_parallel_size: 32 - enable_attention_dp: true - enable_lm_head_tp_in_adp: true - pipeline_parallel_size: 1 - max_batch_size: 16 - max_num_tokens: 64 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: WIDEEP - load_balancer: - num_slots: 288 - layer_updates_per_iter: 1 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX - stream_interval: 20 - num_postprocess_workers: 4 - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 - ctx: - enable_layerwise_nvtx_marker: true - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: UCX - speculative_config: - decoding_type: MTP - num_nextn_predict_layers: 3 diff --git a/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-NIXL.yaml deleted file mode 100644 index 9a899579cacc..000000000000 --- a/tests/scripts/perf/disaggregated/wideep_deepseek-v32-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_con1024_ccb-NIXL.yaml +++ /dev/null @@ -1,113 +0,0 @@ -metadata: - model_name: deepseek_v32_fp4 - precision: fp4 - model_dir_name: DeepSeek-V3.2-FP4-v2 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k - config_index: 5 -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 04:00:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: gen_only - use_nv_sa_benchmark: false - multi_round: 1 - benchmark_ratio: 0.8 - streaming: true - concurrency_list: '1024' - input_length: 8192 - output_length: 1024 - dataset_file: datasets/perf-ci/deepseek_v32-8k1k-20480-ratio-1_for_serve.json -hardware: - gpus_per_node: 4 - num_ctx_servers: 6 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 MIMALLOC_PURGE_DELAY=0 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" - server_env_var: TRTLLM_SERVER_DISABLE_GC=1 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - tensor_parallel_size: 16 - moe_expert_parallel_size: 16 - enable_attention_dp: true - enable_lm_head_tp_in_adp: true - pipeline_parallel_size: 1 - max_batch_size: 64 - max_num_tokens: 64 - max_seq_len: 9419 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.7 - dtype: fp8 - moe_config: - backend: CUTEDSL - use_low_precision_moe_combine: true - load_balancer: - num_slots: 288 - layer_updates_per_iter: 1 - nvfp4_gemm_config: - allowed_backends: - - cutlass - - cublaslt - - cutedsl - - cuda_core - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL - stream_interval: 20 - num_postprocess_workers: 4 - ctx: - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 9419 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - moe_config: - backend: TRTLLM - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL diff --git a/tests/scripts/perf/disaggregated/wideep_kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_con16384_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/wideep_kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_con16384_ccb-NIXL.yaml deleted file mode 100644 index 71aeffa7ba22..000000000000 --- a/tests/scripts/perf/disaggregated/wideep_kimi-k2-thinking-fp4_1k1k_ctx3_gen1_dep32_bs1024_eplb384_mtp0_con16384_ccb-NIXL.yaml +++ /dev/null @@ -1,108 +0,0 @@ -metadata: - model_name: k2_thinking_fp4 - precision: fp4 - model_dir_name: Kimi-K2-Thinking-NVFP4 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 1k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 00:45:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: gen_only - use_nv_sa_benchmark: false - multi_round: 1 - benchmark_ratio: 1.0 - streaming: true - concurrency_list: '16384' - input_length: 1024 - output_length: 1024 - dataset_file: datasets/perf-ci/k2_thinking-1k1k-20480-ratio-1_for_serve.json -hardware: - gpus_per_node: 4 - num_ctx_servers: 3 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: MIMALLOC_PURGE_DELAY=0 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - enable_layerwise_nvtx_marker: true - tensor_parallel_size: 16 - moe_expert_parallel_size: 16 - enable_attention_dp: true - enable_lm_head_tp_in_adp: false - pipeline_parallel_size: 1 - max_batch_size: 1024 - max_num_tokens: 1024 - max_seq_len: 2068 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - - 1024 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.8 - dtype: fp8 - moe_config: - backend: WIDEEP - use_low_precision_moe_combine: true - load_balancer: - num_slots: 384 - layer_updates_per_iter: 1 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL - stream_interval: 100 - num_postprocess_workers: 4 - trust_remote_code: true - ctx: - enable_layerwise_nvtx_marker: true - max_batch_size: 8 - max_num_tokens: 8448 - max_seq_len: 1044 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL - trust_remote_code: true diff --git a/tests/scripts/perf/disaggregated/wideep_kimi-k2-thinking-fp4_8k1k_ctx8_gen1_dep32_bs256_eplb416_mtp0_con8192_ccb-NIXL.yaml b/tests/scripts/perf/disaggregated/wideep_kimi-k2-thinking-fp4_8k1k_ctx8_gen1_dep32_bs256_eplb416_mtp0_con8192_ccb-NIXL.yaml deleted file mode 100644 index 78fc502d960b..000000000000 --- a/tests/scripts/perf/disaggregated/wideep_kimi-k2-thinking-fp4_8k1k_ctx8_gen1_dep32_bs256_eplb416_mtp0_con8192_ccb-NIXL.yaml +++ /dev/null @@ -1,108 +0,0 @@ -metadata: - model_name: k2_thinking_fp4 - precision: fp4 - model_dir_name: Kimi-K2-Thinking-NVFP4 - supported_gpus: - - GB200 - - GB300 - script_file: disaggr_torch.slurm - benchmark_type: 8k1k -slurm: - script_file: disaggr_torch.slurm - partition: - account: - job_time: 00:45:00 - job_name: unified-benchmark - extra_args: --gres=gpu:4 - numa_bind: true -benchmark: - mode: gen_only - use_nv_sa_benchmark: false - multi_round: 1 - benchmark_ratio: 1.0 - streaming: true - concurrency_list: '8192' - input_length: 8192 - output_length: 1024 - dataset_file: datasets/perf-ci/k2_thinking-8k1k-20480-ratio-1_for_serve.json -hardware: - gpus_per_node: 4 - num_ctx_servers: 8 - num_gen_servers: 1 -environment: - container_mount: - container_image: - model_path: - trtllm_repo: '' - build_wheel: false - work_dir: - worker_env_var: MIMALLOC_PURGE_DELAY=0 -profiling: - nsys_on: false -accuracy: - enable_accuracy_test: false -worker_config: - gen: - enable_layerwise_nvtx_marker: true - tensor_parallel_size: 32 - moe_expert_parallel_size: 32 - enable_attention_dp: true - enable_lm_head_tp_in_adp: false - pipeline_parallel_size: 1 - max_batch_size: 256 - max_num_tokens: 256 - max_seq_len: 9256 - cuda_graph_config: - enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 512 - - 768 - - 1024 - - 2048 - - 256 - print_iter_log: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.6 - dtype: fp8 - moe_config: - backend: WIDEEP - use_low_precision_moe_combine: true - load_balancer: - num_slots: 416 - layer_updates_per_iter: 1 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL - stream_interval: 100 - num_postprocess_workers: 4 - trust_remote_code: true - ctx: - enable_layerwise_nvtx_marker: true - max_batch_size: 1 - max_num_tokens: 8448 - max_seq_len: 8232 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - enable_attention_dp: true - pipeline_parallel_size: 1 - print_iter_log: true - cuda_graph_config: null - disable_overlap_scheduler: true - kv_cache_config: - enable_block_reuse: false - free_gpu_memory_fraction: 0.75 - dtype: fp8 - cache_transceiver_config: - max_tokens_in_buffer: 8448 - backend: NIXL - trust_remote_code: true