From da3a0eab4abdddffd26c15953f2fe946de787435 Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Thu, 22 Jan 2026 10:43:11 +0200 Subject: [PATCH 1/7] Add `MegatronRunReportGenerationStrategy` --- src/cloudai/registration.py | 2 + .../workloads/megatron_run/__init__.py | 3 +- .../report_generation_strategy.py | 142 +++++++++++++++++- 3 files changed, 144 insertions(+), 3 deletions(-) diff --git a/src/cloudai/registration.py b/src/cloudai/registration.py index f9be227e6..d44a273cc 100644 --- a/src/cloudai/registration.py +++ b/src/cloudai/registration.py @@ -98,6 +98,7 @@ def register_all(): ) from cloudai.workloads.megatron_run import ( CheckpointTimingReportGenerationStrategy, + MegatronRunReportGenerationStrategy, MegatronRunSlurmCommandGenStrategy, MegatronRunTestDefinition, ) @@ -259,6 +260,7 @@ def register_all(): Registry().add_report(GPTTestDefinition, JaxToolboxReportGenerationStrategy) Registry().add_report(GrokTestDefinition, JaxToolboxReportGenerationStrategy) Registry().add_report(MegatronRunTestDefinition, CheckpointTimingReportGenerationStrategy) + Registry().add_report(MegatronRunTestDefinition, MegatronRunReportGenerationStrategy) Registry().add_report(MegatronBridgeTestDefinition, MegatronBridgeReportGenerationStrategy) Registry().add_report(NCCLTestDefinition, NcclTestPerformanceReportGenerationStrategy) Registry().add_report(NeMoLauncherTestDefinition, NeMoLauncherReportGenerationStrategy) diff --git a/src/cloudai/workloads/megatron_run/__init__.py b/src/cloudai/workloads/megatron_run/__init__.py index 960461256..473203447 100644 --- a/src/cloudai/workloads/megatron_run/__init__.py +++ b/src/cloudai/workloads/megatron_run/__init__.py @@ -15,12 +15,13 @@ # limitations under the License. from .megatron_run import MegatronRunCmdArgs, MegatronRunTestDefinition -from .report_generation_strategy import CheckpointTimingReportGenerationStrategy +from .report_generation_strategy import CheckpointTimingReportGenerationStrategy, MegatronRunReportGenerationStrategy from .slurm_command_gen_strategy import MegatronRunSlurmCommandGenStrategy __all__ = [ "CheckpointTimingReportGenerationStrategy", "MegatronRunCmdArgs", + "MegatronRunReportGenerationStrategy", "MegatronRunSlurmCommandGenStrategy", "MegatronRunTestDefinition", ] diff --git a/src/cloudai/workloads/megatron_run/report_generation_strategy.py b/src/cloudai/workloads/megatron_run/report_generation_strategy.py index 50723a2ca..76a2eafff 100644 --- a/src/cloudai/workloads/megatron_run/report_generation_strategy.py +++ b/src/cloudai/workloads/megatron_run/report_generation_strategy.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -14,13 +14,28 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + import logging import re +from pathlib import Path +from statistics import mean, median, pstdev +from typing import ClassVar -from cloudai.core import ReportGenerationStrategy +from cloudai.core import METRIC_ERROR, ReportGenerationStrategy CHECKPOINT_REGEX = re.compile(r"(save|load)-checkpoint\s.*:\s\((\d+\.\d+),\s(\d+\.\d+)\)") +# Pattern to match lines like: +# [2026-01-16 07:32:39] iteration 6/100 | ... | +# elapsed time per iteration (ms): 15639.0 | throughput per GPU (TFLOP/s/GPU): 494.6 | ... +ITERATION_REGEX = re.compile( + r"elapsed time per iteration \(ms\):\s*([0-9]+(?:\.[0-9]+)?)" + r".*?" + r"throughput per GPU \(TFLOP/s/GPU\):\s*([0-9]+(?:\.[0-9]+)?)", + re.IGNORECASE, +) + class CheckpointTimingReportGenerationStrategy(ReportGenerationStrategy): """Strategy for generating reports from Checkpoint Timing test outputs.""" @@ -59,3 +74,126 @@ def generate_report(self) -> None: for checkpoint_type, timings in [("save", save_timings), ("load", load_timings)]: for t in timings: file.write(f"{checkpoint_type},{t[0]},{t[1]}\n") + + +class MegatronRunReportGenerationStrategy(ReportGenerationStrategy): + """Parse Megatron-Run stdout.txt for iteration time and GPU TFLOP/s per GPU.""" + + metrics: ClassVar[list[str]] = ["default", "iteration-time", "tflops-per-gpu"] + + def get_log_file(self) -> Path | None: + log = self.test_run.output_path / "stdout.txt" + return log if log.is_file() else None + + @property + def results_file(self) -> Path: + return self.get_log_file() or (self.test_run.output_path / "stdout.txt") + + def can_handle_directory(self) -> bool: + log_file = self.get_log_file() + if not log_file: + return False + with log_file.open("r", encoding="utf-8", errors="ignore") as f: + for line in f: + if ITERATION_REGEX.search(line): + return True + return False + + def _extract(self, log_path: Path) -> tuple[list[float], list[float]]: + """Extract iteration times (ms) and GPU TFLOPS from the log file.""" + iter_times_ms: list[float] = [] + gpu_tflops: list[float] = [] + with log_path.open("r", encoding="utf-8", errors="ignore") as f: + for line in f: + m = ITERATION_REGEX.search(line) + if m: + try: + iter_times_ms.append(float(m.group(1))) + gpu_tflops.append(float(m.group(2))) + except (ValueError, TypeError): + logging.debug("Failed to parse iteration metrics line: %s", line.rstrip("\n")) + + # Keep only the last 10 iterations for statistics (to exclude warmup) + if len(iter_times_ms) > 10: + iter_times_ms = iter_times_ms[-10:] + gpu_tflops = gpu_tflops[-10:] + return iter_times_ms, gpu_tflops + + def _get_extracted_data(self) -> tuple[Path | None, list[float], list[float]]: + log_file = self.get_log_file() + if not log_file: + return None, [], [] + iter_times_ms, gpu_tflops = self._extract(log_file) + return log_file, iter_times_ms, gpu_tflops + + def generate_report(self) -> None: + log_file, iter_times_ms, gpu_tflops = self._get_extracted_data() + if not log_file: + logging.error( + "No stdout.txt file found in: %s", + self.test_run.output_path, + ) + return + + summary_file = self.test_run.output_path / "megatron_run_report.txt" + if not iter_times_ms: + with summary_file.open("w") as f: + f.write("MegatronRun report\n") + f.write("No iteration timing lines were found.\n\n") + f.write("Searched file:\n") + f.write(f" - {log_file}\n") + logging.warning("No iteration metrics found under %s (wrote %s)", self.test_run.output_path, summary_file) + return + + iter_stats = { + "avg": mean(iter_times_ms), + "median": median(iter_times_ms), + "min": min(iter_times_ms), + "max": max(iter_times_ms), + "std": pstdev(iter_times_ms) if len(iter_times_ms) > 1 else 0.0, + } + if gpu_tflops: + tflops_stats = { + "avg": mean(gpu_tflops), + "median": median(gpu_tflops), + "min": min(gpu_tflops), + "max": max(gpu_tflops), + "std": pstdev(gpu_tflops) if len(gpu_tflops) > 1 else 0.0, + } + else: + tflops_stats = {"avg": 0.0, "median": 0.0, "min": 0.0, "max": 0.0, "std": 0.0} + + with summary_file.open("w") as f: + f.write(f"Source log: {log_file}\n\n") + f.write("Iteration Time (ms)\n") + f.write(f" avg: {iter_stats['avg']}\n") + f.write(f" median: {iter_stats['median']}\n") + f.write(f" min: {iter_stats['min']}\n") + f.write(f" max: {iter_stats['max']}\n") + f.write(f" std: {iter_stats['std']}\n") + f.write("\n") + f.write("TFLOP/s per GPU\n") + f.write(f" avg: {tflops_stats['avg']}\n") + f.write(f" median: {tflops_stats['median']}\n") + f.write(f" min: {tflops_stats['min']}\n") + f.write(f" max: {tflops_stats['max']}\n") + f.write(f" std: {tflops_stats['std']}\n") + + def get_metric(self, metric: str) -> float: + if metric not in {"default", "iteration-time", "tflops-per-gpu"}: + return METRIC_ERROR + log_file, iter_times_ms, gpu_tflops = self._get_extracted_data() + if not log_file: + logging.error( + "No stdout.txt file found in: %s", + self.test_run.output_path, + ) + return METRIC_ERROR + if not iter_times_ms: + return METRIC_ERROR + + if metric in {"default", "iteration-time"}: + return float(mean(iter_times_ms)) + if metric == "tflops-per-gpu": + return float(mean(gpu_tflops)) if gpu_tflops else METRIC_ERROR + return METRIC_ERROR From 8b2dbeaef357e93005423cd85bdb2e232db5bba6 Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Thu, 22 Jan 2026 10:50:35 +0200 Subject: [PATCH 2/7] Add tests --- ...megatron_run_report_generation_strategy.py | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 tests/report_generation_strategy/test_megatron_run_report_generation_strategy.py diff --git a/tests/report_generation_strategy/test_megatron_run_report_generation_strategy.py b/tests/report_generation_strategy/test_megatron_run_report_generation_strategy.py new file mode 100644 index 000000000..e9c2aa5a6 --- /dev/null +++ b/tests/report_generation_strategy/test_megatron_run_report_generation_strategy.py @@ -0,0 +1,149 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path + +import pytest + +from cloudai import TestRun +from cloudai.core import METRIC_ERROR +from cloudai.systems.slurm.slurm_system import SlurmSystem +from cloudai.workloads.megatron_run import ( + MegatronRunCmdArgs, + MegatronRunReportGenerationStrategy, + MegatronRunTestDefinition, +) + + +@pytest.fixture +def megatron_run_tr(tmp_path: Path) -> TestRun: + test = MegatronRunTestDefinition( + name="megatron_run", + description="desc", + test_template_name="t", + cmd_args=MegatronRunCmdArgs(docker_image_url="http://url", run_script=Path(__file__)), + ) + tr = TestRun(name="megatron_run_test", test=test, num_nodes=1, nodes=[], output_path=tmp_path) + + stdout_content = ( + "[2026-01-16 07:32:24] iteration 5/ 100 | consumed samples: 10240 | " + "elapsed time per iteration (ms): 15800.0 | throughput per GPU (TFLOP/s/GPU): 490.0 | " + "learning rate: 4.134000E-07 | global batch size: 2048 | lm loss: 1.344240E+01 | " + "seq_load_balancing_loss: 1.000203E+00 | loss scale: 1.0 | grad norm: 2.870 | " + "num zeros: 1174412544.0 | params norm: 8660.607 | " + "number of skipped iterations: 0 | number of nan iterations: 0 |\n" + "[2026-01-16 07:32:39] iteration 6/ 100 | consumed samples: 12288 | " + "elapsed time per iteration (ms): 15639.0 | throughput per GPU (TFLOP/s/GPU): 494.6 | " + "learning rate: 4.180800E-07 | global batch size: 2048 | lm loss: 1.342407E+01 | " + "seq_load_balancing_loss: 1.000202E+00 | loss scale: 1.0 | grad norm: 2.867 | " + "num zeros: 1174412672.0 | params norm: 8660.606 | " + "number of skipped iterations: 0 | number of nan iterations: 0 |\n" + "[2026-01-16 07:32:54] iteration 7/ 100 | consumed samples: 14336 | " + "elapsed time per iteration (ms): 15448.5 | throughput per GPU (TFLOP/s/GPU): 500.6 | " + "learning rate: 4.227600E-07 | global batch size: 2048 | lm loss: 1.340574E+01 | " + "seq_load_balancing_loss: 1.000201E+00 | loss scale: 1.0 | grad norm: 2.864 | " + "num zeros: 1174412800.0 | params norm: 8660.605 | " + "number of skipped iterations: 0 | number of nan iterations: 0 |\n" + ) + (tr.output_path / "stdout.txt").write_text(stdout_content) + + return tr + + +@pytest.fixture +def megatron_run_tr_no_data(tmp_path: Path) -> TestRun: + test = MegatronRunTestDefinition( + name="megatron_run", + description="desc", + test_template_name="t", + cmd_args=MegatronRunCmdArgs(docker_image_url="http://url", run_script=Path(__file__)), + ) + tr = TestRun(name="megatron_run_test", test=test, num_nodes=1, nodes=[], output_path=tmp_path) + + stdout_content = """ +Some random log output without iteration metrics +Starting training... +""" + (tr.output_path / "stdout.txt").write_text(stdout_content) + + return tr + + +def test_megatron_run_can_handle_directory(slurm_system: SlurmSystem, megatron_run_tr: TestRun) -> None: + strategy = MegatronRunReportGenerationStrategy(slurm_system, megatron_run_tr) + assert strategy.can_handle_directory() + + +def test_megatron_run_cannot_handle_directory_without_iteration_data( + slurm_system: SlurmSystem, megatron_run_tr_no_data: TestRun +) -> None: + strategy = MegatronRunReportGenerationStrategy(slurm_system, megatron_run_tr_no_data) + assert not strategy.can_handle_directory() + + +def test_megatron_run_extract_and_generate_report(slurm_system: SlurmSystem, megatron_run_tr: TestRun) -> None: + strategy = MegatronRunReportGenerationStrategy(slurm_system, megatron_run_tr) + strategy.generate_report() + report_path = megatron_run_tr.output_path / "megatron_run_report.txt" + assert report_path.is_file() + content = report_path.read_text() + assert "Iteration Time (ms)" in content + assert "TFLOP/s per GPU" in content + assert "avg:" in content + assert "median:" in content + assert "min:" in content + assert "max:" in content + assert "std:" in content + + +def test_megatron_run_get_metric_iteration_time(slurm_system: SlurmSystem, megatron_run_tr: TestRun) -> None: + strategy = MegatronRunReportGenerationStrategy(slurm_system, megatron_run_tr) + # Expected: avg of [15800.0, 15639.0, 15448.5] + expected_avg = (15800.0 + 15639.0 + 15448.5) / 3 + metric = strategy.get_metric("iteration-time") + assert abs(metric - expected_avg) < 0.1 + + +def test_megatron_run_get_metric_default(slurm_system: SlurmSystem, megatron_run_tr: TestRun) -> None: + strategy = MegatronRunReportGenerationStrategy(slurm_system, megatron_run_tr) + # Default should return iteration-time + expected_avg = (15800.0 + 15639.0 + 15448.5) / 3 + metric = strategy.get_metric("default") + assert abs(metric - expected_avg) < 0.1 + + +def test_megatron_run_get_metric_tflops(slurm_system: SlurmSystem, megatron_run_tr: TestRun) -> None: + strategy = MegatronRunReportGenerationStrategy(slurm_system, megatron_run_tr) + # Expected: avg of [490.0, 494.6, 500.6] + expected_avg = (490.0 + 494.6 + 500.6) / 3 + metric = strategy.get_metric("tflops-per-gpu") + assert abs(metric - expected_avg) < 0.1 + + +def test_megatron_run_get_metric_invalid(slurm_system: SlurmSystem, megatron_run_tr: TestRun) -> None: + strategy = MegatronRunReportGenerationStrategy(slurm_system, megatron_run_tr) + metric = strategy.get_metric("invalid-metric") + assert metric == METRIC_ERROR + + +def test_megatron_run_get_metric_no_data(slurm_system: SlurmSystem, megatron_run_tr_no_data: TestRun) -> None: + strategy = MegatronRunReportGenerationStrategy(slurm_system, megatron_run_tr_no_data) + metric = strategy.get_metric("iteration-time") + assert metric == METRIC_ERROR + + +def test_megatron_run_metrics_class_var() -> None: + assert MegatronRunReportGenerationStrategy.metrics == ["default", "iteration-time", "tflops-per-gpu"] From c2570222b2b7748f0f5e92b6db2d2e308cc61e30 Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Thu, 22 Jan 2026 10:59:34 +0200 Subject: [PATCH 3/7] Fix copyright year --- src/cloudai/registration.py | 2 +- src/cloudai/workloads/megatron_run/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cloudai/registration.py b/src/cloudai/registration.py index d44a273cc..3b5bfc9af 100644 --- a/src/cloudai/registration.py +++ b/src/cloudai/registration.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/src/cloudai/workloads/megatron_run/__init__.py b/src/cloudai/workloads/megatron_run/__init__.py index 473203447..1f4f1fec9 100644 --- a/src/cloudai/workloads/megatron_run/__init__.py +++ b/src/cloudai/workloads/megatron_run/__init__.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); From 27edf70239a8deafdbe428c742c28b74b37e4dd9 Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Thu, 22 Jan 2026 11:18:02 +0200 Subject: [PATCH 4/7] Fix tests --- tests/test_test_scenario.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_test_scenario.py b/tests/test_test_scenario.py index c2af1373b..007acc100 100644 --- a/tests/test_test_scenario.py +++ b/tests/test_test_scenario.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -53,6 +53,7 @@ from cloudai.workloads.megatron_run import ( CheckpointTimingReportGenerationStrategy, MegatronRunCmdArgs, + MegatronRunReportGenerationStrategy, MegatronRunTestDefinition, ) from cloudai.workloads.nccl_test import ( @@ -481,7 +482,10 @@ def test_default_reporters_size(self): (DeepEPTestDefinition, {DeepEPReportGenerationStrategy}), (GPTTestDefinition, {JaxToolboxReportGenerationStrategy}), (GrokTestDefinition, {JaxToolboxReportGenerationStrategy}), - (MegatronRunTestDefinition, {CheckpointTimingReportGenerationStrategy}), + ( + MegatronRunTestDefinition, + {CheckpointTimingReportGenerationStrategy, MegatronRunReportGenerationStrategy}, + ), (MegatronBridgeTestDefinition, {MegatronBridgeReportGenerationStrategy}), (NCCLTestDefinition, {NcclTestPerformanceReportGenerationStrategy}), (NeMoLauncherTestDefinition, {NeMoLauncherReportGenerationStrategy}), From 347aba8d7c77c9473c850f763aa04f455cbddb30 Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Thu, 22 Jan 2026 17:25:28 +0200 Subject: [PATCH 5/7] Change report to csv format --- .../report_generation_strategy.py | 63 ++++++++----------- ...megatron_run_report_generation_strategy.py | 41 +++++++++--- 2 files changed, 57 insertions(+), 47 deletions(-) diff --git a/src/cloudai/workloads/megatron_run/report_generation_strategy.py b/src/cloudai/workloads/megatron_run/report_generation_strategy.py index 76a2eafff..33025a101 100644 --- a/src/cloudai/workloads/megatron_run/report_generation_strategy.py +++ b/src/cloudai/workloads/megatron_run/report_generation_strategy.py @@ -16,6 +16,7 @@ from __future__ import annotations +import csv import logging import re from pathlib import Path @@ -135,49 +136,35 @@ def generate_report(self) -> None: ) return - summary_file = self.test_run.output_path / "megatron_run_report.txt" + report_file = self.test_run.output_path / "megatron_run_report.csv" if not iter_times_ms: - with summary_file.open("w") as f: - f.write("MegatronRun report\n") - f.write("No iteration timing lines were found.\n\n") - f.write("Searched file:\n") - f.write(f" - {log_file}\n") - logging.warning("No iteration metrics found under %s (wrote %s)", self.test_run.output_path, summary_file) + with report_file.open("w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["metric_type", "avg", "median", "min", "max", "std"]) + writer.writerow(["error: No iteration timing lines were found.", "", "", "", "", ""]) + logging.warning("No iteration metrics found under %s (wrote %s)", self.test_run.output_path, report_file) return - iter_stats = { - "avg": mean(iter_times_ms), - "median": median(iter_times_ms), - "min": min(iter_times_ms), - "max": max(iter_times_ms), - "std": pstdev(iter_times_ms) if len(iter_times_ms) > 1 else 0.0, - } + iter_avg = mean(iter_times_ms) + iter_median = median(iter_times_ms) + iter_min = min(iter_times_ms) + iter_max = max(iter_times_ms) + iter_std = pstdev(iter_times_ms) if len(iter_times_ms) > 1 else 0.0 + if gpu_tflops: - tflops_stats = { - "avg": mean(gpu_tflops), - "median": median(gpu_tflops), - "min": min(gpu_tflops), - "max": max(gpu_tflops), - "std": pstdev(gpu_tflops) if len(gpu_tflops) > 1 else 0.0, - } + tflops_avg = mean(gpu_tflops) + tflops_median = median(gpu_tflops) + tflops_min = min(gpu_tflops) + tflops_max = max(gpu_tflops) + tflops_std = pstdev(gpu_tflops) if len(gpu_tflops) > 1 else 0.0 else: - tflops_stats = {"avg": 0.0, "median": 0.0, "min": 0.0, "max": 0.0, "std": 0.0} - - with summary_file.open("w") as f: - f.write(f"Source log: {log_file}\n\n") - f.write("Iteration Time (ms)\n") - f.write(f" avg: {iter_stats['avg']}\n") - f.write(f" median: {iter_stats['median']}\n") - f.write(f" min: {iter_stats['min']}\n") - f.write(f" max: {iter_stats['max']}\n") - f.write(f" std: {iter_stats['std']}\n") - f.write("\n") - f.write("TFLOP/s per GPU\n") - f.write(f" avg: {tflops_stats['avg']}\n") - f.write(f" median: {tflops_stats['median']}\n") - f.write(f" min: {tflops_stats['min']}\n") - f.write(f" max: {tflops_stats['max']}\n") - f.write(f" std: {tflops_stats['std']}\n") + tflops_avg = tflops_median = tflops_min = tflops_max = tflops_std = 0.0 + + with report_file.open("w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["metric_type", "avg", "median", "min", "max", "std"]) + writer.writerow(["iteration_time_ms", iter_avg, iter_median, iter_min, iter_max, iter_std]) + writer.writerow(["tflops_per_gpu", tflops_avg, tflops_median, tflops_min, tflops_max, tflops_std]) def get_metric(self, metric: str) -> float: if metric not in {"default", "iteration-time", "tflops-per-gpu"}: diff --git a/tests/report_generation_strategy/test_megatron_run_report_generation_strategy.py b/tests/report_generation_strategy/test_megatron_run_report_generation_strategy.py index e9c2aa5a6..d472b6648 100644 --- a/tests/report_generation_strategy/test_megatron_run_report_generation_strategy.py +++ b/tests/report_generation_strategy/test_megatron_run_report_generation_strategy.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import csv from pathlib import Path import pytest @@ -97,16 +98,38 @@ def test_megatron_run_cannot_handle_directory_without_iteration_data( def test_megatron_run_extract_and_generate_report(slurm_system: SlurmSystem, megatron_run_tr: TestRun) -> None: strategy = MegatronRunReportGenerationStrategy(slurm_system, megatron_run_tr) strategy.generate_report() - report_path = megatron_run_tr.output_path / "megatron_run_report.txt" + report_path = megatron_run_tr.output_path / "megatron_run_report.csv" assert report_path.is_file() - content = report_path.read_text() - assert "Iteration Time (ms)" in content - assert "TFLOP/s per GPU" in content - assert "avg:" in content - assert "median:" in content - assert "min:" in content - assert "max:" in content - assert "std:" in content + + with report_path.open() as f: + reader = csv.DictReader(f) + rows = list(reader) + + # Should have 2 rows: iteration_time_ms and tflops_per_gpu + assert len(rows) == 2 + + expected_headers = {"metric_type", "avg", "median", "min", "max", "std"} + assert set(rows[0].keys()) == expected_headers + + data = {row["metric_type"]: row for row in rows} + + # Verify iteration_time_ms stats + assert "iteration_time_ms" in data + iter_stats = data["iteration_time_ms"] + expected_iter_avg = (15800.0 + 15639.0 + 15448.5) / 3 + assert abs(float(iter_stats["avg"]) - expected_iter_avg) < 0.1 + assert abs(float(iter_stats["median"]) - 15639.0) < 0.1 + assert abs(float(iter_stats["min"]) - 15448.5) < 0.1 + assert abs(float(iter_stats["max"]) - 15800.0) < 0.1 + + # Verify tflops_per_gpu stats + assert "tflops_per_gpu" in data + tflops_stats = data["tflops_per_gpu"] + expected_tflops_avg = (490.0 + 494.6 + 500.6) / 3 + assert abs(float(tflops_stats["avg"]) - expected_tflops_avg) < 0.1 + assert abs(float(tflops_stats["median"]) - 494.6) < 0.1 + assert abs(float(tflops_stats["min"]) - 490.0) < 0.1 + assert abs(float(tflops_stats["max"]) - 500.6) < 0.1 def test_megatron_run_get_metric_iteration_time(slurm_system: SlurmSystem, megatron_run_tr: TestRun) -> None: From 0efbc49f021b099fe3a9b17811b79671c4357105 Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Thu, 22 Jan 2026 17:26:37 +0200 Subject: [PATCH 6/7] Skip first 20 iters instead of keeping last 10 --- .../workloads/megatron_run/report_generation_strategy.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/cloudai/workloads/megatron_run/report_generation_strategy.py b/src/cloudai/workloads/megatron_run/report_generation_strategy.py index 33025a101..df9e64bf1 100644 --- a/src/cloudai/workloads/megatron_run/report_generation_strategy.py +++ b/src/cloudai/workloads/megatron_run/report_generation_strategy.py @@ -114,10 +114,10 @@ def _extract(self, log_path: Path) -> tuple[list[float], list[float]]: except (ValueError, TypeError): logging.debug("Failed to parse iteration metrics line: %s", line.rstrip("\n")) - # Keep only the last 10 iterations for statistics (to exclude warmup) - if len(iter_times_ms) > 10: - iter_times_ms = iter_times_ms[-10:] - gpu_tflops = gpu_tflops[-10:] + # Skip the first 20 iterations for statistics (to exclude warmup) + if len(iter_times_ms) > 20: + iter_times_ms = iter_times_ms[20:] + gpu_tflops = gpu_tflops[20:] return iter_times_ms, gpu_tflops def _get_extracted_data(self) -> tuple[Path | None, list[float], list[float]]: From 27ac52a41e4ad436212c4980ca69b292748a463d Mon Sep 17 00:00:00 2001 From: Juntao Wang Date: Fri, 23 Jan 2026 10:05:33 +0200 Subject: [PATCH 7/7] Fix overwrite behavior between test and scenario configuration --- src/cloudai/models/scenario.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cloudai/models/scenario.py b/src/cloudai/models/scenario.py index 276cac2e5..c0c5abf63 100644 --- a/src/cloudai/models/scenario.py +++ b/src/cloudai/models/scenario.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -102,7 +102,7 @@ def tdef_model_dump(self, by_alias: bool) -> dict: "test_template_name": self.test_template_name, "agent": self.agent, "agent_steps": self.agent_steps, - "agent_metrics": self.agent_metrics, + "agent_metrics": self.agent_metrics if "agent_metrics" in self.model_fields_set else None, "agent_reward_function": self.agent_reward_function, "extra_container_mounts": self.extra_container_mounts, "extra_env_vars": self.extra_env_vars if self.extra_env_vars else None,