From 63f87e0ca4f67f469b02ebad2c232a72f3b97401 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 19 Feb 2025 22:43:25 +0000 Subject: [PATCH 01/64] [Automated Commit] Format Codebase --- compliance/nvidia/TEST01/verify_performance.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/compliance/nvidia/TEST01/verify_performance.py b/compliance/nvidia/TEST01/verify_performance.py index cc400c8ed7..4b527730b1 100644 --- a/compliance/nvidia/TEST01/verify_performance.py +++ b/compliance/nvidia/TEST01/verify_performance.py @@ -54,12 +54,14 @@ def main(): continue if ref_mode == "SingleStream": - if re.match(".*Early stopping (90th|90.0th|99.9th) percentile estimate", line): + if re.match( + ".*Early stopping (90th|90.0th|99.9th) percentile estimate", line): ref_score = line.split(": ", 1)[1].strip() continue if ref_mode == "MultiStream": - if re.match(".*Early stopping (99th|99.0th) percentile estimate", line): + if re.match( + ".*Early stopping (99th|99.0th) percentile estimate", line): ref_score = line.split(": ", 1)[1].strip() continue @@ -96,7 +98,8 @@ def main(): continue if test_mode == "MultiStream": - if re.match(".*Early stopping (99th|99.0th) percentile estimate", line): + if re.match( + ".*Early stopping (99th|99.0th) percentile estimate", line): test_score = line.split(": ", 1)[1].strip() continue From ec284d277523e5409694b431a94ac0601ab6fc3b Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Mon, 24 Feb 2025 16:37:53 +0000 Subject: [PATCH 02/64] Updated tags for submission checker command in docs --- docs/submission/index.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/submission/index.md b/docs/submission/index.md index da30c18350..6a6bbfb2f2 100644 --- a/docs/submission/index.md +++ b/docs/submission/index.md @@ -184,14 +184,14 @@ Once you have all the results on the system, you can upload them to the MLCommon === "via CLI" You can do the following command which will run the submission checker and upload the results to the MLCommons submission server ``` - mlcr run,submission,checker,inference \ + mlcr run,mlperf,submission,checker,inference \ --submitter_id=<> \ --submission_dir= ``` === "via Browser" You can do the following command to generate the final submission tar file and then upload to the [MLCommons Submission UI](https://submissions-ui.mlcommons.org/submission). ``` - mlcr run,submission,checker \ + mlcr run,mlperf,submission,checker,inference \ --submission_dir= \ --tar=yes \ --submission_tar_file=mysubmission.tar.gz From 5335553c19fe0aa464c7c466325c48fd60c4b6df Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 25 Feb 2025 10:44:55 +0000 Subject: [PATCH 03/64] Update mobilenets docs --- .../image_classification/mobilenets.md | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/docs/benchmarks/image_classification/mobilenets.md b/docs/benchmarks/image_classification/mobilenets.md index 7e4605b4b0..9501521b32 100644 --- a/docs/benchmarks/image_classification/mobilenets.md +++ b/docs/benchmarks/image_classification/mobilenets.md @@ -23,10 +23,10 @@ Mobilenet models are not official MLPerf models and so cannot be used for a Clos mlcr run,mobilenet-models,_tflite,_mobilenet-v2 \ --adr.compiler.tags=gcc ``` -=== "Mobilenet-V2" - ### Mobilenet V2 +=== "Mobilenet-V3" + ### Mobilenet V3 ```bash - mlcr run,mobilenet-models,_tflite,_mobilenet-v2 \ + mlcr run,mobilenet-models,_tflite,_mobilenet-v3 \ --adr.compiler.tags=gcc ``` === "Mobilenets" @@ -41,6 +41,12 @@ Mobilenet models are not official MLPerf models and so cannot be used for a Clos mlcr run,mobilenet-models,_tflite,_efficientnet \ --adr.compiler.tags=gcc ``` +=== "Mobilenets and Efficientnet" + ### Mobilenets and Efficientnet + ```bash + mlcr run,mobilenet-models,_tflite \ + --adr.compiler.tags=gcc + ``` ## ARMNN Backend === "Mobilenet-V1" @@ -55,7 +61,7 @@ Mobilenet models are not official MLPerf models and so cannot be used for a Clos mlcr run,mobilenet-models,_tflite,_armnn,_mobilenet-v2 \ --adr.compiler.tags=gcc ``` -=== "Mobilenet-V2" +=== "Mobilenet-V3" ### Mobilenet V2 ```bash mlcr run,mobilenet-models,_tflite,_armnn,_mobilenet-v2 \ @@ -73,4 +79,11 @@ Mobilenet models are not official MLPerf models and so cannot be used for a Clos mlcr run,mobilenet-models,_tflite,_armnn,_efficientnet \ --adr.compiler.tags=gcc ``` +=== "Mobilenets and Efficientnet" + ### Mobilenets and Efficientnet + ```bash + mlcr run,mobilenet-models,_tflite,_armnn \ + --adr.compiler.tags=gcc + ``` + From b9767aa16544a4c759c32767c5ceeb9546b9b195 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 25 Feb 2025 16:55:02 +0000 Subject: [PATCH 04/64] Update main.py --- main.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index 2ccfab2958..a8fcb30a49 100755 --- a/main.py +++ b/main.py @@ -28,11 +28,13 @@ def mlperf_inference_implementation_readme( content = "" execution_envs = ["Docker", "Native"] - code_version = "r4.1-dev" + code_version = "r5.0-dev" implementation_run_options = [] if model == "rnnt": code_version = "r4.0" + elif implementation == "intel": + code_version = "r4.1-dev" if implementation == "reference": # Tip From f42aeeb8a96c58dd8ffff8cecde1e08aa5f10d41 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 25 Feb 2025 16:56:51 +0000 Subject: [PATCH 05/64] Update main.py --- main.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/main.py b/main.py index a8fcb30a49..0020d8c4ec 100755 --- a/main.py +++ b/main.py @@ -341,12 +341,7 @@ def mlperf_inference_implementation_readme( and framework.lower() == "deepsparse" ): run_suffix += f"{cur_space3}You can use any model from [NeuralMagic sparse zoo](https://sparsezoo.neuralmagic.com/?modelSet=computer_vision&architectures=resnet_v1) (trained on Imagenet dataset) as --nm_model_zoo_stub" - if ( - "bert" in model.lower() - and framework.lower() == "deepsparse" - ): - run_suffix += "You can use any model from [NeuralMagic sparse zoo](https://sparsezoo.neuralmagic.com/?modelSet=computer_vision&architectures=resnet_v1) (trained on Imagenet dataset) as --nm_model_zoo_stub" - if ( + elif ( "bert" in model.lower() and framework.lower() == "deepsparse" ): From c699ce30ad313a5b9e5f91a9874cf1f4e917772e Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Tue, 25 Feb 2025 22:38:06 +0530 Subject: [PATCH 06/64] update dataset download commands - waymo calib (#2130) --- .../get-pointpainting-data.md | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/docs/benchmarks/automotive/3d_object_detection/get-pointpainting-data.md b/docs/benchmarks/automotive/3d_object_detection/get-pointpainting-data.md index 0a1e65c8ea..6331b3535b 100644 --- a/docs/benchmarks/automotive/3d_object_detection/get-pointpainting-data.md +++ b/docs/benchmarks/automotive/3d_object_detection/get-pointpainting-data.md @@ -9,11 +9,21 @@ hide: > **Note:** By default, the waymo dataset is downloaded from the mlcommons official drive. One has to accept the [MLCommons Waymo Open Dataset EULA](https://waymo.mlcommons.org/) to access the dataset files. -The benchmark implementation run command will automatically download the preprocessed dataset. In case you want to download only the datasets, you can use the below command. +The benchmark implementation run command will automatically download the preprocessed dataset. In case you want to download only the datasets, you can use the below commands. -```bash -mlcr get,dataset,waymo -j -``` +=== "Validation" + + ### Get Validation Dataset + ``` + mlcr get,dataset,waymo -j + ``` + +=== "Calibration" + + ### Get Calibration Dataset + ``` + mlcr get,dataset,waymo,calibration -j + ``` - `--outdirname=` could be provided to download the dataset to a specific location. From 7bf2c5e6bc49e4c814f93e92d77189b0fd83d7ba Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 13 Mar 2025 18:47:52 +0000 Subject: [PATCH 07/64] Merge from Master (#2155) * Update submission_checker.py | Fix open model unit in Results (#2144) * Add Llama 3.1 to special unit dict (#2150) --------- Co-authored-by: Pablo Gonzalez --- tools/submission/submission_checker.py | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index c3c30c14d7..7ac74be40d 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -1914,50 +1914,38 @@ def log_result( notes = notes + system_json.get("sw_notes") special_unit_dict = { "gptj-99": { - "SingleStream": "Latency (ms)", - "MultiStream": "Latency (ms)", "Offline": "Tokens/s", "Server": "Tokens/s", }, "gptj-99.9": { - "SingleStream": "Latency (ms)", - "MultiStream": "Latency (ms)", "Offline": "Tokens/s", "Server": "Tokens/s", }, "llama2-70b-99": { - "SingleStream": "Latency (ms)", - "MultiStream": "Latency (ms)", "Offline": "Tokens/s", "Server": "Tokens/s", }, "llama2-70b-99.9": { - "SingleStream": "Latency (ms)", - "MultiStream": "Latency (ms)", "Offline": "Tokens/s", "Server": "Tokens/s", }, "llama2-70b-interactive-99": { - "SingleStream": "Latency (ms)", - "MultiStream": "Latency (ms)", "Offline": "Tokens/s", "Server": "Tokens/s", }, "llama2-70b-interactive-99.9": { - "SingleStream": "Latency (ms)", - "MultiStream": "Latency (ms)", + "Offline": "Tokens/s", + "Server": "Tokens/s", + }, + "llama3.1-405b": { "Offline": "Tokens/s", "Server": "Tokens/s", }, "mixtral-8x7b": { - "SingleStream": "Latency (ms)", - "MultiStream": "Latency (ms)", "Offline": "Tokens/s", "Server": "Tokens/s", }, "llama3.1-405b": { - "SingleStream": "Latency (ms)", - "MultiStream": "Latency (ms)", "Offline": "Tokens/s", "Server": "Tokens/s", }, @@ -1977,7 +1965,7 @@ def log_result( if config.version == "v4.0": unit = unit_dict[scenario_fixed] else: - unit = special_unit_dict.get(model_name, unit_dict)[scenario_fixed] + unit = special_unit_dict.get(mlperf_model, unit_dict).get(scenario_fixed, unit_dict[scenario_fixed]) power_unit = power_unit_dict[scenario_fixed] if (power_metric <= 0) or ( From 2a73202e8bdf0028dd7f0e78f4f2d569d6fb1561 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 13 Mar 2025 18:48:21 +0000 Subject: [PATCH 08/64] [Automated Commit] Format Codebase --- tools/submission/submission_checker.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index 7ac74be40d..0694273b19 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -1702,7 +1702,7 @@ def get_power_metric(config, scenario_fixed, log_path, is_valid, res): samples_per_query = 8 if (scenario_fixed in ["MultiStream"] - ) and scenario in ["SingleStream"]: + ) and scenario in ["SingleStream"]: power_metric = ( avg_power * power_duration * samples_per_query * 1000 / num_queries ) @@ -1965,7 +1965,9 @@ def log_result( if config.version == "v4.0": unit = unit_dict[scenario_fixed] else: - unit = special_unit_dict.get(mlperf_model, unit_dict).get(scenario_fixed, unit_dict[scenario_fixed]) + unit = special_unit_dict.get( + mlperf_model, unit_dict).get( + scenario_fixed, unit_dict[scenario_fixed]) power_unit = power_unit_dict[scenario_fixed] if (power_metric <= 0) or ( From 2fb105769406a98ffcd587c3b49c303ba6a9de4b Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Tue, 22 Apr 2025 18:53:22 +0530 Subject: [PATCH 09/64] Inference docs - Update model and dataset download commands (#2153) * Update llama2 70b model download docs * changes in model and dataset download commands --- docs/benchmarks/graph/get-rgat-data.md | 33 +++++++--- .../image_classification/get-resnet50-data.md | 38 +++++++---- docs/benchmarks/language/get-bert-data.md | 19 ++++++ docs/benchmarks/language/get-gptj-data.md | 14 +++- .../language/get-llama2-70b-data.md | 66 ++++++++++++++----- .../language/get-llama3_1-405b-data.md | 29 +++++--- .../language/get-mixtral-8x7b-data.md | 13 +++- .../medical_imaging/get-3d-unet-data.md | 30 +++++++-- .../object_detection/get-retinanet-data.md | 33 +++++++--- .../recommendation/get-dlrm-v2-data.md | 7 +- .../benchmarks/text_to_image/get-sdxl-data.md | 27 ++++++-- 11 files changed, 236 insertions(+), 73 deletions(-) diff --git a/docs/benchmarks/graph/get-rgat-data.md b/docs/benchmarks/graph/get-rgat-data.md index 6ab9515e59..bb719fea2e 100644 --- a/docs/benchmarks/graph/get-rgat-data.md +++ b/docs/benchmarks/graph/get-rgat-data.md @@ -9,22 +9,34 @@ hide: The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands. -=== "Full Dataset" - R-GAT validation run uses the IGBH dataset consisting of 547,306,935 nodes and 5,812,005,639 edges. +=== "Validation" - ### Get Full Dataset - ``` - mlcr get,dataset,igbh,_full -j - ``` + === "Full Dataset" + R-GAT validation run uses the IGBH dataset consisting of 547,306,935 nodes and 5,812,005,639 edges. + + ### Get Full Dataset + ``` + mlcr get,dataset,igbh,_full -j + ``` -=== "Debug Dataset" - R-GAT debug run uses the IGBH debug dataset(tiny). + === "Debug Dataset" + R-GAT debug run uses the IGBH debug dataset(tiny). - ### Get Full Dataset + ### Get Full Dataset + ``` + mlcr get,dataset,igbh,_debug -j + ``` + +=== "Calibration" + The calibration dataset contains 5000 nodes from the training paper nodes of the IGBH dataset. IGBH `full` dataset would be downloaded for creating calibration dataset. + + ### Get Calibration Dataset ``` - mlcr get,dataset,igbh,_debug -j + mlcr get,dataset,igbh,_full,_calibration -j ``` +- `--outdirname=` could be provided to download the dataset to a specific location. + ## Model The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands. @@ -37,3 +49,4 @@ Get the Official MLPerf R-GAT Model mlcr get,ml-model,rgat -j ``` +- `--outdirname=` could be provided to download the model to a specific location. \ No newline at end of file diff --git a/docs/benchmarks/image_classification/get-resnet50-data.md b/docs/benchmarks/image_classification/get-resnet50-data.md index 771571d5c7..9ecd25c1a5 100644 --- a/docs/benchmarks/image_classification/get-resnet50-data.md +++ b/docs/benchmarks/image_classification/get-resnet50-data.md @@ -9,25 +9,34 @@ hide: The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands. -=== "Validation" - ResNet50 validation run uses the Imagenet 2012 validation dataset consisting of 50,000 images. +=== "Unprocessed" + === "Validation" + ResNet50 validation run uses the Imagenet 2012 validation dataset consisting of 50,000 images. - ### Get Validation Dataset - ``` - mlcr get,dataset,imagenet,validation -j - ``` -=== "Calibration" - ResNet50 calibration dataset consist of 500 images selected from the Imagenet 2012 validation dataset. There are 2 alternative options for the calibration dataset. + ### Get Validation Dataset + ``` + mlcr get,dataset,imagenet,validation -j + ``` + === "Calibration" + ResNet50 calibration dataset consist of 500 images selected from the Imagenet 2012 validation dataset. There are 2 alternative options for the calibration dataset. + + ### Get Calibration Dataset Using Option 1 + ``` + mlcr get,dataset,imagenet,calibration,_mlperf.option1 -j + ``` + ### Get Calibration Dataset Using Option 2 + ``` + mlcr get,dataset,imagenet,calibration,_mlperf.option2 -j + ``` +=== "Preprocessed" + ### Get ResNet50 preprocessed dataset - ### Get Calibration Dataset Using Option 1 - ``` - mlcr get,dataset,imagenet,calibration,_mlperf.option1 -j - ``` - ### Get Calibration Dataset Using Option 2 ``` - mlcr get,dataset,imagenet,calibration,_mlperf.option2 -j + mlcr get,dataset,image-classification,imagenet,preprocessed,_pytorch -j ``` +- `--outdirname=` could be provided to download the dataset to a specific location. + ## Model The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands. @@ -46,3 +55,4 @@ Get the Official MLPerf ResNet50 Model mlcr get,ml-model,resnet50,_onnx -j ``` +- `--outdirname=` could be provided to download the model to a specific location. \ No newline at end of file diff --git a/docs/benchmarks/language/get-bert-data.md b/docs/benchmarks/language/get-bert-data.md index 430031f319..ab3ba9b537 100644 --- a/docs/benchmarks/language/get-bert-data.md +++ b/docs/benchmarks/language/get-bert-data.md @@ -17,6 +17,24 @@ The benchmark implementation run command will automatically download the validat mlcr get,dataset,squad,validation -j ``` +=== "Calibration" + + === "Calibration Set 1" + + ### Get Calibration Dataset + ``` + mlcr get,dataset,squad,_calib1 -j + ``` + + === "Calibration Set 2" + + ### Get Calibration Dataset + ``` + mlcr get,dataset,squad,_calib2 -j + ``` + +- `--outdirname=` could be provided to download the dataset to a specific location. + ## Model The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands. @@ -41,3 +59,4 @@ Get the Official MLPerf Bert-Large Model mlcr get,ml-model,bert-large,_tensorflow -j ``` +- `--outdirname=` could be provided to download the model to a specific location. diff --git a/docs/benchmarks/language/get-gptj-data.md b/docs/benchmarks/language/get-gptj-data.md index 34140598e9..60e2568b6e 100644 --- a/docs/benchmarks/language/get-gptj-data.md +++ b/docs/benchmarks/language/get-gptj-data.md @@ -14,9 +14,19 @@ The benchmark implementation run command will automatically download the validat ### Get Validation Dataset ``` - mlcr get,dataset,cnndm,validation -j + mlcr get,dataset,cnndm,_validation -j ``` +=== "Calibration" + GPT-J calibration dataset is extracted from the CNNDM dataset. + + ### Get Validation Dataset + ``` + mlcr get,dataset,cnndm,_calibration -j + ``` + +- `--outdirname=` could be provided to download the dataset to a specific location. + ## Model The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands. @@ -28,3 +38,5 @@ Get the Official MLPerf GPT-J Model ``` mlcr get,ml-model,gptj,_pytorch -j ``` + +- `--outdirname=` could be provided to download the model to a specific location. \ No newline at end of file diff --git a/docs/benchmarks/language/get-llama2-70b-data.md b/docs/benchmarks/language/get-llama2-70b-data.md index 2a31370574..ce7cd996eb 100644 --- a/docs/benchmarks/language/get-llama2-70b-data.md +++ b/docs/benchmarks/language/get-llama2-70b-data.md @@ -9,27 +9,63 @@ hide: The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands. -=== "Validation" - LLAMA2-70b validation run uses the Open ORCA dataset. +=== "Preprocessed Dataset" - ### Get Validation Dataset - ``` - mlcr get,dataset,openorca,validation -j - ``` + === "Validation" + LLAMA2-70b validation run uses the Open ORCA dataset. + + ### Get Preprocessed Validation Dataset + ``` + mlcr get,dataset,preprocessed,openorca,_validation -j + ``` + + === "Calibration" + + ### Get Preprocessed Calibration dataset + ``` + mlcr get,dataset,preprocessed,openorca,_calibration -j + ``` + +=== "Unprocessed Dataset" + + === "Validation" + LLAMA2-70b validation run uses the Open ORCA dataset. + + ### Get Unprocessed Validation Dataset + ``` + mlcr get,dataset,openorca,_validation -j + ``` + + === "Calibration" + + ### Get Unprocessed Validation Dataset + ``` + mlcr get,dataset,openorca,_validation -j + ``` + +- `--outdirname=` could be provided to download the dataset to a specific location. ## Model The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands. -Get the Official MLPerf LLAMA2-70b Model - === "Pytorch" - ### Pytorch - ``` - mlcr get,ml-model,llama2-70b,_pytorch -j --outdirname= - ``` - -!!! tip + === "From MLCOMMONS Google Drive" + + > **Note:** One has to accept the [MLCommons Llama 2 License Confidentiality Notice](https://llama2.mlcommons.org/) to access the model files in MLCOMMONS Google Drive. + + ### Get the Official MLPerf LLAMA2-70B model from MLCOMMONS Google Drive + ``` + mlcr get,ml-model,llama2-70b,_pytorch -j + ``` + + === "From Hugging Face repo" + + > **Note:** Access to the HuggingFace model could be requested [here](https://ai.meta.com/resources/models-and-libraries/llama-downloads/). - [Access Request Link](https://llama2.mlcommons.org/) for MLCommons members + ### Get model from HuggingFace repo + ``` + mlcr get,ml-model,llama2-70b,_hf --hf_token= -j + ``` +- `--outdirname=` could be provided to download the model to a specific location. \ No newline at end of file diff --git a/docs/benchmarks/language/get-llama3_1-405b-data.md b/docs/benchmarks/language/get-llama3_1-405b-data.md index 62b7bd088a..ad05ca8610 100644 --- a/docs/benchmarks/language/get-llama3_1-405b-data.md +++ b/docs/benchmarks/language/get-llama3_1-405b-data.md @@ -23,18 +23,29 @@ The benchmark implementation run command will automatically download the validat mlcr get,dataset,mlperf,inference,llama3,_calibration --outdirname= -j ``` +- `--outdirname=` could be provided to download the dataset to a specific location. + ## Model The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands. -Get the Official MLPerf LLAMA3.1-405b Model - === "Pytorch" - ### Pytorch - ``` - mlcr get,ml-model,llama3 --outdirname= -j - ``` - -!!! tip + === "From MLCOMMONS Google Drive" + + > **Note:** One has to accept the [MLCommons Llama 3.1 License Confidentiality Notice](http://llama3-1.mlcommons.org/) to access the model files in MLCOMMONS Google Drive. + + ### Get the Official MLPerf LLAMA3.1-405B model from MLCOMMONS Google Drive + ``` + mlcr get,ml-model,llama3 -j + ``` + + === "From Hugging Face repo" + + > **Note:** Access to the HuggingFace model could be requested [here](https://ai.meta.com/resources/models-and-libraries/llama-downloads/). + + ### Get model from HuggingFace repo + ``` + mlcr get,ml-model,llama3,_hf --hf_token= -j + ``` - [Access Request Link](https://llama3-1.mlcommons.org/) for MLCommons members +- `--outdirname=` could be provided to download the model to a specific location. \ No newline at end of file diff --git a/docs/benchmarks/language/get-mixtral-8x7b-data.md b/docs/benchmarks/language/get-mixtral-8x7b-data.md index 81b90cdb57..cf5225843d 100644 --- a/docs/benchmarks/language/get-mixtral-8x7b-data.md +++ b/docs/benchmarks/language/get-mixtral-8x7b-data.md @@ -15,6 +15,15 @@ The benchmark implementation run command will automatically download the preproc mlcr get,dataset-mixtral,openorca-mbxp-gsm8k-combined -j ``` +=== "Calibration" + + ### Get Calibration Dataset + ``` + mlcr get,dataset-mixtral,openorca-mbxp-gsm8k-combined,_calibration -j + ``` + +- `--outdirname=` could be provided to download the dataset to a specific location. + ## Model The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands. @@ -25,4 +34,6 @@ Get the Official MLPerf MIXTRAL-8x7b Model ### Pytorch ``` mlcr get,ml-model,mixtral -j - ``` \ No newline at end of file + ``` + +- `--outdirname=` could be provided to download the model to a specific location. \ No newline at end of file diff --git a/docs/benchmarks/medical_imaging/get-3d-unet-data.md b/docs/benchmarks/medical_imaging/get-3d-unet-data.md index d68b769209..9c77fdeaa2 100644 --- a/docs/benchmarks/medical_imaging/get-3d-unet-data.md +++ b/docs/benchmarks/medical_imaging/get-3d-unet-data.md @@ -9,19 +9,34 @@ hide: The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands. -=== "Validation" - 3d-unet validation run uses the KiTS19 dataset performing [KiTS 2019](https://kits19.grand-challenge.org/) kidney tumor segmentation task - ### Get Validation Dataset(Original) - ``` - mlcr get,dataset,kits19,_validation -j - ``` +=== "Unprocessed Dataset" + + === "Validation" + 3d-unet validation run uses the KiTS19 dataset performing [KiTS 2019](https://kits19.grand-challenge.org/) kidney tumor segmentation task + + ### Get Validation Dataset + ``` + mlcr get,dataset,kits19,_validation -j + ``` + + === "Calibration" - ### Get Validation Dataset(Preprocessed) + ### Get Calibration Dataset + ``` + mlcr get,dataset,kits19,_calibration -j + ``` + +=== "Preprocessed Dataset" + + ### Get Preprocessed Validation Dataset ``` mlcr get,dataset,kits19,preprocessed -j ``` +- `--outdirname=` could be provided to download the dataset to a specific location. + + ## Model The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands. @@ -46,3 +61,4 @@ Get the Official MLPerf 3d-unet Model mlcr get,ml-model,3d-unet,_tensorflow -j ``` +- `--outdirname=` could be provided to download the model to a specific location. diff --git a/docs/benchmarks/object_detection/get-retinanet-data.md b/docs/benchmarks/object_detection/get-retinanet-data.md index 6cd677b4e1..6127eed541 100644 --- a/docs/benchmarks/object_detection/get-retinanet-data.md +++ b/docs/benchmarks/object_detection/get-retinanet-data.md @@ -9,20 +9,34 @@ hide: The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands. -=== "Validation" - Retinanet validation run uses the OpenImages v6 MLPerf validation dataset resized to 800x800 and consisting of 24,576 images. +=== "Unprocessed" - ### Get Validation Dataset - ``` - mlcr get,dataset,openimages,_validation -j - ``` -=== "Calibration" - Retinanet calibration dataset consist of 500 images selected from the OpenImages v6 dataset. + === "Validation" + Retinanet validation run uses the OpenImages v6 MLPerf validation dataset resized to 800x800 and consisting of 24,576 images. + + ### Get Validation Dataset + ``` + mlcr get,dataset,openimages,_validation -j + ``` + + === "Calibration" + Retinanet calibration dataset consist of 500 images selected from the OpenImages v6 dataset. + ### Get OpenImages Calibration dataset + ``` + mlcr get,dataset,openimages,_calibration -j + ``` + +=== "Preprocessed" + + ### Get Preprocessed OpenImages dataset ``` - mlcr get,dataset,openimages,_calibration -j + get,dataset,object-detection,open-images,openimages,preprocessed,_validation -j ``` +- `--outdirname=` could be provided to download the dataset to a specific location. + + ## Model The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands. @@ -41,3 +55,4 @@ Get the Official MLPerf Retinanet Model mlcr get,ml-model,retinanet,_onnx -j ``` +- `--outdirname=` could be provided to download the model to a specific location. diff --git a/docs/benchmarks/recommendation/get-dlrm-v2-data.md b/docs/benchmarks/recommendation/get-dlrm-v2-data.md index bb35660b68..8505b31bf4 100644 --- a/docs/benchmarks/recommendation/get-dlrm-v2-data.md +++ b/docs/benchmarks/recommendation/get-dlrm-v2-data.md @@ -16,6 +16,9 @@ The benchmark implementation run command will automatically download the validat ``` mlcr get,dataset,criteo,_validation -j ``` + +- `--outdirname=` could be provided to download the dataset to a specific location. + ## Model The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands. @@ -25,6 +28,8 @@ Get the Official MLPerf DLRM v2 Model ### Pytorch ``` - mlcr get,ml-model,dlrm,_pytorch -j + mlcr get,ml-model,dlrm,_pytorch,_weight_sharded,_rclone -j ``` + +- `--outdirname=` could be provided to download the model to a specific location. \ No newline at end of file diff --git a/docs/benchmarks/text_to_image/get-sdxl-data.md b/docs/benchmarks/text_to_image/get-sdxl-data.md index 6d79e331d1..7c5363415c 100644 --- a/docs/benchmarks/text_to_image/get-sdxl-data.md +++ b/docs/benchmarks/text_to_image/get-sdxl-data.md @@ -17,15 +17,30 @@ The benchmark implementation run command will automatically download the validat mlcr get,dataset,coco2014,_validation -j ``` +=== "Calibration" + + ### Get COCO2014 Calibration Dataset + ``` + mlcr get,dataset,coco2014,_calibration -j + ``` + +- `--outdirname=` could be provided to download the dataset to a specific location. + ## Model The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands. Get the Official MLPerf Stable Diffusion Model === "Pytorch" - - ### Pytorch - ``` - mlcr get,ml-model,sdxl,_pytorch -j - ``` - + === "FP 16" + ### Pytorch + ``` + mlcr get,ml-model,sdxl,_pytorch,_fp16 -j + ``` + === "FP 32" + ### Pytorch + ``` + mlcr get,ml-model,sdxl,_pytorch,_fp32 -j + ``` + +- `--outdirname=` could be provided to download the model to a specific location. From d8048376f88d3ad6aabbaddd52018cea5263b117 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Tue, 22 Apr 2025 18:53:40 +0530 Subject: [PATCH 10/64] add powershell command to get result folder structure (#2156) --- docs/submission/index.md | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/docs/submission/index.md b/docs/submission/index.md index 6a6bbfb2f2..079a513854 100644 --- a/docs/submission/index.md +++ b/docs/submission/index.md @@ -56,9 +56,15 @@ Please refer to the [installation page](site:inference/install/) to install MLCF === "MLC automation based results" If you have followed the `mlcr` commands under the individual model pages in the [benchmarks](../index.md) directory, all the valid results will get aggregated to the `mlc cache` folder. The following command could be used to browse the structure of inference results folder generated by MLCFlow. ### Get results folder structure - ```bash - mlc find cache --tags=get,mlperf,inference,results,dir | xargs tree - ``` + + === "Unix Terminal" + ```bash + mlc find cache --tags=get,mlperf,inference,results,dir | xargs tree + ``` + === "Windows PowerShell" + ``` + mlc find cache --tags=get,mlperf,inference,results,dir | ForEach-Object { Get-ChildItem -Recurse $_ } + ``` Once all the results across all the models are ready you can use the following the below section to generate a valid submission tree compliant with the [MLPerf requirements](https://github.com/mlcommons/policies/blob/master/submission_rules.adoc#inference-1). From 5b65f7be79a703fa23034f0d406b2e6f1a6134eb Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 22 Apr 2025 17:00:11 +0000 Subject: [PATCH 11/64] [Automated Commit] Format Codebase --- tools/submission/submission_checker.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index 41116e2620..edda676c9c 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -2094,7 +2094,8 @@ def log_result( if filter_submitter and submitter != filter_submitter: continue results_path = os.path.join(division, submitter, "results") - measurements_path = os.path.join(division, submitter, "measurements") + measurements_path = os.path.join( + division, submitter, "measurements") systems_path = os.path.join(division, submitter, "systems") if not os.path.exists(results_path): continue @@ -2200,7 +2201,8 @@ def log_result( extra_model_mapping = json.load(fp) if not config.skip_all_systems_with_results: - measurement_diff = list(set(list_dir(measurements_path)) - set(list_dir(results_path))) + measurement_diff = list( + set(list_dir(measurements_path)) - set(list_dir(results_path))) systems_diff = list( set( [ @@ -3173,7 +3175,7 @@ def main(): args.extra_model_benchmark_map, ignore_uncommited=args.submission_exceptions, skip_power_check=args.skip_power_check, - skip_all_systems_with_results = args.skip_all_systems_have_results_check + skip_all_systems_with_results=args.skip_all_systems_have_results_check ) if args.scenarios_to_skip: From 70fcbe0c8b4aeadedd7ba40c17b4077530e3019b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 10 Jun 2025 16:59:39 +0000 Subject: [PATCH 12/64] [Automated Commit] Format Codebase --- language/deepseek-r1/backends/__init__.py | 2 +- .../deepseek-r1/backends/pytorch_backend.py | 52 ++--- .../deepseek-r1/backends/sglang_backend.py | 168 +++++++++------ language/deepseek-r1/backends/utils.py | 173 +++++++++------- language/deepseek-r1/backends/vllm_backend.py | 33 +-- language/deepseek-r1/eval_accuracy.py | 74 ++++--- language/deepseek-r1/mlperf/__init__.py | 6 +- language/deepseek-r1/mlperf/base_sut.py | 28 +-- language/deepseek-r1/mlperf/offline_sut.py | 70 ++++--- language/deepseek-r1/mlperf/qsl.py | 25 +-- language/deepseek-r1/mlperf/server_sut.py | 100 +++++---- language/deepseek-r1/mlperf/utils.py | 69 ++++--- language/deepseek-r1/run_eval.py | 165 ++++++++------- language/deepseek-r1/run_eval_mpi.py | 99 +++++---- language/deepseek-r1/run_mlperf.py | 116 ++++++----- language/deepseek-r1/run_mlperf_mpi.py | 195 ++++++++++-------- language/deepseek-r1/utils/__init__.py | 2 +- .../deepseek-r1/utils/backend_registry.py | 11 +- language/deepseek-r1/utils/data_utils.py | 108 +++++----- language/deepseek-r1/utils/error_handling.py | 11 +- language/deepseek-r1/utils/runner_utils.py | 58 +++--- language/deepseek-r1/utils/tokenization.py | 87 ++++---- language/deepseek-r1/utils/validation.py | 44 ++-- tools/submission/submission_checker.py | 17 +- 24 files changed, 987 insertions(+), 726 deletions(-) diff --git a/language/deepseek-r1/backends/__init__.py b/language/deepseek-r1/backends/__init__.py index 61ad96a3f2..865ed3bd53 100644 --- a/language/deepseek-r1/backends/__init__.py +++ b/language/deepseek-r1/backends/__init__.py @@ -11,4 +11,4 @@ # to avoid dependency issues when only using certain backends __all__ = [ 'BaseBackend', -] \ No newline at end of file +] diff --git a/language/deepseek-r1/backends/pytorch_backend.py b/language/deepseek-r1/backends/pytorch_backend.py index c1e426185d..0742882bca 100644 --- a/language/deepseek-r1/backends/pytorch_backend.py +++ b/language/deepseek-r1/backends/pytorch_backend.py @@ -1,3 +1,17 @@ +from utils.validation import require_initialized, BackendNotInitializedError +from utils.backend_registry import get_backend_config +from .utils import get_cache_directory +from .base_backend import BaseBackend +from transformers import AutoTokenizer +import torch.distributed as dist +import torch +from pathlib import Path +import asyncio +from typing import Any, Dict, List, Optional +import logging +import json +from ref_dsinfer.inference.model import Transformer, ModelArgs +from safetensors.torch import load_model import os import sys @@ -6,23 +20,6 @@ 'REF_DSINFER_PATH', '/opt/ref_dsinfer/inference') sys.path.append(ref_dsinfer_path) -from safetensors.torch import load_model -from ref_dsinfer.inference.model import Transformer, ModelArgs -import json -import logging -from typing import Any, Dict, List, Optional -import asyncio -from pathlib import Path - -import torch -import torch.distributed as dist -from transformers import AutoTokenizer - -from .base_backend import BaseBackend -from .utils import get_cache_directory -from utils.backend_registry import get_backend_config -from utils.validation import require_initialized, BackendNotInitializedError - logger = logging.getLogger(__name__) @@ -115,8 +112,10 @@ def initialize(self) -> None: with torch.device(self.config['device']): self.model = Transformer(self.model_args) - # Load tokenizer (only rank 0 needs it for MLPerf, but all ranks need it for run_eval_mpi) - self.tokenizer = AutoTokenizer.from_pretrained(str(self.model_path), revision=self.config['model_revision']) + # Load tokenizer (only rank 0 needs it for MLPerf, but all ranks need + # it for run_eval_mpi) + self.tokenizer = AutoTokenizer.from_pretrained( + str(self.model_path), revision=self.config['model_revision']) # Load model weights checkpoint_file = self.model_path / \ @@ -133,7 +132,8 @@ def sample(self, logits: torch.Tensor, temperature: float) -> torch.Tensor: """Sample from logits with temperature.""" logits = logits / max(temperature, 1e-5) probs = torch.softmax(logits, dim=-1) - return probs.div_(torch.empty_like(probs).exponential_(1)).argmax(dim=-1) + return probs.div_(torch.empty_like( + probs).exponential_(1)).argmax(dim=-1) @torch.inference_mode() def _generate_batch( @@ -222,7 +222,8 @@ def _generate_batch( return completion_tokens @require_initialized - def generate(self, tokenized_prompts: List[List[int]], **kwargs) -> List[Dict[str, Any]]: + def generate( + self, tokenized_prompts: List[List[int]], **kwargs) -> List[Dict[str, Any]]: """ Generate responses for a list of pre-tokenized prompts. @@ -265,7 +266,8 @@ def generate(self, tokenized_prompts: List[List[int]], **kwargs) -> List[Dict[st return results @require_initialized - def generate_batch_distributed(self, batch_tokens: List[List[int]]) -> List[List[int]]: + def generate_batch_distributed( + self, batch_tokens: List[List[int]]) -> List[List[int]]: """ Generate tokens for a batch in distributed mode. @@ -296,7 +298,8 @@ def generate_batch_distributed(self, batch_tokens: List[List[int]]) -> List[List return [] @require_initialized - def generate_async(self, tokenized_prompts: List[List[int]], **kwargs) -> List[asyncio.Future]: + def generate_async( + self, tokenized_prompts: List[List[int]], **kwargs) -> List[asyncio.Future]: """ Generate responses asynchronously. @@ -331,7 +334,8 @@ async def extract_result(idx): return futures @require_initialized - def generate_batch_distributed_async(self, batch_tokens: List[List[int]]) -> asyncio.Future: + def generate_batch_distributed_async( + self, batch_tokens: List[List[int]]) -> asyncio.Future: """ Generate tokens for a batch in distributed mode asynchronously. diff --git a/language/deepseek-r1/backends/sglang_backend.py b/language/deepseek-r1/backends/sglang_backend.py index 06cf074a96..10be6e1dcd 100644 --- a/language/deepseek-r1/backends/sglang_backend.py +++ b/language/deepseek-r1/backends/sglang_backend.py @@ -66,11 +66,12 @@ def __init__(self, config: Dict[str, Any] = None): # Log monitoring self._log_monitor = None - + # Shared semaphore for async concurrency control self._async_semaphore = None - # Configure logging to suppress httpx INFO logs (only show warnings/errors) + # Configure logging to suppress httpx INFO logs (only show + # warnings/errors) import logging logging.getLogger("httpx").setLevel(logging.WARNING) logging.getLogger("openai").setLevel(logging.WARNING) @@ -128,7 +129,8 @@ def _build_server_command(self) -> List[str]: cmd.append('flashinfer') if self.config['enable_dp_attention']: - cmd.extend(['--enable-dp-attention', '--dp', str(self.config['dp'])]) + cmd.extend(['--enable-dp-attention', + '--dp', str(self.config['dp'])]) # Add performance settings cmd.extend([ @@ -175,7 +177,8 @@ def _wait_for_server_ready(self, timeout: int = None) -> bool: # Update progress indicator every 0.5 seconds if time.time() - last_progress_update >= 0.5: last_progress_update = time.time() - progress_idx = (progress_idx + 1) % len(TerminalDisplay.PROGRESS_CHARS) + progress_idx = ( + progress_idx + 1) % len(TerminalDisplay.PROGRESS_CHARS) minutes = elapsed // 60 seconds = elapsed % 60 # Use carriage return to stay on the same line @@ -192,7 +195,8 @@ def _wait_for_server_ready(self, timeout: int = None) -> bool: if response.status_code == 200: # Health check passed, now try a warmup query print(f"\r{' '*80}\r", end='', flush=True) - print(f"\n[SGLANG] Health check passed, running warmup query...") + print( + f"\n[SGLANG] Health check passed, running warmup query...") # Try to send a simple warmup query using OpenAI client try: @@ -210,7 +214,8 @@ def _wait_for_server_ready(self, timeout: int = None) -> bool: # Send a simple warmup request warmup_response = warmup_client.chat.completions.create( model=self.config['served_model_name'], - messages=[{"role": "user", "content": "Hello"}], + messages=[ + {"role": "user", "content": "Hello"}], temperature=0.0, max_tokens=10, seed=self.config['seed'] @@ -218,23 +223,28 @@ def _wait_for_server_ready(self, timeout: int = None) -> bool: # Check if we got a valid response if warmup_response.choices[0].message.content: - print(f"[SGLANG] ✓ Warmup query successful! Response: {warmup_response.choices[0].message.content[:50]}...") + print( + f"[SGLANG] ✓ Warmup query successful! Response: {warmup_response.choices[0].message.content[:50]}...") # Stop log monitoring if self._log_monitor: self._log_monitor.stop() self._log_monitor = None - print(f"\n[SGLANG] " + "="*60) - print(f"[SGLANG] ✓ SERVER READY! (startup took {elapsed}s)") - print(f"[SGLANG] " + "="*60) + print(f"\n[SGLANG] " + "=" * 60) + print( + f"[SGLANG] ✓ SERVER READY! (startup took {elapsed}s)") + print(f"[SGLANG] " + "=" * 60) return True else: - print(f"[SGLANG] Warmup query returned empty response, retrying...") + print( + f"[SGLANG] Warmup query returned empty response, retrying...") except Exception as warmup_error: - print(f"[SGLANG] Warmup query failed: {warmup_error}, retrying...") - # Continue waiting, the server might not be fully ready yet + print( + f"[SGLANG] Warmup query failed: {warmup_error}, retrying...") + # Continue waiting, the server might not be fully + # ready yet except requests.exceptions.RequestException: pass @@ -246,9 +256,11 @@ def _wait_for_server_ready(self, timeout: int = None) -> bool: self._log_monitor = None # Clear progress line print(f"\r{' '*80}\r", end='', flush=True) - print(f"\n[SGLANG] ✗ Server process died with exit code: {self.server_process.returncode}") + print( + f"\n[SGLANG] ✗ Server process died with exit code: {self.server_process.returncode}") if self.server_log_file: - print(f"[SGLANG] Check server logs at: {self.server_log_file}") + print( + f"[SGLANG] Check server logs at: {self.server_log_file}") return False time.sleep(0.1) # Check every 100ms for smoother progress @@ -264,17 +276,21 @@ def _wait_for_server_ready(self, timeout: int = None) -> bool: def _start_server(self) -> None: """Start the SGLang server as a subprocess.""" - print(f"\n[SGLANG] Starting SGLang server for {self.config['model']}...") + print( + f"\n[SGLANG] Starting SGLang server for {self.config['model']}...") print(f"[SGLANG] Configuration:") print(f"[SGLANG] - Port: {self.port}") - print(f"[SGLANG] - Tensor Parallel: {self.config['tensor_parallel_size']}") - print(f"[SGLANG] - Context Length: {self.config['context_length']:,} tokens") + print( + f"[SGLANG] - Tensor Parallel: {self.config['tensor_parallel_size']}") + print( + f"[SGLANG] - Context Length: {self.config['context_length']:,} tokens") print(f"[SGLANG] - dtype: {self.config['dtype']}") # Create log file for server output log_dir = Path("/work/logs") log_dir.mkdir(exist_ok=True) - self.server_log_file = log_dir / f"sglang_server_{self.port}_{int(time.time())}.log" + self.server_log_file = log_dir / \ + f"sglang_server_{self.port}_{int(time.time())}.log" cmd = self._build_server_command() print(f"\n[SGLANG] Command: {' '.join(cmd)}") @@ -315,7 +331,10 @@ def _stop_server(self) -> None: except subprocess.TimeoutExpired: # Force kill if not stopped print("[SGLANG] Server didn't stop gracefully, forcing...") - os.killpg(os.getpgid(self.server_process.pid), signal.SIGKILL) + os.killpg( + os.getpgid( + self.server_process.pid), + signal.SIGKILL) self.server_process.wait() print("[SGLANG] Server force stopped") except ProcessLookupError: @@ -332,7 +351,8 @@ def initialize(self) -> None: try: # Load tokenizer for string conversion print(f"[SGLANG] Loading tokenizer: {self.config['tokenizer']}...") - self.tokenizer = AutoTokenizer.from_pretrained(self.config['tokenizer'], revision=self.config['model_revision']) + self.tokenizer = AutoTokenizer.from_pretrained( + self.config['tokenizer'], revision=self.config['model_revision']) # Start SGLang server (with log monitoring) self._start_server() @@ -341,7 +361,8 @@ def initialize(self) -> None: base_url = f"http://localhost:{self.port}/v1" api_key = self.config['api_key'] or "dummy-key" - print(f"[SGLANG] Creating OpenAI clients with base URL: {base_url}") + print( + f"[SGLANG] Creating OpenAI clients with base URL: {base_url}") # Configure timeout settings timeout_config = httpx.Timeout( @@ -371,10 +392,12 @@ def initialize(self) -> None: ) print(f"[SGLANG] Created asynchronous OpenAI client") - + # Create shared semaphore for async concurrency control - self._async_semaphore = asyncio.Semaphore(self.config['max_running_requests']) - print(f"[SGLANG] Created async semaphore with limit: {self.config['max_running_requests']}") + self._async_semaphore = asyncio.Semaphore( + self.config['max_running_requests']) + print( + f"[SGLANG] Created async semaphore with limit: {self.config['max_running_requests']}") # Server readiness was already verified by health endpoint in _wait_for_server_ready() # No need to check models endpoint @@ -403,17 +426,18 @@ def initialize(self) -> None: raise @require_initialized - def generate(self, - tokenized_prompts: Optional[List[List[int]]] = None, - text_prompts: Optional[List[str]] = None, - **kwargs) -> List[Dict[str, Any]]: + def generate(self, + tokenized_prompts: Optional[List[List[int]]] = None, + text_prompts: Optional[List[str]] = None, + **kwargs) -> List[Dict[str, Any]]: """Generate responses synchronously.""" # Check if server process is still alive self._check_server_alive() # Check if client is properly initialized if self.client is None: - raise RuntimeError("SGLang client is not initialized. Server may have failed to start.") + raise RuntimeError( + "SGLang client is not initialized. Server may have failed to start.") # Validate prompts using centralized validation validate_prompts_input( @@ -436,7 +460,8 @@ def generate(self, results = [] # Process prompts with progress bar - for prompt in tqdm(prompt_strings, desc="SGLang sync inference", unit="prompt"): + for prompt in tqdm( + prompt_strings, desc="SGLang sync inference", unit="prompt"): try: completion = self.client.chat.completions.create( model=self.config['served_model_name'], @@ -452,7 +477,8 @@ def generate(self, # Validate response is not empty if not generated_text: - raise RuntimeError(f"Empty response received from SGLang server for prompt: {prompt[:100]}...") + raise RuntimeError( + f"Empty response received from SGLang server for prompt: {prompt[:100]}...") # Tokenize the output to get token IDs tokens = self.tokenizer.encode(generated_text) @@ -464,15 +490,18 @@ def generate(self, except Exception as e: print(f"\nError generating completion: {e}") - raise RuntimeError(f"SGLang backend failed to generate tokens for prompt: {prompt[:100]}...") + raise RuntimeError( + f"SGLang backend failed to generate tokens for prompt: {prompt[:100]}...") return results - async def _async_generate_single(self, prompt: str, idx: int, semaphore: asyncio.Semaphore) -> Tuple[int, Dict[str, Any]]: + async def _async_generate_single( + self, prompt: str, idx: int, semaphore: asyncio.Semaphore) -> Tuple[int, Dict[str, Any]]: """Generate a single response asynchronously with semaphore control.""" # Check if async client is properly initialized if self.async_client is None: - raise RuntimeError(f"SGLang async client is not initialized for prompt {idx}") + raise RuntimeError( + f"SGLang async client is not initialized for prompt {idx}") async with semaphore: try: @@ -490,7 +519,8 @@ async def _async_generate_single(self, prompt: str, idx: int, semaphore: asyncio # Validate response is not empty if not generated_text: - raise RuntimeError(f"Empty response received from SGLang server for prompt: {prompt[:100]}...") + raise RuntimeError( + f"Empty response received from SGLang server for prompt: {prompt[:100]}...") # Tokenize the output to get token IDs tokens = self.tokenizer.encode(generated_text) @@ -499,20 +529,22 @@ async def _async_generate_single(self, prompt: str, idx: int, semaphore: asyncio except Exception as e: print(f"\nError generating completion for prompt {idx}: {e}") - raise RuntimeError(f"SGLang backend failed to generate tokens for prompt {idx}: {e}") + raise RuntimeError( + f"SGLang backend failed to generate tokens for prompt {idx}: {e}") @require_initialized - def generate_async(self, - tokenized_prompts: Optional[List[List[int]]] = None, - text_prompts: Optional[List[str]] = None, - **kwargs) -> List[asyncio.Future]: + def generate_async(self, + tokenized_prompts: Optional[List[List[int]]] = None, + text_prompts: Optional[List[str]] = None, + **kwargs) -> List[asyncio.Future]: """Generate responses asynchronously using shared semaphore.""" # Check if server process is still alive self._check_server_alive() # Check if client is properly initialized if self.async_client is None: - raise RuntimeError("SGLang async client is not initialized. Server may have failed to start.") + raise RuntimeError( + "SGLang async client is not initialized. Server may have failed to start.") # Validate prompts using centralized validation validate_prompts_input( @@ -542,44 +574,49 @@ def generate_async(self, futures = [] for idx, prompt in enumerate(prompt_strings): # Create a task for each prompt using the shared semaphore - task = asyncio.create_task(self._async_generate_single(prompt, idx, self._async_semaphore)) - + task = asyncio.create_task( + self._async_generate_single( + prompt, idx, self._async_semaphore)) + # Create a future that will hold the result future = asyncio.Future() - + # Setup callback to extract just the result (not the index) def make_callback(future_obj, expected_idx): def callback(task_obj): try: idx, result = task_obj.result() if idx != expected_idx: - future_obj.set_exception(Exception(f"Index mismatch: expected {expected_idx}, got {idx}")) + future_obj.set_exception( + Exception(f"Index mismatch: expected {expected_idx}, got {idx}")) else: future_obj.set_result(result) except Exception as e: future_obj.set_exception(e) return callback - + task.add_done_callback(make_callback(future, idx)) futures.append(future) return futures - async def generate_stream(self, - tokenized_prompts: Optional[List[List[int]]] = None, - text_prompts: Optional[List[str]] = None, - **kwargs) -> List[AsyncIterator[StreamingChunk]]: + async def generate_stream(self, + tokenized_prompts: Optional[List[List[int]]] = None, + text_prompts: Optional[List[str]] = None, + **kwargs) -> List[AsyncIterator[StreamingChunk]]: """Generate responses for a list of prompts with streaming.""" if not self.is_initialized: - raise RuntimeError("Backend not initialized. Call initialize() first.") - + raise RuntimeError( + "Backend not initialized. Call initialize() first.") + # Check if server process is still alive self._check_server_alive() - + # Check if async client is properly initialized if self.async_client is None: - raise RuntimeError("SGLang async client is not initialized. Server may have failed to start.") - + raise RuntimeError( + "SGLang async client is not initialized. Server may have failed to start.") + # Validate prompts validate_prompts_input( backend_name='sglang', @@ -587,7 +624,7 @@ async def generate_stream(self, text_prompts=text_prompts, input_type='text' ) - + # SGLang prefers text prompts if text_prompts is None: # Convert tokenized prompts to strings @@ -597,8 +634,9 @@ async def generate_stream(self, ] else: prompt_strings = text_prompts - - async def stream_single_prompt(prompt: str) -> AsyncIterator[StreamingChunk]: + + async def stream_single_prompt( + prompt: str) -> AsyncIterator[StreamingChunk]: try: stream = await self.async_client.chat.completions.create( model=self.config['served_model_name'], @@ -609,14 +647,14 @@ async def stream_single_prompt(prompt: str) -> AsyncIterator[StreamingChunk]: seed=self.config.get('seed'), stream=True ) - + async for chunk in stream: if not chunk.choices: continue - + delta = chunk.choices[0].delta finish_reason = chunk.choices[0].finish_reason - + if delta.content: yield StreamingChunk( token=delta.content, @@ -635,7 +673,7 @@ async def stream_single_prompt(prompt: str) -> AsyncIterator[StreamingChunk]: except Exception as e: print(f"[SGLANG] Streaming error for prompt: {e}") raise - + return [stream_single_prompt(prompt) for prompt in prompt_strings] def shutdown(self) -> None: @@ -650,7 +688,7 @@ def shutdown(self) -> None: # Close clients self.client = None self.async_client = None - + # Clear async semaphore self._async_semaphore = None @@ -665,4 +703,4 @@ def shutdown(self) -> None: torch.cuda.empty_cache() self.is_initialized = False - print("[SGLANG] Backend shutdown complete") \ No newline at end of file + print("[SGLANG] Backend shutdown complete") diff --git a/language/deepseek-r1/backends/utils.py b/language/deepseek-r1/backends/utils.py index 0e4c7732da..ebd6ce3719 100644 --- a/language/deepseek-r1/backends/utils.py +++ b/language/deepseek-r1/backends/utils.py @@ -19,50 +19,50 @@ def get_cache_directory() -> Path: """ Get the cache directory at /raid/data/$USER/.cache - + Returns: Path: The cache directory path """ # Get the current user user = os.environ.get('USER', os.environ.get('USERNAME', 'unknown')) - + # Use /raid/data/$USER/.cache cache_dir = Path(f'/raid/data/{user}/.cache') - + # Create the cache directory if it doesn't exist cache_dir.mkdir(parents=True, exist_ok=True) - + return cache_dir def setup_huggingface_cache() -> Path: """ Set up HuggingFace cache environment variables using the preferred cache directory. - + Returns: Path: The cache directory being used """ cache_dir = get_cache_directory() - + # Set HuggingFace cache environment variables os.environ['HF_HOME'] = str(cache_dir) os.environ['HF_HUB_CACHE'] = str(cache_dir) os.environ['HUGGINGFACE_HUB_CACHE'] = str(cache_dir) - + return cache_dir def find_free_port(start_port: int = 30000, max_attempts: int = 100) -> int: """ Find a free port starting from start_port. - + Args: start_port: The port number to start searching from max_attempts: Maximum number of ports to try - + Returns: int: A free port number - + Raises: RuntimeError: If no free port is found after max_attempts """ @@ -75,13 +75,14 @@ def find_free_port(start_port: int = 30000, max_attempts: int = 100) -> int: return port except OSError: continue - raise RuntimeError(f"Could not find free port after {max_attempts} attempts starting from {start_port}") + raise RuntimeError( + f"Could not find free port after {max_attempts} attempts starting from {start_port}") def set_all_seeds(seed: int = 42) -> None: """ Set seeds for all random number generators for reproducibility. - + Args: seed: The seed value to use """ @@ -96,73 +97,76 @@ def set_all_seeds(seed: int = 42) -> None: set_seed(seed) -def validate_prompts(tokenized_prompts: Optional[list] = None, - text_prompts: Optional[list] = None, - backend_type: str = "") -> None: +def validate_prompts(tokenized_prompts: Optional[list] = None, + text_prompts: Optional[list] = None, + backend_type: str = "") -> None: """ Validate that at least one type of prompts is provided. - + Args: tokenized_prompts: List of tokenized prompts text_prompts: List of text prompts backend_type: Name of the backend for error messages - + Raises: ValueError: If neither prompt type is provided """ if tokenized_prompts is None and text_prompts is None: - raise ValueError(f"{backend_type + ' backend' if backend_type else 'Backend'} requires either text_prompts or tokenized_prompts") + raise ValueError( + f"{backend_type + ' backend' if backend_type else 'Backend'} requires either text_prompts or tokenized_prompts") # Terminal display utilities class TerminalDisplay: """ANSI escape codes and utilities for terminal display formatting.""" - + # ANSI escape codes for cursor control CLEAR_SCREEN = "\033[2J" MOVE_CURSOR_UP = "\033[{}A" CLEAR_LINE = "\033[K" SAVE_CURSOR = "\033[s" RESTORE_CURSOR = "\033[u" - + # Progress spinner characters PROGRESS_CHARS = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏'] - + @staticmethod def clear_lines(num_lines: int) -> None: """Clear the specified number of lines above the cursor.""" - print(TerminalDisplay.MOVE_CURSOR_UP.format(num_lines), end='', flush=True) + print(TerminalDisplay.MOVE_CURSOR_UP.format( + num_lines), end='', flush=True) for _ in range(num_lines): print(TerminalDisplay.CLEAR_LINE) - print(TerminalDisplay.MOVE_CURSOR_UP.format(num_lines), end='', flush=True) - + print(TerminalDisplay.MOVE_CURSOR_UP.format( + num_lines), end='', flush=True) + @staticmethod def save_cursor_position() -> None: """Save the current cursor position.""" print(TerminalDisplay.SAVE_CURSOR, end='', flush=True) - + @staticmethod def restore_cursor_position() -> None: """Restore the previously saved cursor position.""" print(TerminalDisplay.RESTORE_CURSOR, end='', flush=True) - + @staticmethod def clear_current_line() -> None: """Clear the current line.""" print("\r" + " " * 80 + "\r", end='', flush=True) - + @staticmethod def truncate_line(line: str, max_length: int = 110) -> str: """Truncate a line to fit within the specified length.""" if len(line) <= max_length: return line - return line[:max_length - 3] + "..." + return line[:max_length - 3] + "..." class LogMonitor: """Real-time log file monitor with terminal display.""" - - def __init__(self, + + def __init__(self, log_file_path: Union[str, Path], prefix: str = "LOG", max_lines: int = 5, @@ -170,7 +174,7 @@ def __init__(self, header_text: Optional[str] = None): """ Initialize the log monitor. - + Args: log_file_path: Path to the log file to monitor prefix: Prefix for display lines (e.g., "[SGLANG]") @@ -183,42 +187,43 @@ def __init__(self, self.max_lines = max_lines self.display_interval = display_interval self.header_text = header_text or f"Server startup logs (last {max_lines} lines):" - + # Threading control self._monitor_thread = None self._stop_event = None self._ready_event = None - + # Display dimensions self.total_lines = max_lines + 3 # 2 header lines + 1 blank separator - - def start(self, wait_for_file: bool = True, file_wait_timeout: float = 30.0) -> bool: + + def start(self, wait_for_file: bool = True, + file_wait_timeout: float = 30.0) -> bool: """ Start the log monitor in a background thread. - + Args: wait_for_file: Whether to wait for the log file to exist file_wait_timeout: How long to wait for the file (seconds) - + Returns: bool: True if monitor started successfully """ if self._monitor_thread is not None: return True # Already running - + self._stop_event = threading.Event() self._ready_event = threading.Event() - + self._monitor_thread = threading.Thread( target=self._monitor_loop, args=(wait_for_file, file_wait_timeout), daemon=True ) self._monitor_thread.start() - + # Wait for the monitor to set up its display area return self._ready_event.wait(timeout=2.0) - + def stop(self) -> None: """Stop the log monitor and clean up display.""" if self._stop_event and self._monitor_thread: @@ -227,36 +232,39 @@ def stop(self) -> None: self._monitor_thread = None self._stop_event = None self._ready_event = None - - def _monitor_loop(self, wait_for_file: bool, file_wait_timeout: float) -> None: + + def _monitor_loop(self, wait_for_file: bool, + file_wait_timeout: float) -> None: """Main monitoring loop that runs in a separate thread.""" # Wait for log file if requested if wait_for_file: start_time = time.time() while not self.log_file_path.exists(): if time.time() - start_time > file_wait_timeout: - print(f"[{self.prefix}] Warning: Log file not found after {file_wait_timeout}s: {self.log_file_path}") + print( + f"[{self.prefix}] Warning: Log file not found after {file_wait_timeout}s: {self.log_file_path}") self._ready_event.set() return time.sleep(0.5) elif not self.log_file_path.exists(): - print(f"[{self.prefix}] Warning: Log file not found: {self.log_file_path}") + print( + f"[{self.prefix}] Warning: Log file not found: {self.log_file_path}") self._ready_event.set() return - + print(f"\n[{self.prefix}] Monitoring logs: {self.log_file_path.name}") - print(f"[{self.prefix}] " + "="*60) - + print(f"[{self.prefix}] " + "=" * 60) + # Initialize display area self._setup_display_area() - + # Signal that we're ready self._ready_event.set() - + # Buffer for log lines line_buffer = [] last_display_time = 0 - + try: # Use tail -f to follow the log file process = subprocess.Popen( @@ -267,11 +275,11 @@ def _monitor_loop(self, wait_for_file: bool, file_wait_timeout: float) -> None: bufsize=1, universal_newlines=True ) - + while not self._stop_event.is_set(): if process.poll() is not None: break - + # Read available lines without blocking line_added = False try: @@ -285,7 +293,7 @@ def _monitor_loop(self, wait_for_file: bool, file_wait_timeout: float) -> None: line_added = True else: break - except: + except BaseException: # Fallback for systems without select line = process.stdout.readline() if line: @@ -293,65 +301,69 @@ def _monitor_loop(self, wait_for_file: bool, file_wait_timeout: float) -> None: if len(line_buffer) > self.max_lines: line_buffer.pop(0) line_added = True - + # Update display if needed current_time = time.time() - if line_added or (current_time - last_display_time >= self.display_interval): + if line_added or ( + current_time - last_display_time >= self.display_interval): last_display_time = current_time self._update_display(line_buffer) - + time.sleep(0.1) - + # Clean up process.terminate() try: process.wait(timeout=2) except subprocess.TimeoutExpired: process.kill() - + except Exception as e: print(f"\n[{self.prefix}] Log monitor error: {e}") finally: self._cleanup_display() - + def _setup_display_area(self) -> None: """Reserve and initialize the display area.""" # Reserve space for _ in range(self.total_lines): print() - + # Move back up to start of reserved area - print(TerminalDisplay.MOVE_CURSOR_UP.format(self.total_lines), end='', flush=True) - + print(TerminalDisplay.MOVE_CURSOR_UP.format( + self.total_lines), end='', flush=True) + # Print initial display print(f"\r[{self.prefix}] {self.header_text}", end='') print(TerminalDisplay.CLEAR_LINE, flush=True) - print(f"\r[{self.prefix}] " + "-"*60, end='') + print(f"\r[{self.prefix}] " + "-" * 60, end='') print(TerminalDisplay.CLEAR_LINE, flush=True) - + # Print empty lines for _ in range(self.max_lines): print(f"\r[{self.prefix}] ", end='') print(TerminalDisplay.CLEAR_LINE, flush=True) - + # Print separator print(f"\r", end='') print(TerminalDisplay.CLEAR_LINE, flush=True) - + def _update_display(self, line_buffer: list) -> None: """Update the display with current log lines.""" # Save cursor position print(TerminalDisplay.SAVE_CURSOR, end='', flush=True) - - # Move to start of reserved area (cursor is on progress line, 1 below our area) - print(TerminalDisplay.MOVE_CURSOR_UP.format(self.total_lines + 1), end='', flush=True) - + + # Move to start of reserved area (cursor is on progress line, 1 below + # our area) + print(TerminalDisplay.MOVE_CURSOR_UP.format( + self.total_lines + 1), end='', flush=True) + # Print header print(f"\r[{self.prefix}] {self.header_text}", end='') print(TerminalDisplay.CLEAR_LINE, flush=True) - print(f"\r[{self.prefix}] " + "-"*60, end='') + print(f"\r[{self.prefix}] " + "-" * 60, end='') print(TerminalDisplay.CLEAR_LINE, flush=True) - + # Print log lines for i in range(self.max_lines): if i < len(line_buffer): @@ -360,22 +372,23 @@ def _update_display(self, line_buffer: list) -> None: else: print(f"\r[{self.prefix}] ", end='') print(TerminalDisplay.CLEAR_LINE, flush=True) - + # Print separator print(f"\r", end='') print(TerminalDisplay.CLEAR_LINE, flush=True) - + # Restore cursor position print(TerminalDisplay.RESTORE_CURSOR, end='', flush=True) - + def _cleanup_display(self) -> None: """Clean up the display area on exit.""" print(TerminalDisplay.SAVE_CURSOR, end='', flush=True) - print(TerminalDisplay.MOVE_CURSOR_UP.format(self.total_lines + 1), end='', flush=True) - + print(TerminalDisplay.MOVE_CURSOR_UP.format( + self.total_lines + 1), end='', flush=True) + # Clear all reserved lines for _ in range(self.total_lines): print(f"\r", end='') print(TerminalDisplay.CLEAR_LINE, flush=True) - - print(TerminalDisplay.RESTORE_CURSOR, end='', flush=True) \ No newline at end of file + + print(TerminalDisplay.RESTORE_CURSOR, end='', flush=True) diff --git a/language/deepseek-r1/backends/vllm_backend.py b/language/deepseek-r1/backends/vllm_backend.py index 4ac408042f..ec49227f41 100644 --- a/language/deepseek-r1/backends/vllm_backend.py +++ b/language/deepseek-r1/backends/vllm_backend.py @@ -148,9 +148,9 @@ def initialize(self) -> None: @require_initialized def generate(self, - tokenized_prompts: Optional[List[List[int]]] = None, - text_prompts: Optional[List[str]] = None, - **kwargs) -> List[Dict[str, Any]]: + tokenized_prompts: Optional[List[List[int]]] = None, + text_prompts: Optional[List[str]] = None, + **kwargs) -> List[Dict[str, Any]]: """Generate responses synchronously using LLM.generate(). Note: vLLM backend only accepts text_prompts parameter. @@ -177,11 +177,14 @@ def generate(self, if not completion.text: # Get the corresponding prompt for context prompt_idx = outputs.index(output) - prompt_preview = text_prompts[prompt_idx][:100] if len(text_prompts[prompt_idx]) > 100 else text_prompts[prompt_idx] - raise RuntimeError(f"Empty response received from vLLM for prompt: {prompt_preview}...") + prompt_preview = text_prompts[prompt_idx][:100] if len( + text_prompts[prompt_idx]) > 100 else text_prompts[prompt_idx] + raise RuntimeError( + f"Empty response received from vLLM for prompt: {prompt_preview}...") results.append({ - 'tokens': list(completion.token_ids), # Convert tuple to list for .copy() compatibility + # Convert tuple to list for .copy() compatibility + 'tokens': list(completion.token_ids), 'text': completion.text, 'finish_reason': completion.finish_reason }) @@ -190,9 +193,9 @@ def generate(self, @require_initialized def generate_async(self, - tokenized_prompts: Optional[List[List[int]]] = None, - text_prompts: Optional[List[str]] = None, - **kwargs) -> List[asyncio.Future]: + tokenized_prompts: Optional[List[List[int]]] = None, + text_prompts: Optional[List[str]] = None, + **kwargs) -> List[asyncio.Future]: """Generate responses asynchronously, returning futures immediately. Note: vLLM backend only accepts text_prompts parameter. @@ -245,11 +248,14 @@ def _generate_batch(self, text_prompts: List[str]) -> List[Dict[str, Any]]: if not completion.text: # Get the corresponding prompt for context prompt_idx = outputs.index(output) - prompt_preview = text_prompts[prompt_idx][:100] if len(text_prompts[prompt_idx]) > 100 else text_prompts[prompt_idx] - raise RuntimeError(f"Empty response received from vLLM for prompt: {prompt_preview}...") + prompt_preview = text_prompts[prompt_idx][:100] if len( + text_prompts[prompt_idx]) > 100 else text_prompts[prompt_idx] + raise RuntimeError( + f"Empty response received from vLLM for prompt: {prompt_preview}...") results.append({ - 'tokens': list(completion.token_ids), # Convert tuple to list for .copy() compatibility + # Convert tuple to list for .copy() compatibility + 'tokens': list(completion.token_ids), 'text': completion.text, 'finish_reason': completion.finish_reason }) @@ -265,7 +271,8 @@ def shutdown(self) -> None: # Access internal executor to ensure proper cleanup if self.llm.llm_engine is not None: try: - # This helps cleanup vLLM's internal Ray/multiprocessing resources + # This helps cleanup vLLM's internal Ray/multiprocessing + # resources del self.llm.llm_engine.model_executor except Exception as e: print(f"Warning: Failed to cleanup model executor: {e}") diff --git a/language/deepseek-r1/eval_accuracy.py b/language/deepseek-r1/eval_accuracy.py index 55647b7ca5..a0b546b600 100644 --- a/language/deepseek-r1/eval_accuracy.py +++ b/language/deepseek-r1/eval_accuracy.py @@ -45,11 +45,11 @@ # ============================================================================= def process_mlperf_log_accuracy(mlperf_log_file: Union[str, Path], - dataset_file: Union[str, Path], - checkpoint_path: str, - dtype: str = "int32", - output_dir: Optional[Union[str, Path]] = None, - base_filename: Optional[str] = None) -> Tuple[pd.DataFrame, str]: + dataset_file: Union[str, Path], + checkpoint_path: str, + dtype: str = "int32", + output_dir: Optional[Union[str, Path]] = None, + base_filename: Optional[str] = None) -> Tuple[pd.DataFrame, str]: """Process MLPerf log accuracy file and evaluate results. Args: @@ -68,7 +68,8 @@ def process_mlperf_log_accuracy(mlperf_log_file: Union[str, Path], dataset_file = Path(dataset_file) if not mlperf_log_file.exists(): - raise FileNotFoundError(f"MLPerf log file not found: {mlperf_log_file}") + raise FileNotFoundError( + f"MLPerf log file not found: {mlperf_log_file}") if not dataset_file.exists(): raise FileNotFoundError(f"Dataset file not found: {dataset_file}") @@ -86,7 +87,8 @@ def process_mlperf_log_accuracy(mlperf_log_file: Union[str, Path], ) logger.info("Tokenizer loaded successfully") except Exception as e: - raise RuntimeError(f"Failed to load tokenizer from {checkpoint_path}: {e}") + raise RuntimeError( + f"Failed to load tokenizer from {checkpoint_path}: {e}") # Load ground truth dataset try: @@ -99,14 +101,20 @@ def process_mlperf_log_accuracy(mlperf_log_file: Union[str, Path], elif 'ground_truth' in dataset_df.columns: ground_truths = dataset_df['ground_truth'].tolist() else: - raise ValueError("Dataset must contain 'gt_output' or 'ground_truth' column") + raise ValueError( + "Dataset must contain 'gt_output' or 'ground_truth' column") # Get other required columns with fallbacks if 'dataset' in dataset_df.columns: datasets = dataset_df['dataset'].tolist() elif 'metric' in dataset_df.columns: # Infer dataset from metric names - datasets = [metric.replace('_em', '').replace('_', '') for metric in dataset_df['metric'].tolist()] + datasets = [ + metric.replace( + '_em', + '').replace( + '_', + '') for metric in dataset_df['metric'].tolist()] else: datasets = ['unknown'] * len(ground_truths) @@ -138,7 +146,7 @@ def process_mlperf_log_accuracy(mlperf_log_file: Union[str, Path], # First, check if this is a JSON array format or newline-delimited JSON with open(mlperf_log_file, 'r') as f: first_line = f.readline().strip() - + if first_line == '[': # JSON array format - load the entire file logger.info("Detected JSON array format") @@ -146,8 +154,10 @@ def process_mlperf_log_accuracy(mlperf_log_file: Union[str, Path], try: mlperf_results = json.load(f) except json.JSONDecodeError as e: - # If full file parsing fails, try to parse line by line, skipping brackets - logger.warning(f"Failed to parse as complete JSON array: {e}") + # If full file parsing fails, try to parse line by line, + # skipping brackets + logger.warning( + f"Failed to parse as complete JSON array: {e}") logger.info("Attempting line-by-line parsing") mlperf_results = [] with open(mlperf_log_file, 'r') as f2: @@ -162,7 +172,8 @@ def process_mlperf_log_accuracy(mlperf_log_file: Union[str, Path], try: mlperf_results.append(json.loads(line)) except json.JSONDecodeError as e: - logger.warning(f"Failed to parse line {line_num}: {e}") + logger.warning( + f"Failed to parse line {line_num}: {e}") continue else: # Newline-delimited JSON format @@ -180,7 +191,7 @@ def process_mlperf_log_accuracy(mlperf_log_file: Union[str, Path], except json.JSONDecodeError as e: logger.warning(f"Failed to parse line {line_num}: {e}") continue - + logger.info(f"Loaded {len(mlperf_results)} MLPerf results") except Exception as e: raise RuntimeError(f"Failed to load MLPerf log file: {e}") @@ -220,7 +231,8 @@ def process_mlperf_log_accuracy(mlperf_log_file: Union[str, Path], questions_required.append(questions[qsl_idx]) except Exception as e: - logger.warning(f"Error processing entry with qsl_idx {qsl_idx}: {e}") + logger.warning( + f"Error processing entry with qsl_idx {qsl_idx}: {e}") continue if not preds_token_ids: @@ -271,7 +283,11 @@ def validate_dataframe(df: pd.DataFrame) -> None: if not isinstance(df, pd.DataFrame): raise ValueError("Input must be a pandas DataFrame") - required_cols = ['model_output', 'dataset', 'ground_truth', 'tok_model_output_len'] + required_cols = [ + 'model_output', + 'dataset', + 'ground_truth', + 'tok_model_output_len'] missing_cols = [col for col in required_cols if col not in df.columns] if missing_cols: raise ValueError(f"Missing required columns: {missing_cols}") @@ -390,7 +406,8 @@ def parse_code(text: str) -> Optional[str]: # Answer Evaluation Functions # ============================================================================= -def evaluate_multiple_choice(parsed: Optional[str], ground_truth: str, valid_options: str) -> bool: +def evaluate_multiple_choice( + parsed: Optional[str], ground_truth: str, valid_options: str) -> bool: """Evaluate multiple choice answer.""" if not parsed or not ground_truth: return False @@ -414,10 +431,12 @@ def evaluate_math500(parsed: Optional[str], ground_truth: str) -> bool: # Use sys.path approach for proper module importing workspace_path = os.path.dirname(os.path.abspath(__file__)) - prm800k_module_path = os.path.join(workspace_path, "submodules", "prm800k", "prm800k") + prm800k_module_path = os.path.join( + workspace_path, "submodules", "prm800k", "prm800k") if not os.path.exists(prm800k_module_path): - raise FileNotFoundError(f"PRM800K module not found at: {prm800k_module_path}") + raise FileNotFoundError( + f"PRM800K module not found at: {prm800k_module_path}") # Save current directory and sys.path original_cwd = os.getcwd() @@ -427,10 +446,10 @@ def evaluate_math500(parsed: Optional[str], ground_truth: str) -> bool: # Add prm800k module path to sys.path if prm800k_module_path not in sys.path: sys.path.insert(0, prm800k_module_path) - + # Change directory as some imports might use relative paths os.chdir(prm800k_module_path) - + # Now import should work from grading.grader import grade_answer result = grade_answer(given_answer=parsed, ground_truth=ground_truth) @@ -622,7 +641,8 @@ def process_row(row: pd.Series) -> Dict[str, Any]: } -def process_livecodebench_parallel(df: pd.DataFrame, group_indices: pd.Index) -> Tuple[int, int]: +def process_livecodebench_parallel( + df: pd.DataFrame, group_indices: pd.Index) -> Tuple[int, int]: """Process LiveCodeBench items in parallel.""" # Prepare work items work_items = [] @@ -726,7 +746,8 @@ def process_dataframe(df: pd.DataFrame) -> pd.DataFrame: # Unified Evaluation Utilities # ============================================================================= -def print_evaluation_results(df_evaluated: pd.DataFrame, logger: Optional[logging.Logger] = None) -> Dict[str, Any]: +def print_evaluation_results(df_evaluated: pd.DataFrame, + logger: Optional[logging.Logger] = None) -> Dict[str, Any]: """Print evaluation results in a unified format. Args: @@ -762,8 +783,8 @@ def print_evaluation_results(df_evaluated: pd.DataFrame, logger: Optional[loggin def process_and_save_dataframe(df: pd.DataFrame, - output_dir: Optional[Union[str, Path]] = None, - base_filename: Optional[str] = None) -> Tuple[pd.DataFrame, str]: + output_dir: Optional[Union[str, Path]] = None, + base_filename: Optional[str] = None) -> Tuple[pd.DataFrame, str]: """Process dataframe for evaluation and save the results. Args: @@ -779,7 +800,8 @@ def process_and_save_dataframe(df: pd.DataFrame, # Determine output path if output_dir is None: - # Try to infer from existing path info in the dataframe or use current directory + # Try to infer from existing path info in the dataframe or use current + # directory output_dir = Path.cwd() else: output_dir = Path(output_dir) diff --git a/language/deepseek-r1/mlperf/__init__.py b/language/deepseek-r1/mlperf/__init__.py index 33b3154f6b..bfe95c35e3 100644 --- a/language/deepseek-r1/mlperf/__init__.py +++ b/language/deepseek-r1/mlperf/__init__.py @@ -16,8 +16,8 @@ __all__ = [ # SUTs - 'BaseSUT', - 'OfflineSUT', + 'BaseSUT', + 'OfflineSUT', 'ServerSUT', # QSL 'QuerySampleLibrary', @@ -26,4 +26,4 @@ 'prepare_mlperf_dataset', 'process_mlperf_results', 'create_mlperf_output_dataframe' -] \ No newline at end of file +] diff --git a/language/deepseek-r1/mlperf/base_sut.py b/language/deepseek-r1/mlperf/base_sut.py index 7249207aab..f1d32eb869 100644 --- a/language/deepseek-r1/mlperf/base_sut.py +++ b/language/deepseek-r1/mlperf/base_sut.py @@ -12,65 +12,65 @@ class BaseSUT(abc.ABC): """Base class for MLPerf inference System Under Test (SUT). - + This class defines the interface that all SUTs must implement for MLPerf inference benchmarks. It provides two main methods: - issue_queries: to enqueue prompt tokens - flush_queries: to await completion of all issued queries """ - + def __init__(self, name: str = "BaseSUT"): """Initialize the base SUT. - + Args: name: Name of the SUT for logging purposes """ self.name = name self.sut = None logger.info(f"Initializing {self.name}") - + @abc.abstractmethod def issue_queries(self, query_samples: List[lg.QuerySample]) -> None: """Issue queries to the SUT. - + This method should enqueue the provided query samples for processing. It should return immediately without waiting for completion. - + Args: query_samples: List of MLPerf LoadGen query samples to process """ raise NotImplementedError("Subclasses must implement issue_queries") - + @abc.abstractmethod def flush_queries(self) -> None: """Flush all pending queries. - + This method should wait for all previously issued queries to complete before returning. It's called by LoadGen to ensure all work is done. """ raise NotImplementedError("Subclasses must implement flush_queries") - + def start(self) -> lg.ConstructSUT: """Start the SUT and return the LoadGen SUT handle. - + Returns: LoadGen SUT handle for use with LoadGen """ self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries) logger.info(f"{self.name} started") return self.sut - + def stop(self) -> None: """Stop the SUT and clean up resources.""" if self.sut: lg.DestroySUT(self.sut) self.sut = None logger.info(f"{self.name} stopped") - + def __enter__(self): """Context manager entry.""" return self.start() - + def __exit__(self, exc_type, exc_val, exc_tb): """Context manager exit.""" - self.stop() \ No newline at end of file + self.stop() diff --git a/language/deepseek-r1/mlperf/offline_sut.py b/language/deepseek-r1/mlperf/offline_sut.py index db1c4feea1..00382f4660 100644 --- a/language/deepseek-r1/mlperf/offline_sut.py +++ b/language/deepseek-r1/mlperf/offline_sut.py @@ -44,11 +44,15 @@ def __init__(self, self.dataset_strings = dataset_strings # Determine backend type using registry - self.backend_name = getattr(backend, 'backend_name', type(backend).__name__.lower()) + self.backend_name = getattr( + backend, + 'backend_name', + type(backend).__name__.lower()) self.uses_text_prompts = uses_text_input(self.backend_name) if self.uses_text_prompts and dataset_strings is None: - raise ValueError(f"Backend {self.backend_name} requires text prompts but dataset_strings was not provided") + raise ValueError( + f"Backend {self.backend_name} requires text prompts but dataset_strings was not provided") # Async event loop and thread self.loop = None @@ -122,12 +126,15 @@ async def _process_all_queries_async(self): # Prepare prompts for batch processing (like run_eval.py) if self.uses_text_prompts: # Use text prompts for vLLM and SGLang - prompts = [self.dataset_strings[sample.index] for sample in query_samples] + prompts = [self.dataset_strings[sample.index] + for sample in query_samples] futures = self.backend.generate_async(text_prompts=prompts) else: # Use tokenized prompts for other backends - prompts = [self.dataset[sample.index] for sample in query_samples] - futures = self.backend.generate_async(tokenized_prompts=prompts) + prompts = [self.dataset[sample.index] + for sample in query_samples] + futures = self.backend.generate_async( + tokenized_prompts=prompts) logger.info(f"Got {len(futures)} futures from backend") @@ -136,7 +143,8 @@ async def _process_all_queries_async(self): indexed_futures = [(i, future) for i, future in enumerate(futures)] completed_indices = set() - # Use asyncio.wait with FIRST_COMPLETED to handle out-of-order completion + # Use asyncio.wait with FIRST_COMPLETED to handle out-of-order + # completion pending = {future for _, future in indexed_futures} while pending: @@ -153,12 +161,14 @@ async def _process_all_queries_async(self): break if original_idx is None: - logger.error("Could not find original index for completed future") + logger.error( + "Could not find original index for completed future") continue # Check for duplicate completion if original_idx in completed_indices: - logger.warning(f"Prompt {original_idx} completed multiple times!") + logger.warning( + f"Prompt {original_idx} completed multiple times!") continue try: @@ -174,36 +184,44 @@ async def _process_all_queries_async(self): await self._send_result_to_loadgen(sample, result) except Exception as e: - logger.error(f"Error processing prompt {original_idx}: {type(e).__name__}: {e}") + logger.error( + f"Error processing prompt {original_idx}: {type(e).__name__}: {e}") # Raise the error instead of handling empty responses - raise RuntimeError(f"Backend failed to generate tokens for prompt {original_idx}: {e}") + raise RuntimeError( + f"Backend failed to generate tokens for prompt {original_idx}: {e}") # Verify all results are populated if len(completed_indices) != len(futures): missing_count = len(futures) - len(completed_indices) - raise RuntimeError(f"Missing results: completed {len(completed_indices)} != {len(futures)} total ({missing_count} missing)") + raise RuntimeError( + f"Missing results: completed {len(completed_indices)} != {len(futures)} total ({missing_count} missing)") for i, result in enumerate(results): if result is None: raise RuntimeError(f"Missing result for prompt {i}") - logger.info(f"Completed all {len(completed_indices)} prompts successfully") + logger.info( + f"Completed all {len(completed_indices)} prompts successfully") except Exception as e: - logger.error(f"Error during batch processing: {type(e).__name__}: {e}") + logger.error( + f"Error during batch processing: {type(e).__name__}: {e}") import traceback traceback.print_exc() raise # Re-raise instead of sending empty responses - async def _send_result_to_loadgen(self, sample: lg.QuerySample, result: Dict[str, Any]): + async def _send_result_to_loadgen( + self, sample: lg.QuerySample, result: Dict[str, Any]): """Send a single result to LoadGen.""" try: # Validate that tokens exist - raise error if missing tokens = result.get('tokens') if tokens is None: - raise ValueError(f"Backend result missing 'tokens' key for query {sample.id}") + raise ValueError( + f"Backend result missing 'tokens' key for query {sample.id}") if not isinstance(tokens, (list, tuple)) or len(tokens) == 0: - raise ValueError(f"Backend returned empty or invalid tokens for query {sample.id}: {tokens}") + raise ValueError( + f"Backend returned empty or invalid tokens for query {sample.id}: {tokens}") # Create a copy of tokens before numpy conversion tokens_copy = tokens.copy() @@ -229,12 +247,15 @@ async def _send_result_to_loadgen(self, sample: lg.QuerySample, result: Dict[str # Send response to LoadGen lg.QuerySamplesComplete([response]) - logger.debug(f"Sent {n_tokens} tokens to LoadGen for query {sample.id}") + logger.debug( + f"Sent {n_tokens} tokens to LoadGen for query {sample.id}") except Exception as e: - logger.error(f"Error sending result to LoadGen for query {sample.id}: {e}") + logger.error( + f"Error sending result to LoadGen for query {sample.id}: {e}") # Raise the error instead of sending empty response - raise RuntimeError(f"Failed to send result to LoadGen for query {sample.id}: {e}") + raise RuntimeError( + f"Failed to send result to LoadGen for query {sample.id}: {e}") def _run_event_loop(self): """Run the async event loop in a separate thread.""" @@ -282,7 +303,8 @@ def get_results(self) -> List[Dict[str, Any]]: # Sort by index to maintain dataset order queried_indices = sorted(self.index_to_id.keys()) - logger.info(f"Retrieving results for {len(queried_indices)} queried samples") + logger.info( + f"Retrieving results for {len(queried_indices)} queried samples") # Process results in order of dataset indices using stored results for i in queried_indices: @@ -296,7 +318,8 @@ def get_results(self) -> List[Dict[str, Any]]: tokens = result['tokens'] output_text = result.get('text', '') if not output_text and self.backend.tokenizer: - output_text = self.backend.tokenizer.decode(result['tokens'], skip_special_tokens=True) + output_text = self.backend.tokenizer.decode( + result['tokens'], skip_special_tokens=True) ordered_results.append({ 'model_output': output_text, @@ -305,6 +328,7 @@ def get_results(self) -> List[Dict[str, Any]]: }) else: # No backend result for this sample - raise RuntimeError(f"No backend result stored for dataset index {i}, sample_id {sample_id}") + raise RuntimeError( + f"No backend result stored for dataset index {i}, sample_id {sample_id}") - return ordered_results \ No newline at end of file + return ordered_results diff --git a/language/deepseek-r1/mlperf/qsl.py b/language/deepseek-r1/mlperf/qsl.py index 59bc5e36a8..d4c9405a4e 100644 --- a/language/deepseek-r1/mlperf/qsl.py +++ b/language/deepseek-r1/mlperf/qsl.py @@ -7,12 +7,12 @@ class QuerySampleLibrary: """MLPerf QuerySampleLibrary implementation for single-process execution.""" - - def __init__(self, dataset: List[List[int]], dataset_strings: List[str], + + def __init__(self, dataset: List[List[int]], dataset_strings: List[str], name: str = "QSL"): """ Initialize QSL with dataset. - + Args: dataset: List of tokenized prompts dataset_strings: List of original prompt strings @@ -24,7 +24,7 @@ def __init__(self, dataset: List[List[int]], dataset_strings: List[str], self.perf_count = self.count self.name = name self.logger = logging.getLogger(__name__) - + # Create LoadGen QSL self.qsl = lg.ConstructQSL( self.count, @@ -33,7 +33,7 @@ def __init__(self, dataset: List[List[int]], dataset_strings: List[str], lambda x: None # UnloadSamplesFromRam ) self.logger.info(f"Created {self.name} with {self.count} samples") - + def __del__(self): """Cleanup QSL.""" if self.qsl is not None: @@ -43,12 +43,12 @@ def __del__(self): class DistributedQuerySampleLibrary: """QuerySampleLibrary for distributed execution (MPI/torchrun).""" - + def __init__(self, dataset: List[List[int]], dataset_strings: List[str], rank: int, world_size: int, name: str = "DistributedQSL"): """ Initialize distributed QSL. - + Args: dataset: List of tokenized prompts dataset_strings: List of original prompt strings @@ -64,10 +64,10 @@ def __init__(self, dataset: List[List[int]], dataset_strings: List[str], self.world_size = world_size self.name = name self.logger = logging.getLogger(__name__) - + # Track if this is rank zero explicitly self.is_rank_zero = (self.rank == 0) - + # Only rank 0 creates the actual QSL if self.is_rank_zero: self.qsl = lg.ConstructQSL( @@ -76,12 +76,13 @@ def __init__(self, dataset: List[List[int]], dataset_strings: List[str], lambda x: None, lambda x: None ) - self.logger.info(f"Created {self.name} with {self.count} samples on rank 0") + self.logger.info( + f"Created {self.name} with {self.count} samples on rank 0") else: self.qsl = None - + def __del__(self): """Cleanup QSL on rank 0.""" if self.is_rank_zero and self.qsl is not None: lg.DestroyQSL(self.qsl) - self.logger.info(f"{self.name} destroyed on rank 0") \ No newline at end of file + self.logger.info(f"{self.name} destroyed on rank 0") diff --git a/language/deepseek-r1/mlperf/server_sut.py b/language/deepseek-r1/mlperf/server_sut.py index 75699e208f..e3acb2bde8 100644 --- a/language/deepseek-r1/mlperf/server_sut.py +++ b/language/deepseek-r1/mlperf/server_sut.py @@ -69,11 +69,15 @@ def __init__(self, self.dataset_strings = dataset_strings # Determine backend type using registry - self.backend_name = getattr(backend, 'backend_name', type(backend).__name__.lower()) + self.backend_name = getattr( + backend, + 'backend_name', + type(backend).__name__.lower()) self.uses_text_prompts = uses_text_input(self.backend_name) if self.uses_text_prompts and dataset_strings is None: - raise ValueError(f"Backend {self.backend_name} requires text prompts but dataset_strings was not provided") + raise ValueError( + f"Backend {self.backend_name} requires text prompts but dataset_strings was not provided") # Async event loop and thread self.loop = None @@ -91,8 +95,6 @@ def __init__(self, self.all_results: Dict[int, Dict[str, Any]] = {} self.results_lock = asyncio.Lock() - - def issue_queries(self, query_samples: List[lg.QuerySample]) -> None: """Issue queries in streaming mode with batching.""" if not supports_streaming(): @@ -123,7 +125,8 @@ async def _start_streaming_query(self, query_info: QueryInfo) -> None: try: # Verify streaming support if not supports_streaming(): - raise RuntimeError(f"Backend {self.backend_name} does not support streaming required for server mode") + raise RuntimeError( + f"Backend {self.backend_name} does not support streaming required for server mode") # Prepare prompt based on backend type if self.uses_text_prompts: @@ -155,8 +158,10 @@ async def _start_streaming_query(self, query_info: QueryInfo) -> None: task.add_done_callback(self._remove_task_from_active) except Exception as e: - logger.error(f"Error starting stream for query {query_info.query_id}: {e}") - raise RuntimeError(f"Failed to start streaming for query {query_info.query_id}: {e}") + logger.error( + f"Error starting stream for query {query_info.query_id}: {e}") + raise RuntimeError( + f"Failed to start streaming for query {query_info.query_id}: {e}") def _remove_task_from_active(self, task: asyncio.Task) -> None: """Remove a completed task from the active set.""" @@ -181,7 +186,8 @@ async def _process_stream(self, state: StreamingQueryState) -> None: state.accumulated_tokens.extend(chunk.token_ids) # Report first token immediately for TTFT measurement - if not state.first_token_sent and (chunk.token or chunk.token_ids): + if not state.first_token_sent and ( + chunk.token or chunk.token_ids): state.first_token_time = current_time - state.start_time state.first_token_sent = True @@ -197,35 +203,43 @@ async def _process_stream(self, state: StreamingQueryState) -> None: except asyncio.CancelledError: # Task was cancelled, clean up gracefully - logger.debug(f"Stream processing cancelled for query {state.query_info.query_id}") - # Close the async generator properly (assume aclose exists in our containerized environment) + logger.debug( + f"Stream processing cancelled for query {state.query_info.query_id}") + # Close the async generator properly (assume aclose exists in our + # containerized environment) try: await state.stream_gen.aclose() except Exception: pass raise except Exception as e: - logger.error(f"Error processing stream for query {state.query_info.query_id}: {e}") - raise RuntimeError(f"Stream processing failed for query {state.query_info.query_id}: {e}") + logger.error( + f"Error processing stream for query {state.query_info.query_id}: {e}") + raise RuntimeError( + f"Stream processing failed for query {state.query_info.query_id}: {e}") finally: # Clean up active stream async with self.active_streams_lock: self.active_streams.pop(state.query_info.query_id, None) - async def _send_first_token_response(self, state: StreamingQueryState) -> None: + async def _send_first_token_response( + self, state: StreamingQueryState) -> None: """Send first token notification to LoadGen for TTFT measurement.""" - logger.debug(f"First token received for query {state.query_info.query_id} at {state.first_token_time:.3f}s") + logger.debug( + f"First token received for query {state.query_info.query_id} at {state.first_token_time:.3f}s") # Convert first tokens to proper format for LoadGen if state.accumulated_tokens: - output_tokens = np.ascontiguousarray(state.accumulated_tokens, dtype=np.int32) + output_tokens = np.ascontiguousarray( + state.accumulated_tokens, dtype=np.int32) else: # If no token IDs available, encode the text if hasattr(self.backend, 'tokenizer') and state.accumulated_text: tokens = self.backend.tokenizer.encode(state.accumulated_text) output_tokens = np.ascontiguousarray(tokens, dtype=np.int32) else: - raise RuntimeError(f"No token IDs available for first token response for query {state.query_info.query_id}") + raise RuntimeError( + f"No token IDs available for first token response for query {state.query_info.query_id}") output_seq_len = len(output_tokens) output_toks_ptr = output_tokens.ctypes.data if output_seq_len > 0 else 0 @@ -248,22 +262,25 @@ async def _send_final_response(self, state: StreamingQueryState) -> None: if state.accumulated_tokens: # Create a copy of tokens before numpy conversion tokens_to_send = state.accumulated_tokens.copy() - token_array = np.array(state.accumulated_tokens, dtype=np.int32) + token_array = np.array( + state.accumulated_tokens, dtype=np.int32) else: # If no tokens, encode the text - if hasattr(self.backend, 'tokenizer') and state.accumulated_text: - tokens = self.backend.tokenizer.encode(state.accumulated_text) + if hasattr(self.backend, + 'tokenizer') and state.accumulated_text: + tokens = self.backend.tokenizer.encode( + state.accumulated_text) # Create a copy of tokens before numpy conversion tokens_to_send = tokens.copy() token_array = np.array(tokens, dtype=np.int32) else: - raise RuntimeError(f"No tokens or tokenizer available for query {state.query_info.query_id}") + raise RuntimeError( + f"No tokens or tokenizer available for query {state.query_info.query_id}") # Validate we have tokens if len(token_array) == 0: - raise RuntimeError(f"No tokens generated for query {state.query_info.query_id}") - - + raise RuntimeError( + f"No tokens generated for query {state.query_info.query_id}") # Create LoadGen response response = lg.QuerySampleResponse( @@ -287,11 +304,14 @@ async def _send_final_response(self, state: StreamingQueryState) -> None: } self.all_results[state.query_info.query_id] = state.query_info.result - logger.debug(f"Sent {len(token_array)} tokens to LoadGen for query {state.query_info.query_id}") + logger.debug( + f"Sent {len(token_array)} tokens to LoadGen for query {state.query_info.query_id}") except Exception as e: - logger.error(f"Error sending final response for query {state.query_info.query_id}: {e}") - raise RuntimeError(f"Failed to send final response for query {state.query_info.query_id}: {e}") + logger.error( + f"Error sending final response for query {state.query_info.query_id}: {e}") + raise RuntimeError( + f"Failed to send final response for query {state.query_info.query_id}: {e}") def flush_queries(self) -> None: """Wait for all active streams to complete.""" @@ -313,13 +333,16 @@ async def wait_for_streams(): async with self.active_streams_lock: if self.active_streams: - logger.warning(f"Timeout: {len(self.active_streams)} streams still active") + logger.warning( + f"Timeout: {len(self.active_streams)} streams still active") # Run the wait task in the event loop if self.loop and not self.loop.is_closed(): - future = asyncio.run_coroutine_threadsafe(wait_for_streams(), self.loop) + future = asyncio.run_coroutine_threadsafe( + wait_for_streams(), self.loop) try: - future.result(timeout=310) # Slightly longer than internal timeout + # Slightly longer than internal timeout + future.result(timeout=310) except Exception as e: logger.error(f"Error waiting for streams to complete: {e}") @@ -352,7 +375,8 @@ async def cancel_all_tasks(): tasks_to_cancel = list(self.active_tasks) if tasks_to_cancel: - logger.info(f"Cancelling {len(tasks_to_cancel)} active streaming tasks...") + logger.info( + f"Cancelling {len(tasks_to_cancel)} active streaming tasks...") for task in tasks_to_cancel: task.cancel() @@ -365,7 +389,8 @@ async def cancel_all_tasks(): self.active_tasks.clear() # Run the cancellation in the event loop - future = asyncio.run_coroutine_threadsafe(cancel_all_tasks(), self.loop) + future = asyncio.run_coroutine_threadsafe( + cancel_all_tasks(), self.loop) try: future.result(timeout=10.0) # Give tasks time to cancel except Exception as e: @@ -405,10 +430,12 @@ def get_results(self) -> List[Dict[str, Any]]: # Only process results for samples that were actually queried # Sort by index to maintain dataset order queried_indices = sorted(index_to_result.keys()) - - logger.info(f"Retrieving results for {len(queried_indices)} queried samples") - # Process results in order of dataset indices using stored backend results + logger.info( + f"Retrieving results for {len(queried_indices)} queried samples") + + # Process results in order of dataset indices using stored backend + # results for i in queried_indices: result = index_to_result[i] @@ -416,7 +443,8 @@ def get_results(self) -> List[Dict[str, Any]]: tokens = result['tokens'] output_text = result.get('text', '') if not output_text and self.backend.tokenizer: - output_text = self.backend.tokenizer.decode(result['tokens'], skip_special_tokens=True) + output_text = self.backend.tokenizer.decode( + result['tokens'], skip_special_tokens=True) ordered_results.append({ 'model_output': output_text, @@ -424,4 +452,4 @@ def get_results(self) -> List[Dict[str, Any]]: 'tok_model_output_len': len(tokens) }) - return ordered_results \ No newline at end of file + return ordered_results diff --git a/language/deepseek-r1/mlperf/utils.py b/language/deepseek-r1/mlperf/utils.py index 973e46c201..f4fbedda41 100644 --- a/language/deepseek-r1/mlperf/utils.py +++ b/language/deepseek-r1/mlperf/utils.py @@ -7,15 +7,15 @@ from utils.tokenization import StandardTokenizer -def prepare_mlperf_dataset(input_file: str, - backend_name: Optional[str] = None, - tokenizer: StandardTokenizer = None, - num_samples: Optional[int] = None, - skip_samples: int = 0, - use_chat_template: Optional[bool] = None) -> Dict[str, Any]: +def prepare_mlperf_dataset(input_file: str, + backend_name: Optional[str] = None, + tokenizer: StandardTokenizer = None, + num_samples: Optional[int] = None, + skip_samples: int = 0, + use_chat_template: Optional[bool] = None) -> Dict[str, Any]: """ Prepare dataset for MLPerf inference. - + Args: input_file: Path to input pickle file backend_name: Optional backend name override. If None, uses MLPERF_BACKEND env var. @@ -24,29 +24,30 @@ def prepare_mlperf_dataset(input_file: str, num_samples: Number of samples to use skip_samples: Number of samples to skip use_chat_template: Whether to use chat template (if None, determined by registry) - + Returns: Dictionary with prepared dataset components """ if backend_name is None: from utils.backend_registry import detect_backend backend_name = detect_backend() - + # Load and validate dataset df = load_dataset(input_file, num_samples, skip_samples) validate_dataset(df) - + prompts = df['text_input'].tolist() print(f"[MLPerf] Loaded {len(prompts)} prompts from dataset") - + # Check if backend uses text prompts from registry uses_text_prompts = uses_text_input() - + # Determine chat template usage from registry if not specified if use_chat_template is None: use_chat_template = uses_chat_template() - print(f"[MLPerf] Using chat template from registry: {use_chat_template}") - + print( + f"[MLPerf] Using chat template from registry: {use_chat_template}") + if uses_text_prompts: print(f"[MLPerf] Backend {backend_name} uses text prompts directly") return { @@ -62,7 +63,7 @@ def prepare_mlperf_dataset(input_file: str, prompts, use_chat_template ) print(f"[MLPerf] Tokenized {len(tokenized_prompts)} prompts") - + return { 'dataframe': df, 'prompts': prompts, @@ -73,61 +74,63 @@ def prepare_mlperf_dataset(input_file: str, def process_mlperf_results(sut_results: List[Dict[str, Any]], - tokenizer: Optional[StandardTokenizer] = None, - backend_name: Optional[str] = None, - uses_text_prompts: Optional[bool] = None) -> List[Dict[str, Any]]: + tokenizer: Optional[StandardTokenizer] = None, + backend_name: Optional[str] = None, + uses_text_prompts: Optional[bool] = None) -> List[Dict[str, Any]]: """ Process MLPerf SUT results into standardized format. - + Args: sut_results: Raw results from MLPerf SUT tokenizer: StandardTokenizer for decoding backend_name: Optional backend name override. If None, uses MLPERF_BACKEND env var. (Kept for backward compatibility but not used in our codebase) uses_text_prompts: Whether backend uses text prompts (if None, determined by registry) - + Returns: List of processed result dictionaries """ from utils.tokenization import process_inference_results - + if backend_name is None: from utils.backend_registry import detect_backend backend_name = detect_backend() - + # Determine text prompt usage from registry if not specified if uses_text_prompts is None: uses_text_prompts = uses_text_input() - + # Reuse the general inference result processing - return process_inference_results(sut_results, tokenizer, uses_text_prompts=uses_text_prompts) + return process_inference_results( + sut_results, tokenizer, uses_text_prompts=uses_text_prompts) def create_mlperf_output_dataframe(input_df: pd.DataFrame, - results: List[Dict[str, Any]], - backend_name: Optional[str] = None) -> pd.DataFrame: + results: List[Dict[str, Any]], + backend_name: Optional[str] = None) -> pd.DataFrame: """ Create output dataframe with MLPerf results. - + Args: input_df: Input dataframe results: Processed MLPerf results backend_name: Optional backend name override. If None, uses MLPERF_BACKEND env var. (Kept for backward compatibility but not used in our codebase) - + Returns: Output dataframe with results """ if backend_name is None: from utils.backend_registry import detect_backend backend_name = detect_backend() - + df_output = input_df.copy() - + # Add result columns df_output['model_output'] = [r['model_output'] for r in results] df_output['tok_model_output'] = [r['tok_model_output'] for r in results] - df_output['tok_model_output_len'] = [r['tok_model_output_len'] for r in results] + df_output['tok_model_output_len'] = [ + r['tok_model_output_len'] for r in results] df_output['model_backend'] = backend_name - - return df_output \ No newline at end of file + + return df_output diff --git a/language/deepseek-r1/run_eval.py b/language/deepseek-r1/run_eval.py index 169b3473e4..8965101bd4 100755 --- a/language/deepseek-r1/run_eval.py +++ b/language/deepseek-r1/run_eval.py @@ -1,4 +1,13 @@ #!/usr/bin/env python3 +from utils import ( + load_dataset, save_results, validate_dataset, generate_timestamped_filename, + validate_runner_for_backend, uses_text_input, uses_chat_template, + StandardTokenizer, process_inference_results, + get_backend_instance, create_base_argument_parser, print_runner_header, + setup_output_paths, validate_runner_args, handle_runner_error, + validate_dataset_extended, supports_async +) +from backends import BaseBackend import argparse import asyncio import os @@ -11,59 +20,51 @@ # Disable tokenizers parallelism to avoid forking issues os.environ["TOKENIZERS_PARALLELISM"] = "false" -from backends import BaseBackend -from utils import ( - load_dataset, save_results, validate_dataset, generate_timestamped_filename, - validate_runner_for_backend, uses_text_input, uses_chat_template, - StandardTokenizer, process_inference_results, - get_backend_instance, create_base_argument_parser, print_runner_header, - setup_output_paths, validate_runner_args, handle_runner_error, - validate_dataset_extended, supports_async -) - def create_argument_parser() -> argparse.ArgumentParser: """Create argument parser with shared arguments only.""" parser = create_base_argument_parser( "Modular backend evaluation system for MLPerf DeepSeek reference implementation" ) - + # Add runner-specific arguments parser.add_argument("--async", action="store_true", - help="Use async generation instead of synchronous") - + help="Use async generation instead of synchronous") + return parser -async def run_async_inference(backend: BaseBackend, - tokenized_prompts: List[List[int]], - text_prompts: Optional[List[str]] = None) -> List[Dict[str, Any]]: +async def run_async_inference(backend: BaseBackend, + tokenized_prompts: List[List[int]], + text_prompts: Optional[List[str]] = None) -> List[Dict[str, Any]]: """Run async inference with proper error handling and progress bar that updates as tasks complete.""" try: # Get futures from backend if uses_text_input(): futures = backend.generate_async(text_prompts=text_prompts) else: - futures = backend.generate_async(tokenized_prompts=tokenized_prompts) - + futures = backend.generate_async( + tokenized_prompts=tokenized_prompts) + # Create a list to store results in order results = [None] * len(futures) - + # Create enumerated futures with their original indices for tracking indexed_futures = [(i, future) for i, future in enumerate(futures)] - + # Track completion for debugging completed_indices = set() - + # Process tasks with progress bar that updates as tasks complete with async_tqdm(total=len(futures), desc="Async inference", unit="prompt") as pbar: - # Use asyncio.wait with FIRST_COMPLETED to handle out-of-order completion + # Use asyncio.wait with FIRST_COMPLETED to handle out-of-order + # completion pending = {future for _, future in indexed_futures} - + while pending: # Wait for at least one future to complete done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED) - + # Process all completed futures in this batch for completed_future in done: # Find the original index for this completed future @@ -72,46 +73,51 @@ async def run_async_inference(backend: BaseBackend, if future is completed_future: original_idx = idx break - + if original_idx is None: - print(f"\nWarning: Could not find original index for completed future") + print( + f"\nWarning: Could not find original index for completed future") continue - + # Check for duplicate completion if original_idx in completed_indices: - print(f"\nWarning: Prompt {original_idx} completed multiple times!") + print( + f"\nWarning: Prompt {original_idx} completed multiple times!") continue - + try: # Get the result from the completed future result = await completed_future - + # Store the result in the correct position results[original_idx] = result completed_indices.add(original_idx) - + except Exception as e: - print(f"\nError processing prompt {original_idx}: {type(e).__name__}: {e}") + print( + f"\nError processing prompt {original_idx}: {type(e).__name__}: {e}") import traceback traceback.print_exception(type(e), e, e.__traceback__) - + # Raise the error instead of using empty tokens - raise RuntimeError(f"Backend failed to generate tokens for prompt {original_idx}: {e}") - + raise RuntimeError( + f"Backend failed to generate tokens for prompt {original_idx}: {e}") + # Update progress bar after each completion pbar.update(1) - + # Verify all results are populated if len(completed_indices) != len(futures): missing_count = len(futures) - len(completed_indices) - raise RuntimeError(f"Missing results: completed {len(completed_indices)} != {len(futures)} total ({missing_count} missing)") - + raise RuntimeError( + f"Missing results: completed {len(completed_indices)} != {len(futures)} total ({missing_count} missing)") + for i, result in enumerate(results): if result is None: raise RuntimeError(f"Missing result for prompt {i}") - + print(f"\nCompleted all {len(completed_indices)} prompts successfully") - + return results except Exception as e: print(f"Error during async inference: {type(e).__name__}: {e}") @@ -120,9 +126,9 @@ async def run_async_inference(backend: BaseBackend, raise -def run_sync_inference(backend: BaseBackend, - tokenized_prompts: List[List[int]], - text_prompts: Optional[List[str]] = None) -> List[Dict[str, Any]]: +def run_sync_inference(backend: BaseBackend, + tokenized_prompts: List[List[int]], + text_prompts: Optional[List[str]] = None) -> List[Dict[str, Any]]: """Run sync inference with proper error handling.""" try: if uses_text_input(): @@ -140,46 +146,52 @@ def main(): # Parse arguments parser = create_argument_parser() args = parser.parse_args() - + try: # Validate arguments validate_runner_args(args, 'eval') - + # Detect backend early backend_name = validate_runner_for_backend('eval') - + # Set up output paths output_dir, output_file = setup_output_paths(args) if args.output_file is None: args.output_file = output_file - - # Generate the actual filename with timestamp that will be used for saving - actual_output_file = generate_timestamped_filename(args.output_file, add_timestamp=True) - + + # Generate the actual filename with timestamp that will be used for + # saving + actual_output_file = generate_timestamped_filename( + args.output_file, add_timestamp=True) + # Get async flag using getattr since 'async' is a reserved keyword use_async = getattr(args, 'async', False) - + # Check if backend supports async if use_async and not supports_async(): - raise RuntimeError(f"Backend {backend_name} does not support async generation") - + raise RuntimeError( + f"Backend {backend_name} does not support async generation") + # Print header - print_runner_header("Modular Backend Evaluation System", backend_name, args) + print_runner_header( + "Modular Backend Evaluation System", + backend_name, + args) print(f"Mode: {'Async' if use_async else 'Sync'}") print("=" * 80) - + # Load and validate dataset df = load_dataset(args.input_file, args.num_samples, args.skip_samples) validate_dataset_extended(df) - + prompts = df['text_input'].tolist() - + # Initialize tokenizer tokenizer = StandardTokenizer() - + # Determine whether to use chat template based on registry use_chat_template = uses_chat_template() - + # For text-prompt backends, we'll pass the prompts directly # For tokenized-prompt backends, we need to tokenize first if uses_text_input(): @@ -195,19 +207,19 @@ def main(): ) print(f"Tokenized {len(tokenized_prompts)} prompts") print(f"Tokenizer Max length: {tokenizer.max_length}") - + # Initialize backend using registry print(f"\nInitializing {backend_name.upper()} backend...") backend = get_backend_instance(backend_name) - + with backend: # Create new output dataframe with only required columns df_output = pd.DataFrame() - + # Copy all columns from input dataframe first for col in df.columns: df_output[col] = df[col] - + # Run inference with appropriate prompt format if use_async: print("Running async inference...") @@ -217,26 +229,31 @@ def main(): print("Running sync inference...") raw_results = run_sync_inference( backend, tokenized_prompts, text_prompts=prompts) - + # Process raw results into standardized format using shared utility print("Processing results...") standardized_results = process_inference_results( raw_results, tokenizer ) - + # Add generated columns - df_output['model_output'] = [r['model_output'] for r in standardized_results] - df_output['tok_model_output'] = [r['tok_model_output'] for r in standardized_results] - df_output['tok_model_output_len'] = [r['tok_model_output_len'] for r in standardized_results] - df_output['model_backend'] = [r['model_backend'] for r in standardized_results] - + df_output['model_output'] = [r['model_output'] + for r in standardized_results] + df_output['tok_model_output'] = [r['tok_model_output'] + for r in standardized_results] + df_output['tok_model_output_len'] = [ + r['tok_model_output_len'] for r in standardized_results] + df_output['model_backend'] = [r['model_backend'] + for r in standardized_results] + # Save results - output_file = save_results(df_output, args.output_file, add_timestamp=True) - + output_file = save_results( + df_output, args.output_file, add_timestamp=True) + print(f"\nEvaluation completed successfully!") print(f"Results saved to: {output_file}") print(f"Output columns: {list(df_output.columns)}") - + except KeyboardInterrupt: print("\nEvaluation interrupted by user") sys.exit(1) @@ -245,4 +262,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/language/deepseek-r1/run_eval_mpi.py b/language/deepseek-r1/run_eval_mpi.py index 37425526e1..4edeae5f8f 100644 --- a/language/deepseek-r1/run_eval_mpi.py +++ b/language/deepseek-r1/run_eval_mpi.py @@ -1,4 +1,11 @@ #!/usr/bin/env python3 +from backends import BaseBackend +from utils.data_utils import load_dataset +from utils.validation import validate_runner_args, ValidationError +from utils.runner_utils import create_base_argument_parser, print_runner_header +from utils.backend_registry import uses_chat_template, get_backend_instance, detect_backend, validate_runner_for_backend +from utils import save_results, generate_timestamped_filename, StandardTokenizer +from backends.pytorch_backend import PyTorchBackend import os import sys import argparse @@ -11,13 +18,6 @@ # Import utilities and backend registry sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from backends.pytorch_backend import PyTorchBackend -from utils import save_results, generate_timestamped_filename, StandardTokenizer -from utils.backend_registry import uses_chat_template, get_backend_instance, detect_backend, validate_runner_for_backend -from utils.runner_utils import create_base_argument_parser, print_runner_header -from utils.validation import validate_runner_args, ValidationError -from utils.data_utils import load_dataset -from backends import BaseBackend def main( @@ -41,7 +41,7 @@ def main( # Detect backend from environment backend_name = detect_backend() - + # Validate backend validate_runner_for_backend('eval_mpi') @@ -49,7 +49,8 @@ def main( use_chat_template = uses_chat_template() # Generate the actual filename with timestamp that will be used for saving - actual_output_file = generate_timestamped_filename(output_pickle_path, add_timestamp=True) + actual_output_file = generate_timestamped_filename( + output_pickle_path, add_timestamp=True) if rank == 0: _print("=" * 80) @@ -63,13 +64,14 @@ def main( _print(f"Sample limit: {num_samples}") if skip_samples: _print(f"Skip samples: {skip_samples}") - _print(f"Chat template: {'enabled' if use_chat_template else 'disabled'} (from registry)") + _print( + f"Chat template: {'enabled' if use_chat_template else 'disabled'} (from registry)") _print("=" * 80) # Initialize PyTorch backend backend = PyTorchBackend() backend.initialize() - + # Initialize StandardTokenizer tokenizer = StandardTokenizer() @@ -82,12 +84,14 @@ def main( _print(f"Loading input DataFrame from {input_pickle_path}...") try: df_for_results = pd.read_pickle(input_pickle_path) - _print(f"Loaded DataFrame with {len(df_for_results)} rows and columns: {df_for_results.columns.tolist()}") - + _print( + f"Loaded DataFrame with {len(df_for_results)} rows and columns: {df_for_results.columns.tolist()}") + # Apply skip_samples if specified if skip_samples > 0: if skip_samples >= len(df_for_results): - _print(f"Error: skip_samples ({skip_samples}) is greater than or equal to total samples ({len(df_for_results)})") + _print( + f"Error: skip_samples ({skip_samples}) is greater than or equal to total samples ({len(df_for_results)})") backend.shutdown() if world_size > 1: dist.destroy_process_group() @@ -96,14 +100,15 @@ def main( df_for_results = df_for_results.iloc[skip_samples:].copy() # Reset index to ensure sequential indices starting from 0 df_for_results = df_for_results.reset_index(drop=True) - + # Apply num_samples limit if specified if num_samples is not None and num_samples < len(df_for_results): - _print(f"Limiting to first {num_samples} samples (out of {len(df_for_results)} total after skipping)") + _print( + f"Limiting to first {num_samples} samples (out of {len(df_for_results)} total after skipping)") df_for_results = df_for_results.head(num_samples).copy() # Reset index to ensure sequential indices starting from 0 df_for_results = df_for_results.reset_index(drop=True) - + except Exception as e: _print(f"Error loading input pickle file: {e}") backend.shutdown() @@ -119,21 +124,25 @@ def main( return prompts_text_list = df_for_results['text_input'].tolist() - _print(f"Extracted {len(prompts_text_list)} prompts from 'text_input' column.") + _print( + f"Extracted {len(prompts_text_list)} prompts from 'text_input' column.") # Pre-initialize output columns df_for_results['model_output'] = "" df_for_results['tok_model_output'] = None - df_for_results['tok_model_output'] = df_for_results['tok_model_output'].astype('object') + df_for_results['tok_model_output'] = df_for_results['tok_model_output'].astype( + 'object') df_for_results['tok_model_output_len'] = 0 df_for_results['model_backend'] = backend_name # Broadcast the number of prompts to all ranks if world_size > 1: if rank == 0: - num_prompts_tensor = torch.tensor(len(prompts_text_list), dtype=torch.long, device="cuda") + num_prompts_tensor = torch.tensor( + len(prompts_text_list), dtype=torch.long, device="cuda") else: - num_prompts_tensor = torch.empty(1, dtype=torch.long, device="cuda") + num_prompts_tensor = torch.empty( + 1, dtype=torch.long, device="cuda") dist.broadcast(num_prompts_tensor, src=0) num_total_prompts = num_prompts_tensor.item() else: @@ -148,13 +157,14 @@ def main( current_batch_prompt_tokens = None if rank == 0: - current_batch_prompt_texts = prompts_text_list[i:i+batch_size] + current_batch_prompt_texts = prompts_text_list[i:i + batch_size] # Tokenize on rank 0 using StandardTokenizer current_batch_prompt_tokens, _ = tokenizer.tokenize_prompts( current_batch_prompt_texts, use_chat_template ) - - _print(f"Processing batch {current_batch_num}, size {len(current_batch_prompt_tokens)}") + + _print( + f"Processing batch {current_batch_num}, size {len(current_batch_prompt_tokens)}") # All ranks call generate_batch_distributed generated_tokens_for_batch = backend.generate_batch_distributed( @@ -164,12 +174,14 @@ def main( if rank == 0: # Validate that we received valid tokens if not generated_tokens_for_batch: - raise RuntimeError(f"Backend returned empty tokens for batch {current_batch_num}") - + raise RuntimeError( + f"Backend returned empty tokens for batch {current_batch_num}") + for batch_idx, tokens in enumerate(generated_tokens_for_batch): if not isinstance(tokens, (list, tuple)) or len(tokens) == 0: - raise RuntimeError(f"Backend returned empty or invalid tokens for batch {current_batch_num}, item {batch_idx}: {tokens}") - + raise RuntimeError( + f"Backend returned empty or invalid tokens for batch {current_batch_num}, item {batch_idx}: {tokens}") + # Decode tokens to text using StandardTokenizer decoded_texts_for_batch = tokenizer.batch_decode( generated_tokens_for_batch @@ -183,23 +195,36 @@ def main( original_df_idx = start_index_in_df + batch_idx if original_df_idx < len(df_for_results): # Use at for assignments with list values - df_for_results.at[original_df_idx, 'model_output'] = decoded_texts_for_batch[batch_idx] - df_for_results.at[original_df_idx, 'tok_model_output'] = generated_tokens_for_batch[batch_idx] - df_for_results.at[original_df_idx, 'tok_model_output_len'] = len(generated_tokens_for_batch[batch_idx]) + df_for_results.at[original_df_idx, + 'model_output'] = decoded_texts_for_batch[batch_idx] + df_for_results.at[original_df_idx, + 'tok_model_output'] = generated_tokens_for_batch[batch_idx] + df_for_results.at[original_df_idx, 'tok_model_output_len'] = len( + generated_tokens_for_batch[batch_idx]) _print(f"Batch {current_batch_num} completed.") if rank == 0 and df_for_results is not None: _print(f"All batches processed. Saving results...") - + # Keep only required columns in the same order as run_eval.py - output_columns = ['text_input', 'ground_truth', 'question', 'dataset', 'model_output', 'tok_model_output', 'tok_model_output_len', 'model_backend'] + output_columns = [ + 'text_input', + 'ground_truth', + 'question', + 'dataset', + 'model_output', + 'tok_model_output', + 'tok_model_output_len', + 'model_backend'] # Filter to only columns that exist - output_columns = [col for col in output_columns if col in df_for_results.columns] + output_columns = [ + col for col in output_columns if col in df_for_results.columns] df_output = df_for_results[output_columns] - + try: - saved_file = save_results(df_output, output_pickle_path, add_timestamp=True) + saved_file = save_results( + df_output, output_pickle_path, add_timestamp=True) _print(f"Successfully saved results to {saved_file}") except Exception as e: _print(f"Error saving output pickle file: {e}") @@ -234,4 +259,4 @@ def main( args.output_file, args.num_samples, args.skip_samples, - ) \ No newline at end of file + ) diff --git a/language/deepseek-r1/run_mlperf.py b/language/deepseek-r1/run_mlperf.py index 7f484e725e..2345cf5b9b 100755 --- a/language/deepseek-r1/run_mlperf.py +++ b/language/deepseek-r1/run_mlperf.py @@ -1,4 +1,23 @@ #!/usr/bin/env python3 +from eval_accuracy import process_dataframe, print_evaluation_results, process_and_save_dataframe, process_mlperf_log_accuracy +from utils import ( + validate_runner_for_backend, uses_text_input, uses_chat_template, + load_dataset, save_results, print_runner_header, StandardTokenizer, + get_backend_instance, create_base_argument_parser, + setup_output_paths, validate_runner_args, handle_runner_error, + validate_dataset_extended, generate_timestamped_filename +) +from mlperf import ( + OfflineSUT, ServerSUT, BaseSUT, + QuerySampleLibrary, + prepare_mlperf_dataset, + process_mlperf_results, + create_mlperf_output_dataframe +) +from backends import BaseBackend +import pandas as pd +import numpy as np +import mlperf_loadgen as lg import argparse import json import logging @@ -10,26 +29,6 @@ # Disable tokenizers parallelism to avoid forking issues os.environ["TOKENIZERS_PARALLELISM"] = "false" -import mlperf_loadgen as lg -import numpy as np -import pandas as pd - -from backends import BaseBackend -from mlperf import ( - OfflineSUT, ServerSUT, BaseSUT, - QuerySampleLibrary, - prepare_mlperf_dataset, - process_mlperf_results, - create_mlperf_output_dataframe -) -from utils import ( - validate_runner_for_backend, uses_text_input, uses_chat_template, - load_dataset, save_results, print_runner_header, StandardTokenizer, - get_backend_instance, create_base_argument_parser, - setup_output_paths, validate_runner_args, handle_runner_error, - validate_dataset_extended, generate_timestamped_filename -) -from eval_accuracy import process_dataframe, print_evaluation_results, process_and_save_dataframe, process_mlperf_log_accuracy # Configure logging logging.basicConfig( @@ -47,39 +46,39 @@ def create_argument_parser() -> argparse.ArgumentParser: # Scenario selection (no backend argument, auto-detected) parser.add_argument("--mode", type=str, default="offline", - choices=["offline", "server"], - help="MLPerf scenario mode") + choices=["offline", "server"], + help="MLPerf scenario mode") # MLPerf configuration parser.add_argument("--mlperf-conf", type=str, default="/inference/mlperf.conf", - help="Path to MLPerf configuration file") + help="Path to MLPerf configuration file") parser.add_argument("--user-conf", type=str, default="mlperf/user.conf", - help="Path to user configuration file") + help="Path to user configuration file") parser.add_argument("--scenario", type=str, default=None, - choices=["Offline", "Server"], - help="MLPerf scenario (overrides --mode)") + choices=["Offline", "Server"], + help="MLPerf scenario (overrides --mode)") parser.add_argument("--accuracy", action="store_true", - help="Run accuracy mode instead of performance") + help="Run accuracy mode instead of performance") # Output configuration parser.add_argument("--output-dir", type=str, default="mlperf_results", - help="Directory for MLPerf output logs") + help="Directory for MLPerf output logs") parser.add_argument("--log-dir", type=str, default=None, - help="Directory for detailed logs") + help="Directory for detailed logs") return parser def configure_loadgen(scenario: str, - accuracy_mode: bool, - mlperf_conf: Optional[str] = None, - user_conf: Optional[str] = None, - log_dir: Optional[str] = None, - model_name: str = "deepseek-r1") -> lg.TestSettings: + accuracy_mode: bool, + mlperf_conf: Optional[str] = None, + user_conf: Optional[str] = None, + log_dir: Optional[str] = None, + model_name: str = "deepseek-r1") -> lg.TestSettings: """Configure LoadGen test settings. Args: @@ -119,9 +118,9 @@ def configure_loadgen(scenario: str, def run_loadgen_test(sut: Union[OfflineSUT, ServerSUT], - qsl: QuerySampleLibrary, - settings: lg.TestSettings, - log_settings: lg.LogSettings) -> None: + qsl: QuerySampleLibrary, + settings: lg.TestSettings, + log_settings: lg.LogSettings) -> None: """Run LoadGen test. Args: @@ -162,7 +161,8 @@ def main(): if args.log_dir: log_dir = Path(args.log_dir) else: - log_dir = output_dir / args.mode / ("accuracy" if args.accuracy else "performance") + log_dir = output_dir / args.mode / \ + ("accuracy" if args.accuracy else "performance") log_dir.mkdir(parents=True, exist_ok=True) # Set up output paths with mode information @@ -170,17 +170,21 @@ def main(): if args.output_file is None: # Create output file path in the log directory mode_str = "accuracy" if args.accuracy else "performance" - output_file_base = str(log_dir / f"{backend_name}_mlperf_{args.mode}_{mode_str}_output.pkl") + output_file_base = str( + log_dir / f"{backend_name}_mlperf_{args.mode}_{mode_str}_output.pkl") else: output_file_base = args.output_file - # Generate the actual filename with timestamp that will be used for saving - actual_output_file = generate_timestamped_filename(output_file_base, add_timestamp=True) + # Generate the actual filename with timestamp that will be used for + # saving + actual_output_file = generate_timestamped_filename( + output_file_base, add_timestamp=True) # Ensure the parent directory of the output file exists output_file_parent = Path(actual_output_file).parent output_file_parent.mkdir(parents=True, exist_ok=True) - logger.info(f"Ensured output file directory exists: {output_file_parent}") + logger.info( + f"Ensured output file directory exists: {output_file_parent}") logger.info("=" * 80) logger.info("MLPerf Inference Benchmark Runner (Async Pattern)") @@ -220,13 +224,14 @@ def main(): # For backends that use text prompts, we pass the processed strings # For tokenized backends, we pass the tokenized prompts if uses_text_prompts: - logger.info(f"Backend {backend_name} will use text prompts directly") + logger.info( + f"Backend {backend_name} will use text prompts directly") dataset_for_sut = tokenized_prompts strings_for_sut = processed_strings else: logger.info(f"Backend {backend_name} will use tokenized prompts") dataset_for_sut = tokenized_prompts - strings_for_sut = processed_strings # This is what gets used for generation now + strings_for_sut = processed_strings # This is what gets used for generation now # Create backend using registry logger.info(f"Initializing {backend_name} backend...") @@ -315,7 +320,8 @@ def main(): try: # Get results from SUT - must have valid results if not sut_results: - raise RuntimeError("No results available from SUT - backend failed to generate tokens") + raise RuntimeError( + "No results available from SUT - backend failed to generate tokens") # Process results using new utility processed_results = process_mlperf_results( @@ -347,16 +353,19 @@ def main(): mlperf_log_file = log_dir / "mlperf_log_accuracy.json" if mlperf_log_file.exists(): - logger.info(f"Found MLPerf log accuracy file: {mlperf_log_file}") + logger.info( + f"Found MLPerf log accuracy file: {mlperf_log_file}") logger.info("Using MLPerf log for accuracy evaluation...") # Get checkpoint path from backend configuration backend_config = get_backend_instance(backend_name).config # Determine checkpoint path based on backend type - if hasattr(get_backend_instance(backend_name), 'model_path'): + if hasattr(get_backend_instance( + backend_name), 'model_path'): # PyTorch backend has model_path - checkpoint_path = str(get_backend_instance(backend_name).model_path) + checkpoint_path = str( + get_backend_instance(backend_name).model_path) elif 'model' in backend_config: # Other backends use model name directly checkpoint_path = backend_config['model'] @@ -376,10 +385,13 @@ def main(): base_filename="mlperf_accuracy_evaluated.pkl" ) - logger.info(f"MLPerf accuracy evaluation saved to: {evaluated_file}") + logger.info( + f"MLPerf accuracy evaluation saved to: {evaluated_file}") else: - logger.info("No MLPerf log accuracy file found, using standard DataFrame evaluation...") - raise RuntimeError("No MLPerf log accuracy file found, using standard DataFrame evaluation...") + logger.info( + "No MLPerf log accuracy file found, using standard DataFrame evaluation...") + raise RuntimeError( + "No MLPerf log accuracy file found, using standard DataFrame evaluation...") # Ensure clean exit gc.collect() @@ -397,4 +409,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/language/deepseek-r1/run_mlperf_mpi.py b/language/deepseek-r1/run_mlperf_mpi.py index 66196645ed..176be41710 100755 --- a/language/deepseek-r1/run_mlperf_mpi.py +++ b/language/deepseek-r1/run_mlperf_mpi.py @@ -1,4 +1,37 @@ #!/usr/bin/env python3 +from eval_accuracy import process_dataframe, print_evaluation_results, process_and_save_dataframe, process_mlperf_log_accuracy +from utils.data_utils import ( + load_dataset, save_results, + generate_timestamped_filename +) +from utils.validation import ( + validate_runner_args, ValidationError, + validate_dataset_extended +) +from utils.backend_registry import ( + uses_chat_template, get_backend_instance, detect_backend, + validate_runner_for_backend +) +from utils.runner_utils import create_base_argument_parser, print_runner_header +from utils import ( + StandardTokenizer, + validate_dataset, + process_inference_results +) +from mlperf import ( + OfflineSUT, ServerSUT, BaseSUT, + DistributedQuerySampleLibrary, + prepare_mlperf_dataset, + process_mlperf_results, + create_mlperf_output_dataframe +) +from backends.pytorch_backend import PyTorchBackend +from transformers import AutoTokenizer +import torch.distributed as dist +import torch +import pandas as pd +import numpy as np +import mlperf_loadgen as lg import argparse import json import logging @@ -12,41 +45,6 @@ # Disable tokenizers parallelism to avoid forking issues os.environ["TOKENIZERS_PARALLELISM"] = "false" -import mlperf_loadgen as lg -import numpy as np -import pandas as pd -import torch -import torch.distributed as dist -from transformers import AutoTokenizer - -from backends.pytorch_backend import PyTorchBackend -from mlperf import ( - OfflineSUT, ServerSUT, BaseSUT, - DistributedQuerySampleLibrary, - prepare_mlperf_dataset, - process_mlperf_results, - create_mlperf_output_dataframe -) -from utils import ( - StandardTokenizer, - validate_dataset, - process_inference_results -) -from utils.runner_utils import create_base_argument_parser, print_runner_header -from utils.backend_registry import ( - uses_chat_template, get_backend_instance, detect_backend, - validate_runner_for_backend -) -from utils.validation import ( - validate_runner_args, ValidationError, - validate_dataset_extended -) -from utils.data_utils import ( - load_dataset, save_results, - generate_timestamped_filename -) -from eval_accuracy import process_dataframe, print_evaluation_results, process_and_save_dataframe, process_mlperf_log_accuracy - # Configure logging - only for rank 0 def setup_logging(rank: int): @@ -119,7 +117,7 @@ def issue_queries(self, query_samples: List[lg.QuerySample]) -> None: batch_size = self.backend.config['batch_size'] for i in range(0, len(query_samples), batch_size): - batch_samples = query_samples[i:i+batch_size] + batch_samples = query_samples[i:i + batch_size] # Prepare batch tokens batch_tokens = [] @@ -141,10 +139,12 @@ def issue_queries(self, query_samples: List[lg.QuerySample]) -> None: # Generate using distributed backend # This will broadcast to all ranks internally - generated_tokens = self.backend.generate_batch_distributed(batch_tokens) + generated_tokens = self.backend.generate_batch_distributed( + batch_tokens) # Process results and send to LoadGen - for j, (sample_id, tokens) in enumerate(zip(batch_ids, generated_tokens)): + for j, (sample_id, tokens) in enumerate( + zip(batch_ids, generated_tokens)): # Create a copy of tokens before numpy conversion tokens_copy = tokens.copy() @@ -219,7 +219,8 @@ def get_results(self) -> List[Dict[str, Any]]: # Decode tokens to get text output output_text = '' if self.backend.tokenizer: - output_text = self.backend.tokenizer.decode(tokens, skip_special_tokens=True) + output_text = self.backend.tokenizer.decode( + tokens, skip_special_tokens=True) ordered_results.append({ 'model_output': output_text, @@ -228,16 +229,16 @@ def get_results(self) -> List[Dict[str, Any]]: }) else: # Result exists but no tokens - this is an error - raise RuntimeError(f"No tokens in result for dataset index {i}, sample_id {sample_id}") + raise RuntimeError( + f"No tokens in result for dataset index {i}, sample_id {sample_id}") else: # No result for this index - this is an error - raise RuntimeError(f"No result for dataset index {i}, sample_id {sample_id}") + raise RuntimeError( + f"No result for dataset index {i}, sample_id {sample_id}") return ordered_results - - def create_argument_parser() -> argparse.ArgumentParser: """Create argument parser for distributed MLPerf runner.""" parser = argparse.ArgumentParser( @@ -247,44 +248,45 @@ def create_argument_parser() -> argparse.ArgumentParser: # Dataset arguments parser.add_argument("--input-file", type=str, - default="data/final_output.pkl", - help="Input pickle file with prompts") + default="data/final_output.pkl", + help="Input pickle file with prompts") # MLPerf configuration parser.add_argument("--mlperf-conf", type=str, default="/inference/mlperf.conf", - help="Path to MLPerf configuration file") + help="Path to MLPerf configuration file") parser.add_argument("--user-conf", type=str, default="mlperf/user.conf", - help="Path to user configuration file") + help="Path to user configuration file") parser.add_argument("--mode", type=str, default="offline", - choices=["offline", "server"], - help="MLPerf scenario mode (only offline supported for distributed)") + choices=["offline", "server"], + help="MLPerf scenario mode (only offline supported for distributed)") parser.add_argument("--accuracy", action="store_true", - help="Run accuracy mode instead of performance") + help="Run accuracy mode instead of performance") # Output configuration parser.add_argument("--output-dir", type=str, default="mlperf_results", - help="Directory for MLPerf output logs") + help="Directory for MLPerf output logs") parser.add_argument("--log-dir", type=str, default=None, - help="Directory for detailed logs") + help="Directory for detailed logs") parser.add_argument("--output-file", type=str, default=None, - help="Output pickle file path (auto-generated if not specified)") + help="Output pickle file path (auto-generated if not specified)") - # Note: --no-chat-template is removed (chat template usage determined by backend registry) + # Note: --no-chat-template is removed (chat template usage determined by + # backend registry) return parser def configure_loadgen(scenario: str, - accuracy_mode: bool, - mlperf_conf: Optional[str] = None, - user_conf: Optional[str] = None, - log_dir: Optional[str] = None, - model_name: str = "deepseek-r1") -> lg.TestSettings: + accuracy_mode: bool, + mlperf_conf: Optional[str] = None, + user_conf: Optional[str] = None, + log_dir: Optional[str] = None, + model_name: str = "deepseek-r1") -> lg.TestSettings: """Configure LoadGen test settings. Args: @@ -324,11 +326,11 @@ def configure_loadgen(scenario: str, def run_loadgen_test(sut: DistributedOfflineSUT, - qsl: DistributedQuerySampleLibrary, - settings: lg.TestSettings, - log_settings: lg.LogSettings, - rank: int, - logger) -> None: + qsl: DistributedQuerySampleLibrary, + settings: lg.TestSettings, + log_settings: lg.LogSettings, + rank: int, + logger) -> None: """Run LoadGen test (only on rank 0). Args: @@ -386,7 +388,8 @@ def main(): # Validate mode for distributed if args.mode != "offline": if rank == 0: - logger.error("Only offline mode is supported for distributed execution") + logger.error( + "Only offline mode is supported for distributed execution") sys.exit(1) # Create output directories (only rank 0) @@ -397,7 +400,8 @@ def main(): if args.log_dir: log_dir = Path(args.log_dir) else: - log_dir = output_dir / args.mode / ("accuracy" if args.accuracy else "performance") + log_dir = output_dir / args.mode / \ + ("accuracy" if args.accuracy else "performance") log_dir.mkdir(parents=True, exist_ok=True) # Determine output file path @@ -405,15 +409,18 @@ def main(): output_file_base = args.output_file else: mode_str = "accuracy" if args.accuracy else "performance" - output_file_base = str(log_dir / f"{backend_name}_mlperf_{args.mode}_{mode_str}_output.pkl") + output_file_base = str( + log_dir / f"{backend_name}_mlperf_{args.mode}_{mode_str}_output.pkl") # Generate the actual filename with timestamp - actual_output_file = generate_timestamped_filename(output_file_base, add_timestamp=True) + actual_output_file = generate_timestamped_filename( + output_file_base, add_timestamp=True) # Ensure the parent directory of the output file exists output_file_parent = Path(actual_output_file).parent output_file_parent.mkdir(parents=True, exist_ok=True) - logger.info(f"Ensured output file directory exists: {output_file_parent}") + logger.info( + f"Ensured output file directory exists: {output_file_parent}") logger.info("=" * 80) logger.info("MLPerf Inference Benchmark Runner (Distributed PyTorch)") @@ -425,7 +432,8 @@ def main(): logger.info(f"Input file: {args.input_file}") logger.info(f"Output directory: {output_dir}") logger.info(f"Output file: {actual_output_file}") - logger.info(f"Chat template: {'enabled' if use_chat_template else 'disabled'} (from registry)") + logger.info( + f"Chat template: {'enabled' if use_chat_template else 'disabled'} (from registry)") logger.info("=" * 80) else: log_dir = None @@ -460,7 +468,8 @@ def main(): tokenized_prompts = dataset_info['tokenized_prompts'] processed_strings = dataset_info['processed_strings'] - logger.info(f"Loaded {len(tokenized_prompts)} prompts from dataset") + logger.info( + f"Loaded {len(tokenized_prompts)} prompts from dataset") # Create SUT sut = DistributedOfflineSUT( @@ -511,7 +520,8 @@ def main(): if rank == 0: # Run test (only rank 0) logger.info("Running test...") - run_loadgen_test(sut, qsl, settings, log_settings, rank, logger) + run_loadgen_test( + sut, qsl, settings, log_settings, rank, logger) logger.info("Completed test...") # Ensure all queries are flushed and async operations complete @@ -524,7 +534,8 @@ def main(): dist.broadcast_object_list(exit_signal, src=0) else: # Non-rank 0 processes participate in distributed generation - # They wait for signals from rank 0 and participate in generate_batch_distributed + # They wait for signals from rank 0 and participate in + # generate_batch_distributed while True: # First, check if we should exit # We use a separate broadcast to signal exit @@ -536,7 +547,8 @@ def main(): break elif exit_check[0] == "generate": # Signal to participate in generation - # The actual batch tokens will be broadcast inside generate_batch_distributed + # The actual batch tokens will be broadcast inside + # generate_batch_distributed backend.generate_batch_distributed(None) # If exit_check[0] is None, continue waiting finally: @@ -563,9 +575,11 @@ def main(): try: # Get results from SUT (if available) - logger.info("Retrieving results from distributed SUT...") + logger.info( + "Retrieving results from distributed SUT...") sut_results = sut.get_results() - logger.info(f"Retrieved {len(sut_results)} results from distributed SUT") + logger.info( + f"Retrieved {len(sut_results)} results from distributed SUT") # Process results using new utility processed_results = process_mlperf_results( @@ -597,11 +611,19 @@ def main(): mlperf_log_file = log_dir / "mlperf_log_accuracy.json" if mlperf_log_file.exists(): - logger.info(f"Found MLPerf log accuracy file: {mlperf_log_file}") - logger.info("Using MLPerf log for accuracy evaluation...") - - # For PyTorch backend (only one supported in MPI), get model path - checkpoint_path = str(backend.model_path) if hasattr(backend, 'model_path') else backend.config.get('model_name', 'deepseek-ai/DeepSeek-R1') + logger.info( + f"Found MLPerf log accuracy file: {mlperf_log_file}") + logger.info( + "Using MLPerf log for accuracy evaluation...") + + # For PyTorch backend (only one supported in MPI), + # get model path + checkpoint_path = str( + backend.model_path) if hasattr( + backend, + 'model_path') else backend.config.get( + 'model_name', + 'deepseek-ai/DeepSeek-R1') # Process MLPerf log accuracy df_evaluated, evaluated_file = process_mlperf_log_accuracy( @@ -612,10 +634,13 @@ def main(): base_filename="mlperf_accuracy_evaluated.pkl" ) - logger.info(f"MLPerf accuracy evaluation saved to: {evaluated_file}") + logger.info( + f"MLPerf accuracy evaluation saved to: {evaluated_file}") else: - logger.info("No MLPerf log accuracy file found, using standard DataFrame evaluation...") - raise RuntimeError("No MLPerf log accuracy file found, using standard DataFrame evaluation...") + logger.info( + "No MLPerf log accuracy file found, using standard DataFrame evaluation...") + raise RuntimeError( + "No MLPerf log accuracy file found, using standard DataFrame evaluation...") except KeyboardInterrupt: if rank == 0: @@ -639,4 +664,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/language/deepseek-r1/utils/__init__.py b/language/deepseek-r1/utils/__init__.py index ce8d10e9fd..65f575da29 100644 --- a/language/deepseek-r1/utils/__init__.py +++ b/language/deepseek-r1/utils/__init__.py @@ -101,4 +101,4 @@ # Error handling 'handle_backend_error', 'handle_runner_error' -] \ No newline at end of file +] diff --git a/language/deepseek-r1/utils/backend_registry.py b/language/deepseek-r1/utils/backend_registry.py index 73aee587e6..aa8d62a18e 100644 --- a/language/deepseek-r1/utils/backend_registry.py +++ b/language/deepseek-r1/utils/backend_registry.py @@ -173,7 +173,8 @@ def validate_backend(backend: str) -> None: f"Unknown backend '{backend}'. Supported backends: {', '.join(supported_backends)}") -def _get_compatibility_error_message(backend: str, runner_type: str, compatible: List[str]) -> str: +def _get_compatibility_error_message( + backend: str, runner_type: str, compatible: List[str]) -> str: """ Generate error message for incompatible backend/runner combinations. @@ -401,7 +402,8 @@ def get_backend_instance(backend_name: Optional[str] = None): return backend_class() -def is_backend_compatible_with_runner(backend_name: Optional[str] = None, runner_type: str = None) -> bool: +def is_backend_compatible_with_runner( + backend_name: Optional[str] = None, runner_type: str = None) -> bool: """Check if a backend is compatible with a specific runner type. Args: @@ -441,7 +443,8 @@ def get_backend_env_vars(backend_name: Optional[str] = None) -> Dict[str, str]: # Get static env vars env_vars = BACKEND_REGISTRY[backend_name]['env_vars'].copy() - # Handle dynamic env vars (e.g., OMP_NUM_THREADS based on tensor_parallel_size) + # Handle dynamic env vars (e.g., OMP_NUM_THREADS based on + # tensor_parallel_size) if backend_name == 'vllm': config = get_backend_config(backend_name) env_vars['OMP_NUM_THREADS'] = str( @@ -461,4 +464,4 @@ def apply_backend_env_vars(backend_name: Optional[str] = None) -> None: env_vars = get_backend_env_vars(backend_name) for key, value in env_vars.items(): - os.environ[key] = value \ No newline at end of file + os.environ[key] = value diff --git a/language/deepseek-r1/utils/data_utils.py b/language/deepseek-r1/utils/data_utils.py index 80acb5c8ce..0eb4cd3dcd 100644 --- a/language/deepseek-r1/utils/data_utils.py +++ b/language/deepseek-r1/utils/data_utils.py @@ -15,54 +15,56 @@ from utils.validation import ValidationError, validate_dataset_extended -def generate_timestamped_filename(output_file: str, add_timestamp: bool = True) -> str: +def generate_timestamped_filename( + output_file: str, add_timestamp: bool = True) -> str: """ Generate the actual filename that will be used when saving, with timestamp if requested. - + Args: output_file: Base output file path add_timestamp: Whether to add timestamp to filename - + Returns: Actual filename that will be used for saving """ if not add_timestamp: return output_file - + timestamp_suffix = time.strftime("%Y%m%d_%H%M%S") base_name, ext = os.path.splitext(output_file) return f"{base_name}_{timestamp_suffix}{ext}" -def load_dataset(file_path: str, num_samples: Optional[int] = None, skip_samples: int = 0) -> pd.DataFrame: +def load_dataset( + file_path: str, num_samples: Optional[int] = None, skip_samples: int = 0) -> pd.DataFrame: """ Load dataset from pickle file. - + Args: file_path: Path to the pickle file num_samples: Optional limit on number of samples to load skip_samples: Number of samples to skip from the beginning - + Returns: Loaded DataFrame - + Raises: ValidationError: If file doesn't exist or validation fails Exception: If file can't be loaded """ if not os.path.exists(file_path): raise ValidationError(f"Input file not found: {file_path}") - + print(f"Loading dataset from {file_path}...") - + try: with open(file_path, "rb") as f: df = pd.read_pickle(f) except Exception as e: raise ValidationError(f"Failed to load dataset: {str(e)}") - + print(f"Loaded {len(df)} samples") - + # Skip samples if specified if skip_samples > 0: if skip_samples >= len(df): @@ -71,31 +73,33 @@ def load_dataset(file_path: str, num_samples: Optional[int] = None, skip_samples ) original_length = len(df) df = df.iloc[skip_samples:].reset_index(drop=True) - print(f"Skipped first {skip_samples} samples (from {original_length} total)") - + print( + f"Skipped first {skip_samples} samples (from {original_length} total)") + # Limit number of samples if specified if num_samples is not None: original_length = len(df) df = df.head(num_samples) - print(f"Limited to {len(df)} samples (from {original_length} total after skipping)") - + print( + f"Limited to {len(df)} samples (from {original_length} total after skipping)") + return df -def save_results(df: pd.DataFrame, - output_file: str, - add_timestamp: bool = True) -> str: +def save_results(df: pd.DataFrame, + output_file: str, + add_timestamp: bool = True) -> str: """ Save results DataFrame to pickle file. - + Args: df: DataFrame to save output_file: Output file path add_timestamp: Whether to add timestamp to filename - + Returns: Actual output file path used - + Raises: ValidationError: If save operation fails """ @@ -104,93 +108,99 @@ def save_results(df: pd.DataFrame, timestamp_suffix = time.strftime("%Y%m%d_%H%M%S") base_name, ext = os.path.splitext(output_file) output_file = f"{base_name}_{timestamp_suffix}{ext}" - + # Ensure output directory exists os.makedirs(os.path.dirname(output_file), exist_ok=True) - + print(f"Saving results to {output_file}...") - + # Reset index before saving df_to_save = df.reset_index(drop=True) - + try: with open(output_file, "wb") as f: pickle.dump(df_to_save, f) - print(f"Save completed: {len(df_to_save)} samples saved to {output_file}") + print( + f"Save completed: {len(df_to_save)} samples saved to {output_file}") except Exception as e: raise ValidationError(f"Failed to save results: {str(e)}") - + return output_file -def prepare_output_dataframe(input_df: pd.DataFrame, - backend_name: Optional[str] = None) -> pd.DataFrame: +def prepare_output_dataframe(input_df: pd.DataFrame, + backend_name: Optional[str] = None) -> pd.DataFrame: """ Prepare output DataFrame by cleaning up old columns. - + Args: input_df: Input DataFrame backend_name: Optional backend name override. If None, uses MLPERF_BACKEND env var. - + Returns: Cleaned DataFrame ready for new results """ if backend_name is None: from utils.backend_registry import detect_backend backend_name = detect_backend() - + df_output = input_df.copy() - + # Define columns to drop (old model outputs and unwanted columns) columns_to_drop = [ # specify columns to drop here ] - + # Also drop any existing backend-specific columns - backend_columns = [col for col in df_output.columns if col.startswith(f'{backend_name}_')] + backend_columns = [ + col for col in df_output.columns if col.startswith(f'{backend_name}_')] columns_to_drop.extend(backend_columns) - + # Drop columns that exist df_output = df_output.drop( columns=[col for col in columns_to_drop if col in df_output.columns] ) - + return df_output -def add_standardized_columns(df: pd.DataFrame, - results: List[Dict[str, Any]], - tokenized_prompts: List[List[int]] = None) -> pd.DataFrame: +def add_standardized_columns(df: pd.DataFrame, + results: List[Dict[str, Any]], + tokenized_prompts: List[List[int]] = None) -> pd.DataFrame: """ Add standardized output columns to DataFrame. - + Args: df: Input DataFrame results: List of result dictionaries from backend tokenized_prompts: List of tokenized input prompts (deprecated, not used) - + Returns: DataFrame with added standardized columns """ # Add results columns with new naming convention df['model_output'] = [r.get('model_output', '') for r in results] df['tok_model_output'] = [r.get('tok_model_output', []) for r in results] - df['tok_model_output_len'] = [r.get('tok_model_output_len', 0) for r in results] + df['tok_model_output_len'] = [ + r.get( + 'tok_model_output_len', + 0) for r in results] df['model_backend'] = [r.get('model_backend', '') for r in results] - + return df -def validate_dataset(df: pd.DataFrame, backend_name: Optional[str] = None) -> None: +def validate_dataset(df: pd.DataFrame, + backend_name: Optional[str] = None) -> None: """ Validate that the dataset has required columns. - + Args: df: DataFrame to validate backend_name: Optional backend name override. If None, uses MLPERF_BACKEND env var. - + Raises: ValidationError: If required columns are missing or validation fails """ # Use centralized validation function - validate_dataset_extended(df, backend_name) \ No newline at end of file + validate_dataset_extended(df, backend_name) diff --git a/language/deepseek-r1/utils/error_handling.py b/language/deepseek-r1/utils/error_handling.py index 54ca580135..6b588b9c20 100644 --- a/language/deepseek-r1/utils/error_handling.py +++ b/language/deepseek-r1/utils/error_handling.py @@ -5,17 +5,18 @@ from .validation import BackendError, ValidationError -def handle_backend_error(e: Exception, backend_name: str, operation: str) -> None: +def handle_backend_error(e: Exception, backend_name: str, + operation: str) -> None: """ Standardized error handling for backend operations. - + Args: e: The exception that occurred backend_name: Name of the backend operation: Description of the operation that failed """ error_msg = f"\n[{backend_name.upper()}] Error during {operation}: {type(e).__name__}: {str(e)}" - + if isinstance(e, (RuntimeError, ValueError)): # Known errors - just print the message print(error_msg) @@ -28,7 +29,7 @@ def handle_backend_error(e: Exception, backend_name: str, operation: str) -> Non def handle_runner_error(e: Exception, runner_name: str) -> None: """ Standardized error handling for runners. - + Args: e: The exception that occurred runner_name: Name of the runner @@ -45,4 +46,4 @@ def handle_runner_error(e: Exception, runner_name: str) -> None: else: print(f"\n{runner_name} failed: {e}") traceback.print_exc() - sys.exit(1) \ No newline at end of file + sys.exit(1) diff --git a/language/deepseek-r1/utils/runner_utils.py b/language/deepseek-r1/utils/runner_utils.py index 8c90deb515..fc2d4ad7f2 100644 --- a/language/deepseek-r1/utils/runner_utils.py +++ b/language/deepseek-r1/utils/runner_utils.py @@ -12,29 +12,31 @@ def create_base_argument_parser(description: str) -> argparse.ArgumentParser: description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter ) - + # Common dataset arguments parser.add_argument("--input-file", type=str, - default="data/final_output.pkl", - help="Input pickle file with prompts") - + default="data/final_output.pkl", + help="Input pickle file with prompts") + parser.add_argument("--output-file", type=str, default=None, - help="Output pickle file path (auto-generated if not specified)") - + help="Output pickle file path (auto-generated if not specified)") + parser.add_argument("--num-samples", type=int, default=None, - help="Number of samples to process from dataset") - + help="Number of samples to process from dataset") + parser.add_argument("--skip-samples", type=int, default=0, - help="Number of samples to skip from the beginning") - - # NOTE: --no-chat-template flag is NOT included (chat template usage determined by backend registry) - + help="Number of samples to skip from the beginning") + + # NOTE: --no-chat-template flag is NOT included (chat template usage + # determined by backend registry) + return parser -def print_runner_header(runner_name: str, backend_name: Optional[str] = None, args: argparse.Namespace = None) -> None: +def print_runner_header( + runner_name: str, backend_name: Optional[str] = None, args: argparse.Namespace = None) -> None: """Print standardized header for runners. - + Args: runner_name: Name of the runner backend_name: Optional backend name override. If None, uses MLPERF_BACKEND env var. @@ -43,7 +45,7 @@ def print_runner_header(runner_name: str, backend_name: Optional[str] = None, ar if backend_name is None: from .backend_registry import detect_backend backend_name = detect_backend() - + print("=" * 80) print(f"{runner_name}") print("=" * 80) @@ -59,22 +61,23 @@ def print_runner_header(runner_name: str, backend_name: Optional[str] = None, ar print("=" * 80) -def setup_output_paths(args: argparse.Namespace, backend_name: Optional[str] = None, mode: Optional[str] = None) -> Tuple[Path, str]: +def setup_output_paths(args: argparse.Namespace, + backend_name: Optional[str] = None, mode: Optional[str] = None) -> Tuple[Path, str]: """ Set up output directories and file paths. - + Args: args: Parsed command line arguments backend_name: Optional backend name override. If None, uses MLPERF_BACKEND env var. mode: Optional mode (e.g., 'offline', 'server' for MLPerf) - + Returns: Tuple of (output_dir, output_file_path) """ if backend_name is None: from .backend_registry import detect_backend backend_name = detect_backend() - + # Determine output directory if hasattr(args, 'output_dir') and args.output_dir: output_dir = Path(args.output_dir) @@ -84,9 +87,9 @@ def setup_output_paths(args: argparse.Namespace, backend_name: Optional[str] = N output_dir = Path(f"outputs/{backend_name}/{mode}") else: output_dir = Path(f"outputs/{backend_name}") - + output_dir.mkdir(parents=True, exist_ok=True) - + # Determine output file path if args.output_file: output_file = args.output_file @@ -97,10 +100,13 @@ def setup_output_paths(args: argparse.Namespace, backend_name: Optional[str] = N suffix = f"_{args.num_samples}samples" else: suffix = "_full" - + if mode: - output_file = str(output_dir / f"{backend_name}_{mode}_output_{timestamp}{suffix}.pkl") + output_file = str( + output_dir / + f"{backend_name}_{mode}_output_{timestamp}{suffix}.pkl") else: - output_file = str(output_dir / f"{backend_name}_output_{timestamp}{suffix}.pkl") - - return output_dir, output_file \ No newline at end of file + output_file = str(output_dir / + f"{backend_name}_output_{timestamp}{suffix}.pkl") + + return output_dir, output_file diff --git a/language/deepseek-r1/utils/tokenization.py b/language/deepseek-r1/utils/tokenization.py index c5fa77d69d..ec67e1e2eb 100644 --- a/language/deepseek-r1/utils/tokenization.py +++ b/language/deepseek-r1/utils/tokenization.py @@ -7,15 +7,15 @@ class StandardTokenizer: """Standard tokenizer for DeepSeek models.""" - + # Standard configuration used across all runners DEFAULT_MODEL = "deepseek-ai/DeepSeek-R1" DEFAULT_MAX_LENGTH = 32 * 1024 - + def __init__(self, model_name: str = None, max_length: int = None): """ Initialize tokenizer. - + Args: model_name: HuggingFace model name max_length: Maximum sequence length @@ -23,50 +23,54 @@ def __init__(self, model_name: str = None, max_length: int = None): self.model_name = model_name or self.DEFAULT_MODEL self.max_length = max_length or self.DEFAULT_MAX_LENGTH self._tokenizer = None - + @property def tokenizer(self): """Lazy load tokenizer.""" if self._tokenizer is None: print(f"Loading tokenizer: {self.model_name}") - self._tokenizer = AutoTokenizer.from_pretrained(self.model_name, revision="56d4cbbb4d29f4355bab4b9a39ccb717a14ad5ad") + self._tokenizer = AutoTokenizer.from_pretrained( + self.model_name, revision="56d4cbbb4d29f4355bab4b9a39ccb717a14ad5ad") return self._tokenizer - - def tokenize_prompts(self, prompts: List[str], - use_chat_template: Optional[bool] = None, - backend_name: Optional[str] = None) -> Tuple[List[List[int]], List[str]]: + + def tokenize_prompts(self, prompts: List[str], + use_chat_template: Optional[bool] = None, + backend_name: Optional[str] = None) -> Tuple[List[List[int]], List[str]]: """ Tokenize prompts with backend-specific handling. - + Args: prompts: List of text prompts use_chat_template: Whether to use chat template (if None and backend_name provided, uses registry) backend_name: Optional backend name override. If None, uses MLPERF_BACKEND env var. - + Returns: Tuple of (tokenized_prompts, processed_strings) """ # Auto-detect backend if not provided if backend_name is None: backend_name = detect_backend() - + # Determine chat template usage from registry if backend_name provided if use_chat_template is None: use_chat_template = uses_chat_template(backend_name) - print(f"[{backend_name}] Using chat template from registry: {use_chat_template}") - + print( + f"[{backend_name}] Using chat template from registry: {use_chat_template}") + tokenized = [] processed_strings = [] - + for prompt in prompts: - if use_chat_template and hasattr(self.tokenizer, 'apply_chat_template'): + if use_chat_template and hasattr( + self.tokenizer, 'apply_chat_template'): tokens = self.tokenizer.apply_chat_template( [{"role": "user", "content": prompt}], add_generation_prompt=True, max_length=self.max_length, truncation=True ) - processed_string = self.tokenizer.decode(tokens, skip_special_tokens=False) + processed_string = self.tokenizer.decode( + tokens, skip_special_tokens=False) else: tokens = self.tokenizer.encode( prompt, @@ -74,49 +78,52 @@ def tokenize_prompts(self, prompts: List[str], max_length=self.max_length ) processed_string = prompt - + tokenized.append(tokens) processed_strings.append(processed_string) - + return tokenized, processed_strings - - def decode_tokens(self, tokens: List[int], skip_special_tokens: bool = True) -> str: + + def decode_tokens(self, tokens: List[int], + skip_special_tokens: bool = True) -> str: """Decode tokens to text.""" - return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens) - - def batch_decode(self, token_lists: List[List[int]], - skip_special_tokens: bool = True) -> List[str]: + return self.tokenizer.decode( + tokens, skip_special_tokens=skip_special_tokens) + + def batch_decode(self, token_lists: List[List[int]], + skip_special_tokens: bool = True) -> List[str]: """Batch decode multiple token lists.""" - return self.tokenizer.batch_decode(token_lists, skip_special_tokens=skip_special_tokens) + return self.tokenizer.batch_decode( + token_lists, skip_special_tokens=skip_special_tokens) -def process_inference_results(raw_results: List[dict], - tokenizer: Optional[StandardTokenizer] = None, - backend_name: Optional[str] = None, - uses_text_prompts: bool = False) -> List[dict]: +def process_inference_results(raw_results: List[dict], + tokenizer: Optional[StandardTokenizer] = None, + backend_name: Optional[str] = None, + uses_text_prompts: bool = False) -> List[dict]: """ Process raw inference results into standardized format. - + Args: raw_results: Raw results from backend tokenizer: Tokenizer for decoding backend_name: Optional backend name override. If None, uses MLPERF_BACKEND env var. uses_text_prompts: Whether backend uses text prompts - + Returns: List of standardized result dictionaries """ # Auto-detect backend if not provided if backend_name is None: backend_name = detect_backend() - + if backend_name not in get_supported_backends(): raise ValueError(f"Backend {backend_name} is not supported") - + backend_config = get_backend_config(backend_name) - + standardized_results = [] - + for raw_result in raw_results: # Handle text-prompt backends if uses_text_prompts and 'text' in raw_result: @@ -129,9 +136,9 @@ def process_inference_results(raw_results: List[dict], if tokenizer and tokens: try: text = tokenizer.decode_tokens(tokens) - except: + except BaseException: pass - + standardized = { 'model_output': text, 'tok_model_output': tokens, @@ -139,5 +146,5 @@ def process_inference_results(raw_results: List[dict], 'model_backend': backend_name, } standardized_results.append(standardized) - - return standardized_results \ No newline at end of file + + return standardized_results diff --git a/language/deepseek-r1/utils/validation.py b/language/deepseek-r1/utils/validation.py index 29bebef4f1..768427ada1 100644 --- a/language/deepseek-r1/utils/validation.py +++ b/language/deepseek-r1/utils/validation.py @@ -12,8 +12,10 @@ class BackendError(RuntimeError): class BackendNotInitializedError(BackendError): """Raised when backend operation is called before initialization.""" + def __init__(self, backend_name: str = "Backend"): - super().__init__(f"{backend_name} not initialized. Call initialize() first.") + super().__init__( + f"{backend_name} not initialized. Call initialize() first.") class ValidationError(ValueError): @@ -33,9 +35,9 @@ def wrapper(self, *args, **kwargs): def validate_prompts_input(backend_name: Optional[str] = None, - tokenized_prompts: Optional[List[List[int]]] = None, - text_prompts: Optional[List[str]] = None, - input_type: str = None) -> None: + tokenized_prompts: Optional[List[List[int]]] = None, + text_prompts: Optional[List[str]] = None, + input_type: str = None) -> None: """ Centralized prompt validation with backend-specific requirements. @@ -53,13 +55,16 @@ def validate_prompts_input(backend_name: Optional[str] = None, backend_name = detect_backend() if tokenized_prompts is None and text_prompts is None: - raise ValidationError(f"{backend_name} backend requires either text_prompts or tokenized_prompts") + raise ValidationError( + f"{backend_name} backend requires either text_prompts or tokenized_prompts") if input_type == 'text' and tokenized_prompts is not None and text_prompts is None: - raise ValidationError(f"{backend_name} backend requires text_prompts, not tokenized_prompts") + raise ValidationError( + f"{backend_name} backend requires text_prompts, not tokenized_prompts") if input_type == 'tokenized' and text_prompts is not None and tokenized_prompts is None: - raise ValidationError(f"{backend_name} backend requires tokenized_prompts, not text_prompts") + raise ValidationError( + f"{backend_name} backend requires tokenized_prompts, not text_prompts") # Additional validation for tokenized prompts if tokenized_prompts is not None: @@ -67,9 +72,11 @@ def validate_prompts_input(backend_name: Optional[str] = None, raise ValidationError("tokenized_prompts cannot be empty") for i, prompt in enumerate(tokenized_prompts): if not isinstance(prompt, list): - raise ValidationError(f"tokenized_prompts[{i}] must be a list of integers") + raise ValidationError( + f"tokenized_prompts[{i}] must be a list of integers") if not prompt: - raise ValidationError(f"tokenized_prompts[{i}] cannot be empty") + raise ValidationError( + f"tokenized_prompts[{i}] cannot be empty") # Additional validation for text prompts if text_prompts is not None: @@ -81,8 +88,8 @@ def validate_prompts_input(backend_name: Optional[str] = None, def validate_dataset_extended(df: pd.DataFrame, - backend_name: Optional[str] = None, - required_columns: Optional[List[str]] = None) -> None: + backend_name: Optional[str] = None, + required_columns: Optional[List[str]] = None) -> None: """ Extended dataset validation with backend-specific requirements. @@ -101,9 +108,11 @@ def validate_dataset_extended(df: pd.DataFrame, if required_columns is None: required_columns = ['text_input'] - missing_columns = [col for col in required_columns if col not in df.columns] + missing_columns = [ + col for col in required_columns if col not in df.columns] if missing_columns: - raise ValidationError(f"Dataset missing required columns: {missing_columns}") + raise ValidationError( + f"Dataset missing required columns: {missing_columns}") # Check for empty prompts empty_prompts = df['text_input'].isna().sum() @@ -118,7 +127,8 @@ def validate_dataset_extended(df: pd.DataFrame, config = get_backend_config(backend_name) # Add backend-specific validation based on config if needed - print(f"Dataset validation passed: {len(df)} samples with required columns") + print( + f"Dataset validation passed: {len(df)} samples with required columns") def validate_runner_args(args: argparse.Namespace, runner_type: str) -> None: @@ -133,7 +143,8 @@ def validate_runner_args(args: argparse.Namespace, runner_type: str) -> None: ValidationError: If validation fails """ # Common validations - if hasattr(args, 'num_samples') and args.num_samples is not None and args.num_samples <= 0: + if hasattr( + args, 'num_samples') and args.num_samples is not None and args.num_samples <= 0: raise ValidationError("--num-samples must be positive") if hasattr(args, 'skip_samples') and args.skip_samples < 0: @@ -142,4 +153,5 @@ def validate_runner_args(args: argparse.Namespace, runner_type: str) -> None: # Runner-specific validations if runner_type in ['mlperf', 'mlperf_mpi']: if hasattr(args, 'mode') and args.mode not in ['offline', 'server']: - raise ValidationError(f"Invalid mode: {args.mode}. Must be 'offline' or 'server'") \ No newline at end of file + raise ValidationError( + f"Invalid mode: {args.mode}. Must be 'offline' or 'server'") diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index fa3ae51514..40c83eecc5 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -482,7 +482,7 @@ "rgat": 788379, "pointpainting": 1024, }, - "dataset-size" : { + "dataset-size": { "resnet": 50000, "retinanet": 24781, "bert-99": 10833, @@ -1023,7 +1023,7 @@ def get_min_query_count(self, model, scenario): if model not in self.min_queries: raise ValueError("model not known: " + model) return self.min_queries[model].get(scenario) - + def get_dataset_size(self, model): model = self.get_mlperf_model(model) if model not in self.dataset_size: @@ -2292,9 +2292,12 @@ def log_result( # Check for calibration documentation if not config.skip_calibration_check and division not in ["open"]: - calibration_path_root = os.path.join(division, submitter, "calibration.md") - calibration_path_doc = os.path.join(division, submitter, "documentation", "calibration.md") - if not (os.path.exists(calibration_path_root)) and (not os.path.exists(calibration_path_doc)): + calibration_path_root = os.path.join( + division, submitter, "calibration.md") + calibration_path_doc = os.path.join( + division, submitter, "documentation", "calibration.md") + if not (os.path.exists(calibration_path_root)) and ( + not os.path.exists(calibration_path_doc)): log.error( "%s/%s: has not calibration file. One of %s or %s is required", division, @@ -3248,8 +3251,8 @@ def main(): args.extra_model_benchmark_map, ignore_uncommited=args.submission_exceptions, skip_power_check=args.skip_power_check, - skip_all_systems_with_results = args.skip_all_systems_have_results_check, - skip_calibration_check = args.skip_calibration_check + skip_all_systems_with_results=args.skip_all_systems_have_results_check, + skip_calibration_check=args.skip_calibration_check ) if args.scenarios_to_skip: From ac42a2f4b6f7ce7dc18d41939a4da0fe9cfa89f2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 16 Jun 2025 16:19:58 +0000 Subject: [PATCH 13/64] [Automated Commit] Format Codebase --- speech2text/QSL.py | 18 ++++- speech2text/accuracy_eval.py | 57 ++++++++++++-- speech2text/helpers.py | 45 ++++++++---- speech2text/legacy_helpers.py | 6 +- speech2text/manifest.py | 1 + speech2text/reference_SUT.py | 94 ++++++++++++++++++------ speech2text/reference_mlperf.py | 60 ++++++++++----- speech2text/utils/download_utils.py | 3 +- speech2text/utils/preprocessing_utils.py | 3 +- 9 files changed, 216 insertions(+), 71 deletions(-) diff --git a/speech2text/QSL.py b/speech2text/QSL.py index 19afd49acf..a4882eb32b 100644 --- a/speech2text/QSL.py +++ b/speech2text/QSL.py @@ -30,6 +30,7 @@ Manifest_Global = None max_duration = float(os.environ.get("MAX_DURATION", "30.0")) + def load_sample_from_file(index): global Manifest sample = Manifest_Global[index] @@ -43,12 +44,18 @@ def load_sample_from_file(index): duration = sample['duration'] return prompt + class AudioQSL: def __init__(self, dataset_dir, manifest_filepath, labels, sample_rate=16000, perf_count=None, skip_qsl=False): global Manifest_Global m_paths = [manifest_filepath] - self.manifest = Manifest(dataset_dir, m_paths, labels, len(labels), max_duration=max_duration) + self.manifest = Manifest( + dataset_dir, + m_paths, + labels, + len(labels), + max_duration=max_duration) Manifest_Global = self.manifest self.sample_rate = sample_rate self.count = len(self.manifest) @@ -59,15 +66,15 @@ def __init__(self, dataset_dir, manifest_filepath, labels, self.qsl = None else: self.qsl = lg.ConstructQSL(self.count, perf_count, - self.load_query_samples, - self.unload_query_samples) + self.load_query_samples, + self.unload_query_samples) print( "Dataset loaded with {0:.2f} hours. Filtered {1:.2f} hours. Number of samples: {2}".format( self.manifest.duration / 3600, self.manifest.filtered_duration / 3600, self.count)) - + def load_query_samples(self, sample_list): pass @@ -83,6 +90,8 @@ def __del__(self): # We have no problem fitting all data in memory, so we do that, in # order to speed up execution of the benchmark. + + class AudioQSLInMemory(AudioQSL): def __init__(self, dataset_dir, manifest_filepath, labels, sample_rate=16000, perf_count=None, skip_qsl=True): @@ -104,5 +113,6 @@ def load_query_samples(self, sample_list): def unload_query_samples(self, sample_list): for sample_id in sample_list: del self.sample_id_to_sample[sample_id] + def __del__(self): print("FInished destroying no QSL") diff --git a/speech2text/accuracy_eval.py b/speech2text/accuracy_eval.py index ad741db502..3c7466ff8f 100644 --- a/speech2text/accuracy_eval.py +++ b/speech2text/accuracy_eval.py @@ -28,7 +28,35 @@ max_duration = float(os.environ.get("MAX_DURATION", "30.0")) -labels = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"] +labels = [ + " ", + "a", + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z", + "'"] dtype_map = { "int8": 'b', "int16": 'h', @@ -36,6 +64,7 @@ "int64": 'q', } + def word_error_rate(hypotheses: List[str], references: List[str]) -> float: """ Computes Average Word Error rate between two texts represented as @@ -61,37 +90,53 @@ def word_error_rate(hypotheses: List[str], references: List[str]) -> float: r = normalizer(r) h_list = h.split() r_list = r.split() - scores_clip, words_clip = compute_wer_with_concatenation(h_list, r_list) + scores_clip, words_clip = compute_wer_with_concatenation( + h_list, r_list) scores += scores_clip words += words_clip wer = scores / words return wer, scores, words + def get_args(): parser = argparse.ArgumentParser() parser.add_argument("--log_dir", required=True) parser.add_argument("--dataset_dir", required=True) parser.add_argument("--manifest", required=True) - parser.add_argument("--output_dtype", default="int64", choices=dtype_map.keys(), help="Output data type") + parser.add_argument( + "--output_dtype", + default="int64", + choices=dtype_map.keys(), + help="Output data type") args = parser.parse_args() return args + def main(): args = get_args() - manifest = Manifest(args.dataset_dir, [args.manifest], labels, len(labels), max_duration=max_duration) + manifest = Manifest(args.dataset_dir, + [args.manifest], + labels, + len(labels), + max_duration=max_duration) with open(os.path.join(args.log_dir, "mlperf_log_accuracy.json")) as fh: results = json.load(fh) hypotheses = [] references = [] for result in results: - hypotheses.append(array.array(dtype_map[args.output_dtype], bytes.fromhex(result["data"])).tolist()) + hypotheses.append(array.array( + dtype_map[args.output_dtype], bytes.fromhex(result["data"])).tolist()) references.append(manifest[result["qsl_idx"]]["transcript"]) references = __gather_predictions([references], labels=labels) hypotheses = __gather_predictions([hypotheses], labels=labels) wer, _, _ = word_error_rate(hypotheses=hypotheses, references=references) - print("Word Error Rate: {:}%, accuracy={:}%".format(wer * 100, (1 - wer) * 100)) + print( + "Word Error Rate: {:}%, accuracy={:}%".format( + wer * 100, + (1 - wer) * 100)) + if __name__ == '__main__': main() diff --git a/speech2text/helpers.py b/speech2text/helpers.py index b89b04e91e..279a3d396c 100644 --- a/speech2text/helpers.py +++ b/speech2text/helpers.py @@ -16,6 +16,7 @@ from typing import List from legacy_helpers import __levenshtein + def compute_wer_with_concatenation(prediction, reference): """ Compute WER considering concatenated words as correct matches using kaldialign @@ -46,7 +47,6 @@ def compute_wer_with_concatenation(prediction, reference): ref_concat = ref_words[i] hyp_concat = hyp_words[j] - # Try concatenating up to 3 words ref_match_len = 1 hyp_match_len = 1 @@ -54,7 +54,7 @@ def compute_wer_with_concatenation(prediction, reference): for k in range(1, 4): if i + k <= len(ref_words): - ref_concat = ''.join(ref_words[i:i+k]) + ref_concat = ''.join(ref_words[i:i + k]) if ref_concat == hyp_words[j]: ref_match_len = k hyp_match_len = 1 @@ -62,7 +62,7 @@ def compute_wer_with_concatenation(prediction, reference): break if j + k <= len(hyp_words): - hyp_concat = ''.join(hyp_words[j:j+k]) + hyp_concat = ''.join(hyp_words[j:j + k]) if hyp_concat == ref_words[i]: ref_match_len = 1 hyp_match_len = k @@ -71,8 +71,8 @@ def compute_wer_with_concatenation(prediction, reference): if match_found: # Add concatenated match - alignment.append((' '.join(ref_words[i:i+ref_match_len]), - ' '.join(hyp_words[j:j+hyp_match_len]))) + alignment.append((' '.join(ref_words[i:i + ref_match_len]), + ' '.join(hyp_words[j:j + hyp_match_len]))) i += ref_match_len j += hyp_match_len @@ -91,14 +91,18 @@ def compute_wer_with_concatenation(prediction, reference): j += 1 # Calculate WER using kaldialign - ref_aligned = [x[0].replace(" ", "") for x in alignment if x[0] is not None] - hyp_aligned = [x[1].replace(" ", "") for x in alignment if x[1] is not None] + ref_aligned = [x[0].replace(" ", "") + for x in alignment if x[0] is not None] + hyp_aligned = [x[1].replace(" ", "") + for x in alignment if x[1] is not None] distance = __levenshtein(ref_aligned, hyp_aligned) wer = distance / len(ref_words) if ref_words else 0 return distance, len(ref_words) if ref_words else 0 -def expand_concatenations(words_list: List, reference_dict: dict, reference_list: List): + +def expand_concatenations( + words_list: List, reference_dict: dict, reference_list: List): """ Finds matching compound words in 'words_list' which exist as keys in 'reference_dict', if any. If found, the compound word will be separated using reference_dict if the substitution reduces @@ -113,15 +117,19 @@ def expand_concatenations(words_list: List, reference_dict: dict, reference_list score = __levenshtein(words_list, reference_list) # Searches each word in 'word_list' for separability using the reference list. Once all options are - # considered, the modified 'word_list' is returned. Length of 'word_list' can grow, but not contract. + # considered, the modified 'word_list' is returned. Length of 'word_list' + # can grow, but not contract. i = 0 words_length = len(words_list) while i < words_length: if words_list[i] in reference_dict.keys(): - words_candidate = words_list[:i] + reference_dict[words_list[i]] + words_list[i + 1:] + words_candidate = words_list[:i] + \ + reference_dict[words_list[i]] + words_list[i + 1:] - # If levenshtein distance reduced, cache new word_list and resume search - candidate_levenshtein = __levenshtein(words_candidate, reference_list) + # If levenshtein distance reduced, cache new word_list and resume + # search + candidate_levenshtein = __levenshtein( + words_candidate, reference_list) if candidate_levenshtein < score: words_list = words_candidate words_length = len(words_list) @@ -129,6 +137,7 @@ def expand_concatenations(words_list: List, reference_dict: dict, reference_list i += 1 return words_list + def get_expanded_wordlist(words_list: List, reference_list: List): """ Provided two lists of English words, the two will be compared, and any compound words found in @@ -141,7 +150,8 @@ def get_expanded_wordlist(words_list: List, reference_list: List): List of words modified from 'word_list' after expanding referenced compound words """ - # If levenshtein distance < 2, there cannot be any compound word separation issues. + # If levenshtein distance < 2, there cannot be any compound word + # separation issues. if __levenshtein(words_list, reference_list) < 2: return words_list @@ -153,9 +163,12 @@ def get_expanded_wordlist(words_list: List, reference_list: List): # Adding three-word compounding candidates to checklist for i in range(len(reference_list) - 2): - compound = reference_list[i] + reference_list[i + 1] + reference_list[i + 2] - checklist[compound] = [reference_list[i], reference_list[i + 1], reference_list[i + 2]] + compound = reference_list[i] + \ + reference_list[i + 1] + reference_list[i + 2] + checklist[compound] = [reference_list[i], + reference_list[i + 1], reference_list[i + 2]] # All compiled candidates will be checked, and after checking for minimal Levenshtein - # distance, the modified list (or original if compounding not found) is directly returned + # distance, the modified list (or original if compounding not found) is + # directly returned return expand_concatenations(words_list, checklist, reference_list) diff --git a/speech2text/legacy_helpers.py b/speech2text/legacy_helpers.py index 45065fff72..17687e4230 100644 --- a/speech2text/legacy_helpers.py +++ b/speech2text/legacy_helpers.py @@ -16,6 +16,7 @@ from enum import Enum from typing import List + def __levenshtein(a: List, b: List) -> int: """Calculates the Levenshtein distance between a and b. """ @@ -37,6 +38,7 @@ def __levenshtein(a: List, b: List) -> int: return current[n] + def __whisper_decoder_predictions_tensor(tensor, labels): """ Takes output of greedy whisper decoder and converts to strings. @@ -58,6 +60,6 @@ def __whisper_decoder_predictions_tensor(tensor, labels): def __gather_predictions(predictions_list: list, labels: list) -> list: results = [] for prediction in predictions_list: - results += __whisper_decoder_predictions_tensor(prediction, labels=labels) + results += __whisper_decoder_predictions_tensor( + prediction, labels=labels) return results - diff --git a/speech2text/manifest.py b/speech2text/manifest.py index d7c0fc88d6..c54be6923b 100644 --- a/speech2text/manifest.py +++ b/speech2text/manifest.py @@ -16,6 +16,7 @@ import string import os + class Manifest(object): def __init__(self, data_dir, manifest_paths, labels, blank_index, max_duration=None, pad_to_max=False, min_duration=None, sort_by_duration=False, max_utts=0, diff --git a/speech2text/reference_SUT.py b/speech2text/reference_SUT.py index 3031281736..3deeea0cec 100644 --- a/speech2text/reference_SUT.py +++ b/speech2text/reference_SUT.py @@ -43,27 +43,59 @@ logging.basicConfig(level=logging.INFO) log = logging.getLogger("SUT") + def get_start_cores(start_cores="0"): start_cores = start_cores.split(",") start_cores = list(map(int, start_cores)) return start_cores + cores_per_inst = int(os.environ.get("CORES_PER_INST", "1")) num_numa_nodes = int(os.environ.get("NUM_NUMA_NODES", "1")) -nodes_per_inst = int(os.environ["NUM_NUMA_NODES"])/int(os.environ["NUM_INSTS"]) +nodes_per_inst = int(os.environ["NUM_NUMA_NODES"] + ) / int(os.environ["NUM_INSTS"]) insts_per_node = int(os.environ["INSTS_PER_NODE"]) -start_cores = os.environ["START_CORES"] +start_cores = os.environ["START_CORES"] precision = torch.float32 n_mels = 128 sample_rate = 16000 model_path = "openai/whisper-large-v3" -labels = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"] +labels = [ + " ", + "a", + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z", + "'"] labels_dict = {} for i in range(len(labels)): labels_dict[labels[i]] = i + class Instance(mp.Process): def __init__( self, @@ -123,7 +155,7 @@ def run(self): self.total_sample_count ) - dtype="bfloat16" + dtype = "bfloat16" print(f"Precision: {dtype}") model = LLM( model=model_path, @@ -162,7 +194,7 @@ def process_queries(self): if qitem_list is None: return False - + prompt_list = [] for qitem in qitem_list: prompt = self.qsl[qitem.index] @@ -177,7 +209,8 @@ def process_queries(self): start_time = time.time() outputs = self.model.generate(prompt_list, self.sampling_params) - print(f"Sample number: {self.num_samples} | Step time {time.time()-start_time:.3f}s") + print( + f"Sample number: {self.num_samples} | Step time {time.time()-start_time:.3f}s") for output in outputs: request_id = int(output.request_id) @@ -188,7 +221,7 @@ def process_queries(self): self.num_samples += len(results) - for i,result in enumerate(results): + for i, result in enumerate(results): # Whisper outputs space in the front and capitalizes things result = result.lower().strip() transcript = [] @@ -204,6 +237,7 @@ def process_queries(self): print(f"Finished {qid[i]}") return True + class vllmSUT: def __init__(self, dataset_dir, manifest_filepath, perf_count, num_workers=1, device="cpu"): @@ -211,21 +245,21 @@ def __init__(self, dataset_dir, self.dataset_path = dataset_dir self.manifest_filepath = manifest_filepath self.device = device - self.batch_size = 16 + self.batch_size = 16 self.total_sample_count = perf_count self.num_workers = num_workers self.worker_threads = [None] * self.num_workers dataset_vocab = labels - #self.dev = torch.device("cuda:0") if torch.cuda.is_available() and os.environ.get("USE_GPU", "").lower() not in [ "no", "false" ] else torch.device("cpu") + # self.dev = torch.device("cuda:0") if torch.cuda.is_available() and os.environ.get("USE_GPU", "").lower() not in [ "no", "false" ] else torch.device("cpu") self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries) self.qsl = AudioQSL(dataset_dir, - manifest_filepath, - dataset_vocab, - sample_rate, - perf_count) + manifest_filepath, + dataset_vocab, + sample_rate, + perf_count) self.query_queue = mp.JoinableQueue() self.output_queue = mp.Queue() self.alive_counter = mp.Value("i", 0) @@ -235,10 +269,20 @@ def __init__(self, dataset_dir, def start(self): node_start_cores = get_start_cores(start_cores) core_lists = [] - if insts_per_node>0: + if insts_per_node > 0: for i in range(num_numa_nodes): for j in range(insts_per_node): - core_lists.append(list(range(node_start_cores[i]+j*cores_per_inst, node_start_cores[i]+(j+1)*cores_per_inst))) + core_lists.append( + list( + range( + node_start_cores[i] + + j * + cores_per_inst, + node_start_cores[i] + + ( + j + + 1) * + cores_per_inst))) for j in range(self.num_workers): core_list = core_lists[j] @@ -253,18 +297,19 @@ def start(self): rank=j, dtype=precision, core_list=tuple(core_list), - node_list=tuple([math.floor(j*nodes_per_inst)]), - input_queue = self.query_queue, - output_queue = self.output_queue, - cond_var = self.cond_var, - alive_counter = self.alive_counter, - sample_counter = self.sample_counter + node_list=tuple([math.floor(j * nodes_per_inst)]), + input_queue=self.query_queue, + output_queue=self.output_queue, + cond_var=self.cond_var, + alive_counter=self.alive_counter, + sample_counter=self.sample_counter ) worker.start() self.worker_threads[j] = worker with self.cond_var: - self.cond_var.wait_for(lambda: self.alive_counter.value == self.num_workers) + self.cond_var.wait_for( + lambda: self.alive_counter.value == self.num_workers) log.info(f"Starting Loadgen response thread") response_thread = threading.Thread(target=self.response_loadgen) @@ -276,11 +321,12 @@ def issue_queries(self, query_samples): for query_sample in query_samples: # Continuous batching self.query_queue.put([query_sample]) - if len(query_sample_list)>0: + if len(query_sample_list) > 0: self.query_queue.put(query_sample_list) def flush_queries(self): pass + def response_loadgen(self): keep_alive = True while keep_alive: @@ -293,13 +339,13 @@ def response_loadgen(self): response = lg.QuerySampleResponse(qid, bi[0], bi[1] * response_array.itemsize) lg.QuerySamplesComplete([response]) + def stop(self): for i in range(self.num_workers): self.query_queue.put(None) for worker in self.worker_threads: worker.kill() - def __del__(self): lg.DestroySUT(self.sut) print("Finished destroying SUT.") diff --git a/speech2text/reference_mlperf.py b/speech2text/reference_mlperf.py index 7ca4416a14..cee5843a7a 100644 --- a/speech2text/reference_mlperf.py +++ b/speech2text/reference_mlperf.py @@ -21,13 +21,32 @@ import mlperf_loadgen as lg from reference_SUT import vllmSUT + def get_args(): parser = argparse.ArgumentParser() - parser.add_argument("--scenario", choices=["Offline", "Server"], default="Offline", help="Scenario") - parser.add_argument("--accuracy", action="store_true", help="enable accuracy pass") - parser.add_argument("--mlperf_conf", default="mlperf.conf", help="mlperf rules config") - parser.add_argument("--user_conf", default="user.conf", help="user config for user LoadGen settings such as target QPS") - parser.add_argument("--audit_conf", default="audit.conf", help="audit config for LoadGen settings during compliance runs") + parser.add_argument( + "--scenario", + choices=[ + "Offline", + "Server"], + default="Offline", + help="Scenario") + parser.add_argument( + "--accuracy", + action="store_true", + help="enable accuracy pass") + parser.add_argument( + "--mlperf_conf", + default="mlperf.conf", + help="mlperf rules config") + parser.add_argument( + "--user_conf", + default="user.conf", + help="user config for user LoadGen settings such as target QPS") + parser.add_argument( + "--audit_conf", + default="audit.conf", + help="audit config for LoadGen settings during compliance runs") parser.add_argument("--dataset_dir", required=True) parser.add_argument("--manifest", required=True) parser.add_argument("--perf_count", type=int, default=None) @@ -50,17 +69,16 @@ def main(): log_path = args.log_dir os.makedirs(log_path, exist_ok=True) - - sut = vllmSUT( args.dataset_dir, - args.manifest, - args.perf_count, - num_workers=args.num_workers, - device="cpu") + sut = vllmSUT(args.dataset_dir, + args.manifest, + args.perf_count, + num_workers=args.num_workers, + device="cpu") sut.start() settings = lg.TestSettings() settings.scenario = scenario_map[args.scenario] - #settings.FromConfig(args.mlperf_conf, "whisper", args.scenario) + # settings.FromConfig(args.mlperf_conf, "whisper", args.scenario) settings.FromConfig(args.user_conf, "whisper", args.scenario) if args.accuracy: @@ -75,15 +93,23 @@ def main(): log_settings.log_output = log_output_settings print("Running Loadgen test...") - lg.StartTestWithLogSettings(sut.sut, - sut.qsl.qsl, - settings, - log_settings, + lg.StartTestWithLogSettings(sut.sut, + sut.qsl.qsl, + settings, + log_settings, args.audit_conf) sut.stop() if args.accuracy: - cmd = ["python3", "accuracy_eval.py", "--log_dir", log_path, "--dataset_dir", args.dataset_dir, "--manifest", args.manifest] + cmd = [ + "python3", + "accuracy_eval.py", + "--log_dir", + log_path, + "--dataset_dir", + args.dataset_dir, + "--manifest", + args.manifest] print(f"Running accuracy script: {cmd}") subprocess.check_call(cmd) diff --git a/speech2text/utils/download_utils.py b/speech2text/utils/download_utils.py index bda4193fbb..ba5e15b7f0 100644 --- a/speech2text/utils/download_utils.py +++ b/speech2text/utils/download_utils.py @@ -65,5 +65,6 @@ def extract(fpath, dest_folder): with tarfile.open(fpath, mode) as tar: members = tar.getmembers() - for member in tqdm.tqdm(iterable=members, total=len(members), leave=True): + for member in tqdm.tqdm( + iterable=members, total=len(members), leave=True): tar.extract(path=dest_folder, member=member) diff --git a/speech2text/utils/preprocessing_utils.py b/speech2text/utils/preprocessing_utils.py index 260e860b80..5d6673434c 100644 --- a/speech2text/utils/preprocessing_utils.py +++ b/speech2text/utils/preprocessing_utils.py @@ -68,7 +68,8 @@ def preprocess(data, input_dir, dest_dir, target_sr=None, speed=None, return output_dict -def parallel_preprocess(dataset, input_dir, dest_dir, target_sr, speed, overwrite, parallel): +def parallel_preprocess(dataset, input_dir, dest_dir, + target_sr, speed, overwrite, parallel): with multiprocessing.Pool(parallel) as p: func = functools.partial(preprocess, input_dir=input_dir, dest_dir=dest_dir, From 2b371c9028fbaaaa1e121df7e9c6d360ebd2aac0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 16 Jun 2025 18:03:55 +0000 Subject: [PATCH 14/64] [Automated Commit] Format Codebase --- language/llama3.1-8b/ref_eval.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/language/llama3.1-8b/ref_eval.py b/language/llama3.1-8b/ref_eval.py index 0cfc0694d7..bb34847b13 100644 --- a/language/llama3.1-8b/ref_eval.py +++ b/language/llama3.1-8b/ref_eval.py @@ -17,7 +17,8 @@ def rouge(label, pred): def niah_em(label, pred): - label_uuids = re.findall(r'[\w]{8}-[\w]{4}-[\w]{4}-[\w]{4}-[\w]{12}', label) + label_uuids = re.findall( + r'[\w]{8}-[\w]{4}-[\w]{4}-[\w]{4}-[\w]{12}', label) pred_uuids = re.findall(r'[\w]{8}-[\w]{4}-[\w]{4}-[\w]{4}-[\w]{12}', pred) # https://github.com/hsiehjackson/RULER/blob/main/scripts/eval/synthetic/constants.py#L28 @@ -43,7 +44,8 @@ def qa_em(label, pred): return {'exact_match': 100.0} normalized_answer = re.sub(r'\s+', '', answer_substring).lower() - label_entries = [re.sub(r'\s+', '', entry).lower() for entry in label.split('|')] + label_entries = [re.sub(r'\s+', '', entry).lower() + for entry in label.split('|')] match_found = any(entry in normalized_answer for entry in label_entries) return {'exact_match': 100.0 if match_found else 0.0} @@ -63,7 +65,12 @@ def process_row(row): def run_evaluation(df): with Pool(cpu_count()) as pool: - accuracies = list(tqdm(pool.imap(process_row, df.to_dict('records')), total=len(df))) + accuracies = list( + tqdm( + pool.imap( + process_row, + df.to_dict('records')), + total=len(df))) df['accuracy'] = accuracies return df @@ -74,10 +81,10 @@ def run_evaluation(df): df = pd.read_pickle(fname) df = run_evaluation(df) - #df.to_pickle(str(fname).replace(".pkl", "_eval.pkl")) + # df.to_pickle(str(fname).replace(".pkl", "_eval.pkl")) print(f"WROTE: {str(fname).replace('.pkl', '_eval.pkl')}") accuracy = df.accuracy.apply(pd.Series) print(df.dataset.value_counts()) print(accuracy.describe()) - print(df.describe()) \ No newline at end of file + print(df.describe()) From 72d52bb8adb42c9560076194a2c8a82190f37740 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 19 Jun 2025 08:45:56 +0000 Subject: [PATCH 15/64] [Automated Commit] Format Codebase --- speech2text/utils/repackage_librispeech.py | 42 +++++++++++++++------- 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/speech2text/utils/repackage_librispeech.py b/speech2text/utils/repackage_librispeech.py index 7a2b62f421..4889b815a2 100644 --- a/speech2text/utils/repackage_librispeech.py +++ b/speech2text/utils/repackage_librispeech.py @@ -24,6 +24,7 @@ PAD_DURATION = 0.5 SR = 16000 + def get_args(): parser = argparse.ArgumentParser() parser.add_argument("--manifest", required=True) @@ -33,10 +34,12 @@ def get_args(): args = parser.parse_args() return args + def get_source_name(fname): basename_list, _ = os.path.splitext(fname) return "-".join(basename_list.split("-")[:2]) + def prepare_clip(current_entry, new_fname): pad_audio = np.zeros(int(PAD_DURATION * SR)) new_audio = [] @@ -51,6 +54,7 @@ def prepare_clip(current_entry, new_fname): new_json = get_sample_json(new_audio, new_transcript, new_fname) return new_audio, new_json + def get_sample_json(audio, transcript, fname): json_file = { "transcript": transcript, @@ -73,6 +77,7 @@ def get_sample_json(audio, transcript, fname): } return json_file + def main(): args = get_args() with open(args.manifest, "r") as manifest: @@ -84,15 +89,22 @@ def main(): for data in json_data: original_fname = data["files"][0]["fname"] original_transcript = data["transcript"] - original_audio = librosa.load(os.path.join(args.data_dir, original_fname), sr=SR)[0] - original_json = get_sample_json(original_audio, original_transcript, original_fname) + original_audio = librosa.load( + os.path.join( + args.data_dir, + original_fname), + sr=SR)[0] + original_json = get_sample_json( + original_audio, original_transcript, original_fname) - source_name = get_source_name(os.path.basename(os.path.basename(original_fname))) + source_name = get_source_name( + os.path.basename( + os.path.basename(original_fname))) if source_name not in catalog: catalog[source_name] = [] - + catalog[source_name].append((original_audio, original_json)) - + full_json = [] for key in catalog.keys(): index = 0 @@ -100,13 +112,17 @@ def main(): current_duration = 0 for entry in catalog[key]: clip_duration = entry[1]["original_duration"] - - # Only considering clips <=30s. If single clip duration > 30s, ignore. + + # Only considering clips <=30s. If single clip duration > 30s, + # ignore. if clip_duration > 30: continue - # If new clip would extend compiled entry to >30s, flush the existing entry - if (len(current_entry) > 0) and (current_duration + PAD_DURATION + clip_duration > 30): - new_fname = os.path.join(args.output_dir, key + "_" + str(index) + ".wav") + # If new clip would extend compiled entry to >30s, flush the + # existing entry + if (len(current_entry) > 0) and ( + current_duration + PAD_DURATION + clip_duration > 30): + new_fname = os.path.join( + args.output_dir, key + "_" + str(index) + ".wav") new_audio, new_json = prepare_clip(current_entry, new_fname) sf.write(new_fname, new_audio, SR) full_json.append(new_json) @@ -120,9 +136,11 @@ def main(): if len(current_entry) > 1: current_duration += PAD_DURATION - # After all key clips are processed, if a remaining entry has content, exports it. + # After all key clips are processed, if a remaining entry has content, + # exports it. if len(current_entry) > 0: - new_fname = os.path.join(args.output_dir, key + "_" + str(index) + ".wav") + new_fname = os.path.join( + args.output_dir, key + "_" + str(index) + ".wav") new_audio, new_json = prepare_clip(current_entry, new_fname) sf.write(new_fname, new_audio, SR) full_json.append(new_json) From f841519e77014c799b0e08ad75b23ca2c534b1aa Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 26 Jun 2025 23:30:42 +0100 Subject: [PATCH 16/64] Fix Typo in Interactive Latencies (#2147) (#2225) * Fix Typo in Interactive Latencies * Update submission_checker.py --- language/llama2-70b/README.md | 2 +- tools/submission/submission_checker.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/language/llama2-70b/README.md b/language/llama2-70b/README.md index bbd9889564..0c0ad21952 100644 --- a/language/llama2-70b/README.md +++ b/language/llama2-70b/README.md @@ -291,7 +291,7 @@ Please see the [new docs site](https://docs.mlcommons.org/inference/submission/) # Run llama2-70b-interactive benchmark -For official, Llama2-70b submissions it is also possible to submit in the interactive category. This sets a more strict latency requirements for Time to First Token (ttft) and Time per Output Token (tpot). Specifically, the interactive category requires loadgen to enforce `ttft <= 450ms` and `ttft <= 40ms` +For official, Llama2-70b submissions it is also possible to submit in the interactive category. This sets a more strict latency requirements for Time to First Token (ttft) and Time per Output Token (tpot). Specifically, the interactive category requires loadgen to enforce `ttft <= 450ms` and `tpot <= 40ms` In order to run interactive category, it is sufficient to set the flag `--lg-model-name` as `llama2-70b-interactive` when calling the `main.py` to run the benchmark. For example, to run the server scenario in interactive mode: diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index 69b47d6f17..462f3d56ea 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -808,6 +808,18 @@ "ttft": 450 * 1000000, "tpot": 40 * 1000000 }, }, + "llama2-70b-interactive-99": { + "Server": { + "ttft": 450 * 1000000, "tpot": 40 * 1000000 + }, + }, + # for v5.0 + "llama2-70b-interactive-99.9": { + "Server": { + "ttft": 450 * 1000000, "tpot": 40 * 1000000 + }, + }, + # for v5.0 "mixtral-8x7b": { "Server": { "ttft": 2000 * 1000000, "tpot": 200 * 1000000 From 4e7717712ceb8259a703423898fbfcbb5edd057e Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 26 Jun 2025 23:40:19 +0100 Subject: [PATCH 17/64] Fix Typo in Interactive Latencies (#2147) (#2226) * Fix Typo in Interactive Latencies * Update submission_checker.py --------- Co-authored-by: Miro Co-authored-by: github-actions[bot] From 350032a3f13ff25cd5d54c99c0a604de4a155dbd Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Sun, 6 Jul 2025 01:57:04 +0530 Subject: [PATCH 18/64] Update MLCFlow commands for v5.1 (#2237) --- docs/benchmarks/language/deepseek-r1.md | 11 ++++ .../language/get-deepseek-r1-data.md | 24 ++++++++ .../language/get-llama3_1-8b-data.md | 60 +++++++++++++++++++ docs/benchmarks/language/llama3_1-8b.md | 11 ++++ .../speech_to_text/get-whisper-data.md | 40 +++++++++++++ docs/benchmarks/speech_to_text/whisper.md | 11 ++++ language/deepseek-r1/README.md | 30 ++++++++++ language/llama3.1-8b/README.md | 52 ++++++++++++---- main.py | 33 ++++++---- mkdocs.yml | 4 ++ speech2text/README.md | 46 ++++++++++++++ 11 files changed, 298 insertions(+), 24 deletions(-) create mode 100644 docs/benchmarks/language/deepseek-r1.md create mode 100644 docs/benchmarks/language/get-deepseek-r1-data.md create mode 100644 docs/benchmarks/language/get-llama3_1-8b-data.md create mode 100644 docs/benchmarks/language/llama3_1-8b.md create mode 100644 docs/benchmarks/speech_to_text/get-whisper-data.md create mode 100644 docs/benchmarks/speech_to_text/whisper.md diff --git a/docs/benchmarks/language/deepseek-r1.md b/docs/benchmarks/language/deepseek-r1.md new file mode 100644 index 0000000000..f83fe1bb82 --- /dev/null +++ b/docs/benchmarks/language/deepseek-r1.md @@ -0,0 +1,11 @@ +--- +hide: + - toc +--- + +# Reasoning using DeepSeek-R1 + +=== "MLCommons-Python" + ## MLPerf Reference Implementation in Python + +{{ mlperf_inference_implementation_readme (4, "deepseek-r1", "reference", devices=["CUDA"]) }} \ No newline at end of file diff --git a/docs/benchmarks/language/get-deepseek-r1-data.md b/docs/benchmarks/language/get-deepseek-r1-data.md new file mode 100644 index 0000000000..401c4d27bc --- /dev/null +++ b/docs/benchmarks/language/get-deepseek-r1-data.md @@ -0,0 +1,24 @@ +--- +hide: + - toc +--- + +# Reasoning using DeepSeek R1 + +## Dataset + +The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands. + +=== "Validation" + + ### Get Validation Dataset + ``` + mlcr get,preprocessed,dataset,deepseek-r1,_validation,_mlc,_rclone --outdirname= -j + ``` + +=== "Calibration" + + ### Get Calibration Dataset + ``` + mlcr get,preprocessed,dataset,deepseek-r1,_calibration,_mlc,_rclone --outdirname= -j + ``` \ No newline at end of file diff --git a/docs/benchmarks/language/get-llama3_1-8b-data.md b/docs/benchmarks/language/get-llama3_1-8b-data.md new file mode 100644 index 0000000000..e24cc37d44 --- /dev/null +++ b/docs/benchmarks/language/get-llama3_1-8b-data.md @@ -0,0 +1,60 @@ +--- +hide: + - toc +--- + +# Text Summarization using LLAMA3.1-8b + +## Dataset + +The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands. + +=== "Validation" + + === "Full dataset (Datacenter)" + + ### Get Validation Dataset + ``` + mlcr get,dataset,cnndm,_validation,_datacenter,_llama3,_mlc,_rclone --outdirname= -j + ``` + + === "5000 samples (Edge)" + + ### Get Validation Dataset + ``` + mlcr get,dataset,cnndm,_validation,_edge,_llama3,_mlc,_rclone --outdirname= -j + ``` + +=== "Calibration" + + ### Get Calibration Dataset + ``` + mlcr get,dataset,cnndm,_calibration,_llama3,_mlc,_rclone --outdirname= -j + ``` + +- `--outdirname=` could be provided to download the dataset to a specific location. + +## Model +The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands. + +=== "Pytorch" + + === "From MLCOMMONS Google Drive" + + > **Note:** One has to accept the [MLCommons Llama 3.1 License Confidentiality Notice](http://llama3-1.mlcommons.org/) to access the model files in MLCOMMONS Google Drive. + + ### Get the Official MLPerf LLAMA3.1-405B model from MLCOMMONS Cloudfare R2 + ``` + TBD + ``` + + === "From Hugging Face repo" + + > **Note:** Access to the HuggingFace model could be requested [here](https://ai.meta.com/resources/models-and-libraries/llama-downloads/). + + ### Get model from HuggingFace repo + ``` + mlcr get,ml-model,llama3,_hf,_meta-llama/Llama-3.1-8B-Instruct --hf_token= -j + ``` + +- `--outdirname=` could be provided to download the model to a specific location. \ No newline at end of file diff --git a/docs/benchmarks/language/llama3_1-8b.md b/docs/benchmarks/language/llama3_1-8b.md new file mode 100644 index 0000000000..93f8df2997 --- /dev/null +++ b/docs/benchmarks/language/llama3_1-8b.md @@ -0,0 +1,11 @@ +--- +hide: + - toc +--- + +# Text Summarization using LLAMA3_1-8b + +=== "MLCommons-Python" + ## MLPerf Reference Implementation in Python + +{{ mlperf_inference_implementation_readme (4, "llama3_1-8b", "reference", devices=["CPU","CUDA"]) }} \ No newline at end of file diff --git a/docs/benchmarks/speech_to_text/get-whisper-data.md b/docs/benchmarks/speech_to_text/get-whisper-data.md new file mode 100644 index 0000000000..9bc97ad9a0 --- /dev/null +++ b/docs/benchmarks/speech_to_text/get-whisper-data.md @@ -0,0 +1,40 @@ +--- +hide: + - toc +--- + +# Speech to Text using Whisper + +## Dataset + +The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands. + +=== "Validation" + + === "Preprocessed" + + ### Get Preprocessed Validation Dataset + ``` + mlcr get,dataset,whisper,_preprocessed,_mlc,_rclone --outdirname= -j + ``` + + === "Unprocessed" + + ### Get Unprocessed Validation Dataset + ``` + mlcr get,dataset,whisper,_unprocessed --outdirname= -j + ``` + +## Model +The benchmark implementation run command will automatically download the required model and do the necessary conversions if any. In case you want to only download the official model, you can use the below commands. + +=== "Pytorch" + + === "From MLCOMMONS" + + ### Get the Official MLPerf Whisper model from MLCOMMONS Cloudflare R2 + ``` + mlcr get,ml-model,whisper,_rclone,_mlc s-j + ``` + +- `--outdirname=` could be provided to download the model to a specific location. \ No newline at end of file diff --git a/docs/benchmarks/speech_to_text/whisper.md b/docs/benchmarks/speech_to_text/whisper.md new file mode 100644 index 0000000000..fddf37b767 --- /dev/null +++ b/docs/benchmarks/speech_to_text/whisper.md @@ -0,0 +1,11 @@ +--- +hide: + - toc +--- + +# Speech to Text using Whisper + +=== "MLCommons-Python" + ## MLPerf Reference Implementation in Python + +{{ mlperf_inference_implementation_readme (4, "whisper", "reference", devices=["CPU","CUDA"]) }} \ No newline at end of file diff --git a/language/deepseek-r1/README.md b/language/deepseek-r1/README.md index 7c2722c7a6..4e0184ff94 100644 --- a/language/deepseek-r1/README.md +++ b/language/deepseek-r1/README.md @@ -1,5 +1,11 @@ # Mlperf Inference DeepSeek Reference Implementation +## Automated command to run the benchmark via MLFlow + +Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/language/deepseek-r1/) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker. + +You can also do pip install mlc-scripts and then use `mlcr` commands for downloading the model and datasets using the commands given in the later sections. + ## Model & Dataset Download > **Model**: [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) (revision: `56d4cbbb4d29f4355bab4b9a39ccb717a14ad5ad`) @@ -11,6 +17,14 @@ ### Preprocessed +**Using MLCFlow Automation** + +``` +mlcr get,dataset,whisper,_preprocessed,_mlc,_rclone --outdirname= -j +``` + +**Using Native method** + You can use Rclone to download the preprocessed dataset from a Cloudflare R2 bucket. To run Rclone on Windows, you can download the executable [here](https://rclone.org/install/#windows). @@ -30,6 +44,14 @@ rclone copy mlc-inference:mlcommons-inference-wg-public/deepseek_r1/mlperf_deeps ### Calibration +**Using MLCFlow Automation** + +``` +mlcr get,preprocessed,dataset,deepseek-r1,_calibration,_mlc,_rclone --outdirname= -j +``` + +**Using Native method** + Download and install Rclone as described in the previous section. Then navigate in the terminal to your desired download directory and run the following command to download the dataset: @@ -171,6 +193,14 @@ The following table shows which backends support different evaluation and MLPerf ## Accuracy Evaluation +**Using MLCFlow Automation** + +``` +TBD +``` + +**Using Native method** + Accuracy evaluation is handled uniformly across all backends: ```bash diff --git a/language/llama3.1-8b/README.md b/language/llama3.1-8b/README.md index 5947aa0cc4..2b331c98f9 100644 --- a/language/llama3.1-8b/README.md +++ b/language/llama3.1-8b/README.md @@ -9,7 +9,7 @@ ## Automated command to run the benchmark via MLFlow -Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/language/llama3_1-8b/) (TBD) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker. +Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/language/llama3_1-8b/) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker. You can also do pip install mlc-scripts and then use `mlcr` commands for downloading the model and datasets using the commands given in the later sections. @@ -99,7 +99,10 @@ pip install -e ../../loadgen ## Get Model ### MLCommons Members Download (Recommended for official submission) -You need to request for access to [MLCommons](http://llama3-1.mlcommons.org/) and you'll receive an email with the download instructions. You can download the model automatically via the below command +You need to request for access to [MLCommons](http://llama3-1.mlcommons.org/) and you'll receive an email with the download instructions. + +**Official Model download using MLCFlow Automation** +You can download the model automatically via the below command ``` TBD ``` @@ -115,6 +118,12 @@ git clone https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct ${CHECKPOINT_P cd ${CHECKPOINT_PATH} && git checkout be673f326cab4cd22ccfef76109faf68e41aa5f1 ``` +**External Model download using MLCFlow Automation** +You can download the model automatically via the below command +``` +mlcr get,ml-model,llama3,_hf,_meta-llama/Llama-3.1-8B-Instruct --hf_token= -j +``` + ### Download huggingface model through MLC ``` @@ -142,24 +151,39 @@ rclone config create mlc-inference s3 provider=Cloudflare access_key_id=f65ba5ee You can then navigate in the terminal to your desired download directory and run the following command to download the dataset: #### Full dataset (datacenter) + +**Using MLCFlow Automation** +``` +mlcr get,dataset,cnndm,_validation,_datacenter,_llama3,_mlc,_rclone --outdirname= -j +``` + +**Native method** ``` rclone copy mlc-inference:mlcommons-inference-wg-public/llama3.1_8b/cnn_eval.json ./ -P ``` #### 5000 samples (edge) + +**Using MLCFlow Automation** +``` +mlcr get,dataset,cnndm,_validation,_edge,_llama3,_mlc,_rclone --outdirname= -j +``` + +**Native method** ``` rclone copy mlc-inference:mlcommons-inference-wg-public/llama3.1_8b/sample_cnn_eval_5000.json ./ -P ``` #### Calibration + +**Using MLCFlow Automation** ``` -rclone copy mlc-inference:mlcommons-inference-wg-public/llama3.1_8b/cnn_dailymail_calibration.json ./ -P +mlcr get,dataset,cnndm,_calibration,_llama3,_mlc,_rclone --outdirname= -j ``` -**MLC Command** - +**Native method** ``` -TBD +rclone copy mlc-inference:mlcommons-inference-wg-public/llama3.1_8b/cnn_dailymail_calibration.json ./ -P ``` You can also download the calibration dataset from the Cloudflare R2 bucket by running the following command: @@ -168,11 +192,6 @@ You can also download the calibration dataset from the Cloudflare R2 bucket by r rclone copy mlc-inference:mlcommons-inference-wg-public/llama3.1_8b/cnn_eval.json ./ -P ``` -**MLC Command** -``` -TBD -``` - ## Run Performance Benchmarks @@ -265,8 +284,17 @@ The ServerSUT was not tested for GPU runs. ### Evaluate the accuracy using MLCFlow You can also evaulate the accuracy from the generated accuracy log by using the following MLC command + +**Full dataset (datacenter)** + ``` -TBD +mlcr run,accuracy,mlperf,_cnndm_llama_3,_edge --result_dir= +``` + +**5000 samples (edge)** + +``` +mlcr run,accuracy,mlperf,_cnndm_llama_3,_datacenter --result_dir= ``` ## Accuracy Target diff --git a/main.py b/main.py index d2f625fcf4..1e5a20bb96 100755 --- a/main.py +++ b/main.py @@ -33,13 +33,15 @@ def mlperf_inference_implementation_readme( if model == "rnnt": code_version = "r4.0" + if "gpt" in model: + code_version = "r5.0-dev" elif implementation == "intel": code_version = "r4.1-dev" if implementation == "reference": # Tip - if model != "rnnt": - code_version = "r5.0-dev" + if model not in ["rnnt", "gptj-99", "gptj-99.9"]: + code_version = "r5.1-dev" if "99.9" not in model and implementation_tips: content += f"\n{pre_space}!!! tip\n\n" content += f"{pre_space} - MLCommons reference implementations are only meant to provide a rules compliant reference implementation for the submitters and in most cases are not best performing. If you want to benchmark any system, it is advisable to use the vendor MLPerf implementation for that system like Nvidia, Intel etc.\n\n" @@ -54,8 +56,12 @@ def mlperf_inference_implementation_readme( frameworks = ["Onnxruntime", "Pytorch"] elif "bert" in model.lower(): frameworks = ["Pytorch", "Deepsparse"] - elif "llama3" in model.lower(): - frameworks = ["Pytorch"] + elif "whisper" in model.lower(): + frameworks = ["vLLM"] + elif "deepseek" in model.lower(): + frameworks = ["vLLM", "Pytorch", "SGLang"] + elif "llama3_1-8b" in model.lower(): + frameworks = ["vLLM"] else: frameworks = ["Pytorch"] @@ -130,12 +136,7 @@ def mlperf_inference_implementation_readme( categories = ["Datacenter"] elif model.lower() in ["pointpainting"]: categories = ["Edge"] - elif ( - "dlrm" in model.lower() - or "llama2" in model.lower() - or "mixtral" in model.lower() - or "llama3" in model.lower() - ): + elif model.lower() in ["bert-99.9", "dlrm", "llama2", "mixtral", "llama3", "deepseek-r1"]: categories = ["Datacenter"] else: categories = ["Edge", "Datacenter"] @@ -153,8 +154,12 @@ def mlperf_inference_implementation_readme( scenarios.append("MultiStream") if model.lower() in ["pointpainting"]: scenarios.remove("Offline") + if model.lower() in ["whisper"]: + scenarios.remove("SingleStream") elif category == "Datacenter": scenarios = ["Offline", "Server"] + if model.lower() in ["whisper"]: + scenarios.remove("Server") if fixed_scenarios: scenarios = [ scenario for scenario in scenarios if scenario in fixed_scenarios] @@ -164,7 +169,7 @@ def mlperf_inference_implementation_readme( cur_space = pre_space + " " scenarios_string = ", ".join(scenarios) - content += f"{cur_space}### {category} category \n\n{cur_space} In the {category.lower()} category, {model} has {scenarios_string} scenarios and all the scenarios are mandatory for a closed division submission.\n\n" + content += f"""{cur_space}### {category} category \n\n{cur_space} In the {category.lower()} category, {model} has {scenarios_string} scenario{"s" if len(scenarios)>1 else ""} and {"all of the scenarios are" if len(scenarios)>1 else "the scenario is"} mandatory for a closed division submission.\n\n""" for framework in frameworks: cur_space1 = cur_space + " " @@ -539,7 +544,7 @@ def get_common_info(spaces, implementation, model): info += f"{pre_space} - In valid execution mode, the query count for performance mode can be adjusted using `--env.MLC_MLPERF_LOADGEN_QUERY_COUNT=`.\n\n" if implementation.lower() == "reference" and model.lower() not in [ - "pointpainting"]: + "pointpainting", "llama3_1-8b", "deepseek-r1", "whisper"]: info += f"{pre_space} - `_r4.1-dev` could also be given instead of `_r5.0-dev` if you want to run the benchmark with the MLPerf version being 4.1.\n\n" if model == "rgat": @@ -568,6 +573,10 @@ def get_docker_info(spaces, model, implementation, elif "llama3" in model.lower(): info += f"{pre_space} - `--env.MLC_MLPERF_MODEL_LLAMA3_DOWNLOAD_TO_HOST=yes` option can be used to download the model on the host so that it can be reused across different container lanuches. \n\n" info += f"{pre_space} - `--env.MLC_MLPERF_DATASET_LLAMA3_DOWNLOAD_TO_HOST=yes` option can be used to download the dataset on the host so that it can be reused across different container lanuches. \n\n" + elif model.lower() in ["llama3_1-8b", "whisper", "deepseek-r1"]: + info += f"{pre_space} - `--env.MLC_USE_ML_MODEL_FROM_HOST=yes` option can be used to download the model on the host so that it can be reused across different container lanuches. \n\n" + info += f"{pre_space} - `--env.MLC_USE_DATASET_FROM_HOST=yes` option can be used to download the dataset on the host so that it can be reused across different container lanuches. \n\n" + if implementation.lower() == "nvidia": info += f"{pre_space} - Default batch size is assigned based on [GPU memory](https://github.com/mlcommons/cm4mlops/blob/dd0c35856969c68945524d5c80414c615f5fe42c/script/app-mlperf-inference-nvidia/_cm.yaml#L1129) or the [specified GPU](https://github.com/mlcommons/cm4mlops/blob/dd0c35856969c68945524d5c80414c615f5fe42c/script/app-mlperf-inference-nvidia/_cm.yaml#L1370). Please click more option for *docker launch* or *run command* to see how to specify the GPU name.\n\n" info += f"{pre_space} - When run with `--all_models=yes`, all the benchmark models of NVIDIA implementation can be executed within the same container.\n\n" diff --git a/mkdocs.yml b/mkdocs.yml index a0ac88ef98..e4396d2d53 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -46,11 +46,15 @@ nav: - GPT-J: benchmarks/language/gpt-j.md - LLAMA2-70B: benchmarks/language/llama2-70b.md - LLAMA3-405B: benchmarks/language/llama3_1-405b.md + - LLAMA3-8B: benchmarks/language/llama3_1-8b.md - MIXTRAL-8x7B: benchmarks/language/mixtral-8x7b.md + - DeepSeek-R1: benchmarks/language/deepseek-r1.md - Recommendation: - DLRM-v2: benchmarks/recommendation/dlrm-v2.md - Graph Neural Networks: - R-GAT: benchmarks/graph/rgat.md + - Speech to Text: + - Whisper: benchmarks/speech_to_text/whisper.md - Install MLCFlow: - install/index.md - Submission: diff --git a/speech2text/README.md b/speech2text/README.md index 8747cc8557..be325975a2 100644 --- a/speech2text/README.md +++ b/speech2text/README.md @@ -1,5 +1,11 @@ # Reference Implementation for whisper-large-v3 +## Automated command to run the benchmark via MLFlow + +Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/language/whisper/) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker. + +You can also do pip install mlc-scripts and then use `mlcr` commands for downloading the model and datasets using the commands given in the later sections. + ## Prepare environment ### Docker @@ -92,6 +98,15 @@ VLLM_TARGET_DEVICE=cpu pip install --break-system-packages . --no-build-isolatio ## Get Model ### MLCommons Download +**Official Model download using MLCFlow Automation** + +You can download the model automatically via the below command +``` +mlcr get,ml-model,whisper,_rclone,_mlc --outdirname= -j +``` + +**Official Model download using native method** + You can use Rclone to download the preprocessed dataset from a Cloudflare R2 bucket. To run Rclone on Windows, you can download the executable [here](https://rclone.org/install/#windows). @@ -111,6 +126,15 @@ rclone copy mlc-inference:mlcommons-inference-wg-public/Whisper/model/ ./ -P ### External Download (Not recommended for official submission) +**External Model download using MLCFlow Automation** + +You can download the model automatically via the below command +``` +TBD +``` + +**External Model download using native method** + + Requires Git Large Files Storage ```bash export CHECKPOINT_PATH=whisper-large-v3 @@ -127,6 +151,13 @@ We use dev-clean and dev-other splits, which are approximately 10 hours. ### Preprocessed +**Using MLCFlow Automation** +``` +mlcr get,dataset,whisper,_preprocessed,_mlc,_rclone --outdirname= -j +``` + +**Native method** + Download and install rclone as decribed in the [MLCommons Download section](#mlcommons-download) You can then navigate in the terminal to your desired download directory and run the following command to download the dataset: @@ -136,6 +167,13 @@ rclone copy mlc-inference:mlcommons-inference-wg-public/Whisper/dataset/ ./ -P ### Unprocessed +**Using MLCFlow Automation** +``` +mlcr get,dataset,whisper,_unprocessed --outdirname= -j +``` + +**Native method** + If your are using docker, we provide a script to download and preprocess the dataset from the source. You can download it by running: ```bash ./download_dataset.sh @@ -227,6 +265,14 @@ python reference_mlperf.py \ ### Run Accuracy +**Evaluate Accuracy using MLCFlow Automation** + +``` +mlcr run,accuracy,mlperf,_librispeech_whisper,_int32 --result_dir= +``` + +**Evaluate Accuracy using native method** + ```bash python reference_mlperf.py \ --dataset_dir ${DATA_DIR} \ From 3a8595acd31a5f5a401f52d35449374e9d4cd281 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 5 Jul 2025 20:27:26 +0000 Subject: [PATCH 19/64] [Automated Commit] Format Codebase --- main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index 1e5a20bb96..5b3062fa32 100755 --- a/main.py +++ b/main.py @@ -544,7 +544,7 @@ def get_common_info(spaces, implementation, model): info += f"{pre_space} - In valid execution mode, the query count for performance mode can be adjusted using `--env.MLC_MLPERF_LOADGEN_QUERY_COUNT=`.\n\n" if implementation.lower() == "reference" and model.lower() not in [ - "pointpainting", "llama3_1-8b", "deepseek-r1", "whisper"]: + "pointpainting", "llama3_1-8b", "deepseek-r1", "whisper"]: info += f"{pre_space} - `_r4.1-dev` could also be given instead of `_r5.0-dev` if you want to run the benchmark with the MLPerf version being 4.1.\n\n" if model == "rgat": @@ -576,7 +576,7 @@ def get_docker_info(spaces, model, implementation, elif model.lower() in ["llama3_1-8b", "whisper", "deepseek-r1"]: info += f"{pre_space} - `--env.MLC_USE_ML_MODEL_FROM_HOST=yes` option can be used to download the model on the host so that it can be reused across different container lanuches. \n\n" info += f"{pre_space} - `--env.MLC_USE_DATASET_FROM_HOST=yes` option can be used to download the dataset on the host so that it can be reused across different container lanuches. \n\n" - + if implementation.lower() == "nvidia": info += f"{pre_space} - Default batch size is assigned based on [GPU memory](https://github.com/mlcommons/cm4mlops/blob/dd0c35856969c68945524d5c80414c615f5fe42c/script/app-mlperf-inference-nvidia/_cm.yaml#L1129) or the [specified GPU](https://github.com/mlcommons/cm4mlops/blob/dd0c35856969c68945524d5c80414c615f5fe42c/script/app-mlperf-inference-nvidia/_cm.yaml#L1370). Please click more option for *docker launch* or *run command* to see how to specify the GPU name.\n\n" info += f"{pre_space} - When run with `--all_models=yes`, all the benchmark models of NVIDIA implementation can be executed within the same container.\n\n" From 906c0fcb0e43f3e041011baf9a8fb42ade41f7ec Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 15 Jul 2025 22:11:56 +0000 Subject: [PATCH 20/64] [Automated Commit] Format Codebase --- tools/submission/submission_checker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index feeb0e41ae..291f07476a 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -498,7 +498,7 @@ "rgat": ("acc", 0.7286 * 0.99), "pointpainting": ("mAP", 0.5425 * 0.999), "deepseek-r1": ("exact_match", 0.99 * 81.6773, "TOKENS_PER_SAMPLE", 0.9 * 4043.449), - "whisper": ("ACCURACY", (100.0-2.0671) * 0.99), + "whisper": ("ACCURACY", (100.0 - 2.0671) * 0.99), }, "accuracy-upper-limit": { "stable-diffusion-xl": ( From 024e4aded9afdfffd33aeba0271f20f4acc393f7 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Thu, 17 Jul 2025 14:18:54 +0530 Subject: [PATCH 21/64] Update main.py --- main.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/main.py b/main.py index 5b3062fa32..e21582a4af 100755 --- a/main.py +++ b/main.py @@ -66,6 +66,8 @@ def mlperf_inference_implementation_readme( frameworks = ["Pytorch"] elif implementation == "nvidia": + if model in ["retinanet", "resnet50", "3d-unet-99", "3d-unet-99.9]: + code_version = "r5.1-dev" if model in ["mixtral-8x7b"]: return pre_space + " WIP" devices = ["CUDA"] From 9e2c7a982b8d7eed48c508ef6dc0659c6c196db9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 17 Jul 2025 08:50:04 +0000 Subject: [PATCH 22/64] [Automated Commit] Format Codebase --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index e21582a4af..e22e9f0421 100755 --- a/main.py +++ b/main.py @@ -66,7 +66,7 @@ def mlperf_inference_implementation_readme( frameworks = ["Pytorch"] elif implementation == "nvidia": - if model in ["retinanet", "resnet50", "3d-unet-99", "3d-unet-99.9]: + if model in ["retinanet", "resnet50", "3d-unet-99", "3d - unet - 99.9]: code_version = "r5.1-dev" if model in ["mixtral-8x7b"]: return pre_space + " WIP" From 0928e11461625e39b9380f8505a3d8b02a7d06e5 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Thu, 17 Jul 2025 14:20:51 +0530 Subject: [PATCH 23/64] updating for 5.1-dev (inference doc) --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index e22e9f0421..a4c3c556e8 100755 --- a/main.py +++ b/main.py @@ -66,7 +66,7 @@ def mlperf_inference_implementation_readme( frameworks = ["Pytorch"] elif implementation == "nvidia": - if model in ["retinanet", "resnet50", "3d-unet-99", "3d - unet - 99.9]: + if model in ["retinanet", "resnet50", "3d-unet-99", "3d - unet - 99.9"]: code_version = "r5.1-dev" if model in ["mixtral-8x7b"]: return pre_space + " WIP" From 7069a9e58be3f40d9acbc08942c31a2840e893c4 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 17 Jul 2025 08:51:14 +0000 Subject: [PATCH 24/64] [Automated Commit] Format Codebase --- main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index a4c3c556e8..b4d1f5e74d 100755 --- a/main.py +++ b/main.py @@ -66,7 +66,8 @@ def mlperf_inference_implementation_readme( frameworks = ["Pytorch"] elif implementation == "nvidia": - if model in ["retinanet", "resnet50", "3d-unet-99", "3d - unet - 99.9"]: + if model in ["retinanet", "resnet50", + "3d-unet-99", "3d - unet - 99.9"]: code_version = "r5.1-dev" if model in ["mixtral-8x7b"]: return pre_space + " WIP" From 253854ff4f10d73a94f3a071dcaf5fa2e9f02a5b Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Thu, 17 Jul 2025 14:23:03 +0530 Subject: [PATCH 25/64] fix typo --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index b4d1f5e74d..7cdec722ba 100755 --- a/main.py +++ b/main.py @@ -67,7 +67,7 @@ def mlperf_inference_implementation_readme( elif implementation == "nvidia": if model in ["retinanet", "resnet50", - "3d-unet-99", "3d - unet - 99.9"]: + "3d-unet-99", "3d-unet-99.9"]: code_version = "r5.1-dev" if model in ["mixtral-8x7b"]: return pre_space + " WIP" From 371d5835c94c187f5c2af69f9509dbf1b91f4e07 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 21 Jul 2025 13:52:33 +0000 Subject: [PATCH 26/64] [Automated Commit] Format Codebase --- compliance/nvidia/TEST06/run_verification.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/compliance/nvidia/TEST06/run_verification.py b/compliance/nvidia/TEST06/run_verification.py index 70e16f5266..cae64b3f47 100644 --- a/compliance/nvidia/TEST06/run_verification.py +++ b/compliance/nvidia/TEST06/run_verification.py @@ -53,7 +53,12 @@ def get_args(): "--scenario", "-s", required=True, - choices=["Offline", "Server", "Interactive", "SingleStream", "MultiStream"], + choices=[ + "Offline", + "Server", + "Interactive", + "SingleStream", + "MultiStream"], ) args = parser.parse_args() return args From ea86fc011fdeff2f0a1dd462010b4ea16d68c8c5 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Wed, 23 Jul 2025 11:01:30 +0530 Subject: [PATCH 27/64] Update main.py --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index 7cdec722ba..419b76a6da 100755 --- a/main.py +++ b/main.py @@ -190,7 +190,7 @@ def mlperf_inference_implementation_readme( content += f"{cur_space1}=== \"{device}\"\n" content += f"{cur_space2}##### {device} device\n\n" - # minimum system requirements + # get minimum system requirements content += get_min_system_requirements( cur_space2, model, implementation, device From 0828e9ca31c692174d23e1f8baf8eca860381c90 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 24 Jul 2025 18:35:56 +0000 Subject: [PATCH 28/64] [Automated Commit] Format Codebase --- language/deepseek-r1/eval_accuracy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/language/deepseek-r1/eval_accuracy.py b/language/deepseek-r1/eval_accuracy.py index bf537e9d3a..9c103fdcba 100644 --- a/language/deepseek-r1/eval_accuracy.py +++ b/language/deepseek-r1/eval_accuracy.py @@ -773,7 +773,7 @@ def print_evaluation_results(df_evaluated: pd.DataFrame, 'tokens_per_sample': mean_output_len, 'num-samples': len(df_evaluated), } - + print("\nResults\n") print(results) From f8c344fa5f5b312908a888be34f5f3fef615629e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 29 Jul 2025 15:52:53 +0000 Subject: [PATCH 29/64] [Automated Commit] Format Codebase --- language/llama3.1-8b/download_cnndm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/language/llama3.1-8b/download_cnndm.py b/language/llama3.1-8b/download_cnndm.py index d8694be720..90c9ad8d7a 100644 --- a/language/llama3.1-8b/download_cnndm.py +++ b/language/llama3.1-8b/download_cnndm.py @@ -100,8 +100,8 @@ def preprocess_function(sample, padding="max_length"): # create list of samples inputs = [] - #print(f"Num samples: {len(sample[text_column])}") - #for i in range(0, len(sample[text_column])): + # print(f"Num samples: {len(sample[text_column])}") + # for i in range(0, len(sample[text_column])): x = dict() x["instruction"] = instruction_template x["input"] = sample[text_column] @@ -109,7 +109,7 @@ def preprocess_function(sample, padding="max_length"): instruction_template[instruction].format_map(x) ) x["output"] = sample[summary_column] - #inputs.append(x) + # inputs.append(x) model_inputs = dict() model_inputs["text"] = x From d0a2ed48ed75803abf6788d742a1dfd4444885e3 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Wed, 30 Jul 2025 19:39:47 +0530 Subject: [PATCH 30/64] Doc updates (#2292) * improve submission doc * Update index.md * Fix for model and dataset download commands * update submission doc * [Automated Commit] Format Codebase * Update index.md * r2_downloader -> r2-downloader * Update multithreading information about SDXL * [Automated Commit] Format Codebase * .lower() for consistency * [Automated Commit] Format Codebase * updation for llama3_1-8b edge * [Automated Commit] Format Codebase --------- Co-authored-by: github-actions[bot] Co-authored-by: Arjun Suresh --- .../image_classification/get-resnet50-data.md | 6 +++--- .../language/get-deepseek-r1-data.md | 18 +++++++++--------- .../benchmarks/language/get-llama2-70b-data.md | 14 ++++++++++++-- .../language/get-llama3_1-405b-data.md | 8 ++++++++ .../language/get-llama3_1-8b-data.md | 17 +++++++++-------- .../object_detection/get-retinanet-data.md | 6 +++--- .../speech_to_text/get-whisper-data.md | 4 ++-- docs/submission/index.md | 15 ++++++++++++++- main.py | 15 +++++++++------ 9 files changed, 69 insertions(+), 34 deletions(-) diff --git a/docs/benchmarks/image_classification/get-resnet50-data.md b/docs/benchmarks/image_classification/get-resnet50-data.md index 9ecd25c1a5..64e17c8246 100644 --- a/docs/benchmarks/image_classification/get-resnet50-data.md +++ b/docs/benchmarks/image_classification/get-resnet50-data.md @@ -15,7 +15,7 @@ The benchmark implementation run command will automatically download the validat ### Get Validation Dataset ``` - mlcr get,dataset,imagenet,validation -j + mlcr get,dataset,imagenet,validation,_full -j ``` === "Calibration" ResNet50 calibration dataset consist of 500 images selected from the Imagenet 2012 validation dataset. There are 2 alternative options for the calibration dataset. @@ -32,7 +32,7 @@ The benchmark implementation run command will automatically download the validat ### Get ResNet50 preprocessed dataset ``` - mlcr get,dataset,image-classification,imagenet,preprocessed,_pytorch -j + mlcr get,dataset,image-classification,imagenet,preprocessed,_pytorch,_full-j ``` - `--outdirname=` could be provided to download the dataset to a specific location. @@ -52,7 +52,7 @@ Get the Official MLPerf ResNet50 Model ### Onnx ``` - mlcr get,ml-model,resnet50,_onnx -j + mlcr get,ml-model,resnet50,image-classification,_onnx -j ``` - `--outdirname=` could be provided to download the model to a specific location. \ No newline at end of file diff --git a/docs/benchmarks/language/get-deepseek-r1-data.md b/docs/benchmarks/language/get-deepseek-r1-data.md index 401c4d27bc..1dbce5cdf7 100644 --- a/docs/benchmarks/language/get-deepseek-r1-data.md +++ b/docs/benchmarks/language/get-deepseek-r1-data.md @@ -11,14 +11,14 @@ The benchmark implementation run command will automatically download the validat === "Validation" - ### Get Validation Dataset - ``` - mlcr get,preprocessed,dataset,deepseek-r1,_validation,_mlc,_rclone --outdirname= -j - ``` -=== "Calibration" + ### Get Validation Dataset + ``` + mlcr get,preprocessed,dataset,deepseek-r1,_validation,_mlc,_r2-downloader --outdirname= -j + ``` - ### Get Calibration Dataset - ``` - mlcr get,preprocessed,dataset,deepseek-r1,_calibration,_mlc,_rclone --outdirname= -j - ``` \ No newline at end of file +=== "Calibration" + + ### Get Calibration Dataset + ``` + mlcr get,preprocessed,dataset,deepseek-r1,_calibration,_mlc,_r2-downloader --outdirname= -j \ No newline at end of file diff --git a/docs/benchmarks/language/get-llama2-70b-data.md b/docs/benchmarks/language/get-llama2-70b-data.md index ce7cd996eb..6c22d3658d 100644 --- a/docs/benchmarks/language/get-llama2-70b-data.md +++ b/docs/benchmarks/language/get-llama2-70b-data.md @@ -16,7 +16,7 @@ The benchmark implementation run command will automatically download the validat ### Get Preprocessed Validation Dataset ``` - mlcr get,dataset,preprocessed,openorca,_validation -j + mlcr get,dataset,preprocessed,openorca,_validation,_mlcommons -j ``` === "Calibration" @@ -56,7 +56,17 @@ The benchmark implementation run command will automatically download the require ### Get the Official MLPerf LLAMA2-70B model from MLCOMMONS Google Drive ``` - mlcr get,ml-model,llama2-70b,_pytorch -j + mlcr get,ml-model,llama2-70b,_rclone,_mlc,_70b -j + ``` + + === "From MLCOMMONS Cloudfare R2" + + > **Note:** One has to accept the [MLCommons Llama 2 License Confidentiality Notice](https://llama2.mlcommons.org/) to access the model files in MLCOMMONS Google Drive. + + ### Get the Official MLPerf LLAMA2-70B model from MLCOMMONS Cloudfare R2 + + ``` + mlcr get,ml-model,llama2-70b,_mlc,_r2-downloader,_70b -j ``` === "From Hugging Face repo" diff --git a/docs/benchmarks/language/get-llama3_1-405b-data.md b/docs/benchmarks/language/get-llama3_1-405b-data.md index ad05ca8610..3257cd17b0 100644 --- a/docs/benchmarks/language/get-llama3_1-405b-data.md +++ b/docs/benchmarks/language/get-llama3_1-405b-data.md @@ -38,6 +38,14 @@ The benchmark implementation run command will automatically download the require ``` mlcr get,ml-model,llama3 -j ``` + + === "From Cloudfare R2" + + > **Note:** One has to accept the [MLCommons Llama 3.1 License Confidentiality Notice](http://llama3-1.mlcommons.org/) to access the model files in MLCOMMONS Google Drive. + + ### Get the Official MLPerf LLAMA3.1-405B model from MLCOMMONS Cloudfare R2 + ``` + mlcr get,ml-model,llama3,_mlc,_405b,_r2-downloader --outdirname= -j === "From Hugging Face repo" diff --git a/docs/benchmarks/language/get-llama3_1-8b-data.md b/docs/benchmarks/language/get-llama3_1-8b-data.md index e24cc37d44..26b3cf11d1 100644 --- a/docs/benchmarks/language/get-llama3_1-8b-data.md +++ b/docs/benchmarks/language/get-llama3_1-8b-data.md @@ -10,26 +10,27 @@ hide: The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands. === "Validation" - + === "Full dataset (Datacenter)" ### Get Validation Dataset ``` - mlcr get,dataset,cnndm,_validation,_datacenter,_llama3,_mlc,_rclone --outdirname= -j + mlcr get,dataset,cnndm,_validation,_datacenter,_llama3,_mlc,_r2-downloader --outdirname= -j ``` - + === "5000 samples (Edge)" ### Get Validation Dataset ``` - mlcr get,dataset,cnndm,_validation,_edge,_llama3,_mlc,_rclone --outdirname= -j + mlcr get,dataset,cnndm,_validation,_edge,_llama3,_mlc,_r2-downloader --outdirname= -j ``` === "Calibration" - + ``` + ### Get Calibration Dataset ``` - mlcr get,dataset,cnndm,_calibration,_llama3,_mlc,_rclone --outdirname= -j + mlcr get,dataset,cnndm,_calibration,_llama3,_mlc,_r2-downloader --outdirname= -j ``` - `--outdirname=` could be provided to download the dataset to a specific location. @@ -39,13 +40,13 @@ The benchmark implementation run command will automatically download the require === "Pytorch" - === "From MLCOMMONS Google Drive" + === "From Cloudfare R2" > **Note:** One has to accept the [MLCommons Llama 3.1 License Confidentiality Notice](http://llama3-1.mlcommons.org/) to access the model files in MLCOMMONS Google Drive. ### Get the Official MLPerf LLAMA3.1-405B model from MLCOMMONS Cloudfare R2 ``` - TBD + mlcr get,ml-model,llama3,_mlc,_8b,_r2-downloader --outdirname= -j ``` === "From Hugging Face repo" diff --git a/docs/benchmarks/object_detection/get-retinanet-data.md b/docs/benchmarks/object_detection/get-retinanet-data.md index 6127eed541..00c5bf8451 100644 --- a/docs/benchmarks/object_detection/get-retinanet-data.md +++ b/docs/benchmarks/object_detection/get-retinanet-data.md @@ -16,7 +16,7 @@ The benchmark implementation run command will automatically download the validat ### Get Validation Dataset ``` - mlcr get,dataset,openimages,_validation -j + mlcr get,dataset,openimages,original,_validation -j ``` === "Calibration" @@ -24,14 +24,14 @@ The benchmark implementation run command will automatically download the validat ### Get OpenImages Calibration dataset ``` - mlcr get,dataset,openimages,_calibration -j + mlcr get,dataset,openimages,original,_calibration -j ``` === "Preprocessed" ### Get Preprocessed OpenImages dataset ``` - get,dataset,object-detection,open-images,openimages,preprocessed,_validation -j + mlcr get,dataset,object-detection,open-images,openimages,preprocessed,_validation -j ``` - `--outdirname=` could be provided to download the dataset to a specific location. diff --git a/docs/benchmarks/speech_to_text/get-whisper-data.md b/docs/benchmarks/speech_to_text/get-whisper-data.md index 9bc97ad9a0..ed9e3b02b0 100644 --- a/docs/benchmarks/speech_to_text/get-whisper-data.md +++ b/docs/benchmarks/speech_to_text/get-whisper-data.md @@ -15,7 +15,7 @@ The benchmark implementation run command will automatically download the validat ### Get Preprocessed Validation Dataset ``` - mlcr get,dataset,whisper,_preprocessed,_mlc,_rclone --outdirname= -j + mlcr get,dataset,whisper,_preprocessed,_mlc,_r2-downloader --outdirname= -j ``` === "Unprocessed" @@ -34,7 +34,7 @@ The benchmark implementation run command will automatically download the require ### Get the Official MLPerf Whisper model from MLCOMMONS Cloudflare R2 ``` - mlcr get,ml-model,whisper,_rclone,_mlc s-j + mlcr get,ml-model,whisper,_r2-downloader,_mlc -j ``` - `--outdirname=` could be provided to download the model to a specific location. \ No newline at end of file diff --git a/docs/submission/index.md b/docs/submission/index.md index 079a513854..56f872fc6e 100644 --- a/docs/submission/index.md +++ b/docs/submission/index.md @@ -158,6 +158,18 @@ If there are multiple systems where MLPerf results are collected, the same proce --commit_message="Results on added by " \ --quiet ``` + + The path to the locally synced submission directory from the output below can be used in the next step by passing it to the `--submission_dir` argument. +
+ Click to see the sample output + ``` + [2025-07-23 16:36:56,399 module.py:2197 INFO] - + + Path to the locally synced submission directory: mysubmissions/mlperf_submission + + + ``` +
```mermaid flowchart LR @@ -193,7 +205,8 @@ Once you have all the results on the system, you can upload them to the MLCommon mlcr run,mlperf,submission,checker,inference \ --submitter_id=<> \ --submission_dir= - ``` + ``` + === "via Browser" You can do the following command to generate the final submission tar file and then upload to the [MLCommons Submission UI](https://submissions-ui.mlcommons.org/submission). ``` diff --git a/main.py b/main.py index 419b76a6da..e4782bf3ac 100755 --- a/main.py +++ b/main.py @@ -45,7 +45,8 @@ def mlperf_inference_implementation_readme( if "99.9" not in model and implementation_tips: content += f"\n{pre_space}!!! tip\n\n" content += f"{pre_space} - MLCommons reference implementations are only meant to provide a rules compliant reference implementation for the submitters and in most cases are not best performing. If you want to benchmark any system, it is advisable to use the vendor MLPerf implementation for that system like Nvidia, Intel etc.\n\n" - + if model.lower() in ["sdxl"]: + content += f"\n{pre_space}> **Note:** {model.upper()} reference implementation does not support multithreading.\n\n" if not devices: devices = ["CPU", "CUDA", "ROCm"] @@ -139,10 +140,10 @@ def mlperf_inference_implementation_readme( categories = ["Datacenter"] elif model.lower() in ["pointpainting"]: categories = ["Edge"] - elif model.lower() in ["bert-99.9", "dlrm", "llama2", "mixtral", "llama3", "deepseek-r1"]: + elif model.lower() in ["bert-99.9", "dlrm", "llama2", "mixtral", "llama3_1-405b-99.9", "llama3_1-405b-99", "deepseek-r1"]: categories = ["Datacenter"] else: - categories = ["Edge", "Datacenter"] + categories = ["Datacenter", "Edge"] # model name content += f"{pre_space}{model.upper()}\n\n" @@ -159,6 +160,8 @@ def mlperf_inference_implementation_readme( scenarios.remove("Offline") if model.lower() in ["whisper"]: scenarios.remove("SingleStream") + if model.lower() == "llama3_1-8b": + model = "llama3_1-8b-edge" elif category == "Datacenter": scenarios = ["Offline", "Server"] if model.lower() in ["whisper"]: @@ -547,7 +550,7 @@ def get_common_info(spaces, implementation, model): info += f"{pre_space} - In valid execution mode, the query count for performance mode can be adjusted using `--env.MLC_MLPERF_LOADGEN_QUERY_COUNT=`.\n\n" if implementation.lower() == "reference" and model.lower() not in [ - "pointpainting", "llama3_1-8b", "deepseek-r1", "whisper"]: + "pointpainting", "llama3_1-8b", "llama3_1-8b-edge", "deepseek-r1", "whisper"]: info += f"{pre_space} - `_r4.1-dev` could also be given instead of `_r5.0-dev` if you want to run the benchmark with the MLPerf version being 4.1.\n\n" if model == "rgat": @@ -573,10 +576,10 @@ def get_docker_info(spaces, model, implementation, if model == "sdxl": info += f"{pre_space} - `--env.MLC_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes` option can be used to download the model on the host so that it can be reused across different container lanuches. \n\n" - elif "llama3" in model.lower(): + elif "llama3_1-405b" in model.lower(): info += f"{pre_space} - `--env.MLC_MLPERF_MODEL_LLAMA3_DOWNLOAD_TO_HOST=yes` option can be used to download the model on the host so that it can be reused across different container lanuches. \n\n" info += f"{pre_space} - `--env.MLC_MLPERF_DATASET_LLAMA3_DOWNLOAD_TO_HOST=yes` option can be used to download the dataset on the host so that it can be reused across different container lanuches. \n\n" - elif model.lower() in ["llama3_1-8b", "whisper", "deepseek-r1"]: + elif model.lower() in ["llama3_1-8b", "llama3_1-8b-edge", "whisper", "deepseek-r1"]: info += f"{pre_space} - `--env.MLC_USE_ML_MODEL_FROM_HOST=yes` option can be used to download the model on the host so that it can be reused across different container lanuches. \n\n" info += f"{pre_space} - `--env.MLC_USE_DATASET_FROM_HOST=yes` option can be used to download the dataset on the host so that it can be reused across different container lanuches. \n\n" From 23bd06243d9232728f185bfe39411ec977764712 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Thu, 31 Jul 2025 13:15:03 +0530 Subject: [PATCH 31/64] Add quiet flags to MLC commands (#2309) --- docs/submission/index.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/submission/index.md b/docs/submission/index.md index 56f872fc6e..7b4ab585f5 100644 --- a/docs/submission/index.md +++ b/docs/submission/index.md @@ -204,7 +204,7 @@ Once you have all the results on the system, you can upload them to the MLCommon ``` mlcr run,mlperf,submission,checker,inference \ --submitter_id=<> \ - --submission_dir= + --submission_dir= --quiet ``` === "via Browser" @@ -213,7 +213,7 @@ Once you have all the results on the system, you can upload them to the MLCommon mlcr run,mlperf,submission,checker,inference \ --submission_dir= \ --tar=yes \ - --submission_tar_file=mysubmission.tar.gz + --submission_tar_file=mysubmission.tar.gz --quiet ``` ```mermaid From 6ec49291d40cf427f5c45428e952db6e6349c9c0 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Thu, 31 Jul 2025 23:27:58 +0530 Subject: [PATCH 32/64] Improve docs - submission generation (#2311) --- docs/submission/index.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/submission/index.md b/docs/submission/index.md index 7b4ab585f5..f920bbfa11 100644 --- a/docs/submission/index.md +++ b/docs/submission/index.md @@ -159,7 +159,7 @@ If there are multiple systems where MLPerf results are collected, the same proce --quiet ``` - The path to the locally synced submission directory from the output below can be used in the next step by passing it to the `--submission_dir` argument. + > **Note:** The path to the locally synced submission directory from the output below can be used in the next step by passing it to the `--submission_dir` argument.
Click to see the sample output ``` @@ -204,7 +204,7 @@ Once you have all the results on the system, you can upload them to the MLCommon ``` mlcr run,mlperf,submission,checker,inference \ --submitter_id=<> \ - --submission_dir= --quiet + --submission_dir= --quiet ``` === "via Browser" From ff856b8ff204395338432d7c823f677644c2ffe2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 31 Jul 2025 22:04:42 +0000 Subject: [PATCH 33/64] [Automated Commit] Format Codebase --- tools/submission/submission_checker.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index f124b808c5..7a0c22c902 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -1487,7 +1487,8 @@ def check_accuracy_dir(config, model, path, verbose): def extra_check_llm(mlperf_log, scenario, model): if mlperf_log["requested_use_token_latencies"]: if scenario not in ["Server", "Interactive"]: - # For offline, singlestream and multistream no further checks are necessary + # For offline, singlestream and multistream no further checks are + # necessary return True else: limits = LLM_LATENCY_LIMITS[model][scenario] @@ -1887,7 +1888,7 @@ def get_power_metric(config, scenario_fixed, log_path, is_valid, res): samples_per_query = 8 if (scenario_fixed in ["MultiStream"] - ) and scenario in ["SingleStream"]: + ) and scenario in ["SingleStream"]: power_metric = ( avg_power * power_duration * samples_per_query * 1000 / num_queries ) From bf1469eccc454c3e74733740ad1be0653d8959d6 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 4 Nov 2025 22:11:55 +0000 Subject: [PATCH 34/64] [Automated Commit] Format Codebase --- speech2text/accuracy_eval.py | 4 ++-- speech2text/reference_SUT.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/speech2text/accuracy_eval.py b/speech2text/accuracy_eval.py index eb6cc53299..c356ab6398 100644 --- a/speech2text/accuracy_eval.py +++ b/speech2text/accuracy_eval.py @@ -57,12 +57,12 @@ "x", "y", "z", - "'", + "'", "0", "1", "2", "3", - "4", + "4", "5", "6", "7", diff --git a/speech2text/reference_SUT.py b/speech2text/reference_SUT.py index 63d491a00f..0b2f02c490 100644 --- a/speech2text/reference_SUT.py +++ b/speech2text/reference_SUT.py @@ -90,12 +90,12 @@ def get_start_cores(start_cores="0"): "x", "y", "z", - "'", + "'", "0", "1", "2", "3", - "4", + "4", "5", "6", "7", From d9f1bc1d28939f3d6b58678aad7184539f7c072c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 25 Nov 2025 16:56:38 +0000 Subject: [PATCH 35/64] [Automated Commit] Format Codebase --- tools/submission/preprocess_submission.py | 19 +++++++++++++------ tools/submission/submission_checker.py | 5 ++--- tools/submission/truncate_accuracy_log.py | 3 ++- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/tools/submission/preprocess_submission.py b/tools/submission/preprocess_submission.py index 34043208c8..df3d748c5c 100644 --- a/tools/submission/preprocess_submission.py +++ b/tools/submission/preprocess_submission.py @@ -99,7 +99,7 @@ def delete_empty_dirs(src): return False -def copy_submission_dir(src, dst, filter_submitter, keep_structure = True): +def copy_submission_dir(src, dst, filter_submitter, keep_structure=True): """ Copies the submission tree to output directory for processing """ @@ -116,15 +116,18 @@ def copy_submission_dir(src, dst, filter_submitter, keep_structure = True): ) else: for dir in os.listdir(os.path.join(src, division, submitter)): - if os.path.isdir(os.path.join(src, division, submitter, dir)): - target_dir = "results" if dir in ["compliance", "measurements"] else dir + if os.path.isdir(os.path.join( + src, division, submitter, dir)): + target_dir = "results" if dir in [ + "compliance", "measurements"] else dir shutil.copytree( os.path.join(src, division, submitter, dir), os.path.join(dst, division, submitter, target_dir), - dirs_exist_ok = True + dirs_exist_ok=True ) for file in os.listdir(os.path.join(src, division, submitter)): - if os.path.isfile(os.path.join(src, division, submitter, file)): + if os.path.isfile(os.path.join( + src, division, submitter, file)): shutil.copyfile( os.path.join(src, division, submitter, file), os.path.join(dst, division, submitter, file) @@ -561,7 +564,11 @@ def main(): log.error(f"output directory {args.output} already exists") sys.exit(1) os.makedirs(args.output) - copy_submission_dir(args.input, args.output, args.submitter, args.keep_structure) + copy_submission_dir( + args.input, + args.output, + args.submitter, + args.keep_structure) src_dir = args.output config = checker.Config( diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index 335485c33c..5c2801bacb 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -1061,7 +1061,7 @@ def set_type(self, submission_type): self.optional = self.base["optional-scenarios-datacenter-edge"] else: raise ValueError("invalid system type") - + def skip_calibration(self): return self.skip_calibration_check or self.version in ["v5.0"] @@ -1893,7 +1893,7 @@ def get_power_metric(config, scenario_fixed, log_path, is_valid, res): samples_per_query = 8 if (scenario_fixed in ["MultiStream"] - ) and scenario in ["SingleStream"]: + ) and scenario in ["SingleStream"]: power_metric = ( avg_power * power_duration * samples_per_query * 1000 / num_queries ) @@ -3040,7 +3040,6 @@ def check_measurement_dir( end = len(".json") break - weight_data_types = None if system_file: with open(os.path.join(measurement_dir, system_file), "r") as f: diff --git a/tools/submission/truncate_accuracy_log.py b/tools/submission/truncate_accuracy_log.py index 6c1267fdf8..87bba5ab98 100755 --- a/tools/submission/truncate_accuracy_log.py +++ b/tools/submission/truncate_accuracy_log.py @@ -172,7 +172,8 @@ def truncate_results_dir(filter_submitter, backup, scenarios_to_skip): acc_path, "accuracy.txt") # only TEST01 has an accuracy log - if str(test).startswith("TEST") and test != "TEST01": + if str(test).startswith( + "TEST") and test != "TEST01": continue if not os.path.exists(acc_log): log.error("%s missing", acc_log) From 8ed956d04605534c25d043318c0d77258d5b7573 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 19 Dec 2025 22:15:22 +0000 Subject: [PATCH 36/64] [Automated Commit] Format Codebase --- language/deepseek-r1/backends/sglang_backend.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/language/deepseek-r1/backends/sglang_backend.py b/language/deepseek-r1/backends/sglang_backend.py index 8efc0d76d2..010cc6dd7e 100644 --- a/language/deepseek-r1/backends/sglang_backend.py +++ b/language/deepseek-r1/backends/sglang_backend.py @@ -126,7 +126,8 @@ def _build_server_command(self) -> List[str]: # Add optimization flags if self.config['enable_speculative_decode']: - cmd.extend(['--speculative-algorithm', self.config['speculative_algorithm']]) + cmd.extend(['--speculative-algorithm', + self.config['speculative_algorithm']]) cmd.extend(['--speculative-num-steps', str(self.config['speculative_num_steps'])]) cmd.extend(['--speculative-eagle-topk', From 2d0034fab546b710272713b496cc9cca2ca3ef74 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 22 Dec 2025 17:14:14 +0000 Subject: [PATCH 37/64] [Automated Commit] Format Codebase --- .../wan2.2-t2v-14b/download_model.py | 26 +++--- .../wan2.2-t2v-14b/run_evaluation.py | 60 +++++++------ text_to_video/wan2.2-t2v-14b/run_inference.py | 88 ++++++++++--------- 3 files changed, 94 insertions(+), 80 deletions(-) diff --git a/text_to_video/wan2.2-t2v-14b/download_model.py b/text_to_video/wan2.2-t2v-14b/download_model.py index d0efd0b321..cdcc14cf8c 100755 --- a/text_to_video/wan2.2-t2v-14b/download_model.py +++ b/text_to_video/wan2.2-t2v-14b/download_model.py @@ -16,29 +16,31 @@ sys.exit(1) -def download_model(download_path: str, model_name: str = "Wan-AI/Wan2.2-T2V-A14B-Diffusers"): +def download_model(download_path: str, + model_name: str = "Wan-AI/Wan2.2-T2V-A14B-Diffusers"): """ Download Wan T2V model from HuggingFace. - + Args: download_path: Directory to download the model model_name: HuggingFace model identifier """ download_path = Path(download_path).resolve() - # Extract model name without org prefix (e.g., "Wan-AI/Wan2.2-T2V-A14B-Diffusers" -> "Wan2.2-T2V-A14B-Diffusers") + # Extract model name without org prefix (e.g., + # "Wan-AI/Wan2.2-T2V-A14B-Diffusers" -> "Wan2.2-T2V-A14B-Diffusers") model_dir_name = model_name.split("/")[-1] model_path = download_path / model_dir_name - + # Create download directory download_path.mkdir(parents=True, exist_ok=True) - + print("=" * 60) print(f"{model_name} Model Download") print("=" * 60) print(f"Download path: {model_path}") print("=" * 60) print() - + try: print("Starting download...") snapshot_download( @@ -47,13 +49,13 @@ def download_model(download_path: str, model_name: str = "Wan-AI/Wan2.2-T2V-A14B local_dir_use_symlinks=False, resume_download=True, ) - + print() print("=" * 60) print("? Download completed successfully!") print("=" * 60) print(f"Model location: {model_path}") - + except Exception as e: print() print("=" * 60) @@ -67,21 +69,21 @@ def main(): parser = argparse.ArgumentParser( description="Download Wan2.2 T2V-A14B-Diffusers model from HuggingFace", ) - + parser.add_argument( "-d", "--download-path", default=os.environ.get("DOWNLOAD_PATH", "./models"), help="Download directory (default: ./models or $DOWNLOAD_PATH)" ) - + parser.add_argument( "--model-name", default="Wan-AI/Wan2.2-T2V-A14B-Diffusers", help="HuggingFace model identifier (default: Wan-AI/Wan2.2-T2V-A14B-Diffusers)" ) - + args = parser.parse_args() - + download_model(args.download_path, args.model_name) diff --git a/text_to_video/wan2.2-t2v-14b/run_evaluation.py b/text_to_video/wan2.2-t2v-14b/run_evaluation.py index f1b6352d2f..b925be68aa 100755 --- a/text_to_video/wan2.2-t2v-14b/run_evaluation.py +++ b/text_to_video/wan2.2-t2v-14b/run_evaluation.py @@ -12,6 +12,7 @@ import sys from pathlib import Path + def setup_logging(): """Setup logging configuration.""" logging.basicConfig( @@ -24,36 +25,37 @@ def setup_logging(): def parse_results(output_path): """ Parse VBench evaluation results and print summary. - + Args: output_path: Path to evaluation results directory """ output_path = Path(output_path) - + # Find the most recent eval_results file (contains scores) result_files = sorted(output_path.glob("results_*_eval_results.json")) if not result_files: logging.warning(f"No results found in {output_path}") return - + result_file = result_files[-1] - + try: with open(result_file, 'r') as f: results = json.load(f) - + # Print summary in MLPerf-style format - print("\n" + "="*60) + print("\n" + "=" * 60) print("VBench Evaluation Results") - print("="*60) - - # Extract dimension scores (VBench format: {dimension_name: [avg_score, [video_results]], ...}) + print("=" * 60) + + # Extract dimension scores (VBench format: {dimension_name: [avg_score, + # [video_results]], ...}) if results: print("\nDimension Scores:") print("-" * 60) total_score = 0 num_dimensions = 0 - + for dimension, value in sorted(results.items()): # VBench stores [avg_score, list_of_video_results] if isinstance(value, list) and len(value) > 0: @@ -62,16 +64,16 @@ def parse_results(output_path): total_score += score num_dimensions += 1 print(f" {dimension:30s}: {score:6.4f}") - + if num_dimensions > 0: overall_avg = total_score / num_dimensions print("-" * 60) print(f" {'Overall Average':30s}: {overall_avg:6.4f}") - - print("="*60) + + print("=" * 60) print(f"Detailed results: {result_file}") - print("="*60 + "\n") - + print("=" * 60 + "\n") + except Exception as e: logging.error(f"Failed to parse results: {e}") import traceback @@ -79,7 +81,8 @@ def parse_results(output_path): def main(): - parser = argparse.ArgumentParser(description="VBench evaluation for Wan2.2 T2V videos") + parser = argparse.ArgumentParser( + description="VBench evaluation for Wan2.2 T2V videos") parser.add_argument( "--videos-path", type=str, @@ -111,20 +114,20 @@ def main(): default=8, help="Number of GPUs to use for evaluation (default: 8)" ) - + args = parser.parse_args() - + setup_logging() - + # Validate inputs videos_path = Path(args.videos_path) if not videos_path.exists(): logging.error(f"Videos path does not exist: {videos_path}") return 1 - + output_path = Path(args.output_path) output_path.mkdir(parents=True, exist_ok=True) - + logging.info("=" * 60) logging.info("VBench Evaluation") logging.info("=" * 60) @@ -133,8 +136,9 @@ def main(): logging.info(f"GPUs: {args.num_gpus}") logging.info(f"Dimensions: {', '.join(args.dimensions)}") logging.info("=" * 60) - - vbench_script = Path(__file__).parent / "submodules" / "VBench" / "evaluate.py" + + vbench_script = Path(__file__).parent / "submodules" / \ + "VBench" / "evaluate.py" cmd = [ "python", "-m", "torch.distributed.run", f"--nproc_per_node={args.num_gpus}", @@ -144,21 +148,21 @@ def main(): "--load_ckpt_from_local=True", "--dimension" ] + args.dimensions - + logging.info("\nExecuting VBench evaluation...") logging.info(f"Command: {' '.join(cmd)}") logging.info("") - + # Run evaluation try: result = subprocess.run(cmd, check=True) - + # Parse and print results logging.info("\nParsing evaluation results...") parse_results(output_path) - + return 0 - + except subprocess.CalledProcessError as e: logging.error(f"Evaluation failed with exit code {e.returncode}") return e.returncode diff --git a/text_to_video/wan2.2-t2v-14b/run_inference.py b/text_to_video/wan2.2-t2v-14b/run_inference.py index 43b972800d..801e0c68b4 100755 --- a/text_to_video/wan2.2-t2v-14b/run_inference.py +++ b/text_to_video/wan2.2-t2v-14b/run_inference.py @@ -5,6 +5,10 @@ Supports multi-GPU inference with data parallelism (prompts divided among GPUs). """ +from diffusers.utils import export_to_video +from diffusers import WanPipeline, AutoencoderKLWan +import torch +import yaml import argparse import logging import os @@ -14,13 +18,10 @@ warnings.filterwarnings('ignore') -import yaml -import torch -from diffusers import WanPipeline, AutoencoderKLWan -from diffusers.utils import export_to_video # import modelopt.torch.opt as mto + def setup_logging(rank): """Setup logging configuration for data parallel (all ranks log).""" logging.basicConfig( @@ -50,12 +51,12 @@ def generate_videos(args, config): world_size = int(os.environ.get("WORLD_SIZE", 1)) rank = int(os.environ.get("RANK", 0)) local_rank = int(os.environ.get("LOCAL_RANK", 0)) - + torch.cuda.set_device(local_rank) device = torch.device(f"cuda:{local_rank}") - + setup_logging(rank) - + # Generation parameters from config height = config['height'] width = config['width'] @@ -67,7 +68,7 @@ def generate_videos(args, config): negative_prompt = config['negative_prompt'].strip() sample_steps = config['sample_steps'] base_seed = config['seed'] - + if rank == 0: logging.info(f"Model: Wan2.2 T2V-A14B-Diffusers") logging.info(f"Model path: {args.model_path}") @@ -75,24 +76,25 @@ def generate_videos(args, config): logging.info(f"Sample steps: {sample_steps}") logging.info(f"Base seed: {base_seed}") logging.info(f"Iterations per prompt: {args.num_iterations}") - + all_prompts = load_prompts(args.dataset) - + if rank == 0: logging.info(f"Loaded {len(all_prompts)} prompts from {args.dataset}") - + if args.num_prompts > 0: all_prompts = all_prompts[:args.num_prompts] if rank == 0: logging.info(f"Processing first {args.num_prompts} prompts") - + # Divide prompts among GPUs (data parallelism) prompts = all_prompts[rank::world_size] - logging.info(f"This rank will process {len(prompts)} prompts (indices: {rank}, {rank + world_size}, ...)") - + logging.info( + f"This rank will process {len(prompts)} prompts (indices: {rank}, {rank + world_size}, ...)") + output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) - + logging.info("Loading Diffusers pipeline...") vae = AutoencoderKLWan.from_pretrained( args.model_path, @@ -120,45 +122,49 @@ def generate_videos(args, config): # logging.info("Quantized model loaded successfully!") - fixed_latent = None if args.fixed_latent: fixed_latent = torch.load(args.fixed_latent) - logging.info(f"Loaded fixed latent from {args.fixed_latent} with shape: {fixed_latent.shape}") + logging.info( + f"Loaded fixed latent from {args.fixed_latent} with shape: {fixed_latent.shape}") logging.info(f"This latent will be reused for all generations") else: logging.info("No fixed latent provided - using random initial latents") - + if rank == 0: - logging.info(f"Starting batch generation: {len(all_prompts)} total prompts x {args.num_iterations} iterations") + logging.info( + f"Starting batch generation: {len(all_prompts)} total prompts x {args.num_iterations} iterations") logging.info(f"Each GPU processes ~{len(prompts)} prompts") - - # Generate videos: iterate through all prompts, then repeat for next iteration + + # Generate videos: iterate through all prompts, then repeat for next + # iteration total_videos = 0 for iteration in range(args.num_iterations): if rank == 0: logging.info(f"\n{'='*60}") logging.info(f"ITERATION {iteration + 1}/{args.num_iterations}") logging.info(f"{'='*60}") - + for local_idx, prompt in enumerate(prompts): # Calculate global prompt index global_idx = rank + local_idx * world_size - - logging.info(f"[Prompt {global_idx+1}/{len(all_prompts)}, Iteration {iteration+1}/{args.num_iterations}] {prompt}") - + + logging.info( + f"[Prompt {global_idx+1}/{len(all_prompts)}, Iteration {iteration+1}/{args.num_iterations}] {prompt}") + # Check if video already exists filename = f"{prompt}-{iteration}.mp4" save_path = output_dir / filename - + if save_path.exists(): - logging.info(f"Video already exists at {save_path}, skipping generation") + logging.info( + f"Video already exists at {save_path}, skipping generation") total_videos += 1 continue - + # Generate video with seed based on iteration current_seed = base_seed + iteration - + # Prepare pipeline arguments pipeline_kwargs = { "prompt": prompt, @@ -171,21 +177,22 @@ def generate_videos(args, config): "num_inference_steps": sample_steps, "generator": torch.Generator(device=device).manual_seed(current_seed), } - + # Only pass latents if fixed_latent is provided if fixed_latent is not None: pipeline_kwargs["latents"] = fixed_latent - + output = pipe(**pipeline_kwargs).frames[0] - + # Save video with VBench format: -.mp4 logging.info(f"Saving to {save_path} (seed: {current_seed})") export_to_video(output, str(save_path), fps=25) total_videos += 1 - logging.info(f"Saved! ({total_videos}/{len(prompts) * args.num_iterations} for this GPU)") - + logging.info( + f"Saved! ({total_videos}/{len(prompts) * args.num_iterations} for this GPU)") + torch.cuda.empty_cache() - + logging.info(f"\n{'='*60}") logging.info(f"Batch generation complete for this GPU!") logging.info(f"Generated {total_videos} videos in {output_dir}") @@ -193,8 +200,9 @@ def generate_videos(args, config): def main(): - parser = argparse.ArgumentParser(description="Batch T2V inference with Wan2.2-Diffusers") - + parser = argparse.ArgumentParser( + description="Batch T2V inference with Wan2.2-Diffusers") + parser.add_argument( "--model-path", type=str, @@ -248,11 +256,11 @@ def main(): # default="./models/Wan2.2-T2V-FP8-Torch", # help="Path to quantized model (default: ./models/Wan2.2-T2V-FP8-Torch)" # ) - + args = parser.parse_args() - + config = load_config(args.config) - + generate_videos(args, config) From fa639dcef12d0f26199bc32b4e3e1ff94aacfa7e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 6 Jan 2026 17:36:35 +0000 Subject: [PATCH 38/64] [Automated Commit] Format Codebase --- multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/task.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/task.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/task.py index 5fc3881d95..7d53022a0f 100644 --- a/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/task.py +++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/task.py @@ -67,7 +67,8 @@ def __init__( self.openai_api_client = AsyncOpenAI( base_url=endpoint.url, http_client=DefaultAioHttpClient( - timeout=httpx.Timeout(timeout=request_timeout_seconds, connect=5.0), + timeout=httpx.Timeout( + timeout=request_timeout_seconds, connect=5.0), ), api_key=endpoint.api_key, timeout=request_timeout_seconds, @@ -187,7 +188,9 @@ def estimated_num_performance_samples(self) -> int: """ estimation_indices = random.sample( range(self.total_num_samples), - k=min(MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES, self.total_num_samples), + k=min( + MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES, + self.total_num_samples), ) estimation_samples = [ self.formulate_loaded_sample( @@ -274,7 +277,8 @@ def _unload_samples_from_ram(query_sample_indices: list[int]) -> None: _unload_samples_from_ram, ) - async def _query_endpoint_async_batch(self, query_sample: lg.QuerySample) -> None: + async def _query_endpoint_async_batch( + self, query_sample: lg.QuerySample) -> None: """Query the endpoint through the async OpenAI API client.""" try: sample = self.loaded_samples[query_sample.index] @@ -360,7 +364,8 @@ async def _query_endpoint_async_batch(self, query_sample: lg.QuerySample) -> Non ], ) - async def _query_endpoint_async_stream(self, query_sample: lg.QuerySample) -> None: + async def _query_endpoint_async_stream( + self, query_sample: lg.QuerySample) -> None: """Query the endpoint through the async OpenAI API client.""" ttft_set = False try: From d81659ad9a6483a632eadf7746874038d14fe931 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 6 Jan 2026 18:18:32 +0000 Subject: [PATCH 39/64] [Automated Commit] Format Codebase --- recommendation/dlrm_v3/accuracy.py | 5 +- recommendation/dlrm_v3/checkpoint.py | 21 ++- recommendation/dlrm_v3/configs.py | 3 +- recommendation/dlrm_v3/data_producer.py | 14 +- recommendation/dlrm_v3/datasets/dataset.py | 9 +- .../dlrm_v3/datasets/synthetic_streaming.py | 18 ++- recommendation/dlrm_v3/datasets/utils.py | 4 +- .../dlrm_v3/generative_recommenders/common.py | 16 ++- .../modules/action_encoder.py | 6 +- .../modules/content_encoder.py | 3 +- .../contextual_interleave_preprocessor.py | 15 +- .../modules/contextualize_mlps.py | 3 +- .../modules/dlrm_hstu.py | 18 ++- .../modules/dynamic_stu.py | 3 +- .../modules/hstu_transducer.py | 6 +- .../modules/multitask_module.py | 17 ++- .../modules/postprocessors.py | 15 +- .../modules/preprocessors.py | 6 +- .../generative_recommenders/modules/stu.py | 6 +- .../ops/hstu_attention.py | 18 ++- .../ops/jagged_tensors.py | 4 +- .../ops/pytorch/pt_hstu_attention.py | 15 +- .../ops/pytorch/pt_jagged.py | 18 ++- .../ops/pytorch/pt_jagged_tensors.py | 6 +- .../ops/pytorch/pt_position.py | 3 +- .../ops/triton/triton_addmm.py | 87 ++++++++---- .../ops/triton/triton_hstu_attention.py | 100 ++++++++++---- .../ops/triton/triton_hstu_linear.py | 130 ++++++++++++++---- .../triton_hstu_preprocess_and_attention.py | 3 +- .../ops/triton/triton_jagged.py | 120 ++++++++++------ .../ops/triton/triton_jagged_tensors.py | 36 +++-- .../ops/triton/triton_layer_norm.py | 70 +++++++--- .../ops/triton/triton_position.py | 10 +- recommendation/dlrm_v3/inference_modules.py | 3 +- recommendation/dlrm_v3/main.py | 86 ++++++------ recommendation/dlrm_v3/model_family.py | 18 ++- .../dlrm_v3/streaming_synthetic_data.py | 40 ++++-- recommendation/dlrm_v3/utils.py | 15 +- 38 files changed, 664 insertions(+), 306 deletions(-) diff --git a/recommendation/dlrm_v3/accuracy.py b/recommendation/dlrm_v3/accuracy.py index 5d2d0ff11a..569f3d2476 100644 --- a/recommendation/dlrm_v3/accuracy.py +++ b/recommendation/dlrm_v3/accuracy.py @@ -67,9 +67,10 @@ def main() -> None: num_candidates = data[-1].astype(int) assert len(data) == 1 + num_candidates * 3 mt_target_preds = torch.from_numpy(data[0:num_candidates]) - mt_target_labels = torch.from_numpy(data[num_candidates : num_candidates * 2]) + mt_target_labels = torch.from_numpy( + data[num_candidates: num_candidates * 2]) mt_target_weights = torch.from_numpy( - data[num_candidates * 2 : num_candidates * 3] + data[num_candidates * 2: num_candidates * 3] ) num_candidates = torch.tensor([num_candidates]) metrics.update( diff --git a/recommendation/dlrm_v3/checkpoint.py b/recommendation/dlrm_v3/checkpoint.py index 33dbaf3c58..8d5ca17c60 100644 --- a/recommendation/dlrm_v3/checkpoint.py +++ b/recommendation/dlrm_v3/checkpoint.py @@ -46,7 +46,8 @@ class SparseState(Stateful): sparse_tensor_keys: Set of keys identifying sparse tensors in the model's state dict. """ - def __init__(self, model: torch.nn.Module, sparse_tensor_keys: Set[str]) -> None: + def __init__(self, model: torch.nn.Module, + sparse_tensor_keys: Set[str]) -> None: self.model = model self.sparse_tensor_keys = sparse_tensor_keys @@ -62,7 +63,8 @@ def state_dict(self) -> Dict[str, torch.Tensor]: return out_dict def load_state_dict(self, state_dict: Dict[str, torch.Tensor]) -> None: - incompatible_keys = self.model.load_state_dict(state_dict, strict=False) + incompatible_keys = self.model.load_state_dict( + state_dict, strict=False) assert not incompatible_keys.unexpected_keys @@ -70,9 +72,14 @@ def is_sparse_key(k: str, v: torch.Tensor) -> bool: return isinstance(v, ShardedTensor) or "embedding_collection" in k -def load_dense_state_dict(model: torch.nn.Module, state_dict: Dict[str, Any]) -> None: +def load_dense_state_dict(model: torch.nn.Module, + state_dict: Dict[str, Any]) -> None: own_state = model.state_dict() - own_state_dense_keys = {k for k, v in own_state.items() if not is_sparse_key(k, v)} + own_state_dense_keys = { + k for k, + v in own_state.items() if not is_sparse_key( + k, + v)} state_dict_dense_keys = { k for k, v in state_dict.items() if not is_sparse_key(k, v) } @@ -156,7 +163,8 @@ def save_dmp_checkpoint( sparse_dict = {"sparse_dict": SparseState(model, sparse_tensor_keys)} torch.distributed.checkpoint.save( sparse_dict, - storage_writer=torch.distributed.checkpoint.FileSystemWriter(sparse_path), + storage_writer=torch.distributed.checkpoint.FileSystemWriter( + sparse_path), ) torch.distributed.barrier() print("checkpoint successfully saved") @@ -178,7 +186,8 @@ def load_sparse_checkpoint( gc.collect() torch.distributed.checkpoint.load( sparse_dict, - storage_reader=torch.distributed.checkpoint.FileSystemReader(sparse_path), + storage_reader=torch.distributed.checkpoint.FileSystemReader( + sparse_path), ) gc.collect() print("sparse checkpoint successfully loaded") diff --git a/recommendation/dlrm_v3/configs.py b/recommendation/dlrm_v3/configs.py index 3d053b6512..4e59ed9197 100644 --- a/recommendation/dlrm_v3/configs.py +++ b/recommendation/dlrm_v3/configs.py @@ -114,7 +114,8 @@ def get_hstu_configs(dataset: str = "debug") -> DlrmHSTUConfig: return hstu_config -def get_embedding_table_config(dataset: str = "debug") -> Dict[str, EmbeddingConfig]: +def get_embedding_table_config( + dataset: str = "debug") -> Dict[str, EmbeddingConfig]: """ Create and return embedding table configurations. diff --git a/recommendation/dlrm_v3/data_producer.py b/recommendation/dlrm_v3/data_producer.py index a2b8e18e09..0caefba20c 100644 --- a/recommendation/dlrm_v3/data_producer.py +++ b/recommendation/dlrm_v3/data_producer.py @@ -90,7 +90,8 @@ def enqueue( """ with torch.profiler.record_function("data batching"): t0_batching: float = time.time() - samples: Union[Samples, List[Samples]] = self.ds.get_samples(content_ids) + samples: Union[Samples, List[Samples] + ] = self.ds.get_samples(content_ids) dt_batching: float = time.time() - t0_batching if isinstance(samples, Samples): query = QueryItem( @@ -106,7 +107,7 @@ def enqueue( for sample in samples: batch_size: int = sample.batch_size() query = QueryItem( - query_ids=query_ids[start_idx : start_idx + batch_size], + query_ids=query_ids[start_idx: start_idx + batch_size], samples=sample, start=t0, dt_queue=dt_queue, @@ -148,7 +149,9 @@ def __init__( ) self.workers: List[threading.Thread] = [] for _ in range(self.threads): - worker = threading.Thread(target=self.handle_tasks, args=(self.tasks,)) + worker = threading.Thread( + target=self.handle_tasks, args=( + self.tasks,)) worker.daemon = True self.workers.append(worker) worker.start() @@ -172,7 +175,8 @@ def handle_tasks( break query_ids, content_ids, t0, dt_queue = query_and_content_ids t0_batching: float = time.time() - samples: Union[Samples, List[Samples]] = self.ds.get_samples(content_ids) + samples: Union[Samples, List[Samples] + ] = self.ds.get_samples(content_ids) dt_batching: float = time.time() - t0_batching if isinstance(samples, Samples): qitem = QueryItem( @@ -189,7 +193,7 @@ def handle_tasks( for sample in samples: batch_size: int = sample.batch_size() qitem = QueryItem( - query_ids=query_ids[start_idx : start_idx + batch_size], + query_ids=query_ids[start_idx: start_idx + batch_size], samples=sample, start=t0, dt_queue=dt_queue, diff --git a/recommendation/dlrm_v3/datasets/dataset.py b/recommendation/dlrm_v3/datasets/dataset.py index 495c5836c1..3121e4f51b 100644 --- a/recommendation/dlrm_v3/datasets/dataset.py +++ b/recommendation/dlrm_v3/datasets/dataset.py @@ -204,11 +204,13 @@ def kjt_batch_func( bs_offset = torch.ops.fbgemm.asynchronous_complete_cumsum( torch.tensor(bs_list) ).int() - batched_offset = torch.ops.fbgemm.asynchronous_complete_cumsum(batched_length) + batched_offset = torch.ops.fbgemm.asynchronous_complete_cumsum( + batched_length) reorder_length = torch.ops.fbgemm.reorder_batched_ad_lengths( batched_length, bs_offset, bs ) - reorder_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(reorder_length) + reorder_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum( + reorder_length) reorder_indices = torch.ops.fbgemm.reorder_batched_ad_indices( batched_offset, batched_indices, reorder_offsets, bs_offset, bs ) @@ -345,7 +347,8 @@ def __init__( self.num_aggregated_samples = num_aggregated_samples self.items_in_memory = {} - def get_sample(self, id: int) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]: + def get_sample( + self, id: int) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]: """ Get a sample by ID from in-memory storage. diff --git a/recommendation/dlrm_v3/datasets/synthetic_streaming.py b/recommendation/dlrm_v3/datasets/synthetic_streaming.py index 8cddcc36d2..d1a1cc14d5 100644 --- a/recommendation/dlrm_v3/datasets/synthetic_streaming.py +++ b/recommendation/dlrm_v3/datasets/synthetic_streaming.py @@ -146,7 +146,8 @@ def load_query_samples(self, sample_list: List[int]) -> None: def unload_query_samples(self, sample_list: List[int]) -> None: self.items_in_memory = {} - def get_sample(self, id: int) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]: + def get_sample( + self, id: int) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]: return self.items_in_memory[self.ts][id] def get_sample_with_ts( @@ -192,7 +193,8 @@ def _process_line(self, line: str, user_id: int) -> pd.Series: reader = csv.reader([line]) parsed_line = next(reader) # total ts + one more eval ts + one base ts so that uih won't be zero - # for each ts, ordered as candidate_ids, candidate_ratings, uih_ids, uih_ratings + # for each ts, ordered as candidate_ids, candidate_ratings, uih_ids, + # uih_ratings assert len(parsed_line) == 4 * (self.total_ts + 2) uih_item_ids_list = [] uih_ratings_list = [] @@ -290,7 +292,8 @@ def set_ts(self, ts: int) -> None: assert len(row) == 1 requests = json_loads(row[0]) self.requests = requests - logger.warning(f"DLRMv3SyntheticStreamingDataset: ts={ts} requests loaded") + logger.warning( + f"DLRMv3SyntheticStreamingDataset: ts={ts} requests loaded") assert self.ts_to_users_cumsum[self.ts][-1] == len(self.requests) logger.warning( f"DLRMv3SyntheticStreamingDataset: ts={ts} users_cumsum={self.ts_to_users_cumsum[self.ts]}" @@ -336,7 +339,8 @@ def load_item( timestamps_uih = maybe_truncate_seq(timestamps_uih, self._max_uih_len) ids_candidates = maybe_truncate_seq(ids_candidates, max_num_candidates) num_candidates = len(ids_candidates) - ratings_candidates = maybe_truncate_seq(ratings_candidates, max_num_candidates) + ratings_candidates = maybe_truncate_seq( + ratings_candidates, max_num_candidates) action_weights_uih = [ self.action_weights[int(rating) - 1] for rating in ratings_uih ] @@ -366,7 +370,8 @@ def load_item( [ uih_seq_len for _ in range( - len(self._uih_keys) - len(self._contextual_feature_to_max_length) + len(self._uih_keys) - + len(self._contextual_feature_to_max_length) ) ] ) @@ -380,7 +385,8 @@ def load_item( values=torch.tensor(uih_kjt_values).long(), ) - candidates_kjt_lengths = num_candidates * torch.ones(len(self._candidates_keys)) + candidates_kjt_lengths = num_candidates * \ + torch.ones(len(self._candidates_keys)) item_candidate_category_ids = [ id // self.items_per_category for id in ids_candidates ] diff --git a/recommendation/dlrm_v3/datasets/utils.py b/recommendation/dlrm_v3/datasets/utils.py index c85c3cf706..134cd1a2ec 100644 --- a/recommendation/dlrm_v3/datasets/utils.py +++ b/recommendation/dlrm_v3/datasets/utils.py @@ -45,7 +45,7 @@ def json_loads( y = json.loads(x) else: y = x - y_list = [y] if type(y) == int else list(y) + y_list = [y] if isinstance(y, int) else list(y) return y_list @@ -72,7 +72,7 @@ def separate_uih_candidates( y = json.loads(x) else: y = x - y_list = [y] if type(y) == int else list(y) + y_list = [y] if isinstance(y, int) else list(y) candidates, uih = ( y_list[-candidates_max_seq_len:], y_list[:-candidates_max_seq_len], diff --git a/recommendation/dlrm_v3/generative_recommenders/common.py b/recommendation/dlrm_v3/generative_recommenders/common.py index 9ba5821d9f..3b9ca73bb8 100644 --- a/recommendation/dlrm_v3/generative_recommenders/common.py +++ b/recommendation/dlrm_v3/generative_recommenders/common.py @@ -188,7 +188,8 @@ def generate_sparse_seq_len( if sparsity == 0.0: return torch.zeros(size=(size,), device=device, dtype=torch.int) elif sparsity == 1.0: - return torch.ones(size=(size,), device=device, dtype=torch.int) * max_seq_len + return torch.ones(size=(size,), device=device, + dtype=torch.int) * max_seq_len elif sparsity >= 0.5: min_seq_len: int = int((2 * sparsity - 1.0) * max_seq_len) return torch.randint( @@ -265,10 +266,12 @@ def switch_to_contiguous_if_needed(x: torch.Tensor) -> torch.Tensor: def prev_power_of_2(x: int) -> int: if torch.compiler.is_compiling(): # Re-write to make Dynamo happy - x_tensor = torch.scalar_tensor(x, dtype=torch.int64) # type: ignore[arg-type] + x_tensor = torch.scalar_tensor( + x, dtype=torch.int64) # type: ignore[arg-type] x_tensor_orig = x_tensor.clone() out = triton.next_power_of_2(x_tensor) # type: ignore[arg-type] - return int(torch.where(torch.lt(x_tensor_orig, out), out // 2, out).item()) # type: ignore[return-value] + return int(torch.where(torch.lt(x_tensor_orig, out), out // + 2, out).item()) # type: ignore[return-value] else: out = triton.next_power_of_2(x) return out // 2 if out > x else out @@ -340,7 +343,9 @@ def _generate_fine_grained_buckets() -> List[int]: def _fine_grained_bucket_size(x: int) -> int: if torch.compiler.is_compiling(): x_tensor = torch.scalar_tensor(x, dtype=torch.int64) - buckets = torch.tensor(_generate_fine_grained_buckets(), dtype=torch.int64) + buckets = torch.tensor( + _generate_fine_grained_buckets(), + dtype=torch.int64) mask = buckets >= x_tensor valid_buckets = torch.where( @@ -361,7 +366,8 @@ def _fine_grained_bucket_size(x: int) -> int: @torch.fx.wrap -def fx_unwrap_optional_tensor(optional: Optional[torch.Tensor]) -> torch.Tensor: +def fx_unwrap_optional_tensor( + optional: Optional[torch.Tensor]) -> torch.Tensor: assert optional is not None, "Expected optional to be non-None Tensor" return optional diff --git a/recommendation/dlrm_v3/generative_recommenders/modules/action_encoder.py b/recommendation/dlrm_v3/generative_recommenders/modules/action_encoder.py index 0116b99b43..e1282e4015 100644 --- a/recommendation/dlrm_v3/generative_recommenders/modules/action_encoder.py +++ b/recommendation/dlrm_v3/generative_recommenders/modules/action_encoder.py @@ -85,7 +85,8 @@ def forward( watchtimes = seq_payloads[self._watchtime_feature_name] for threshold, weight in self._watchtime_to_action_thresholds_and_weights: seq_actions = torch.bitwise_or( - seq_actions, (watchtimes >= threshold).to(torch.int64) * weight + seq_actions, (watchtimes >= threshold).to( + torch.int64) * weight ) exploded_actions = ( torch.bitwise_and( @@ -94,7 +95,8 @@ def forward( > 0 ) action_embeddings = ( - exploded_actions.unsqueeze(-1) * self._action_embedding_table.unsqueeze(0) + exploded_actions.unsqueeze(-1) * + self._action_embedding_table.unsqueeze(0) ).view(-1, self._num_action_types * self._action_embedding_dim) total_targets: int = seq_embeddings.size(0) - action_embeddings.size(0) action_embeddings = concat_2D_jagged( diff --git a/recommendation/dlrm_v3/generative_recommenders/modules/content_encoder.py b/recommendation/dlrm_v3/generative_recommenders/modules/content_encoder.py index 75d73298a4..303e827f8a 100644 --- a/recommendation/dlrm_v3/generative_recommenders/modules/content_encoder.py +++ b/recommendation/dlrm_v3/generative_recommenders/modules/content_encoder.py @@ -79,7 +79,8 @@ def forward( if self._target_enrich_dummy_embeddings: total_seq_len: int = seq_embeddings.size(0) for name, param in self._target_enrich_dummy_embeddings.items(): - enrich_embeddings_target = seq_payloads[name].to(seq_embeddings.dtype) + enrich_embeddings_target = seq_payloads[name].to( + seq_embeddings.dtype) total_targets: int = enrich_embeddings_target.size(0) total_uih_len: int = total_seq_len - total_targets enrich_embeddings_uih = param.tile(total_uih_len, 1).to( diff --git a/recommendation/dlrm_v3/generative_recommenders/modules/contextual_interleave_preprocessor.py b/recommendation/dlrm_v3/generative_recommenders/modules/contextual_interleave_preprocessor.py index fff0d72f0d..85f8dc21a6 100644 --- a/recommendation/dlrm_v3/generative_recommenders/modules/contextual_interleave_preprocessor.py +++ b/recommendation/dlrm_v3/generative_recommenders/modules/contextual_interleave_preprocessor.py @@ -63,7 +63,8 @@ def __init__( self._contextual_feature_to_min_uih_length: Dict[str, int] = ( contextual_feature_to_min_uih_length ) - std = 1.0 * sqrt(2.0 / float(input_embedding_dim + output_embedding_dim)) + std = 1.0 * \ + sqrt(2.0 / float(input_embedding_dim + output_embedding_dim)) self._batched_contextual_linear_weights = torch.nn.Parameter( torch.empty( ( @@ -141,7 +142,8 @@ def combine_embeddings( valid_mask = torch.logical_and( indices < seq_lengths_by_2.view(-1, 1), torch.logical_or( - indices < (output_seq_lengths - num_targets).view(-1, 1), + indices < (output_seq_lengths - + num_targets).view(-1, 1), torch.remainder(indices, 2) == 0, ), ) @@ -249,7 +251,8 @@ def forward( # noqa C901 with torch.autocast( "cuda", dtype=torch.bfloat16, - enabled=(not self.is_inference and self._training_dtype == torch.bfloat16), + enabled=( + not self.is_inference and self._training_dtype == torch.bfloat16), ): # get contextual_embeddings contextual_embeddings: Optional[torch.Tensor] = None @@ -285,8 +288,10 @@ def forward( # noqa C901 ).transpose(0, 1) # content embeddings - seq_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(seq_lengths) - target_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(num_targets) + seq_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum( + seq_lengths) + target_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum( + num_targets) uih_offsets = seq_offsets - target_offsets content_embeddings = self._content_encoder( max_uih_len=max_uih_len, diff --git a/recommendation/dlrm_v3/generative_recommenders/modules/contextualize_mlps.py b/recommendation/dlrm_v3/generative_recommenders/modules/contextualize_mlps.py index 95c29f0381..1550929aa5 100644 --- a/recommendation/dlrm_v3/generative_recommenders/modules/contextualize_mlps.py +++ b/recommendation/dlrm_v3/generative_recommenders/modules/contextualize_mlps.py @@ -127,7 +127,8 @@ def forward( max_seq_len: int, contextual_embeddings: Optional[torch.Tensor], ) -> torch.Tensor: - shared_input = self._dense_features_compress(none_throws(contextual_embeddings)) + shared_input = self._dense_features_compress( + none_throws(contextual_embeddings)) attn_weights = self._attn_weights_norm( self._attn_raw_weights(shared_input).reshape( -1, self._sequential_input_dim, self._sequential_output_dim diff --git a/recommendation/dlrm_v3/generative_recommenders/modules/dlrm_hstu.py b/recommendation/dlrm_v3/generative_recommenders/modules/dlrm_hstu.py index 003abe77dd..938ea792c2 100644 --- a/recommendation/dlrm_v3/generative_recommenders/modules/dlrm_hstu.py +++ b/recommendation/dlrm_v3/generative_recommenders/modules/dlrm_hstu.py @@ -76,8 +76,10 @@ class DlrmHSTUConfig: hstu_group_norm: bool = False hstu_input_dropout_ratio: float = 0.2 hstu_linear_dropout_rate: float = 0.2 - contextual_feature_to_max_length: Dict[str, int] = field(default_factory=dict) - contextual_feature_to_min_uih_length: Dict[str, int] = field(default_factory=dict) + contextual_feature_to_max_length: Dict[str, int] = field( + default_factory=dict) + contextual_feature_to_min_uih_length: Dict[str, int] = field( + default_factory=dict) candidates_weight_feature_name: str = "" candidates_watchtime_feature_name: str = "" candidates_querytime_feature_name: str = "" @@ -108,7 +110,8 @@ def _get_supervision_labels_and_weights( supervision_weights: Dict[str, torch.Tensor] = {} for task in task_configs: if task.task_type == MultitaskTaskType.REGRESSION: - supervision_labels[task.task_name] = watchtime_sequence.to(torch.float32) + supervision_labels[task.task_name] = watchtime_sequence.to( + torch.float32) elif task.task_type == MultitaskTaskType.BINARY_CLASSIFICATION: supervision_labels[task.task_name] = ( torch.bitwise_and(supervision_bitmasks, task.task_weight) > 0 @@ -292,7 +295,8 @@ def _construct_payload( **{ x + "_offsets": contextual_offsets[i] for i, x in enumerate( - list(self._hstu_configs.contextual_feature_to_max_length.keys()) + list( + self._hstu_configs.contextual_feature_to_max_length.keys()) ) }, **{ @@ -394,7 +398,8 @@ def preprocess( dim=0, ), ) - seq_embeddings_dict = self._embedding_collection(merged_sparse_features) + seq_embeddings_dict = self._embedding_collection( + merged_sparse_features) num_candidates = fx_mark_length_features( candidates_features.lengths().view(len(candidates_features.keys()), -1) )[0] @@ -430,7 +435,8 @@ def preprocess( device=values_left.device, ) else: - values_right = candidates_features[candidate_feature_name].values() + values_right = candidates_features[candidate_feature_name].values( + ) payload_features[uih_feature_name] = values_left payload_features[candidate_feature_name] = values_right payload_features["uih_offsets"] = torch.ops.fbgemm.asynchronous_complete_cumsum( diff --git a/recommendation/dlrm_v3/generative_recommenders/modules/dynamic_stu.py b/recommendation/dlrm_v3/generative_recommenders/modules/dynamic_stu.py index e1fe8ad161..c8d5d4cdbf 100644 --- a/recommendation/dlrm_v3/generative_recommenders/modules/dynamic_stu.py +++ b/recommendation/dlrm_v3/generative_recommenders/modules/dynamic_stu.py @@ -254,7 +254,8 @@ def _preprocess( x_lengths - self._max_l2_len - num_targets - self._contextual_seq_len ) prefix_lengths = torch.clamp(prefix_lengths, min=0) - prefix_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(prefix_lengths) + prefix_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum( + prefix_lengths) l2_lengths = x_lengths - prefix_lengths l2_offsets = x_offsets - prefix_offsets self._runtime_max_l2_len: int = fx_infer_max_len(l2_lengths) diff --git a/recommendation/dlrm_v3/generative_recommenders/modules/hstu_transducer.py b/recommendation/dlrm_v3/generative_recommenders/modules/hstu_transducer.py index b4ae836ada..8e91a020ce 100644 --- a/recommendation/dlrm_v3/generative_recommenders/modules/hstu_transducer.py +++ b/recommendation/dlrm_v3/generative_recommenders/modules/hstu_transducer.py @@ -238,7 +238,8 @@ def _postprocess( ) candidate_timestamps = candidate_timestamps.squeeze(-1) if interleave_targets: - candidate_timestamps = candidate_timestamps.view(-1, 2)[:, 0] + candidate_timestamps = candidate_timestamps.view(-1, 2)[ + :, 0] candidate_embeddings = self._output_postprocessor( seq_embeddings=candidate_embeddings, seq_timestamps=candidate_timestamps, @@ -312,7 +313,8 @@ def forward( ) if not self._is_inference: - encoded_candidate_embeddings = encoded_candidate_embeddings.to(orig_dtype) + encoded_candidate_embeddings = encoded_candidate_embeddings.to( + orig_dtype) if self._return_full_embeddings: encoded_embeddings = fx_unwrap_optional_tensor(encoded_embeddings).to( orig_dtype diff --git a/recommendation/dlrm_v3/generative_recommenders/modules/multitask_module.py b/recommendation/dlrm_v3/generative_recommenders/modules/multitask_module.py index d5efe237ea..1824ebbf76 100644 --- a/recommendation/dlrm_v3/generative_recommenders/modules/multitask_module.py +++ b/recommendation/dlrm_v3/generative_recommenders/modules/multitask_module.py @@ -83,7 +83,7 @@ def _compute_pred_and_logits( mt_preds_list: List[torch.Tensor] = [] for task_type in MultitaskTaskType: logits = mt_logits[ - task_offsets[task_type] : task_offsets[task_type + 1], + task_offsets[task_type]: task_offsets[task_type + 1], :, ] if task_offsets[task_type + 1] - task_offsets[task_type] > 0: @@ -140,15 +140,15 @@ def _compute_loss( for task_type in MultitaskTaskType: if task_offsets[task_type + 1] - task_offsets[task_type] > 0: logits = mt_logits[ - task_offsets[task_type] : task_offsets[task_type + 1], + task_offsets[task_type]: task_offsets[task_type + 1], :, ] labels = mt_labels[ - task_offsets[task_type] : task_offsets[task_type + 1], + task_offsets[task_type]: task_offsets[task_type + 1], :, ] weights = mt_weights[ - task_offsets[task_type] : task_offsets[task_type + 1], + task_offsets[task_type]: task_offsets[task_type + 1], :, ] if task_type == MultitaskTaskType.REGRESSION: @@ -168,7 +168,8 @@ def _compute_loss( else: mt_losses = mt_losses_list[0] mt_losses = ( - mt_losses.sum(-1) / mt_weights.sum(-1).clamp(min=1.0) * causal_multitask_weights + mt_losses.sum(-1) / mt_weights.sum(-1).clamp(min=1.0) * + causal_multitask_weights ) return mt_losses @@ -214,13 +215,15 @@ def forward( ]: orig_dtype = encoded_user_embeddings.dtype if not self._is_inference: - encoded_user_embeddings = encoded_user_embeddings.to(self._training_dtype) + encoded_user_embeddings = encoded_user_embeddings.to( + self._training_dtype) item_embeddings = item_embeddings.to(self._training_dtype) with torch.autocast( "cuda", dtype=torch.bfloat16, - enabled=(not self.is_inference and self._training_dtype == torch.bfloat16), + enabled=( + not self.is_inference and self._training_dtype == torch.bfloat16), ): mt_preds, mt_logits = _compute_pred_and_logits( prediction_module=self._prediction_module, diff --git a/recommendation/dlrm_v3/generative_recommenders/modules/postprocessors.py b/recommendation/dlrm_v3/generative_recommenders/modules/postprocessors.py index 32fa660602..0faf64f81a 100644 --- a/recommendation/dlrm_v3/generative_recommenders/modules/postprocessors.py +++ b/recommendation/dlrm_v3/generative_recommenders/modules/postprocessors.py @@ -92,11 +92,13 @@ def forward( ) -> torch.Tensor: # pyre-fixme[6]: For 1st argument expected `dtype` but got `Union[dtype, # Tensor, Module]`. - return self._layer_norm(seq_embeddings.to(self._layer_norm.weight.dtype)) + return self._layer_norm(seq_embeddings.to( + self._layer_norm.weight.dtype)) @torch.fx.wrap -def _unsqueeze_if_needed(t: torch.Tensor, embedding: torch.Tensor) -> torch.Tensor: +def _unsqueeze_if_needed(t: torch.Tensor, + embedding: torch.Tensor) -> torch.Tensor: if embedding.dim() == 3: return t.unsqueeze(0) return t @@ -141,7 +143,8 @@ def _concat_time_features( timestamps = timestamps.unsqueeze(-1) period_units = _unsqueeze_if_needed(period_units, combined_embeddings) - units_per_period = _unsqueeze_if_needed(units_per_period, combined_embeddings) + units_per_period = _unsqueeze_if_needed( + units_per_period, combined_embeddings) _units_since_epoch = torch.div( timestamps, period_units, rounding_mode="floor" ) # [sum(N_i), num_time_features] or [B, N, num_time_features] @@ -161,7 +164,8 @@ def _concat_time_features( -2, -1 ) # [sum(N_i), num_time_features * 2] or [B, N, num_time_features * 2] _units_elapsed = _cast_dtype(_units_elapsed, _units_elapsed_type) - combined_embeddings = torch.cat([combined_embeddings, _units_elapsed], dim=-1) + combined_embeddings = torch.cat( + [combined_embeddings, _units_elapsed], dim=-1) return combined_embeddings def forward( @@ -171,6 +175,7 @@ def forward( seq_payloads: Dict[str, torch.Tensor], ) -> torch.Tensor: user_embeddings = self._time_feature_combiner( - self._concat_time_features(seq_embeddings, timestamps=seq_timestamps) + self._concat_time_features( + seq_embeddings, timestamps=seq_timestamps) ) return self._layer_norm(user_embeddings) diff --git a/recommendation/dlrm_v3/generative_recommenders/modules/preprocessors.py b/recommendation/dlrm_v3/generative_recommenders/modules/preprocessors.py index dc7806bb45..083277d91a 100644 --- a/recommendation/dlrm_v3/generative_recommenders/modules/preprocessors.py +++ b/recommendation/dlrm_v3/generative_recommenders/modules/preprocessors.py @@ -240,8 +240,10 @@ def forward( # noqa C901 + self._additional_embedding_mlp(additional_embeddings) ) max_seq_len = max_uih_len + max_targets - target_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(num_targets) - seq_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(seq_lengths) + target_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum( + num_targets) + seq_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum( + seq_lengths) uih_offsets = seq_offsets - target_offsets if self._action_weights is not None: action_embeddings = self._action_encoder( diff --git a/recommendation/dlrm_v3/generative_recommenders/modules/stu.py b/recommendation/dlrm_v3/generative_recommenders/modules/stu.py index d186000e38..5e404cd707 100644 --- a/recommendation/dlrm_v3/generative_recommenders/modules/stu.py +++ b/recommendation/dlrm_v3/generative_recommenders/modules/stu.py @@ -196,7 +196,8 @@ def __init__( self._target_aware: bool = config.target_aware self._causal: bool = config.causal self._max_attn_len: int = config.max_attn_len or 0 - self._attn_alpha: float = config.attn_alpha or 1.0 / (self._attention_dim**0.5) + self._attn_alpha: float = config.attn_alpha or 1.0 / \ + (self._attention_dim**0.5) self._use_group_norm: bool = config.use_group_norm self._recompute_normed_x: bool = config.recompute_normed_x self._recompute_uvqk: bool = config.recompute_uvqk @@ -426,7 +427,8 @@ def __init__( is_inference: bool = False, ) -> None: super().__init__(is_inference=is_inference) - self._stu_layers: torch.nn.ModuleList = torch.nn.ModuleList(modules=stu_list) + self._stu_layers: torch.nn.ModuleList = torch.nn.ModuleList( + modules=stu_list) def forward( self, diff --git a/recommendation/dlrm_v3/generative_recommenders/ops/hstu_attention.py b/recommendation/dlrm_v3/generative_recommenders/ops/hstu_attention.py index b7021bb075..01552e1cec 100644 --- a/recommendation/dlrm_v3/generative_recommenders/ops/hstu_attention.py +++ b/recommendation/dlrm_v3/generative_recommenders/ops/hstu_attention.py @@ -34,7 +34,7 @@ from hammer.ops.triton.cc.hstu_attention.triton_cc_hstu_attention import ( triton_cc_hstu_mha, ) -except: +except BaseException: from generative_recommenders.ops.triton.triton_hstu_attention import ( triton_hstu_mha as triton_cc_hstu_mha, ) @@ -75,8 +75,12 @@ def hstu_mha( torch._assert(q.is_cuda, "q must be CUDA tensor") torch._assert(k.is_cuda, "k must be CUDA tensor") torch._assert(v.is_cuda, "v must be CUDA tensor") - torch._assert(seq_offsets.is_cuda, "seq_offsets must be CUDA tensor") - torch._assert(dropout_pr < 1e-6, "dropout for triton path not implemented") + torch._assert( + seq_offsets.is_cuda, + "seq_offsets must be CUDA tensor") + torch._assert( + dropout_pr < 1e-6, + "dropout for triton path not implemented") torch._assert( min_full_attn_seq_len == 0, "min_full_attn_seq_len not implemented" ) @@ -159,9 +163,13 @@ def delta_hstu_mha( if kernel in [HammerKernel.TRITON, HammerKernel.TRITON_CC]: if not is_fx_tracing() and kernel == HammerKernel.TRITON: torch._assert(delta_q.is_cuda, "q must be CUDA tensor") - torch._assert(seq_offsets.is_cuda, "seq_offsets must be CUDA tensor") + torch._assert( + seq_offsets.is_cuda, + "seq_offsets must be CUDA tensor") if num_targets is not None: - torch._assert(num_targets.is_cuda, "num_targets must be CUDA tensor") + torch._assert( + num_targets.is_cuda, + "num_targets must be CUDA tensor") seq_offsets = seq_offsets.contiguous() delta_q = switch_to_contiguous_if_needed(delta_q) k = switch_to_contiguous_if_needed(k) diff --git a/recommendation/dlrm_v3/generative_recommenders/ops/jagged_tensors.py b/recommendation/dlrm_v3/generative_recommenders/ops/jagged_tensors.py index 0ca24daa55..c5daac7292 100644 --- a/recommendation/dlrm_v3/generative_recommenders/ops/jagged_tensors.py +++ b/recommendation/dlrm_v3/generative_recommenders/ops/jagged_tensors.py @@ -222,7 +222,9 @@ def jagged_dense_bmm_broadcast_add( _, K = jagged.shape B, _, N = dense.shape torch._assert(dense.shape[1] == K, "wrong dense shape[1]") - torch._assert(seq_offsets.shape[0] == B + 1, "wrong seq_offsets shape[0]") + torch._assert( + seq_offsets.shape[0] == B + 1, + "wrong seq_offsets shape[0]") torch._assert(bias.shape[0] == B, "wrong bias shape[0]") torch._assert(bias.shape[1] == N, "wrong bias shape[1]") if kernel == HammerKernel.TRITON: diff --git a/recommendation/dlrm_v3/generative_recommenders/ops/pytorch/pt_hstu_attention.py b/recommendation/dlrm_v3/generative_recommenders/ops/pytorch/pt_hstu_attention.py index e4e5f64f61..60b447b6c7 100644 --- a/recommendation/dlrm_v3/generative_recommenders/ops/pytorch/pt_hstu_attention.py +++ b/recommendation/dlrm_v3/generative_recommenders/ops/pytorch/pt_hstu_attention.py @@ -60,9 +60,16 @@ def _get_valid_attn_mask( row_ids = row_ids.view(1, N, N) col_ids = col_ids.view(1, N, N) row_col_dist = row_ids - col_ids - valid_attn_mask = torch.eye(N, device=device, dtype=torch.bool).view(1, N, N) + valid_attn_mask = torch.eye( + N, + device=device, + dtype=torch.bool).view( + 1, + N, + N) if not causal: - row_col_dist = torch.where(row_col_dist > 0, row_col_dist, -row_col_dist) + row_col_dist = torch.where( + row_col_dist > 0, row_col_dist, -row_col_dist) valid_attn_mask = torch.logical_or(valid_attn_mask, row_col_dist > 0) if max_attn_len > 0: if min_full_attn_seq_len > 0: @@ -184,7 +191,9 @@ def pytorch_hstu_mha( qk_attn = F.dropout(qk_attn, p=dropout_pr, training=training) attn_dense = torch.einsum("bhxd,bhdv->bhxv", qk_attn, v) # [B, H, N, V] return torch.ops.fbgemm.dense_to_jagged( - attn_dense.transpose(1, 2).flatten(2, 3), # [B, N, H, V]->[B, N, H * V] + attn_dense.transpose( + 1, 2).flatten( + 2, 3), # [B, N, H, V]->[B, N, H * V] [seq_offsets], L, )[0].view(L, H, V) diff --git a/recommendation/dlrm_v3/generative_recommenders/ops/pytorch/pt_jagged.py b/recommendation/dlrm_v3/generative_recommenders/ops/pytorch/pt_jagged.py index 67de7cbfce..034fccafdf 100644 --- a/recommendation/dlrm_v3/generative_recommenders/ops/pytorch/pt_jagged.py +++ b/recommendation/dlrm_v3/generative_recommenders/ops/pytorch/pt_jagged.py @@ -129,7 +129,8 @@ def pytorch_concat_2D_dense_jagged( ) concatted_dense = torch.cat([dense_values, jagged_dense], dim=1) concatted_offsets = ( - dense_size * _arange(B + 1, device=jagged_offsets.device) + jagged_offsets + dense_size * _arange(B + 1, + device=jagged_offsets.device) + jagged_offsets ) return torch.ops.fbgemm.dense_to_jagged( concatted_dense, @@ -148,7 +149,8 @@ def pytorch_concat_2D_jagged_jagged( is_replace: bool = False, n_prefix_from_right: int = 0, ) -> torch.Tensor: - # is_replace with n_prefix_from_right != 0 is not supported yet (neither in triton) + # is_replace with n_prefix_from_right != 0 is not supported yet (neither + # in triton) if is_replace: return pytorch_replace_last_n_with_jagged( max_seq_len_left, @@ -179,7 +181,11 @@ def pytorch_concat_2D_jagged_jagged( dense_b, [n_prefix_from_right, max_seq_len_right - n_prefix_from_right], dim=1 ) dense = torch.cat([dense_b_prefix, dense_a, dense_b_suffix], dim=1) - mask = _arange(max_seq_len, device=offsets_left.device).expand(B, max_seq_len) + mask = _arange( + max_seq_len, + device=offsets_left.device).expand( + B, + max_seq_len) mask = torch.logical_or( mask < lengths_a.view(B, 1) + n_prefix_from_right, torch.logical_and( @@ -198,8 +204,10 @@ def pytorch_jagged_remove_first_or_last_1D( ) -> Tuple[torch.Tensor, torch.Tensor]: values = values.view(-1, 1) shrunk_lengths = lengths - 1 - k_lengths = torch.stack([shrunk_lengths, torch.ones_like(lengths)], dim=1).view(-1) - q_lengths = torch.stack([torch.ones_like(lengths), shrunk_lengths], dim=1).view(-1) + k_lengths = torch.stack( + [shrunk_lengths, torch.ones_like(lengths)], dim=1).view(-1) + q_lengths = torch.stack( + [torch.ones_like(lengths), shrunk_lengths], dim=1).view(-1) all_indices = torch.arange( start=0, end=q_lengths.numel(), device=values.device ).reshape(-1, 2) diff --git a/recommendation/dlrm_v3/generative_recommenders/ops/pytorch/pt_jagged_tensors.py b/recommendation/dlrm_v3/generative_recommenders/ops/pytorch/pt_jagged_tensors.py index 27817f7fbd..0468115ef5 100644 --- a/recommendation/dlrm_v3/generative_recommenders/ops/pytorch/pt_jagged_tensors.py +++ b/recommendation/dlrm_v3/generative_recommenders/ops/pytorch/pt_jagged_tensors.py @@ -137,7 +137,8 @@ def _split_2D_jagged_jagged( mask >= lengths_left.view(-1, 1), mask < (lengths_left + lengths_right).view(-1, 1), ) - return padded_values[mask_left.view(-1), :], padded_values[mask_right.view(-1), :] + return padded_values[mask_left.view(-1), + :], padded_values[mask_right.view(-1), :] @torch.fx.wrap @@ -233,7 +234,8 @@ def pytorch_hstu_concat_l2_embeddings( ], dim=1, ) - mask = fx_arange(max_prefix_len + max_l2_len, device=prefix_x.device).view(1, -1) + mask = fx_arange(max_prefix_len + max_l2_len, + device=prefix_x.device).view(1, -1) prefix_lengths = prefix_offsets[1:] - prefix_offsets[:-1] l2_lengths = l2_offsets[1:] - l2_offsets[:-1] mask = torch.logical_or( diff --git a/recommendation/dlrm_v3/generative_recommenders/ops/pytorch/pt_position.py b/recommendation/dlrm_v3/generative_recommenders/ops/pytorch/pt_position.py index dbe0c7efe9..5eefd9d30d 100644 --- a/recommendation/dlrm_v3/generative_recommenders/ops/pytorch/pt_position.py +++ b/recommendation/dlrm_v3/generative_recommenders/ops/pytorch/pt_position.py @@ -51,7 +51,8 @@ def _get_col_indices( ) if num_targets is not None: if interleave_targets: - high_inds = seq_lengths - fx_unwrap_optional_tensor(num_targets) * 2 + high_inds = seq_lengths - \ + fx_unwrap_optional_tensor(num_targets) * 2 else: high_inds = seq_lengths - fx_unwrap_optional_tensor(num_targets) col_indices = torch.clamp(col_indices, max=high_inds.view(-1, 1)) diff --git a/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_addmm.py b/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_addmm.py index 2231387fb6..56b6aac9f1 100644 --- a/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_addmm.py +++ b/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_addmm.py @@ -326,9 +326,11 @@ def _addmm_fwd( mask_m = (pid_m * BLOCK_M + offs_m)[:, None] < M mask_n = (pid_n * BLOCK_N + offs_n)[None, :] < N x_ptr += pid_m.to(tl.int64) * BLOCK_M * stride_xm - x_ptrs = x_ptr + (offs_m[:, None] * stride_xm + offs_k[None, :] * stride_xk) + x_ptrs = x_ptr + (offs_m[:, None] * stride_xm + + offs_k[None, :] * stride_xk) w_ptr += pid_n.to(tl.int64) * BLOCK_N * stride_wn - w_ptrs = w_ptr + (offs_k[:, None] * stride_wk + offs_n[None, :] * stride_wn) + w_ptrs = w_ptr + (offs_k[:, None] * stride_wk + + offs_n[None, :] * stride_wn) accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) for k in range(0, tl.cdiv(K, BLOCK_K)): mask_k = offs_k[None, :] < K - k * BLOCK_K @@ -348,7 +350,8 @@ def _addmm_fwd( else: y_ptr += pid_m.to(tl.int64) * BLOCK_M * stride_ym y_ptr += pid_n.to(tl.int64) * BLOCK_N * stride_yn - y_ptrs = y_ptr + stride_ym * offs_m[:, None] + stride_yn * offs_n[None, :] + y_ptrs = y_ptr + stride_ym * \ + offs_m[:, None] + stride_yn * offs_n[None, :] y = tl.load(y_ptrs, mask=z_mask) z = (accumulator + y.to(tl.float32)).to(z_ptr.dtype.element_ty) z_ptr += pid_m.to(tl.int64) * BLOCK_M * stride_zm @@ -454,8 +457,10 @@ def _addmm_fwd_tma_ws( BROADCAST_Y: tl.constexpr, NUM_SMEM_BUFFERS: tl.constexpr, ): - x_buffers = tlx.local_alloc((BLOCK_M, BLOCK_K), x_desc.dtype, NUM_SMEM_BUFFERS) - w_buffers = tlx.local_alloc((BLOCK_K, BLOCK_N), w_desc.dtype, NUM_SMEM_BUFFERS) + x_buffers = tlx.local_alloc( + (BLOCK_M, BLOCK_K), x_desc.dtype, NUM_SMEM_BUFFERS) + w_buffers = tlx.local_alloc( + (BLOCK_K, BLOCK_N), w_desc.dtype, NUM_SMEM_BUFFERS) acc_tmem_buffer = tlx.local_alloc( (BLOCK_M, BLOCK_N), tl.float32, tl.constexpr(1), tlx.storage_kind.tmem ) @@ -463,11 +468,15 @@ def _addmm_fwd_tma_ws( if BROADCAST_Y: y_buffer = tlx.local_alloc((1, BLOCK_N), y_desc.dtype, tl.constexpr(1)) else: - y_buffer = tlx.local_alloc((BLOCK_M, BLOCK_N), y_desc.dtype, tl.constexpr(1)) - z_buffer = tlx.local_alloc((BLOCK_M, BLOCK_N), z_desc.dtype, tl.constexpr(1)) - - smem_full_bars = tlx.alloc_barriers(num_barriers=NUM_SMEM_BUFFERS, arrive_count=1) - smem_empty_bars = tlx.alloc_barriers(num_barriers=NUM_SMEM_BUFFERS, arrive_count=1) + y_buffer = tlx.local_alloc( + (BLOCK_M, BLOCK_N), y_desc.dtype, tl.constexpr(1)) + z_buffer = tlx.local_alloc( + (BLOCK_M, BLOCK_N), z_desc.dtype, tl.constexpr(1)) + + smem_full_bars = tlx.alloc_barriers( + num_barriers=NUM_SMEM_BUFFERS, arrive_count=1) + smem_empty_bars = tlx.alloc_barriers( + num_barriers=NUM_SMEM_BUFFERS, arrive_count=1) y_load_barrier = tlx.alloc_barriers(num_barriers=1, arrive_count=1) with tlx.async_tasks(): @@ -502,10 +511,12 @@ def _addmm_fwd_tma_ws( 2 * (BLOCK_M * BLOCK_K + BLOCK_K * BLOCK_N), ) tlx.async_descriptor_load( - x_desc, x_buffers[buf], [offs_xm, offs_k], smem_full_bars[buf] + x_desc, x_buffers[buf], [ + offs_xm, offs_k], smem_full_bars[buf] ) tlx.async_descriptor_load( - w_desc, w_buffers[buf], [offs_k, offs_wn], smem_full_bars[buf] + w_desc, w_buffers[buf], [ + offs_k, offs_wn], smem_full_bars[buf] ) load_phase = load_phase ^ (buf == NUM_SMEM_BUFFERS - 1) @@ -532,7 +543,9 @@ def _addmm_fwd_tma_ws( y_load_bar = tlx.local_view(y_load_barrier, 0) if BROADCAST_Y: tlx.barrier_expect_bytes(y_load_bar, 1 * BLOCK_N * 2) - tlx.async_descriptor_load(y_desc, y_buf_view, [0, offs_wn], y_load_bar) + tlx.async_descriptor_load( + y_desc, y_buf_view, [ + 0, offs_wn], y_load_bar) else: tlx.barrier_expect_bytes(y_load_bar, BLOCK_M * BLOCK_N * 2) tlx.async_descriptor_load( @@ -595,18 +608,24 @@ def _addmm_fwd_tma_ws_persistent( NUM_SMS: tl.constexpr, ): # Allocate buffers once for all tiles - x_buffers = tlx.local_alloc((BLOCK_M, BLOCK_K), x_desc.dtype, NUM_SMEM_BUFFERS) - w_buffers = tlx.local_alloc((BLOCK_K, BLOCK_N), w_desc.dtype, NUM_SMEM_BUFFERS) + x_buffers = tlx.local_alloc( + (BLOCK_M, BLOCK_K), x_desc.dtype, NUM_SMEM_BUFFERS) + w_buffers = tlx.local_alloc( + (BLOCK_K, BLOCK_N), w_desc.dtype, NUM_SMEM_BUFFERS) tmem_buffers = tlx.local_alloc( (BLOCK_M, BLOCK_N), tl.float32, NUM_TMEM_BUFFERS, tlx.storage_kind.tmem ) # Barriers for producer <-> MMA - smem_full_bars = tlx.alloc_barriers(num_barriers=NUM_SMEM_BUFFERS, arrive_count=1) - smem_empty_bars = tlx.alloc_barriers(num_barriers=NUM_SMEM_BUFFERS, arrive_count=1) + smem_full_bars = tlx.alloc_barriers( + num_barriers=NUM_SMEM_BUFFERS, arrive_count=1) + smem_empty_bars = tlx.alloc_barriers( + num_barriers=NUM_SMEM_BUFFERS, arrive_count=1) # Barriers for MMA <-> Epilogue - tmem_full_bars = tlx.alloc_barriers(num_barriers=NUM_TMEM_BUFFERS, arrive_count=1) - tmem_empty_bars = tlx.alloc_barriers(num_barriers=NUM_TMEM_BUFFERS, arrive_count=1) + tmem_full_bars = tlx.alloc_barriers( + num_barriers=NUM_TMEM_BUFFERS, arrive_count=1) + tmem_empty_bars = tlx.alloc_barriers( + num_barriers=NUM_TMEM_BUFFERS, arrive_count=1) with tlx.async_tasks(): # Epilogue consumer: loads Y, adds bias, stores Z @@ -672,7 +691,9 @@ def _addmm_fwd_tma_ws_persistent( ) # Wait for epilogue to finish with this TMEM buffer - tlx.barrier_wait(tmem_empty_bars[cur_tmem_buf], tmem_write_phase) + tlx.barrier_wait( + tmem_empty_bars[cur_tmem_buf], + tmem_write_phase) tmem_write_phase = tmem_write_phase ^ ( cur_tmem_buf == int(NUM_TMEM_BUFFERS) - 1 ) @@ -694,8 +715,10 @@ def _addmm_fwd_tma_ws_persistent( dot_phase = dot_phase ^ (buf == int(NUM_SMEM_BUFFERS) - 1) # Wait for last MMA to complete - last_buf = (processed_k_iters + k_tiles - 1) % int(NUM_SMEM_BUFFERS) - last_dot_phase = dot_phase ^ (last_buf == int(NUM_SMEM_BUFFERS) - 1) + last_buf = (processed_k_iters + k_tiles - + 1) % int(NUM_SMEM_BUFFERS) + last_dot_phase = dot_phase ^ ( + last_buf == int(NUM_SMEM_BUFFERS) - 1) tlx.barrier_wait(smem_empty_bars[last_buf], last_dot_phase) # Signal epilogue that result is ready @@ -735,13 +758,16 @@ def _addmm_fwd_tma_ws_persistent( 2 * (BLOCK_M + BLOCK_N) * BLOCK_K, ) tlx.async_descriptor_load( - x_desc, x_buffers[buf], [offs_xm, offs_k], smem_full_bars[buf] + x_desc, x_buffers[buf], [ + offs_xm, offs_k], smem_full_bars[buf] ) tlx.async_descriptor_load( - w_desc, w_buffers[buf], [offs_k, offs_wn], smem_full_bars[buf] + w_desc, w_buffers[buf], [ + offs_k, offs_wn], smem_full_bars[buf] ) - load_phase = load_phase ^ (buf == int(NUM_SMEM_BUFFERS) - 1) + load_phase = load_phase ^ ( + buf == int(NUM_SMEM_BUFFERS) - 1) processed_k_iters += k_tiles @@ -763,7 +789,8 @@ def triton_addmm_fwd_tma_persistent( if M == 0 or N == 0: return z - # A dummy block value that will be overwritten when we have the real block size + # A dummy block value that will be overwritten when we have the real block + # size dummy_block = [1, 1] # pyre-ignore[6]: In call `TensorDescriptor.__init__`, for 2nd positional # argument, expected `List[int]` but got `Size` @@ -823,7 +850,8 @@ def triton_addmm_fwd_tma_ws_tlx( if M == 0 or N == 0: return z - # A dummy block value that will be overwritten when we have the real block size + # A dummy block value that will be overwritten when we have the real block + # size dummy_block = [1, 1] # pyre-ignore[6]: In call `TensorDescriptor.__init__`, for 2nd positional # argument, expected `List[int]` but got `Size` @@ -944,7 +972,7 @@ def triton_addmm_fwd( if M == 0 or N == 0: return z - grid = lambda meta: ( # noqa E731 + def grid(meta): return ( # noqa E731 triton.cdiv(M, meta["BLOCK_M"]), triton.cdiv(N, meta["BLOCK_N"]), ) @@ -1015,7 +1043,8 @@ def forward( if is_sm100() and TMA_AVAILABLE and _check_tma_alignment(x, w, y): if x.dtype == torch.float32 or HAS_TLX == False: # use TMA persistent kernel on sm100 - return triton_addmm_fwd_tma_persistent(x, w, y, warp_specialize=True) + return triton_addmm_fwd_tma_persistent( + x, w, y, warp_specialize=True) else: return triton_addmm_fwd_tma_ws_persistent_tlx( x, w, y diff --git a/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_hstu_attention.py b/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_hstu_attention.py index 36080561fc..946a6350f6 100644 --- a/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_hstu_attention.py +++ b/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_hstu_attention.py @@ -276,7 +276,8 @@ def _get_fw_configs() -> List[triton.Config]: # noqa: C901 ), ] - # Add 'USE_TLX' : False, 'NUM_BUFFERS': 1, 'NUM_MMA_WARPS_PER_GROUP': 1, 'NUM_MMA_GROUPS': 1 to non-TLX configs + # Add 'USE_TLX' : False, 'NUM_BUFFERS': 1, 'NUM_MMA_WARPS_PER_GROUP': + # 1, 'NUM_MMA_GROUPS': 1 to non-TLX configs for config in configs: if not config.kwargs.get("USE_TLX", False): config.kwargs["USE_TLX"] = False @@ -486,7 +487,9 @@ def _hstu_attn_fwd_compute( # noqa C901 block_shape=(BLOCK_M, BLOCK_D_Q), order=(1, 0), ) - q = tl.load(Q_block_ptr, boundary_check=(0,), padding_option="zero") + q = tl.load( + Q_block_ptr, boundary_check=( + 0,), padding_option="zero") K_block_ptr = tl.make_block_ptr( base=K + off_h * stride_kh + seq_start * stride_kn, @@ -633,7 +636,8 @@ def _hstu_attn_fwd_compute( # noqa C901 offs_m_delta = start_m_delta + tl.arange(0, BLOCK_M) offs_v_d = tl.arange(0, BLOCK_D_V) off_o = Out + off_z * DeltaSize * stride_om + off_h * stride_oh - out_ptrs = off_o + offs_m_delta[:, None] * stride_om + offs_v_d[None, :] + out_ptrs = off_o + offs_m_delta[:, + None] * stride_om + offs_v_d[None, :] tl.store(out_ptrs, acc, mask=(offs_m_delta < DeltaSize)[:, None]) else: # rematerialize offsets to save registers @@ -805,18 +809,21 @@ def _hstu_attn_fwd_compute_main_loop_tlx_pipelined( # noqa C901 # Pingpong if cid == 0: - # Consumer 0 waits for Consumer 1 to reach synchronization point at barrier 9. + # Consumer 0 waits for Consumer 1 to reach synchronization point at + # barrier 9. tlx.named_barrier_wait(9, 256) else: # Consumer 1 signals its arrival at barrier 9. tlx.named_barrier_arrive(9, 256) - # Then waits at barrier 10 until Consumer 0 finishes issuing its async_dot. + # Then waits at barrier 10 until Consumer 0 finishes issuing its + # async_dot. tlx.named_barrier_wait(10, 256) qk = tlx.async_dot(q_tile, k_tile) if cid == 0: - # After issuing async_dot, Consumer 0 signals barrier 10 to unblock Consumer 1. + # After issuing async_dot, Consumer 0 signals barrier 10 to unblock + # Consumer 1. tlx.named_barrier_arrive(10, 256) # wait for the MMA using to complete @@ -1157,7 +1164,8 @@ def _hstu_attn_fwd_load_Q_K_V( BLOCK_N, ) - for cid in tl.range(1, NUM_MMA_GROUPS, loop_unroll_factor=NUM_MMA_GROUPS - 1): + for cid in tl.range(1, NUM_MMA_GROUPS, + loop_unroll_factor=NUM_MMA_GROUPS - 1): _hstu_attn_fwd_load_Q( Q, q_tiles, @@ -1233,7 +1241,8 @@ def _hstu_attn_fwd_load_Q_K_V( if uih_end < start_m: low_delta = start_m high_delta = start_m + BLOCK_M - for start_delta in tl.range(low_delta, high_delta, BLOCK_N, num_stages=0): + for start_delta in tl.range( + low_delta, high_delta, BLOCK_N, num_stages=0): # pyre-ignore[58] buf_id = loop_trip_cnt % NUM_BUFFERS # buffers in a row share the same phase @@ -1333,8 +1342,10 @@ def _hstu_attn_fwd_compute_tlx( # noqa C901 q_tiles = tlx.local_alloc( (BLOCK_M_SPLIT, BLOCK_D_Q), tlx.dtype_of(Q), NUM_MMA_GROUPS ) - k_tiles = tlx.local_alloc((BLOCK_N, BLOCK_D_Q), tlx.dtype_of(K), NUM_BUFFERS) - v_tiles = tlx.local_alloc((BLOCK_N, BLOCK_D_V), tlx.dtype_of(V), NUM_BUFFERS) + k_tiles = tlx.local_alloc( + (BLOCK_N, BLOCK_D_Q), tlx.dtype_of(K), NUM_BUFFERS) + v_tiles = tlx.local_alloc( + (BLOCK_N, BLOCK_D_V), tlx.dtype_of(V), NUM_BUFFERS) # allocate barriers q_fulls = tlx.alloc_barriers(num_barriers=NUM_MMA_GROUPS, arrive_count=1) @@ -1393,7 +1404,8 @@ def _hstu_attn_fwd_compute_tlx( # noqa C901 cid = tlx.async_task_replica_id() acc = tl.zeros([BLOCK_M_SPLIT, BLOCK_D_V], dtype=tl.float32) # initialize offsets - offs_m = start_m + tl.arange(0, BLOCK_M_SPLIT) + cid * BLOCK_M_SPLIT + offs_m = start_m + tl.arange(0, + BLOCK_M_SPLIT) + cid * BLOCK_M_SPLIT offs_n = tl.arange(0, BLOCK_N) low, high, uih_end = _hstu_attn_fwd_caculate_range( @@ -1485,15 +1497,20 @@ def _hstu_attn_fwd_compute_tlx( # noqa C901 offs_m_delta = start_m_delta + tl.arange(0, BLOCK_M_SPLIT) offs_v_d = tl.arange(0, BLOCK_D_V) off_o = Out + off_z * DeltaSize * stride_om + off_h * stride_oh - out_ptrs = off_o + offs_m_delta[:, None] * stride_om + offs_v_d[None, :] - tl.store(out_ptrs, acc, mask=(offs_m_delta < DeltaSize)[:, None]) + out_ptrs = off_o + \ + offs_m_delta[:, None] * stride_om + offs_v_d[None, :] + tl.store( + out_ptrs, acc, mask=( + offs_m_delta < DeltaSize)[ + :, None]) else: # rematerialize offsets to save registers start_m = pid * BLOCK_M + cid * BLOCK_M_SPLIT offs_m = start_m + tl.arange(0, BLOCK_M_SPLIT) offs_v_d = tl.arange(0, BLOCK_D_V) off_o = Out + seq_start * stride_om + off_h * stride_oh - out_ptrs = off_o + offs_m[:, None] * stride_om + offs_v_d[None, :] + out_ptrs = off_o + offs_m[:, None] * \ + stride_om + offs_v_d[None, :] tl.store(out_ptrs, acc, mask=(offs_m < seq_len)[:, None]) @@ -1851,12 +1868,14 @@ def _hstu_attn_bwd_one_block( # noqa C901 # compute dk and dq dqk_trans = tl.dot(v, tl.trans(do), allow_tf32=ALLOW_TF32) dqk_trans = ( - dqk_trans * sig_trans * (1 + qk_trans * (1 - sig_trans)) * (1.0 / MAX_SEQ_LEN) + dqk_trans * sig_trans * + (1 + qk_trans * (1 - sig_trans)) * (1.0 / MAX_SEQ_LEN) ) dqk_trans = tl.where(invalid_mask_trans, dqk_trans, 0) dqk_trans = dqk_trans.to(k.dtype) - # Note: the factor `alpha` is delayed until the end of the function to reduce the cost + # Note: the factor `alpha` is delayed until the end of the function to + # reduce the cost dk += tl.dot(dqk_trans, tl.trans(q_trans), allow_tf32=ALLOW_TF32) acc_dq( dq_ptrs_trans=dq_ptrs_trans, @@ -2080,8 +2099,10 @@ def _hstu_attn_bwd_one_col_block( # noqa C901 else: dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_v_d[None, :]) dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_qk_d[None, :]) - tl.store(dv_ptrs, dv.to(k.dtype), mask=mask_n[:, None]) # pyre-ignore[61] - tl.store(dk_ptrs, dk.to(k.dtype), mask=mask_n[:, None]) # pyre-ignore[61] + tl.store(dv_ptrs, dv.to(k.dtype), + mask=mask_n[:, None]) # pyre-ignore[61] + tl.store(dk_ptrs, dk.to(k.dtype), + mask=mask_n[:, None]) # pyre-ignore[61] def _bwd_pre_hook(nargs): @@ -2242,43 +2263,56 @@ def _get_bw_configs() -> List[triton.Config]: if torch.cuda.is_available() and torch.version.cuda < "12.8": configs += [ triton.Config( - {"BLOCK_M": 16, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False, "UNROLL": 1}, + {"BLOCK_M": 16, + "BLOCK_N": 64, + "SEQUENCE_PARALLEL": False, + "UNROLL": 1}, num_stages=1, num_warps=4, pre_hook=_bwd_pre_hook, ), triton.Config( - {"BLOCK_M": 32, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False, "UNROLL": 1}, + {"BLOCK_M": 32, + "BLOCK_N": 64, + "SEQUENCE_PARALLEL": False, + "UNROLL": 1}, num_stages=1, num_warps=4, pre_hook=_bwd_pre_hook, ), triton.Config( - {"BLOCK_M": 32, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False, "UNROLL": 1}, + {"BLOCK_M": 32, + "BLOCK_N": 64, + "SEQUENCE_PARALLEL": False, + "UNROLL": 1}, num_stages=1, num_warps=8, pre_hook=_bwd_pre_hook, ), triton.Config( - {"BLOCK_M": 32, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True, "UNROLL": 1}, + {"BLOCK_M": 32, "BLOCK_N": 64, + "SEQUENCE_PARALLEL": True, "UNROLL": 1}, num_stages=1, num_warps=8, pre_hook=_bwd_pre_hook, ), triton.Config( - {"BLOCK_M": 32, "BLOCK_N": 128, "SEQUENCE_PARALLEL": True, "UNROLL": 1}, + {"BLOCK_M": 32, "BLOCK_N": 128, + "SEQUENCE_PARALLEL": True, "UNROLL": 1}, num_stages=3, num_warps=8, pre_hook=_bwd_pre_hook, ), triton.Config( - {"BLOCK_M": 32, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True, "UNROLL": 1}, + {"BLOCK_M": 32, "BLOCK_N": 64, + "SEQUENCE_PARALLEL": True, "UNROLL": 1}, num_stages=1, num_warps=4, pre_hook=_bwd_pre_hook, ), triton.Config( - {"BLOCK_M": 32, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True, "UNROLL": 1}, + {"BLOCK_M": 32, "BLOCK_N": 64, + "SEQUENCE_PARALLEL": True, "UNROLL": 1}, num_stages=2, num_warps=4, pre_hook=_bwd_pre_hook, @@ -2616,7 +2650,8 @@ def alloc_fn(size: int, align: int, stream: Optional[int]): # pyre-ignore [6] triton.set_allocator(alloc_fn) - grid = lambda meta: ( # noqa E731 + + def grid(meta): return ( # noqa E731 triton.cdiv(N, meta["BLOCK_M"]), Z * H, ) @@ -2689,7 +2724,8 @@ def triton_hstu_attention_bwd( Z = seq_offsets.numel() - 1 _, H, DimQ = q.shape _, _, DimV = v.shape - grid = lambda meta: ( # noqa E731 + + def grid(meta): return ( # noqa E731 Z * H, (triton.cdiv(N, meta["BLOCK_N"]) if meta["SEQUENCE_PARALLEL"] else 1), ) @@ -2928,7 +2964,12 @@ def triton_cached_hstu_mha( DELTA_L, H, DimQ = delta_q.shape DeltaSize = DELTA_L // Z L, _, DimV = v.shape - out = torch.empty((DELTA_L, H, DimV), dtype=delta_q.dtype, device=delta_q.device) + out = torch.empty( + (DELTA_L, + H, + DimV), + dtype=delta_q.dtype, + device=delta_q.device) TMA_DESC_SIZE = 128 desc_q = delta_q @@ -2962,7 +3003,8 @@ def alloc_fn(size: int, align: int, stream: Optional[int]): # pyre-ignore [6] triton.set_allocator(alloc_fn) - grid = lambda meta: ( # noqa E731 + + def grid(meta): return ( # noqa E731 triton.cdiv(DeltaSize, meta["BLOCK_M"]), Z * H, ) diff --git a/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_hstu_linear.py b/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_hstu_linear.py index 8b0c288696..c155f3b7b9 100644 --- a/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_hstu_linear.py +++ b/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_hstu_linear.py @@ -15,6 +15,8 @@ #!/usr/bin/env python3 +from triton.language.extra import libdevice +from generative_recommenders.ops.utils import is_sm100 from typing import List, Optional, Tuple import torch @@ -46,11 +48,7 @@ def _get_layer_norm_mul_dropout_fwd_multirow_configs() -> List[triton.Config]: return configs -from generative_recommenders.ops.utils import is_sm100 - # @manual=//triton:triton -from triton.language.extra import libdevice - try: # @manual=//triton:triton from triton.language.extra.libdevice import fast_dividef @@ -100,7 +98,8 @@ def _generate_random_mask( STRIDE: tl.constexpr, BLOCK_D: tl.constexpr, ): - # NOTE: This function appears to be incomplete/unused - kept for compatibility + # NOTE: This function appears to be incomplete/unused - kept for + # compatibility pid = tl.program_id(0) cols = tl.arange(0, BLOCK_D) col_mask = cols < D @@ -222,8 +221,10 @@ def _ln_mul_dropout_fwd_rng( col_offsets = tl.arange(0, BLOCK_D) # Load precomputed random masks for u, x, y - u_offsets = row_offsets[:, None] * stride_mask + col_offsets[None, :] - x_offsets = (row_offsets[:, None] + N) * stride_mask + col_offsets[None, :] + u_offsets = row_offsets[:, None] * \ + stride_mask + col_offsets[None, :] + x_offsets = (row_offsets[:, None] + N) * \ + stride_mask + col_offsets[None, :] y_offsets = (row_offsets[:, None] + 2 * N) * stride_mask + col_offsets[ None, : ] @@ -242,7 +243,8 @@ def _ln_mul_dropout_fwd_rng( col_offsets = tl.arange(0, BLOCK_D) # Load precomputed random mask for y - y_offsets = row_offsets[:, None] * stride_mask + col_offsets[None, :] + y_offsets = row_offsets[:, None] * \ + stride_mask + col_offsets[None, :] mask = (row_offsets[:, None] < N) & (col_offsets[None, :] < D) y_keep = tl.load(RANDOM_MASK + y_offsets, mask=mask, other=True) @@ -276,9 +278,18 @@ def _ln_mul_dropout_fwd_rng( order=(1, 0), ) - tl.store(Y_block_ptr_u, u_block.to(Y.dtype.element_ty), boundary_check=(0, 1)) - tl.store(Y_block_ptr_x, x_block.to(Y.dtype.element_ty), boundary_check=(0, 1)) - tl.store(Y_block_ptr_y, y.to(Y.dtype.element_ty), boundary_check=(0, 1)) + tl.store( + Y_block_ptr_u, u_block.to( + Y.dtype.element_ty), boundary_check=( + 0, 1)) + tl.store( + Y_block_ptr_x, x_block.to( + Y.dtype.element_ty), boundary_check=( + 0, 1)) + tl.store( + Y_block_ptr_y, y.to( + Y.dtype.element_ty), boundary_check=( + 0, 1)) else: Y_block_ptr = tl.make_block_ptr( base=Y, @@ -470,7 +481,8 @@ def _ln_mul_dropout_bwd_dx_du_rng( dx = tl.where(dx_keep, dx / (1.0 - dropout_ratio), 0.0) dy = tl.where(dy_keep, dy / (1.0 - dropout_ratio), 0.0) else: - # Load dropout mask directly instead of generating random numbers + # Load dropout mask directly instead of generating random + # numbers dy_keep = tl.load(RANDOM_MASK + cols, mask=mask, other=True) dy = tl.where(dy_keep, dy / (1.0 - dropout_ratio), 0.0) @@ -620,7 +632,8 @@ def _ln_mul_dropout_bwd_dx_du( if CONCAT_UX: # apply dropout on du if FAST_DROPOUT: - random_du, random_dx, random_dy = rand3x(seed, random_offsets) + random_du, random_dx, random_dy = rand3x( + seed, random_offsets) else: random_du = tl.rand(seed, random_offsets) du_keep = random_du > dropout_ratio @@ -754,8 +767,16 @@ def _ln_mul_dropout_bwd_dwdb( sum_dw = tl.sum(dw, axis=0) sum_db = tl.sum(db, axis=0) - tl.store(FINAL_DW + cols, sum_dw.to(FINAL_DW.dtype.element_ty), mask=cols < D) - tl.store(FINAL_DB + cols, sum_db.to(FINAL_DB.dtype.element_ty), mask=cols < D) + tl.store( + FINAL_DW + cols, + sum_dw.to( + FINAL_DW.dtype.element_ty), + mask=cols < D) + tl.store( + FINAL_DB + cols, + sum_db.to( + FINAL_DB.dtype.element_ty), + mask=cols < D) def triton_layer_norm_mul_dropout_fwd( @@ -792,10 +813,12 @@ def triton_layer_norm_mul_dropout_fwd( MAX_FUSED_SIZE = 65536 // x.element_size() BLOCK_D: int = min(MAX_FUSED_SIZE, triton.next_power_of_2(D)) if D > BLOCK_D: - raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.") + raise RuntimeError( + "This layer norm doesn't support feature dim >= 64KB.") if seed is None: - seed = torch.randint(low=0, high=2**62, size=(1,), dtype=torch.int64).item() + seed = torch.randint(low=0, high=2**62, size=(1,), + dtype=torch.int64).item() num_warps: int = min(max(BLOCK_D // 256, 1), 8) sms = torch.cuda.get_device_properties("cuda").multi_processor_count # Benchmark shows separating RNG from ln_mul_dropout kernel only benefits on @@ -1174,7 +1197,11 @@ def _group_norm_mul_dropout_fwd( if CONCAT_UX: tl.store(Y + offsets, u.to(Y.dtype.element_ty), mask=mask) tl.store(Y + Heads * D + offsets, x.to(Y.dtype.element_ty), mask=mask) - tl.store(Y + 2 * Heads * D + offsets, y.to(Y.dtype.element_ty), mask=mask) + tl.store( + Y + 2 * Heads * D + offsets, + y.to( + Y.dtype.element_ty), + mask=mask) else: tl.store(Y + offsets, y.to(Y.dtype.element_ty), mask=mask) @@ -1229,8 +1256,23 @@ def _group_norm_mul_dropout_bwd_dx_du( x = tl.load(X + offsets, mask=mask, other=0).to(tl.float32) if CONCAT_UX: du = tl.load(DY + offsets, mask=mask, other=0).to(tl.float32) - dx = tl.load(DY + Heads * D + offsets, mask=mask, other=0).to(tl.float32) - dy = tl.load(DY + 2 * Heads * D + offsets, mask=mask, other=0).to(tl.float32) + dx = tl.load( + DY + + Heads * + D + + offsets, + mask=mask, + other=0).to( + tl.float32) + dy = tl.load( + DY + + 2 * + Heads * + D + + offsets, + mask=mask, + other=0).to( + tl.float32) else: du = tl.zeros([BLOCK_H, BLOCK_D], dtype=tl.float32) dx = tl.zeros([BLOCK_H, BLOCK_D], dtype=tl.float32) @@ -1303,8 +1345,16 @@ def _group_norm_mul_dropout_bwd_dx_du( ) if CONCAT_UX: tl.store(Y + offsets, u.to(Y.dtype.element_ty), mask=mask) - tl.store(Y + Heads * D + offsets, x.to(Y.dtype.element_ty), mask=mask) - tl.store(Y + 2 * Heads * D + offsets, y.to(Y.dtype.element_ty), mask=mask) + tl.store( + Y + Heads * D + offsets, + x.to( + Y.dtype.element_ty), + mask=mask) + tl.store( + Y + 2 * Heads * D + offsets, + y.to( + Y.dtype.element_ty), + mask=mask) else: tl.store(Y + offsets, y.to(Y.dtype.element_ty), mask=mask) @@ -1367,9 +1417,11 @@ def triton_group_norm_mul_dropout_fwd( assert bias.numel() == num_heads if concat_ux: - y = torch.empty((N, 3 * num_heads * linear_dim), dtype=x.dtype, device=x.device) + y = torch.empty((N, 3 * num_heads * linear_dim), + dtype=x.dtype, device=x.device) else: - y = torch.empty((N, num_heads * linear_dim), dtype=x.dtype, device=x.device) + y = torch.empty((N, num_heads * linear_dim), + dtype=x.dtype, device=x.device) mean = torch.empty((N * num_heads,), dtype=torch.float32, device=x.device) rstd = torch.empty((N * num_heads,), dtype=torch.float32, device=x.device) if N == 0: @@ -1384,7 +1436,8 @@ def triton_group_norm_mul_dropout_fwd( ) if seed is None: - seed = torch.randint(low=0, high=2**62, size=(1,), dtype=torch.int64).item() + seed = torch.randint(low=0, high=2**62, size=(1,), + dtype=torch.int64).item() num_warps: int = min(max(BLOCK_D * BLOCK_H // 256, 1), 8) # pyre-ignore[28] _group_norm_mul_dropout_fwd[(N,)]( @@ -1444,7 +1497,8 @@ def triton_group_norm_mul_dropout_bwd( (N, 3 * num_heads * linear_dim), dtype=x.dtype, device=x.device ) else: - y = torch.empty((N, num_heads * linear_dim), dtype=x.dtype, device=x.device) + y = torch.empty((N, num_heads * linear_dim), + dtype=x.dtype, device=x.device) if N == 0: return ( torch.zeros_like(x), @@ -1464,8 +1518,16 @@ def triton_group_norm_mul_dropout_bwd( else: GROUP_N = 64 * 8 GROUP_N = N if GROUP_N > N else GROUP_N - _dweight = torch.zeros((GROUP_N, num_heads), dtype=torch.float32, device=x.device) - _dbias = torch.zeros((GROUP_N, num_heads), dtype=torch.float32, device=x.device) + _dweight = torch.zeros( + (GROUP_N, + num_heads), + dtype=torch.float32, + device=x.device) + _dbias = torch.zeros( + (GROUP_N, + num_heads), + dtype=torch.float32, + device=x.device) dweight = torch.empty((num_heads,), dtype=weight.dtype, device=x.device) dbias = torch.empty((num_heads,), dtype=weight.dtype, device=x.device) # pyre-ignore[28] @@ -1713,7 +1775,14 @@ def forward( out = maybe_triton_addmm_fwd(x=y, w=output_weight, y=x) - saved_tensors = [attn, u, norm_weight, norm_bias, mean, rstd, output_weight] + saved_tensors = [ + attn, + u, + norm_weight, + norm_bias, + mean, + rstd, + output_weight] if not recompute_y_in_backward: saved_tensors.append(y) ctx.save_for_backward(*saved_tensors) @@ -1953,7 +2022,8 @@ def helion_layer_norm_mul_dropout_fwd( N, D = x.shape if seed is None: - seed = torch.randint(low=0, high=2**62, size=(1,), dtype=torch.int64).item() + seed = torch.randint(low=0, high=2**62, size=(1,), + dtype=torch.int64).item() if concat_ux: y = torch.empty([N, 3 * D], dtype=x.dtype, device=x.device) diff --git a/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_hstu_preprocess_and_attention.py b/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_hstu_preprocess_and_attention.py index 85e60db3c7..c1514b4ffc 100644 --- a/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_hstu_preprocess_and_attention.py +++ b/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_hstu_preprocess_and_attention.py @@ -189,7 +189,8 @@ def backward( idx += 1 if ctx.recompute_uvqk_in_backward: uvqk_bias = ctx.saved_tensors[idx] - uvqk = maybe_triton_addmm_fwd(x=normed_x, w=uvqk_weight, y=uvqk_bias) + uvqk = maybe_triton_addmm_fwd( + x=normed_x, w=uvqk_weight, y=uvqk_bias) idx += 1 else: uvqk = ctx.saved_tensors[idx] diff --git a/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_jagged.py b/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_jagged.py index 46884a63d0..877da3193b 100644 --- a/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_jagged.py +++ b/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_jagged.py @@ -222,7 +222,8 @@ def jagged_dense_bmm_broadcast_add_kernel( jg = tl.load( jg_ptrs, # pyre-fixme[16]: `int` has no attribute `__getitem__`. - mask=(offs_m[:, None] < (seq_len - start_m)) & ((k + offs_k)[None, :] < K), + mask=(offs_m[:, None] < (seq_len - start_m) + ) & ((k + offs_k)[None, :] < K), other=0.0, ) dn = tl.load( @@ -237,10 +238,12 @@ def jagged_dense_bmm_broadcast_add_kernel( if HAS_BIAS: if ELEMENTWISE: Bias += (seq_start + start_m) * stride_bias_b - bias_ptrs = Bias + offs_m[:, None] * stride_bias_b + offs_n[None, :] + bias_ptrs = Bias + offs_m[:, None] * \ + stride_bias_b + offs_n[None, :] bias = tl.load( bias_ptrs, - mask=(offs_m[:, None] < (seq_len - start_m)) & (offs_n[None, :] < N), + mask=(offs_m[:, None] < (seq_len - start_m) + ) & (offs_n[None, :] < N), other=0.0, ) accumulator += bias.to(tl.float32) @@ -357,14 +360,16 @@ def _jagged_jagged_bmm_reduce_sum( JaggedA += seq_start * stride_ak JaggedB += seq_start * stride_bk offs_k = tl.arange(0, BLOCK_K) - jg_a_ptrs = JaggedA + offs_k[None, :] * stride_ak + (start_m + offs_m)[:, None] + jg_a_ptrs = JaggedA + offs_k[None, :] * \ + stride_ak + (start_m + offs_m)[:, None] jg_b_ptrs = JaggedB + offs_k[:, None] * stride_bk + offs_n[None, :] for k in range(0, seq_len, BLOCK_K): jg_a = tl.load( jg_a_ptrs, # pyre-fixme[16]: `int` has no attribute `__getitem__`. - mask=(offs_m[:, None] < (M - start_m)) & ((k + offs_k)[None, :] < seq_len), + mask=(offs_m[:, None] < (M - start_m) + ) & ((k + offs_k)[None, :] < seq_len), other=0.0, ) jg_b = tl.load( @@ -411,7 +416,7 @@ def forward( B, _, K = dense.shape bmm_out = torch.empty((L, K), dtype=jagged.dtype, device=jagged.device) - grid = lambda meta: ( # noqa E731 + def grid(meta): return ( # noqa E731 triton.cdiv(K, meta["BLOCK_N"]), triton.cdiv(max_seq_len, meta["BLOCK_M"]), B, @@ -453,7 +458,7 @@ def backward( d_jagged = torch.empty_like(jagged) d_dense = torch.empty_like(dense) - grid = lambda meta: ( # noqa E731 + def grid(meta): return ( # noqa E731 triton.cdiv(ctx.D, meta["BLOCK_N"]), triton.cdiv(ctx.max_seq_len, meta["BLOCK_M"]), ctx.B, @@ -478,7 +483,7 @@ def backward( ELEMENTWISE=False, ) - grid = lambda meta: ( # noqa E731 + def grid(meta): return ( # noqa E731 triton.cdiv(ctx.D, meta["BLOCK_M"]), triton.cdiv(ctx.K, meta["BLOCK_N"]), ctx.B, @@ -636,7 +641,7 @@ def forward( B, _ = dense.shape out = torch.empty_like(jagged) - grid = lambda meta: ( # noqa E731 + def grid(meta): return ( # noqa E731 B, triton.cdiv(max_seq_len, meta["BLOCK_N"]), ) @@ -666,7 +671,8 @@ def backward( ctx, d_out: torch.Tensor ) -> Tuple[None, None, torch.Tensor, torch.Tensor]: seq_offsets = ctx.saved_tensors[0] - d_dense = torch.empty((ctx.B, ctx.D), device=d_out.device, dtype=d_out.dtype) + d_dense = torch.empty( + (ctx.B, ctx.D), device=d_out.device, dtype=d_out.dtype) BLOCK_D = triton.next_power_of_2(ctx.D) if ctx.D < 64 else 64 jagged_reduce_sum[(ctx.B, triton.cdiv(ctx.D, BLOCK_D))]( seq_offsets=seq_offsets, @@ -694,7 +700,7 @@ def triton_jagged_dense_bmm_add_fwd( B, _, N = dense.shape out = torch.empty((L, N), dtype=jagged.dtype, device=jagged.device) - grid = lambda meta: ( # noqa E731 + def grid(meta): return ( # noqa E731 triton.cdiv(N, meta["BLOCK_N"]), triton.cdiv(max_seq_len, meta["BLOCK_M"]), B, @@ -733,7 +739,7 @@ def triton_jagged_dense_bmm_add_bwd_jagged( B: int, N: int, ) -> torch.Tensor: - grid = lambda meta: ( # noqa E731 + def grid(meta): return ( # noqa E731 triton.cdiv(K, meta["BLOCK_N"]), triton.cdiv(max_seq_len, meta["BLOCK_M"]), B, @@ -774,7 +780,7 @@ def triton_jagged_dense_bmm_add_bwd_dense_bias( ) -> Tuple[torch.Tensor, torch.Tensor]: d_bias = torch.empty((B, N), device=d_out.device, dtype=d_out.dtype) - grid = lambda meta: ( # noqa E731 + def grid(meta): return ( # noqa E731 triton.cdiv(K, meta["BLOCK_M"]), triton.cdiv(N, meta["BLOCK_N"]), B, @@ -937,7 +943,8 @@ def concat_2D_jagged_w_prefix( + offs_d ) else: - in_ptrs = ValuesA + (off_a + seq_start_a).to(tl.int64) * stride_ad + offs_d + in_ptrs = ValuesA + \ + (off_a + seq_start_a).to(tl.int64) * stride_ad + offs_d else: off_b = off_n - out_seq_b_start + n_prefix_from_B if off_n < n_prefix_from_B: @@ -950,7 +957,8 @@ def concat_2D_jagged_w_prefix( + offs_d ) else: - in_ptrs = ValuesB + (off_b + seq_start_b).to(tl.int64) * stride_bd + offs_d + in_ptrs = ValuesB + \ + (off_b + seq_start_b).to(tl.int64) * stride_bd + offs_d v = tl.load(in_ptrs, mask=offs_d < D) tl.store(out_ptrs, v, mask=offs_d < D) @@ -1084,12 +1092,14 @@ def split_2D_jagged_w_prefix( in_ptrs = JaggedIn + (seq_start + off_n).to(tl.int64) * stride_id + offs_d if off_n < out_seq_b_start and off_n >= n_prefix_to_B: off_a = off_n - n_prefix_to_B - out_ptrs = OutA + (off_a + seq_start_a).to(tl.int64) * stride_ad + offs_d + out_ptrs = OutA + (off_a + seq_start_a).to(tl.int64) * \ + stride_ad + offs_d else: off_b = off_n - out_seq_b_start + n_prefix_to_B if off_n < n_prefix_to_B: off_b += out_seq_b_start - n_prefix_to_B - out_ptrs = OutB + (off_b + seq_start_b).to(tl.int64) * stride_bd + offs_d + out_ptrs = OutB + (off_b + seq_start_b).to(tl.int64) * \ + stride_bd + offs_d v = tl.load(in_ptrs, mask=offs_d < D) tl.store(out_ptrs, v, mask=offs_d < D) @@ -1425,9 +1435,11 @@ def forward( offsets_b.device, non_blocking=True ) if seq_len_a is None: - seq_len_a = offsets_a.index_select(dim=0, index=offsets_a_last_idx) + seq_len_a = offsets_a.index_select( + dim=0, index=offsets_a_last_idx) if seq_len_b is None: - seq_len_b = offsets_b.index_select(dim=0, index=offsets_b_last_idx) + seq_len_b = offsets_b.index_select( + dim=0, index=offsets_b_last_idx) else: if seq_len_a is None: seq_len_a = int(offsets_a[-1].item()) @@ -1436,9 +1448,11 @@ def forward( _, D = values.shape BLOCK_D = triton.next_power_of_2(D) # pyre-ignore[6] Incompatible parameter type - values_a = torch.empty((seq_len_a, D), device=values.device, dtype=values.dtype) + values_a = torch.empty( + (seq_len_a, D), device=values.device, dtype=values.dtype) # pyre-ignore[6] Incompatible parameter type - values_b = torch.empty((seq_len_b, D), device=values.device, dtype=values.dtype) + values_b = torch.empty( + (seq_len_b, D), device=values.device, dtype=values.dtype) _triton_split_2D_jagged_internal( jagged_in=values, max_seq_len=max_seq_len, @@ -1599,7 +1613,8 @@ def triton_jagged_dense_bmm( jagged: torch.Tensor, dense: torch.Tensor, ) -> torch.Tensor: - return _JaggedDenseBmmFunction.apply(max_seq_len, seq_offsets, jagged, dense) + return _JaggedDenseBmmFunction.apply( + max_seq_len, seq_offsets, jagged, dense) @torch.jit.unused @@ -1690,7 +1705,8 @@ def concat_2D_jagged_w_prefix_multirow( + offs_d[None, :] ) - to_a_mask = (offs_n < out_seq_b_start) & (offs_n >= n_prefix_from_B) & valid_mask + to_a_mask = (offs_n < out_seq_b_start) & ( + offs_n >= n_prefix_from_B) & valid_mask to_b_mask = ~to_a_mask & valid_mask off_a = offs_n - n_prefix_from_B @@ -1708,12 +1724,18 @@ def concat_2D_jagged_w_prefix_multirow( + offs_d[None, :] ) - v_a = tl.load(in_a_ptrs, mask=to_a_mask[:, None] & (offs_d[None, :] < D), other=0.0) + v_a = tl.load(in_a_ptrs, mask=to_a_mask[:, None] & ( + offs_d[None, :] < D), other=0.0) tl.store(out_ptrs, v_a, mask=to_a_mask[:, None] & (offs_d[None, :] < D)) prefix_mask = offs_n < n_prefix_from_B - off_b = tl.where(prefix_mask, offs_n, offs_n - out_seq_b_start + n_prefix_from_B) + off_b = tl.where( + prefix_mask, + offs_n, + offs_n - + out_seq_b_start + + n_prefix_from_B) if IS_DENSE_B: in_b_ptrs = ( ValuesB @@ -1728,7 +1750,8 @@ def concat_2D_jagged_w_prefix_multirow( + offs_d[None, :] ) - v_b = tl.load(in_b_ptrs, mask=to_b_mask[:, None] & (offs_d[None, :] < D), other=0.0) + v_b = tl.load(in_b_ptrs, mask=to_b_mask[:, None] & ( + offs_d[None, :] < D), other=0.0) tl.store(out_ptrs, v_b, mask=to_b_mask[:, None] & (offs_d[None, :] < D)) @@ -1883,22 +1906,31 @@ def split_2D_jagged_w_prefix_multirow( + offs_d[None, :] ) - v = tl.load(in_ptrs, mask=valid_mask[:, None] & (offs_d[None, :] < D), other=0.0) + v = tl.load(in_ptrs, mask=valid_mask[:, None] & ( + offs_d[None, :] < D), other=0.0) - to_a_mask = (offs_n < out_seq_b_start) & (offs_n >= n_prefix_to_B) & valid_mask + to_a_mask = (offs_n < out_seq_b_start) & ( + offs_n >= n_prefix_to_B) & valid_mask to_b_mask = ~to_a_mask & valid_mask off_a = offs_n - n_prefix_to_B out_a_ptrs = ( - OutA + (off_a[:, None] + seq_start_a).to(tl.int64) * stride_ad + offs_d[None, :] + OutA + (off_a[:, None] + seq_start_a).to(tl.int64) * + stride_ad + offs_d[None, :] ) tl.store(out_a_ptrs, v, mask=to_a_mask[:, None] & (offs_d[None, :] < D)) prefix_mask = offs_n < n_prefix_to_B - off_b = tl.where(prefix_mask, offs_n, offs_n - out_seq_b_start + n_prefix_to_B) + off_b = tl.where( + prefix_mask, + offs_n, + offs_n - + out_seq_b_start + + n_prefix_to_B) out_b_ptrs = ( - OutB + (off_b[:, None] + seq_start_b).to(tl.int64) * stride_bd + offs_d[None, :] + OutB + (off_b[:, None] + seq_start_b).to(tl.int64) * + stride_bd + offs_d[None, :] ) tl.store(out_b_ptrs, v, mask=to_b_mask[:, None] & (offs_d[None, :] < D)) @@ -2023,7 +2055,10 @@ def _helion_split_2d_jagged_kernel( ), ) # Load output boundaries for part A - out_a_start = tl.load(offsets_a + batch_id * 1, None, eviction_policy="evict_last") + out_a_start = tl.load( + offsets_a + batch_id * 1, + None, + eviction_policy="evict_last") batch_id_plus_1 = 1 + triton_helpers.div_floor_integer( flat_program_id, triton_helpers.div_floor_integer( @@ -2079,7 +2114,8 @@ def _helion_split_2d_jagged_kernel( disallow_acc_multi_buffer=True, flatten=True, ): - feature_indices = feature_offset + tl.arange(0, _BLOCK_SIZE_1).to(tl.int32) + feature_indices = feature_offset + \ + tl.arange(0, _BLOCK_SIZE_1).to(tl.int32) # Compute D constant and feature mask once per feature iteration D_const = tl.full([], tl.cast(D, tl.int32), tl.int32) @@ -2090,15 +2126,18 @@ def _helion_split_2d_jagged_kernel( row_subscript = row_indices[:, None] input_row_a = input_start_i32 + row_subscript input_idx_a = ( - tl.cast(input_row_a * D_const, tl.int32) + feature_indices[None, :] + tl.cast(input_row_a * D_const, tl.int32) + + feature_indices[None, :] ) out_a_row = out_a_start_i32 + row_subscript out_a_idx = ( - tl.cast(out_a_row * D_const, tl.int32) + feature_indices[None, :] + tl.cast(out_a_row * D_const, tl.int32) + + feature_indices[None, :] ) - mask_a = is_part_a[:, None] & valid_mask[:, None] & feature_mask[None, :] + mask_a = is_part_a[:, None] & valid_mask[:, + None] & feature_mask[None, :] # Load and store part A data slice_a = tl.load( @@ -2118,7 +2157,8 @@ def _helion_split_2d_jagged_kernel( row_minus_len_a = row_subscript - len_a_i32 out_b_row = out_b_start_i32 + row_minus_len_a out_b_idx = ( - tl.cast(out_b_row * D_const, tl.int32) + feature_indices[None, :] + tl.cast(out_b_row * D_const, tl.int32) + + feature_indices[None, :] ) mask_b = is_part_b[:, None] & feature_mask[None, :] @@ -2185,8 +2225,10 @@ def _helion_split_2d_jagged( num_seq_blocks = (max_seq_len + block_size_0 - 1) // block_size_0 total_len_a = int(offsets_a[-1].item()) total_len_b = int(offsets_b[-1].item()) - out_a = torch.empty([total_len_a, D], dtype=values.dtype, device=values.device) - out_b = torch.empty([total_len_b, D], dtype=values.dtype, device=values.device) + out_a = torch.empty( + [total_len_a, D], dtype=values.dtype, device=values.device) + out_b = torch.empty( + [total_len_b, D], dtype=values.dtype, device=values.device) values_flat = values.view(-1) out_a_flat = out_a.view(-1) out_b_flat = out_b.view(-1) diff --git a/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_jagged_tensors.py b/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_jagged_tensors.py index 7fd79ad99d..f117e7ceba 100644 --- a/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_jagged_tensors.py +++ b/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_jagged_tensors.py @@ -206,7 +206,8 @@ def _concat_2D_jagged_multirow( valid_mask = offs_n < seq_len out_seq_start = seq_start_a + seq_start_b + offs_n - out_ptrs = Out + out_seq_start[:, None].to(tl.int64) * stride_od + offs_d[None, :] + out_ptrs = Out + \ + out_seq_start[:, None].to(tl.int64) * stride_od + offs_d[None, :] from_prefix_b_mask = (offs_n < n_prefix_from_B) & valid_mask from_a_mask = ( @@ -224,7 +225,8 @@ def _concat_2D_jagged_multirow( v_b1 = tl.load( in_b1_ptrs, mask=from_prefix_b_mask[:, None] & (offs_d[None, :] < D), other=0.0 ) - tl.store(out_ptrs, v_b1, mask=from_prefix_b_mask[:, None] & (offs_d[None, :] < D)) + tl.store(out_ptrs, v_b1, mask=from_prefix_b_mask[:, None] & ( + offs_d[None, :] < D)) off_a = offs_n - n_prefix_from_B in_a_ptrs = ( @@ -246,7 +248,8 @@ def _concat_2D_jagged_multirow( v_b2 = tl.load( in_b2_ptrs, mask=from_suffix_b_mask[:, None] & (offs_d[None, :] < D), other=0.0 ) - tl.store(out_ptrs, v_b2, mask=from_suffix_b_mask[:, None] & (offs_d[None, :] < D)) + tl.store(out_ptrs, v_b2, mask=from_suffix_b_mask[:, None] & ( + offs_d[None, :] < D)) @triton_autotune( @@ -343,11 +346,13 @@ def _split_2D_jagged_multirow( + offs_d[None, :] ) - v = tl.load(in_ptrs, mask=valid_mask[:, None] & (offs_d[None, :] < D), other=0.0) + v = tl.load(in_ptrs, mask=valid_mask[:, None] & ( + offs_d[None, :] < D), other=0.0) to_prefix_b_mask = (offs_n < n_prefix_to_B) & valid_mask to_a_mask = ( - (offs_n >= n_prefix_to_B) & (offs_n < seq_len_a + n_prefix_to_B) & valid_mask + (offs_n >= n_prefix_to_B) & ( + offs_n < seq_len_a + n_prefix_to_B) & valid_mask ) to_suffix_b_mask = (offs_n >= seq_len_a + n_prefix_to_B) & valid_mask @@ -356,19 +361,23 @@ def _split_2D_jagged_multirow( + (offs_n[:, None] + seq_start_b).to(tl.int64) * stride_bd + offs_d[None, :] ) - tl.store(out_b1_ptrs, v, mask=to_prefix_b_mask[:, None] & (offs_d[None, :] < D)) + tl.store(out_b1_ptrs, v, mask=to_prefix_b_mask[:, None] & ( + offs_d[None, :] < D)) off_a = offs_n - n_prefix_to_B out_a_ptrs = ( - OutA + (off_a[:, None] + seq_start_a).to(tl.int64) * stride_ad + offs_d[None, :] + OutA + (off_a[:, None] + seq_start_a).to(tl.int64) * + stride_ad + offs_d[None, :] ) tl.store(out_a_ptrs, v, mask=to_a_mask[:, None] & (offs_d[None, :] < D)) off_b = offs_n - seq_len_a out_b2_ptrs = ( - OutB + (off_b[:, None] + seq_start_b).to(tl.int64) * stride_bd + offs_d[None, :] + OutB + (off_b[:, None] + seq_start_b).to(tl.int64) * + stride_bd + offs_d[None, :] ) - tl.store(out_b2_ptrs, v, mask=to_suffix_b_mask[:, None] & (offs_d[None, :] < D)) + tl.store(out_b2_ptrs, v, mask=to_suffix_b_mask[:, None] & ( + offs_d[None, :] < D)) @triton_autotune( @@ -455,7 +464,8 @@ def _concat_2D_jagged( out_seq_start = seq_start_a + seq_start_b + off_n out_ptrs = Out + out_seq_start.to(tl.int64) * stride_od + offs_d if off_n < n_prefix_from_B: - in_ptrs = ValuesB + (off_n + seq_start_b).to(tl.int64) * stride_bd + offs_d + in_ptrs = ValuesB + \ + (off_n + seq_start_b).to(tl.int64) * stride_bd + offs_d elif off_n < seq_len_a + n_prefix_from_B: in_ptrs = ( ValuesA @@ -513,7 +523,8 @@ def _split_2D_jagged( offs_d = tl.arange(0, BLOCK_D) in_ptrs = JaggedIn + (seq_start + off_n).to(tl.int64) * stride_id + offs_d if off_n < n_prefix_to_B: - out_ptrs = OutB + (off_n + seq_start_b).to(tl.int64) * stride_bd + offs_d + out_ptrs = OutB + (off_n + seq_start_b).to(tl.int64) * \ + stride_bd + offs_d elif off_n < seq_len_a + n_prefix_to_B: out_ptrs = ( OutA @@ -522,7 +533,8 @@ def _split_2D_jagged( ) else: out_ptrs = ( - OutB + (off_n - seq_len_a + seq_start_b).to(tl.int64) * stride_bd + offs_d + OutB + (off_n - seq_len_a + seq_start_b).to(tl.int64) * + stride_bd + offs_d ) v = tl.load(in_ptrs, mask=offs_d < D) tl.store(out_ptrs, v, mask=offs_d < D) diff --git a/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_layer_norm.py b/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_layer_norm.py index 2327ab14c6..736bffcb79 100644 --- a/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_layer_norm.py +++ b/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_layer_norm.py @@ -400,8 +400,16 @@ def _weighted_layer_norm_bwd_dx( dx = dy_block * sigmoid_layer_norm + dx # Write dx - tl.store(DX_block_ptr, dx.to(DX.dtype.element_ty), boundary_check=(0, 1)) - partial_dw = tl.sum(dy_block * x_block * xhat * sigmoid_deriv, axis=0) + tl.store( + DX_block_ptr, dx.to( + DX.dtype.element_ty), boundary_check=( + 0, 1)) + partial_dw = tl.sum( + dy_block * + x_block * + xhat * + sigmoid_deriv, + axis=0) partial_db = tl.sum(dy_block * x_block * sigmoid_deriv, axis=0) else: c1 = tl.sum(xhat * wdy, axis=1) / D @@ -410,7 +418,10 @@ def _weighted_layer_norm_bwd_dx( c2 = tl.expand_dims(c2, 1) dx = (wdy - (xhat * c1 + c2)) * rstd # Write dx - tl.store(DX_block_ptr, dx.to(DX.dtype.element_ty), boundary_check=(0, 1)) + tl.store( + DX_block_ptr, dx.to( + DX.dtype.element_ty), boundary_check=( + 0, 1)) partial_dw = tl.sum(dy_block * xhat, axis=0) partial_db = tl.sum(dy_block, axis=0) @@ -471,8 +482,16 @@ def _layer_norm_bwd_dwdb( sum_dw = tl.sum(dw, axis=0) sum_db = tl.sum(db, axis=0) - tl.store(FINAL_DW + cols, sum_dw.to(FINAL_DW.dtype.element_ty), mask=cols < D) - tl.store(FINAL_DB + cols, sum_db.to(FINAL_DB.dtype.element_ty), mask=cols < D) + tl.store( + FINAL_DW + cols, + sum_dw.to( + FINAL_DW.dtype.element_ty), + mask=cols < D) + tl.store( + FINAL_DB + cols, + sum_db.to( + FINAL_DB.dtype.element_ty), + mask=cols < D) def triton_weighted_layer_norm_fwd( @@ -505,13 +524,14 @@ def triton_weighted_layer_norm_fwd( MAX_FUSED_SIZE = 65536 // x.element_size() BLOCK_D: int = min(MAX_FUSED_SIZE, triton.next_power_of_2(D)) if D > BLOCK_D: - raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.") + raise RuntimeError( + "This layer norm doesn't support feature dim >= 64KB.") if N == 0: return y, mean, rstd, BLOCK_D # pyre-ignore[28] - grid = lambda meta: ( # noqa E731 + def grid(meta): return ( # noqa E731 triton.cdiv(N, meta["BLOCK_N"]), ) if learnable: @@ -569,8 +589,10 @@ def triton_weighted_layer_norm_bwd( dx = torch.empty_like(x) sms = torch.cuda.get_device_properties(x.device).multi_processor_count tile_num = max(1, min(sms * 8, N // 4)) - _dweight = torch.empty((tile_num, D), dtype=torch.float32, device=x.device) - _dbias = torch.empty((tile_num, D), dtype=torch.float32, device=x.device) + _dweight = torch.empty( + (tile_num, D), dtype=torch.float32, device=x.device) + _dbias = torch.empty( + (tile_num, D), dtype=torch.float32, device=x.device) dweight = torch.empty((D,), dtype=weight.dtype, device=x.device) dbias = torch.empty((D,), dtype=weight.dtype, device=x.device) if N == 0: @@ -768,7 +790,8 @@ def _weighted_rms_norm_fwd( y = y * w[None, :] if SILU: - # pyre-ignore[16]: Module `triton.language.math` has no attribute `fast_dividef` + # pyre-ignore[16]: Module `triton.language.math` has no attribute + # `fast_dividef` y = fast_dividef(y, 1.0 + tl.exp(-y)) tl.store(Y_block_ptr, y.to(Y.dtype.element_ty), boundary_check=(0, 1)) @@ -928,7 +951,8 @@ def _weighted_rms_norm_bwd( # pyre-fixme[16] sig_y = fast_dividef(1.0, 1.0 + tl.exp(-y_before_silu)) # SILU derivative: sigmoid(y) + y * sigmoid(y) * (1 - sigmoid(y)) - dy_block = dy_block * (sig_y + y_before_silu * sig_y * (1.0 - sig_y)) + dy_block = dy_block * \ + (sig_y + y_before_silu * sig_y * (1.0 - sig_y)) wdy = w[None, :] * dy_block @@ -937,7 +961,10 @@ def _weighted_rms_norm_bwd( dx = (wdy - (xhat * c1)) * rstd # Write dx - tl.store(DX_block_ptr, dx.to(DX.dtype.element_ty), boundary_check=(0, 1)) + tl.store( + DX_block_ptr, dx.to( + DX.dtype.element_ty), boundary_check=( + 0, 1)) # Accumulate partial sums for dw # Compute dw for all rows, then sum locally before atomic operation @@ -975,7 +1002,11 @@ def _rms_norm_bwd_dwdb( dw += tl.load(DW + offs, mask=mask, other=0.0) sum_dw = tl.sum(dw, axis=0) - tl.store(FINAL_DW + cols, sum_dw.to(FINAL_DW.dtype.element_ty), mask=cols < D) + tl.store( + FINAL_DW + cols, + sum_dw.to( + FINAL_DW.dtype.element_ty), + mask=cols < D) class RMSNormFunction(torch.autograd.Function): @@ -1001,7 +1032,8 @@ def forward( MAX_FUSED_SIZE = 65536 // x.element_size() BLOCK_D = min(MAX_FUSED_SIZE, triton.next_power_of_2(D)) if D > BLOCK_D: - raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.") + raise RuntimeError( + "This layer norm doesn't support feature dim >= 64KB.") ctx.save_for_backward(x, weight, rstd) ctx.silu = silu @@ -1009,7 +1041,7 @@ def forward( return y # pyre-ignore[28] - grid = lambda meta: ( # noqa E731 + def grid(meta): return ( # noqa E731 triton.cdiv(N, meta["BLOCK_N"]), ) _weighted_rms_norm_fwd[grid]( @@ -1110,7 +1142,7 @@ def forward( return y # pyre-ignore[28] - grid = lambda meta: ( # noqa E731 + def grid(meta): return ( # noqa E731 triton.cdiv(N, meta["BLOCK_N"]), ) _weighted_layer_norm_fwd[grid]( @@ -1143,8 +1175,10 @@ def backward( dx = torch.empty_like(x) sms = torch.cuda.get_device_properties(x.device).multi_processor_count tile_num = max(1, min(sms * 8, N // 4)) - _dweight = torch.empty((tile_num, D), dtype=torch.float32, device=x.device) - _dbias = torch.empty((tile_num, D), dtype=torch.float32, device=x.device) + _dweight = torch.empty( + (tile_num, D), dtype=torch.float32, device=x.device) + _dbias = torch.empty( + (tile_num, D), dtype=torch.float32, device=x.device) dweight = torch.empty((D,), dtype=weight.dtype, device=x.device) dbias = torch.empty((D,), dtype=weight.dtype, device=x.device) if N == 0: diff --git a/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_position.py b/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_position.py index 793b61f5e0..63aa54c732 100644 --- a/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_position.py +++ b/recommendation/dlrm_v3/generative_recommenders/ops/triton/triton_position.py @@ -150,7 +150,8 @@ def _add_timestamp_position_embeddings_kernel( seq_emb = tl.load(SeqEmb + seq_emb_offsets, mask=mask) pos_emb = tl.load(PosEmb + pos_emb_offsets, mask=mask) ts_emb = tl.load(TsEmb + ts_emb_offsets, mask=mask) - tl.store(out_offsets, seq_emb + (pos_emb + ts_emb).to(seq_emb.dtype), mask=mask) + tl.store(out_offsets, seq_emb + (pos_emb + + ts_emb).to(seq_emb.dtype), mask=mask) seq_emb_offsets += BLOCK_D pos_emb_offsets += BLOCK_D ts_emb_offsets += BLOCK_D @@ -274,7 +275,7 @@ def forward( pos_inds = torch.empty_like(seq_embeddings[:, 0], dtype=torch.int32) ts_emb_size = ts_embeddings.shape[0] - grid = lambda meta: ( # noqa E731 + def grid(meta): return ( # noqa E731 B, triton.cdiv(max_seq_len, meta["BLOCK_N"]), ) @@ -309,7 +310,8 @@ def forward( BLOCK_D=BLOCK_D, ) try: - values = torch.arange(0, N, dtype=torch.int32, device=timestamps.device) + values = torch.arange( + 0, N, dtype=torch.int32, device=timestamps.device) sorted_ts_key_inds, sorted_ts_value_inds = torch.ops.hammer.sort_kv_pairs( ts_inds, values ) @@ -363,7 +365,7 @@ def backward( d_ts_embeddings = torch.empty( (ctx.ts_emb_size, ctx.D), device=d_out.device, dtype=torch.float32 ) - grid = lambda meta: (triton.cdiv(d_out.shape[0], meta["BLOCK"]),) # noqa E731 + def grid(meta): return (triton.cdiv(d_out.shape[0], meta["BLOCK"]),) # noqa E731 AUTOTUNE_B = prev_power_of_2(ctx.B) _add_embeddings_bwd_kernel[grid]( In=d_out, diff --git a/recommendation/dlrm_v3/inference_modules.py b/recommendation/dlrm_v3/inference_modules.py index 6bc78694f2..2a0cae495b 100644 --- a/recommendation/dlrm_v3/inference_modules.py +++ b/recommendation/dlrm_v3/inference_modules.py @@ -195,7 +195,8 @@ def move_sparse_output_to_device( seq_embeddings = { k: SequenceEmbedding( lengths=seq_embeddings[k].lengths.to(device), - embedding=seq_embeddings[k].embedding.to(device).to(torch.bfloat16), + embedding=seq_embeddings[k].embedding.to( + device).to(torch.bfloat16), ) for k in seq_embeddings.keys() } diff --git a/recommendation/dlrm_v3/main.py b/recommendation/dlrm_v3/main.py index b5dbbe3169..b04dfea7ee 100644 --- a/recommendation/dlrm_v3/main.py +++ b/recommendation/dlrm_v3/main.py @@ -17,6 +17,31 @@ mlperf dlrm_v3 inference benchmarking tool. """ +from utils import ( + get_dataset, + profiler_or_nullcontext, + SUPPORTED_DATASETS, +) +from model_family import HSTUModelFamily +from inference_modules import set_is_inference +from data_producer import ( + MultiThreadDataProducer, + QueryItem, + SingleThreadDataProducer, +) +from datasets.synthetic_streaming import ( + DLRMv3SyntheticStreamingDataset, +) +from datasets.dataset import Dataset, Samples +from configs import get_embedding_table_config, get_hstu_configs +from generative_recommenders.common import set_dev_mode, set_verbose_level +import torch +import numpy as np +import mlperf_loadgen as lg # @manual +from typing import Any, Dict, List, Optional, Union +import time +import sys +import os import argparse import array import logging @@ -24,33 +49,8 @@ import threading logging.basicConfig(level=logging.INFO) -import os -import sys -import time -from typing import Any, Dict, List, Optional, Union # pyre-ignore [21] -import mlperf_loadgen as lg # @manual -import numpy as np -import torch -from generative_recommenders.common import set_dev_mode, set_verbose_level -from configs import get_embedding_table_config, get_hstu_configs -from datasets.dataset import Dataset, Samples -from datasets.synthetic_streaming import ( - DLRMv3SyntheticStreamingDataset, -) -from data_producer import ( - MultiThreadDataProducer, - QueryItem, - SingleThreadDataProducer, -) -from inference_modules import set_is_inference -from model_family import HSTUModelFamily -from utils import ( - get_dataset, - profiler_or_nullcontext, - SUPPORTED_DATASETS, -) logger: logging.Logger = logging.getLogger("main") @@ -221,15 +221,17 @@ def run_one_item(self, qitem: QueryItem) -> None: query_mt_target_preds = ( mt_target_preds[ # pyre-ignore [61] 0, - candidate_size * i : candidate_size * (i + 1), + candidate_size * i: candidate_size * (i + 1), ] .view(-1) .float() .numpy() ) - response_array = array.array("B", query_mt_target_preds.tobytes()) + response_array = array.array( + "B", query_mt_target_preds.tobytes()) bi = response_array.buffer_info() - # since we send buffer to loadgen, needs `response_array` in memory during send + # since we send buffer to loadgen, needs `response_array` + # in memory during send lg.QuerySamplesComplete( [lg.QuerySampleResponse(query_id, bi[0], bi[1])] ) @@ -237,7 +239,7 @@ def run_one_item(self, qitem: QueryItem) -> None: for i, query_id in enumerate(qitem.query_ids): query_mt_target_preds = ( mt_target_preds[ # pyre-ignore [61] - 0, candidate_size * i : candidate_size * (i + 1) + 0, candidate_size * i: candidate_size * (i + 1) ] .view(-1) .float() @@ -245,7 +247,7 @@ def run_one_item(self, qitem: QueryItem) -> None: ) query_mt_target_labels = ( mt_target_labels[ # pyre-ignore [16,61] - 0, candidate_size * i : candidate_size * (i + 1) + 0, candidate_size * i: candidate_size * (i + 1) ] .view(-1) .float() @@ -253,7 +255,7 @@ def run_one_item(self, qitem: QueryItem) -> None: ) query_mt_target_weights = ( mt_target_weights[ # pyre-ignore [61] - 0, candidate_size * i : candidate_size * (i + 1) + 0, candidate_size * i: candidate_size * (i + 1) ] .view(-1) .float() @@ -269,7 +271,8 @@ def run_one_item(self, qitem: QueryItem) -> None: ) response_array = array.array("B", np_array.tobytes()) bi = response_array.buffer_info() - # since we send buffer to loadgen, needs `response_array` in memory during send + # since we send buffer to loadgen, needs `response_array` + # in memory during send lg.QuerySamplesComplete( [lg.QuerySampleResponse(query_id, bi[0], bi[1])] ) @@ -297,10 +300,10 @@ def enqueue(self, query_samples, t0: float) -> None: # pyre-ignore [2] for i in range(len(self.current_query_ids) // self.batchsize): self.data_producer.enqueue( query_ids=self.current_query_ids[ - i * self.batchsize : (i + 1) * self.batchsize + i * self.batchsize: (i + 1) * self.batchsize ], content_ids=self.current_content_ids[ - i * self.batchsize : (i + 1) * self.batchsize + i * self.batchsize: (i + 1) * self.batchsize ], t0=t0, dt_queue=dt_queue, @@ -345,7 +348,8 @@ def add_results( timing: list[float] = [result[key] for result in result_timing] buckets: List[float] = np.percentile(timing, percentiles).tolist() buckets_str: str = ",".join( - ["| {}:{:.4f}| ".format(p, b) for p, b in zip(percentiles, buckets)] + ["| {}:{:.4f}| ".format(p, b) + for p, b in zip(percentiles, buckets)] ) buckets_dict[key] = buckets buckets_str_dict[key] = buckets_str @@ -397,7 +401,8 @@ def get_num_queries( Number of queries to execute in the benchmark run. """ if scenario_name == "Offline": - # consistent with https://github.com/mlcommons/inference/blob/8999c4d686f6e4a180da14597c97063fce7c9f33/loadgen/test_settings_internal.cc#L147 + # consistent with + # https://github.com/mlcommons/inference/blob/8999c4d686f6e4a180da14597c97063fce7c9f33/loadgen/test_settings_internal.cc#L147 return int(1.1 * target_duration / 1000 * offline_target_qps) else: if input_size is None: @@ -547,7 +552,8 @@ def get_samples(self, id_list: List[int]) -> List[Samples]: if curr_ts_idx == self.inference_ts - 1: curr_ts_queries += self.remaining_queries begin_query_idx: int = self.ts_processed_cnt - end_query_idx: int = min(begin_query_idx + batch_size, curr_ts_queries) + end_query_idx: int = min( + begin_query_idx + batch_size, curr_ts_queries) begin_request_idx: int = begin_query_idx % curr_ts_unique_requests end_request_idx: int = end_query_idx % curr_ts_unique_requests if begin_query_idx + batch_size >= curr_ts_queries: @@ -639,7 +645,8 @@ def run( """ set_dev_mode(False) if scenario_name not in SCENARIO_MAP: - raise NotImplementedError("valid scanarios:" + str(list(SCENARIO_MAP.keys()))) + raise NotImplementedError( + "valid scanarios:" + str(list(SCENARIO_MAP.keys()))) scenario = SCENARIO_MAP[scenario_name] np.random.seed(numpy_rand_seed) random.seed(numpy_rand_seed) @@ -773,7 +780,8 @@ def flush_queries() -> None: ds.unload_query_samples, ) with profiler_or_nullcontext(enabled=output_trace, with_stack=False): - logger.info(f"starting warmup {scenario} with {warmup_count} queries") + logger.info( + f"starting warmup {scenario} with {warmup_count} queries") lg.StartTest(sut, qsl, settings) lg.DestroyQSL(qsl) lg.DestroySUT(sut) diff --git a/recommendation/dlrm_v3/model_family.py b/recommendation/dlrm_v3/model_family.py index e40b8d4c02..8d4b81da6a 100644 --- a/recommendation/dlrm_v3/model_family.py +++ b/recommendation/dlrm_v3/model_family.py @@ -258,7 +258,8 @@ def load(self, model_path: str) -> None: activation=quant.PlaceholderObserver.with_args( dtype=torch.float ), - weight=quant.PlaceholderObserver.with_args(dtype=torch.int8), + weight=quant.PlaceholderObserver.with_args( + dtype=torch.int8), ), }, mapping={ @@ -361,8 +362,10 @@ def __init__( self.dist_backend = "nccl" ctx = mp.get_context("spawn") - self.samples_q: List[mp.Queue] = [ctx.Queue() for _ in range(self.world_size)] - self.result_q: List[mp.Queue] = [ctx.Queue() for _ in range(self.world_size)] + self.samples_q: List[mp.Queue] = [ctx.Queue() + for _ in range(self.world_size)] + self.result_q: List[mp.Queue] = [ctx.Queue() + for _ in range(self.world_size)] def load(self, model_path: str) -> None: """ @@ -387,7 +390,8 @@ def load(self, model_path: str) -> None: p.start() processes.append(p) - def distributed_setup(self, rank: int, world_size: int, model_path: str) -> None: + def distributed_setup(self, rank: int, world_size: int, + model_path: str) -> None: """ Initialize and run a dense worker process. @@ -431,7 +435,8 @@ def distributed_setup(self, rank: int, world_size: int, model_path: str) -> None assert profiler is not None profiler.step() with torch.profiler.record_function("get_item_from_queue"): - # Copy here to release data in the producer to avoid invalid cuda caching allocator release. + # Copy here to release data in the producer to avoid + # invalid cuda caching allocator release. item = copy.deepcopy(item) ( id, @@ -510,7 +515,8 @@ def predict( max_num_candidates: int, num_candidates: Optional[torch.Tensor], ) -> Optional[ - Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], float] + Tuple[torch.Tensor, Optional[torch.Tensor], + Optional[torch.Tensor], float] ]: """ Run distributed dense forward pass. diff --git a/recommendation/dlrm_v3/streaming_synthetic_data.py b/recommendation/dlrm_v3/streaming_synthetic_data.py index 8046909e00..d0f96e56aa 100644 --- a/recommendation/dlrm_v3/streaming_synthetic_data.py +++ b/recommendation/dlrm_v3/streaming_synthetic_data.py @@ -133,7 +133,8 @@ def generate_one_timestamp( total_cnt = sum(category_to_cnt.values()) p = np.array( [ - (alpha / len(categories) + category_to_cnt[c]) / (alpha + total_cnt) + (alpha / len(categories) + + category_to_cnt[c]) / (alpha + total_cnt) for c in categories ] ) @@ -146,7 +147,8 @@ def generate_one_timestamp( ) sample_inds = np.random.randint(0, sample_end_idx, size=seq_len) offsets = np.array( - [self.category_to_start_end_item_idx[cat][0] for cat in item_categories] + [self.category_to_start_end_item_idx[cat][0] + for cat in item_categories] ) sample_inds = sample_inds + offsets num_categories = len(categories) @@ -205,7 +207,8 @@ def generate_one_timestamp( for i in range(seq_len) ] if not inference: - sub_indices = random.sample(range(seq_len), self.num_eval_candidates) + sub_indices = random.sample( + range(seq_len), self.num_eval_candidates) sample_candidate_inds = [sample_inds[i] for i in sub_indices] sample_candidate_ratings = [sample_ratings[i] for i in sub_indices] sample_uih_inds = sample_inds @@ -281,7 +284,10 @@ def generate_one_user( Returns: List of CSV row values for this user's data. """ - categories = random.sample(range(self.num_categories), self.categories_per_user) + categories = random.sample( + range( + self.num_categories), + self.categories_per_user) category_to_cnt = {c: 0 for c in categories} out_list: List[str] = [] # t = -1 as base UIH @@ -304,7 +310,8 @@ def generate_one_user( ts_buffers=ts_buffers, ) out_list.append(",".join([str(ind) for ind in sample_candidate_inds])) - out_list.append(",".join([str(rat) for rat in sample_candidate_ratings])) + out_list.append(",".join([str(rat) + for rat in sample_candidate_ratings])) out_list.append(",".join([str(ind) for ind in sample_inds])) out_list.append(",".join([str(rat) for rat in sample_ratings])) # train @@ -328,7 +335,8 @@ def generate_one_user( file_idx=file_idx, ts_buffers=ts_buffers, ) - out_list.append(",".join([str(ind) for ind in sample_candidate_inds])) + out_list.append(",".join([str(ind) + for ind in sample_candidate_inds])) out_list.append( ",".join([str(rat) for rat in sample_candidate_ratings]) ) @@ -356,7 +364,8 @@ def generate_one_user( ts_buffers=ts_buffers, ) out_list.append(",".join([str(ind) for ind in sample_candidate_inds])) - out_list.append(",".join([str(rat) for rat in sample_candidate_ratings])) + out_list.append(",".join([str(rat) + for rat in sample_candidate_ratings])) out_list.append(",".join([str(ind) for ind in sample_inds])) out_list.append(",".join([str(rat) for rat in sample_ratings])) # inference @@ -382,7 +391,8 @@ def generate_one_user( file_idx=file_idx, ts_buffers=ts_buffers, ) - out_list.append(",".join([str(ind) for ind in sample_candidate_inds])) + out_list.append(",".join([str(ind) + for ind in sample_candidate_inds])) out_list.append( ",".join([str(rat) for rat in sample_candidate_ratings]) ) @@ -491,7 +501,10 @@ def worker( rank=rank, ) num_files_per_rank = num_files // world_size - file_indices = [i + rank * num_files_per_rank for i in range(num_files_per_rank)] + file_indices = [ + i + + rank * + num_files_per_rank for i in range(num_files_per_rank)] for file_idx in file_indices: logger.warning(f"rank {rank}: start generating file {file_idx}") generator.write_dataset( @@ -534,7 +547,8 @@ def write_offset(output_folder: str, num_files: int, num_users: int) -> None: writer.writerow([",".join([str(offset) for offset in offsets])]) -def write_ts_metadata(output_folder: str, total_ts: int, num_files: int) -> None: +def write_ts_metadata(output_folder: str, total_ts: int, + num_files: int) -> None: """ Write timestamp metadata for streaming simulation. @@ -563,7 +577,8 @@ def write_ts_metadata(output_folder: str, total_ts: int, num_files: int) -> None num_users_per_file.append(size) cumsum = np.cumsum(num_users_per_file).tolist() assert cumsum[-1] == len(requests) - requests_writer.writerow([",".join([str(r) for r in requests])]) + requests_writer.writerow( + [",".join([str(r) for r in requests])]) cumsum_writer.writerow([",".join([str(s) for s in cumsum])]) logger.warning(f"ts {ts} finished") with open( @@ -579,7 +594,8 @@ def write_ts_metadata(output_folder: str, total_ts: int, num_files: int) -> None if not line: break offsets.append(offset) - assert len(offsets) == total_ts, f"total_ts {total_ts} != {len(offsets)}" + assert len( + offsets) == total_ts, f"total_ts {total_ts} != {len(offsets)}" logger.warning("offsets for file requests_per_ts.csv finished") writer.writerow([",".join([str(offset) for offset in offsets])]) diff --git a/recommendation/dlrm_v3/utils.py b/recommendation/dlrm_v3/utils.py index 4d18d360d1..11743d219d 100644 --- a/recommendation/dlrm_v3/utils.py +++ b/recommendation/dlrm_v3/utils.py @@ -34,7 +34,8 @@ MultitaskTaskType, TaskConfig, ) -from torch.profiler import profile, profiler, ProfilerActivity # pyre-ignore [21] +# pyre-ignore [21] +from torch.profiler import profile, profiler, ProfilerActivity from torch.utils.tensorboard import SummaryWriter from torchrec.metrics.accuracy import AccuracyMetricComputation from torchrec.metrics.gauc import GAUCMetricComputation @@ -94,7 +95,8 @@ def profiler_or_nullcontext(enabled: bool, with_stack: bool): """ return ( profile( - # pyre-fixme[16]: Module `profiler` has no attribute `ProfilerActivity`. + # pyre-fixme[16]: Module `profiler` has no attribute + # `ProfilerActivity`. activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], on_trace_ready=_on_trace_ready_fn(), with_stack=with_stack, @@ -126,7 +128,8 @@ def __init__(self, rank, active: int = 50) -> None: repeat=1, ), on_trace_ready=_on_trace_ready_fn(self.rank), - # pyre-fixme[16]: Module `profiler` has no attribute `ProfilerActivity`. + # pyre-fixme[16]: Module `profiler` has no attribute + # `ProfilerActivity`. activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, profile_memory=False, @@ -180,7 +183,8 @@ def __init__( assert all_classification_tasks + all_regression_tasks == [ task.task_name for task in multitask_configs ] - self.task_names: List[str] = all_classification_tasks + all_regression_tasks + self.task_names: List[str] = all_classification_tasks + \ + all_regression_tasks self.class_metrics: Dict[str, List[RecMetricComputation]] = { "train": [], @@ -239,7 +243,8 @@ def __init__( self.global_step: Dict[str, int] = {"train": 0, "eval": 0} self.tb_logger: Optional[SummaryWriter] = None if tensorboard_log_path != "": - self.tb_logger = SummaryWriter(log_dir=tensorboard_log_path, purge_step=0) + self.tb_logger = SummaryWriter( + log_dir=tensorboard_log_path, purge_step=0) self.tb_logger.flush() @property From de2362097c2fd9cfebb8459b418ed193b0c0945a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 7 Jan 2026 20:00:05 +0000 Subject: [PATCH 40/64] [Automated Commit] Format Codebase --- multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/evaluation.py | 2 +- tools/submission/submission_checker.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/evaluation.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/evaluation.py index 2c629b7d26..d1701a19ff 100644 --- a/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/evaluation.py +++ b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/evaluation.py @@ -408,4 +408,4 @@ def run_evaluation(random_seed: int, filename: FilePath, with open("accuracy.txt", "w") as f: f.write("Results\n\n") f.write(f"{data_string}\n\n") - f.write(f"hash={file_hash}") \ No newline at end of file + f.write(f"hash={file_hash}") diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index f5b8aa8753..62bd6a87bb 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -1514,7 +1514,9 @@ def check_accuracy_dir(config, model, path, verbose): is_valid = False else: if os.stat(fname).st_size > MAX_ACCURACY_LOG_SIZE: - log.error("Max expected file size is: %s bytes", MAX_ACCURACY_LOG_SIZE) + log.error( + "Max expected file size is: %s bytes", + MAX_ACCURACY_LOG_SIZE) log.error("%s is not truncated", fname) is_valid = False From cff487940e430ad9f46f0baf17ef793db8b7ebb8 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 12 Jan 2026 16:08:11 +0000 Subject: [PATCH 41/64] [Automated Commit] Format Codebase --- compliance/TEST07/run_verification.py | 42 +++++++++++++++------------ 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/compliance/TEST07/run_verification.py b/compliance/TEST07/run_verification.py index 44353aa31a..906cbd56a3 100755 --- a/compliance/TEST07/run_verification.py +++ b/compliance/TEST07/run_verification.py @@ -49,15 +49,15 @@ def parse_audit_config(config_path): """ Parse audit.config file and extract TEST07-specific settings. - + Returns: dict: Parsed configuration values """ config = {} - + if not os.path.isfile(config_path): return config - + try: with open(config_path, 'r') as f: for line in f: @@ -65,27 +65,29 @@ def parse_audit_config(config_path): # Skip comments and empty lines if not line or line.startswith('#'): continue - + # Parse key = value if '=' in line: key, value = line.split('=', 1) key = key.strip() value = value.strip() - - # Extract the setting name (last part of key like *.*.setting_name) + + # Extract the setting name (last part of key like + # *.*.setting_name) parts = key.split('.') if len(parts) >= 3: setting_name = parts[-1] - + # Parse test07_accuracy_threshold if setting_name == 'test07_accuracy_threshold': try: config['accuracy_threshold'] = float(value) except ValueError: - print(f"Warning: Invalid threshold value in audit.config: {value}") + print( + f"Warning: Invalid threshold value in audit.config: {value}") except Exception as e: print(f"Warning: Error parsing audit.config: {e}") - + return config @@ -149,28 +151,29 @@ def main(): # Determine accuracy threshold accuracy_threshold = args.accuracy_threshold audit_config_path = args.audit_config - + # Try to read threshold from audit.config if provided if audit_config_path: print(f"Reading audit.config from: {audit_config_path}") audit_config = parse_audit_config(audit_config_path) - + if 'accuracy_threshold' in audit_config: config_threshold = audit_config['accuracy_threshold'] print(f"Found threshold in audit.config: {config_threshold}") - + # CLI argument overrides config file if accuracy_threshold is None: accuracy_threshold = config_threshold else: - print(f"CLI threshold ({accuracy_threshold}) overrides audit.config ({config_threshold})") - + print( + f"CLI threshold ({accuracy_threshold}) overrides audit.config ({config_threshold})") + # Validate we have a threshold if accuracy_threshold is None: print("Error: No accuracy threshold specified.") print("Provide --accuracy-threshold or --audit-config with test07_accuracy_threshold field.") sys.exit(1) - + print(f"Using accuracy threshold: {accuracy_threshold}") # Build accuracy script command with placeholder substitution @@ -264,19 +267,22 @@ def main(): try: shutil.copy2(accuracy_file, output_accuracy_dir) except Exception: - print(f"Exception occurred trying to copy {accuracy_file} to {output_accuracy_dir}") + print( + f"Exception occurred trying to copy {accuracy_file} to {output_accuracy_dir}") try: if os.path.exists(summary_file): shutil.copy2(summary_file, output_performance_dir) except Exception: - print(f"Exception occurred trying to copy {summary_file} to {output_performance_dir}") + print( + f"Exception occurred trying to copy {summary_file} to {output_performance_dir}") try: if os.path.exists(detail_file): shutil.copy2(detail_file, output_performance_dir) except Exception: - print(f"Exception occurred trying to copy {detail_file} to {output_performance_dir}") + print( + f"Exception occurred trying to copy {detail_file} to {output_performance_dir}") print(f"\nAccuracy check pass: {accuracy_pass}") print("TEST07 verification complete") From 057305a45f3550d14549e9c01a5d64ff5db580c8 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 27 Jan 2026 17:47:11 +0000 Subject: [PATCH 42/64] [Automated Commit] Format Codebase --- recommendation/dlrm_v3/accuracy.py | 3 ++- recommendation/dlrm_v3/main.py | 15 ++++++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/recommendation/dlrm_v3/accuracy.py b/recommendation/dlrm_v3/accuracy.py index 244fe11000..94087e12dc 100644 --- a/recommendation/dlrm_v3/accuracy.py +++ b/recommendation/dlrm_v3/accuracy.py @@ -64,7 +64,8 @@ def main() -> None: logger.warning(f"results have {len(results)} entries") for result in results: data = np.frombuffer(bytes.fromhex(result["data"]), np.float32) - # Format: [ts_idx, query_idx, predictions..., labels..., weights..., candidate_size] + # Format: [ts_idx, query_idx, predictions..., labels..., weights..., + # candidate_size] num_candidates = data[-1].astype(int) assert len(data) == 3 + num_candidates * 3 mt_target_preds = torch.from_numpy(data[2:2 + num_candidates]) diff --git a/recommendation/dlrm_v3/main.py b/recommendation/dlrm_v3/main.py index 12e429ec03..a0dbd45991 100644 --- a/recommendation/dlrm_v3/main.py +++ b/recommendation/dlrm_v3/main.py @@ -227,8 +227,10 @@ def run_one_item(self, qitem: QueryItem) -> None: .float() .numpy() ) - ts_idx_val = float(qitem.ts_idx) if qitem.ts_idx is not None else -1.0 - query_idx_val = float(qitem.query_idx[i]) if qitem.query_idx is not None else -1.0 + ts_idx_val = float( + qitem.ts_idx) if qitem.ts_idx is not None else -1.0 + query_idx_val = float( + qitem.query_idx[i]) if qitem.query_idx is not None else -1.0 np_array = np.concatenate( [ np.array([ts_idx_val]).astype(np.float32), @@ -269,8 +271,10 @@ def run_one_item(self, qitem: QueryItem) -> None: .float() .numpy() ) - ts_idx_val = float(qitem.ts_idx) if qitem.ts_idx is not None else -1.0 - query_idx_val = float(qitem.query_idx[i]) if qitem.query_idx is not None else -1.0 + ts_idx_val = float( + qitem.ts_idx) if qitem.ts_idx is not None else -1.0 + query_idx_val = float( + qitem.query_idx[i]) if qitem.query_idx is not None else -1.0 np_array = np.concatenate( [ np.array([ts_idx_val]).astype(np.float32), @@ -731,7 +735,8 @@ def run( if is_streaming: ds.init_sut() # pyre-ignore [16] result = ds.get_samples(warmup_ids) - if isinstance(result, list) and len(result) > 0 and isinstance(result[0], tuple): + if isinstance(result, list) and len( + result) > 0 and isinstance(result[0], tuple): for sample, _, _ in result: model_family.predict(sample) elif isinstance(result, Samples): From 81ffcf0645e0528489d814bf058ff934891e6c80 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 27 Jan 2026 18:08:57 +0000 Subject: [PATCH 43/64] [Automated Commit] Format Codebase --- main.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index e0f9a8787b..17d01fe11f 100755 --- a/main.py +++ b/main.py @@ -654,7 +654,13 @@ def get_readme_suffix(spaces, model, implementation, extra_variation_tags): if implementation == "reference" and not extra_variation_tags: if not model.endswith("-99"): - model_base_name = model.replace("-99.9", "").replace("-99", "").replace("-95", "") + model_base_name = model.replace( + "-99.9", + "").replace( + "-99", + "").replace( + "-95", + "") readme_suffix += f"{pre_space}* If you want to download the official MLPerf model and dataset for {model} you can follow [this README](get-{model_base_name}-data.md).\n" if model == "resnet50": readme_suffix += f"{pre_space}* Please see [mobilenets.md](mobilenets.md) for running mobilenet models for Image Classification." From 9b0ab2b7e8bfbbb60fd7be81aa5fddc7b7745f85 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 28 Jan 2026 07:17:02 +0000 Subject: [PATCH 44/64] [Automated Commit] Format Codebase --- text_to_video/wan2.2-t2v-14b/run_mlperf.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/text_to_video/wan2.2-t2v-14b/run_mlperf.py b/text_to_video/wan2.2-t2v-14b/run_mlperf.py index 7b6d7a8604..170c8db176 100644 --- a/text_to_video/wan2.2-t2v-14b/run_mlperf.py +++ b/text_to_video/wan2.2-t2v-14b/run_mlperf.py @@ -101,10 +101,12 @@ def issue_queries(self, query_samples): output = self.pipe(**pipeline_kwargs).frames[0] # Save to video to reduce mlperf_log_accuracy.json size - output_path = Path(self.video_output_path, f"{self.prompts[i]}-0.mp4") + output_path = Path( + self.video_output_path, + f"{self.prompts[i]}-0.mp4") logging.info(f"Saving {q} to {output_path}") export_to_video(output[0], str(output_path), fps=self.fps) - + with open(output_path, "rb") as f: resp = f.read() @@ -276,7 +278,14 @@ def run_mlperf(args, config): logging.info("No fixed latent provided - using random initial latents") # Loading model - model = Model(args.model_path, args.video_output_path, device, config, dataset, fixed_latent, rank) + model = Model( + args.model_path, + args.video_output_path, + device, + config, + dataset, + fixed_latent, + rank) # model = DebugModel(args.model_path, device, config, dataset, fixed_latent, rank) logging.info("Model loaded successfully!") From 1c30072caf50d0a65577160659546d4e5b09c233 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 29 Jan 2026 07:18:04 +0000 Subject: [PATCH 45/64] [Automated Commit] Format Codebase --- compliance/TEST09/run_verification.py | 42 ++++++++++++------- .../checks/compliance_check.py | 3 +- 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/compliance/TEST09/run_verification.py b/compliance/TEST09/run_verification.py index 224b044684..33d8c598e3 100644 --- a/compliance/TEST09/run_verification.py +++ b/compliance/TEST09/run_verification.py @@ -122,14 +122,16 @@ def parse_mlperf_log(log_path: str) -> List[Dict[str, Any]]: parsed_count += 1 except json.JSONDecodeError: if parsed_count == 0: - print(f"Warning: Line {line_num}: Could not parse JSON") + print( + f"Warning: Line {line_num}: Could not parse JSON") break print(f"Loaded {len(entries)} entries from MLPerf log") return entries -def compute_output_token_lengths(entries: List[Dict[str, Any]]) -> Tuple[List[int], float, int, int]: +def compute_output_token_lengths( + entries: List[Dict[str, Any]]) -> Tuple[List[int], float, int, int]: """Compute output token lengths from MLPerf log entries. Args: @@ -147,7 +149,8 @@ def compute_output_token_lengths(entries: List[Dict[str, Any]]) -> Tuple[List[in token_ids = decode_hex_to_tokens(hex_data) token_lengths.append(len(token_ids)) except Exception as e: - print(f"Warning: Error decoding entry {entry.get('qsl_idx')}: {e}") + print( + f"Warning: Error decoding entry {entry.get('qsl_idx')}: {e}") token_lengths.append(0) else: token_lengths.append(0) @@ -194,13 +197,15 @@ def parse_audit_config(config_path: str) -> Dict[str, Any]: try: config['min_output_tokens'] = float(value) except ValueError: - print(f"Warning: Invalid min_output_tokens value: {value}") + print( + f"Warning: Invalid min_output_tokens value: {value}") elif setting_name == 'test09_max_output_tokens': try: config['max_output_tokens'] = float(value) except ValueError: - print(f"Warning: Invalid max_output_tokens value: {value}") + print( + f"Warning: Invalid max_output_tokens value: {value}") except Exception as e: print(f"Warning: Error parsing audit.config: {e}") @@ -275,7 +280,8 @@ def main(): if min_output_tokens is None: min_output_tokens = config_min else: - print(f"CLI min ({min_output_tokens}) overrides audit.config ({config_min})") + print( + f"CLI min ({min_output_tokens}) overrides audit.config ({config_min})") if 'max_output_tokens' in audit_config: config_max = audit_config['max_output_tokens'] @@ -283,7 +289,8 @@ def main(): if max_output_tokens is None: max_output_tokens = config_max else: - print(f"CLI max ({max_output_tokens}) overrides audit.config ({config_max})") + print( + f"CLI max ({max_output_tokens}) overrides audit.config ({config_max})") # Validate we have thresholds if min_output_tokens is None or max_output_tokens is None: @@ -309,7 +316,8 @@ def main(): sys.exit(1) print(f"\nComputing output token lengths for {len(entries)} samples...") - token_lengths, mean_length, min_length, max_length = compute_output_token_lengths(entries) + token_lengths, mean_length, min_length, max_length = compute_output_token_lengths( + entries) # Print statistics print("\n" + "=" * 80) @@ -322,7 +330,8 @@ def main(): # Compute standard deviation if token_lengths: - variance = sum((x - mean_length) ** 2 for x in token_lengths) / len(token_lengths) + variance = sum((x - mean_length) ** + 2 for x in token_lengths) / len(token_lengths) std_dev = variance ** 0.5 print(f"Std deviation: {std_dev:.2f}") @@ -336,8 +345,10 @@ def main(): overall_pass = min_check_pass and max_check_pass print(f"Mean output tokens: {mean_length:.2f}") - print(f"Min threshold: {min_output_tokens} -> {'PASS' if min_check_pass else 'FAIL'}") - print(f"Max threshold: {max_output_tokens} -> {'PASS' if max_check_pass else 'FAIL'}") + print( + f"Min threshold: {min_output_tokens} -> {'PASS' if min_check_pass else 'FAIL'}") + print( + f"Max threshold: {max_output_tokens} -> {'PASS' if max_check_pass else 'FAIL'}") print(f"\nOverall: {'TEST PASS' if overall_pass else 'TEST FAIL'}") # Write verification results @@ -391,19 +402,22 @@ def main(): try: shutil.copy2(accuracy_file, output_accuracy_dir) except Exception: - print(f"Exception occurred trying to copy {accuracy_file} to {output_accuracy_dir}") + print( + f"Exception occurred trying to copy {accuracy_file} to {output_accuracy_dir}") try: if os.path.exists(summary_file): shutil.copy2(summary_file, output_performance_dir) except Exception: - print(f"Exception occurred trying to copy {summary_file} to {output_performance_dir}") + print( + f"Exception occurred trying to copy {summary_file} to {output_performance_dir}") try: if os.path.exists(detail_file): shutil.copy2(detail_file, output_performance_dir) except Exception: - print(f"Exception occurred trying to copy {detail_file} to {output_performance_dir}") + print( + f"Exception occurred trying to copy {detail_file} to {output_performance_dir}") print("\n" + "=" * 80) print("TEST09 verification complete") diff --git a/tools/submission/submission_checker/checks/compliance_check.py b/tools/submission/submission_checker/checks/compliance_check.py index 7e09ac1644..477c17a856 100644 --- a/tools/submission/submission_checker/checks/compliance_check.py +++ b/tools/submission/submission_checker/checks/compliance_check.py @@ -335,7 +335,8 @@ def accuracy_check(self): elif test == "TEST09": # TEST09: Verify output token length in performance mode # Check verify_output_len.txt for TEST PASS - output_len_path = os.path.join(test_dir, "verify_output_len.txt") + output_len_path = os.path.join( + test_dir, "verify_output_len.txt") if os.path.exists(output_len_path): with open(output_len_path, "r", encoding="utf-8") as f: content = f.read() From 779dafcc7d227b9d17703d6d8b7b41acf5dbda98 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 29 Jan 2026 18:13:46 +0000 Subject: [PATCH 46/64] [Automated Commit] Format Codebase --- .../submission_checker/checks/compliance_check.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/submission/submission_checker/checks/compliance_check.py b/tools/submission/submission_checker/checks/compliance_check.py index fb0212daa7..a152383cbe 100644 --- a/tools/submission/submission_checker/checks/compliance_check.py +++ b/tools/submission/submission_checker/checks/compliance_check.py @@ -360,8 +360,10 @@ def accuracy_check(self): is_valid = False elif test == "TEST08": # TEST08 is used for dlrm-v3 streaming dataset compliance - # It verifies that NE values match between accuracy and performance runs - lines = self.submission_logs.loader_data.get(f"{test}_acc_result") + # It verifies that NE values match between accuracy and + # performance runs + lines = self.submission_logs.loader_data.get( + f"{test}_acc_result") if lines is None: self.log.error( "TEST08 accuracy result file not found for %s", test_dir) @@ -401,7 +403,8 @@ def accuracy_check(self): elif test == "TEST09": # TEST09: Verify output token length in performance mode # Check verify_output_len.txt for TEST PASS - output_len_path = os.path.join(test_dir, "verify_output_len.txt") + output_len_path = os.path.join( + test_dir, "verify_output_len.txt") if os.path.exists(output_len_path): with open(output_len_path, "r", encoding="utf-8") as f: content = f.read() From 9adcf480e37b54b5b407f585f651b50e59287275 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 6 Feb 2026 19:56:54 +0000 Subject: [PATCH 47/64] [Automated Commit] Format Codebase --- text_to_video/wan-2.2-t2v-a14b/run_mlperf.py | 28 ++++++++++++++------ 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/text_to_video/wan-2.2-t2v-a14b/run_mlperf.py b/text_to_video/wan-2.2-t2v-a14b/run_mlperf.py index 147624b340..ab73c25966 100644 --- a/text_to_video/wan-2.2-t2v-a14b/run_mlperf.py +++ b/text_to_video/wan-2.2-t2v-a14b/run_mlperf.py @@ -46,7 +46,8 @@ def load_prompts(dataset_path): class Model: - def __init__(self, model_path, device, config, prompts, fixed_latent=None, rank=0): + def __init__(self, model_path, device, config, + prompts, fixed_latent=None, rank=0): self.device = device self.rank = rank self.height = config["height"] @@ -106,7 +107,8 @@ def flush_queries(self): class DebugModel: - def __init__(self, model_path, device, config, prompts, fixed_latent=None, rank=0): + def __init__(self, model_path, device, config, + prompts, fixed_latent=None, rank=0): self.prompts = prompts def issue_queries(self, query_samples): @@ -186,7 +188,8 @@ def get_args(): parser.add_argument( "--scenario", default="SingleStream", - help="mlperf benchmark scenario, one of " + str(list(SCENARIO_MAP.keys())), + help="mlperf benchmark scenario, one of " + + str(list(SCENARIO_MAP.keys())), ) parser.add_argument( "--user_conf", @@ -202,7 +205,10 @@ def get_args(): help="performance sample count", default=5000, ) - parser.add_argument("--accuracy", action="store_true", help="enable accuracy pass") + parser.add_argument( + "--accuracy", + action="store_true", + help="enable accuracy pass") # Dont overwrite these for official submission parser.add_argument("--count", type=int, help="dataset items to use") parser.add_argument("--time", type=int, help="time to scan in seconds") @@ -271,7 +277,10 @@ def run_mlperf(args, config): audit_config = os.path.abspath(args.audit_conf) if os.path.exists(audit_config): - settings.FromConfig(audit_config, "wan-2.2-t2v-a14b", args.scenario) + settings.FromConfig( + audit_config, + "wan-2.2-t2v-a14b", + args.scenario) settings.scenario = SCENARIO_MAP[args.scenario] settings.mode = lg.TestMode.PerformanceOnly @@ -297,8 +306,10 @@ def run_mlperf(args, config): if args.samples_per_query: settings.multi_stream_samples_per_query = args.samples_per_query if args.max_latency: - settings.server_target_latency_ns = int(args.max_latency * NANO_SEC) - settings.multi_stream_expected_latency_ns = int(args.max_latency * NANO_SEC) + settings.server_target_latency_ns = int( + args.max_latency * NANO_SEC) + settings.multi_stream_expected_latency_ns = int( + args.max_latency * NANO_SEC) performance_sample_count = ( args.performance_sample_count @@ -311,7 +322,8 @@ def run_mlperf(args, config): count, performance_sample_count, load_query_samples, unload_query_samples ) - lg.StartTestWithLogSettings(sut, qsl, settings, log_settings, audit_config) + lg.StartTestWithLogSettings( + sut, qsl, settings, log_settings, audit_config) lg.DestroyQSL(qsl) lg.DestroySUT(sut) From 8da62b80dbb3bb0fcb7280e84550b58c67ae959f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 11 Feb 2026 23:03:43 +0000 Subject: [PATCH 48/64] [Automated Commit] Format Codebase --- tools/submission/generate_final_report.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/submission/generate_final_report.py b/tools/submission/generate_final_report.py index 4b0e055290..f191bdbf5f 100644 --- a/tools/submission/generate_final_report.py +++ b/tools/submission/generate_final_report.py @@ -101,11 +101,11 @@ def main(): "singlestream": "SingleStream", "multistream": "MultiStream", "server": "Server", - "interactive":"Interactive", + "interactive": "Interactive", "offline": "Offline", } - df["Scenario"] = df["Scenario"].apply(lambda x: scenario_map.get(str(x).lower(), x)) - + df["Scenario"] = df["Scenario"].apply( + lambda x: scenario_map.get(str(x).lower(), x)) output = args.input[:-4] writer = pd.ExcelWriter(output + ".xlsx", engine="xlsxwriter") From 406bb396f7494f92726eb2cb19fe5b45fde36fc3 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 12 Feb 2026 16:48:35 +0000 Subject: [PATCH 49/64] [Automated Commit] Format Codebase --- .../checks/performance_check.py | 16 +++++++++------- tools/submission/submission_checker/loader.py | 12 +++++++----- tools/submission/submission_checker/utils.py | 8 +++++--- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/tools/submission/submission_checker/checks/performance_check.py b/tools/submission/submission_checker/checks/performance_check.py index 915abd04cf..a2c34394c4 100644 --- a/tools/submission/submission_checker/checks/performance_check.py +++ b/tools/submission/submission_checker/checks/performance_check.py @@ -85,13 +85,15 @@ def missing_check(self): self.log.error("Performance log missing at %s", self.path) return False return True - + def scenarios_check(self): if self.submission_logs.loader_data.get("check_scenarios", False): return True else: - missing_scenarios = self.submission_logs.loader_data.get("missing_scenarios", []) - unknown_scenarios = self.submission_logs.loader_data.get("unknown_scenarios", []) + missing_scenarios = self.submission_logs.loader_data.get( + "missing_scenarios", []) + unknown_scenarios = self.submission_logs.loader_data.get( + "unknown_scenarios", []) if len(missing_scenarios) > 0: self.log.error( "%s does not have all required scenarios, missing %s", @@ -445,7 +447,7 @@ def inferred_check(self): ("singlestream", "offline") ] if (self.scenario.lower(), self.scenario_fixed.lower() - ) not in list_inferred: + ) not in list_inferred: self.log.error( "Result for scenario %s can not be inferred from %s for: %s", self.scenario_fixed, @@ -529,12 +531,12 @@ def get_inferred_result(self, res): res = qps_wo_loadgen_overhead if (self.scenario_fixed in ["Offline"] - ) and self.scenario in ["MultiStream"]: + ) and self.scenario in ["MultiStream"]: inferred = True res = samples_per_query * S_TO_MS / (latency_mean / MS_TO_NS) if (self.scenario_fixed in ["MultiStream"] - ) and self.scenario in ["SingleStream"]: + ) and self.scenario in ["SingleStream"]: inferred = True # samples_per_query does not match with the one reported in the logs # when inferring MultiStream from SingleStream @@ -551,6 +553,6 @@ def get_inferred_result(self, res): else: res = (latency_99_percentile * samples_per_query) / MS_TO_NS if (self.scenario_fixed in ["Interactive"] - ) and self.scenario not in ["Server"]: + ) and self.scenario not in ["Server"]: is_valid = False return res, is_valid diff --git a/tools/submission/submission_checker/loader.py b/tools/submission/submission_checker/loader.py index 323ed2a078..89c8bf08ce 100644 --- a/tools/submission/submission_checker/loader.py +++ b/tools/submission/submission_checker/loader.py @@ -212,8 +212,9 @@ def load_single_log(self, path, log_type: Literal["Performance", "Accuracy", log_type, path) return log - - def check_scenarios(self, benchmark, model_mapping, system_type, scenarios): + + def check_scenarios(self, benchmark, model_mapping, + system_type, scenarios): self.config.set_type(system_type) mlperf_model = self.config.get_mlperf_model(benchmark, model_mapping) required_scenarios = lower_list(self.config.get_required(mlperf_model)) @@ -230,13 +231,13 @@ def check_scenarios(self, benchmark, model_mapping, system_type, scenarios): unknown, passed = contains_list(set(all_senarios), scenarios) if not passed: check = False - if contains_list(set(optional_scenarios), ["interactive", "server"])[1]: + if contains_list(set(optional_scenarios), [ + "interactive", "server"])[1]: if "interactive" not in scenarios and "server" not in scenarios: check = False missing.append("(one of) Interactive or Server") return missing, unknown, check - def load(self) -> Generator[SubmissionLogs, None, None]: """Traverse submissions directory and yield parsed log containers. @@ -270,7 +271,8 @@ def load(self) -> Generator[SubmissionLogs, None, None]: for benchmark in list_dir(system_path): benchmark_path = os.path.join(system_path, benchmark) if division.lower() in ["closed", "network"]: - missing_scenarios, unknown_scenarios, check_scenarios = self.check_scenarios(benchmark, model_mapping, system_type, list_dir(benchmark_path)) + missing_scenarios, unknown_scenarios, check_scenarios = self.check_scenarios( + benchmark, model_mapping, system_type, list_dir(benchmark_path)) else: missing_scenarios, unknown_scenarios, check_scenarios = [], [], True for scenario in list_dir(benchmark_path): diff --git a/tools/submission/submission_checker/utils.py b/tools/submission/submission_checker/utils.py index 7d1daf0e72..7ff4fd020a 100644 --- a/tools/submission/submission_checker/utils.py +++ b/tools/submission/submission_checker/utils.py @@ -107,17 +107,19 @@ def is_number(s): return True except ValueError: return False - + + def lower_list(l): return [str(e).lower() for e in l] + def contains_list(l1, l2): # Check if l1 contains all elements of l2 missing = [] for e in l2: if e not in l1: missing.append(e) - return missing, len(missing) == 0 + return missing, len(missing) == 0 def get_performance_metric( @@ -317,7 +319,7 @@ def get_power_metric(config, scenario_fixed, log_path, is_valid, res): samples_per_query = 8 if (scenario_fixed in ["MultiStream"] - ) and scenario in ["SingleStream"]: + ) and scenario in ["SingleStream"]: power_metric = ( avg_power * power_duration * samples_per_query * 1000 / num_queries ) From 96407c342297d800752bad8049610edba79cd12c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 17 Feb 2026 22:40:01 +0000 Subject: [PATCH 50/64] [Automated Commit] Format Codebase --- .../submission_checker/checks/performance_check.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tools/submission/submission_checker/checks/performance_check.py b/tools/submission/submission_checker/checks/performance_check.py index 21d0f1c718..29203d6de1 100644 --- a/tools/submission/submission_checker/checks/performance_check.py +++ b/tools/submission/submission_checker/checks/performance_check.py @@ -118,7 +118,8 @@ def loadgen_errors_check(self): bool: True if no blocking Loadgen errors are present, False otherwise. """ - compliance_skip = self.submission_logs.loader_data.get("compliance_skip", False) + compliance_skip = self.submission_logs.loader_data.get( + "compliance_skip", False) if self.mlperf_log.has_error(): has_critical_errors = False if self.config.ignore_uncommited: @@ -129,7 +130,7 @@ def loadgen_errors_check(self): ): has_critical_errors = True if ( - not compliance_skip + not compliance_skip and "Multiple conf files are used" in error["value"] ): has_critical_errors = True @@ -456,7 +457,7 @@ def inferred_check(self): ("singlestream", "offline") ] if (self.scenario.lower(), self.scenario_fixed.lower() - ) not in list_inferred: + ) not in list_inferred: self.log.error( "Result for scenario %s can not be inferred from %s for: %s", self.scenario_fixed, @@ -540,12 +541,12 @@ def get_inferred_result(self, res): res = qps_wo_loadgen_overhead if (self.scenario_fixed in ["Offline"] - ) and self.scenario in ["MultiStream"]: + ) and self.scenario in ["MultiStream"]: inferred = True res = samples_per_query * S_TO_MS / (latency_mean / MS_TO_NS) if (self.scenario_fixed in ["MultiStream"] - ) and self.scenario in ["SingleStream"]: + ) and self.scenario in ["SingleStream"]: inferred = True # samples_per_query does not match with the one reported in the logs # when inferring MultiStream from SingleStream @@ -562,6 +563,6 @@ def get_inferred_result(self, res): else: res = (latency_99_percentile * samples_per_query) / MS_TO_NS if (self.scenario_fixed in ["Interactive"] - ) and self.scenario not in ["Server"]: + ) and self.scenario not in ["Server"]: is_valid = False return res, is_valid From 6015c374a606081ce977f5289b6a609826a86c24 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 4 Mar 2026 20:50:37 +0000 Subject: [PATCH 51/64] [Automated Commit] Format Codebase --- .../submission/submission_checker/checks/performance_check.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/submission/submission_checker/checks/performance_check.py b/tools/submission/submission_checker/checks/performance_check.py index f65d2e4173..a895aa20ca 100644 --- a/tools/submission/submission_checker/checks/performance_check.py +++ b/tools/submission/submission_checker/checks/performance_check.py @@ -526,7 +526,8 @@ def get_inferred_result(self, res): # Check if current scenario (and version) uses early stopping uses_early_stopping = self.config.uses_early_stopping(self.scenario) scenario = SCENARIO_MAPPING.get(self.scenario, self.scenario) - scenario_fixed = SCENARIO_MAPPING.get(self.scenario_fixed, self.scenario_fixed) + scenario_fixed = SCENARIO_MAPPING.get( + self.scenario_fixed, self.scenario_fixed) latency_mean = self.mlperf_log["result_mean_latency_ns"] if scenario in ["MultiStream"]: From 9496775de94390fd8a18f9ba83a0360969f0ae9f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 17 Mar 2026 21:29:46 +0000 Subject: [PATCH 52/64] [Automated Commit] Format Codebase --- tools/submission/submission_checker/results.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/submission/submission_checker/results.py b/tools/submission/submission_checker/results.py index 9f39519f45..990172b989 100644 --- a/tools/submission/submission_checker/results.py +++ b/tools/submission/submission_checker/results.py @@ -99,7 +99,8 @@ def add_result(self, submission_logs: SubmissionLogs): row["host_processor_core_count"] = submission_logs.system_json["host_processor_core_count"] row["accelerator_model_name"] = submission_logs.system_json["accelerator_model_name"] row["accelerators_per_node"] = submission_logs.system_json["accelerators_per_node"] - row["total_accelerators"] = int(row["number_of_nodes"]) * int(row["accelerators_per_node"]) + row["total_accelerators"] = int( + row["number_of_nodes"]) * int(row["accelerators_per_node"]) row["Location"] = os.path.dirname( submission_logs.loader_data["perf_path"]) row["framework"] = submission_logs.system_json["framework"] From 607d82131399a41698a03acaa14c6edbf11aedfa Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 27 Mar 2026 15:37:51 +0000 Subject: [PATCH 53/64] [Automated Commit] Format Codebase --- vision/medical_imaging/3d-unet-kits19/global_vars.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vision/medical_imaging/3d-unet-kits19/global_vars.py b/vision/medical_imaging/3d-unet-kits19/global_vars.py index 1ac539e6c5..b5fa192d9d 100644 --- a/vision/medical_imaging/3d-unet-kits19/global_vars.py +++ b/vision/medical_imaging/3d-unet-kits19/global_vars.py @@ -65,7 +65,7 @@ TARGET_CASES = json.load(f) with open(CALIBRATION_CASE_FILE, "r") as f: CALIB_CASES = json.load(f) - + # constants used preprocessing images as well as sliding window inference MEAN_VAL = 101.0 STDDEV_VAL = 76.9 From 5a683b75a4a53c45d388e992bb55a04356367fe9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 27 Mar 2026 16:28:06 +0000 Subject: [PATCH 54/64] [Automated Commit] Format Codebase --- loadgen/version_generator.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/loadgen/version_generator.py b/loadgen/version_generator.py index e0de87c281..b88ae8e1e0 100644 --- a/loadgen/version_generator.py +++ b/loadgen/version_generator.py @@ -101,7 +101,8 @@ def generate_loadgen_version_definitions(cc_filename, loadgen_root): raise with open(cc_filename, "w") as ofile: - ofile.write("// DO NOT EDIT: Autogenerated by version_generator.py.\n\n") + ofile.write( + "// DO NOT EDIT: Autogenerated by version_generator.py.\n\n") ofile.write("#include \n\n") ofile.write("namespace mlperf {\n\n") # Open and read the VERSION.txt file @@ -114,7 +115,12 @@ def generate_loadgen_version_definitions(cc_filename, loadgen_root): date_time_now_local = datetime.datetime.now().isoformat() date_time_now_utc = datetime.datetime.utcnow().isoformat() - ofile.write(func_def("BuildDateLocal", '"' + date_time_now_local + '"')) + ofile.write( + func_def( + "BuildDateLocal", + '"' + + date_time_now_local + + '"')) ofile.write(func_def("BuildDateUtc", '"' + date_time_now_utc + '"')) git_dir = '--git-dir="' + loadgen_root + '/../.git" ' From 0551a63014b16b4f78b59f7b2c6ea6a37721db25 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 27 Mar 2026 17:09:27 +0000 Subject: [PATCH 55/64] [Automated Commit] Format Codebase --- .../never_adopted/language/gpt3/megatron/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/retired_benchmarks/never_adopted/language/gpt3/megatron/utils.py b/retired_benchmarks/never_adopted/language/gpt3/megatron/utils.py index d7f64bc261..9b9bfbe776 100644 --- a/retired_benchmarks/never_adopted/language/gpt3/megatron/utils.py +++ b/retired_benchmarks/never_adopted/language/gpt3/megatron/utils.py @@ -1,10 +1,11 @@ import json import io + def jload(f, mode="r"): """Load a .json file into a dictionary.""" if not isinstance(f, io.IOBase): with open(f, mode=mode) as f: return json.load(f) else: - return json.load(f) \ No newline at end of file + return json.load(f) From d64c9a09af7b96d7a0c01dad811061b54daa9c8d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 30 Mar 2026 15:51:47 +0000 Subject: [PATCH 56/64] [Automated Commit] Format Codebase --- compliance/TEST04/verify_performance.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/compliance/TEST04/verify_performance.py b/compliance/TEST04/verify_performance.py index 3a4edd3268..15ea64abc1 100644 --- a/compliance/TEST04/verify_performance.py +++ b/compliance/TEST04/verify_performance.py @@ -40,8 +40,7 @@ def main(): args = parser.parse_args() print("Verifying performance.") - - + ref_score = 0 test_score = 0 ref_mode = "" @@ -87,7 +86,10 @@ def main(): if re.match("\\d+ ERROR", line): error = line.split(" ", 1)[0].strip() - print("WARNING: " + error + " ERROR reported in reference results") + print( + "WARNING: " + + error + + " ERROR reported in reference results") with open(args.test_summary, "r") as test_file: for line in test_file: From 4300693a84993b79b37b68870f6d59a34aeae0ad Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 30 Mar 2026 16:06:25 +0000 Subject: [PATCH 57/64] [Automated Commit] Format Codebase --- language/gpt-j/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/language/gpt-j/utils.py b/language/gpt-j/utils.py index d7f64bc261..9b9bfbe776 100644 --- a/language/gpt-j/utils.py +++ b/language/gpt-j/utils.py @@ -1,10 +1,11 @@ import json import io + def jload(f, mode="r"): """Load a .json file into a dictionary.""" if not isinstance(f, io.IOBase): with open(f, mode=mode) as f: return json.load(f) else: - return json.load(f) \ No newline at end of file + return json.load(f) From 1ab6e6d21d5c6bdc224d576632bcb5a8ac981bd3 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 30 Mar 2026 16:17:10 +0000 Subject: [PATCH 58/64] [Automated Commit] Format Codebase --- recommendation/dlrm_v2/pytorch/python/main.py | 2 -- recommendation/dlrm_v2/pytorch/tools/accuracy-dlrm.py | 2 -- retired_benchmarks/recommendation/dlrm/pytorch/python/main.py | 2 -- .../recommendation/dlrm/pytorch/tools/accuracy-dlrm.py | 2 -- retired_benchmarks/recommendation/dlrm/tf/mlp_log.py | 2 -- .../recommendation/dlrm/tf/train_and_eval_runner.py | 1 - .../translation/gnmt/tensorflow/nmt/inference_test.py | 1 - .../translation/gnmt/tensorflow/nmt/model_test.py | 1 - retired_benchmarks/translation/gnmt/tensorflow/nmt/nmt_test.py | 1 - .../translation/gnmt/tensorflow/nmt/scripts/rouge.py | 2 -- .../translation/gnmt/tensorflow/nmt/utils/common_test_utils.py | 1 - .../gnmt/tensorflow/nmt/utils/evaluation_utils_test.py | 1 - .../gnmt/tensorflow/nmt/utils/iterator_utils_test.py | 1 - .../translation/gnmt/tensorflow/nmt/utils/misc_utils_test.py | 1 - .../gnmt/tensorflow/nmt/utils/standard_hparams_utils.py | 1 - .../translation/gnmt/tensorflow/nmt/utils/vocab_utils.py | 1 - .../translation/gnmt/tensorflow/nmt/utils/vocab_utils_test.py | 1 - .../vision/classification_and_detection/python/main.py | 2 -- .../vision/classification_and_detection/tools/accuracy-coco.py | 2 -- .../classification_and_detection/tools/accuracy-imagenet.py | 2 -- .../vision/classification_and_detection/tools/coco-analyze.py | 2 -- .../vision/classification_and_detection/tools/lglog2csv.py | 2 -- .../vision/classification_and_detection/tools/resnet_save.py | 1 - text_to_image/main.py | 2 -- tools/submission/filter_errors.py | 2 -- vision/classification_and_detection/python/main.py | 2 -- vision/classification_and_detection/tools/accuracy-coco.py | 2 -- vision/classification_and_detection/tools/accuracy-imagenet.py | 2 -- .../classification_and_detection/tools/accuracy-openimages.py | 2 -- vision/classification_and_detection/tools/coco-analyze.py | 2 -- vision/classification_and_detection/tools/lglog2csv.py | 2 -- 31 files changed, 50 deletions(-) diff --git a/recommendation/dlrm_v2/pytorch/python/main.py b/recommendation/dlrm_v2/pytorch/python/main.py index e97de7f9dc..859a2f5b7c 100755 --- a/recommendation/dlrm_v2/pytorch/python/main.py +++ b/recommendation/dlrm_v2/pytorch/python/main.py @@ -3,8 +3,6 @@ """ - - import argparse import array import collections diff --git a/recommendation/dlrm_v2/pytorch/tools/accuracy-dlrm.py b/recommendation/dlrm_v2/pytorch/tools/accuracy-dlrm.py index b62104cc5e..0a64710313 100644 --- a/recommendation/dlrm_v2/pytorch/tools/accuracy-dlrm.py +++ b/recommendation/dlrm_v2/pytorch/tools/accuracy-dlrm.py @@ -5,8 +5,6 @@ """ - - import argparse import json diff --git a/retired_benchmarks/recommendation/dlrm/pytorch/python/main.py b/retired_benchmarks/recommendation/dlrm/pytorch/python/main.py index 8d3f8c0d9f..d3547d4058 100755 --- a/retired_benchmarks/recommendation/dlrm/pytorch/python/main.py +++ b/retired_benchmarks/recommendation/dlrm/pytorch/python/main.py @@ -3,8 +3,6 @@ """ - - import argparse import array import collections diff --git a/retired_benchmarks/recommendation/dlrm/pytorch/tools/accuracy-dlrm.py b/retired_benchmarks/recommendation/dlrm/pytorch/tools/accuracy-dlrm.py index bf7837575f..4125e7fd64 100644 --- a/retired_benchmarks/recommendation/dlrm/pytorch/tools/accuracy-dlrm.py +++ b/retired_benchmarks/recommendation/dlrm/pytorch/tools/accuracy-dlrm.py @@ -5,8 +5,6 @@ """ - - import argparse import json diff --git a/retired_benchmarks/recommendation/dlrm/tf/mlp_log.py b/retired_benchmarks/recommendation/dlrm/tf/mlp_log.py index 361b3bea1c..409416619e 100644 --- a/retired_benchmarks/recommendation/dlrm/tf/mlp_log.py +++ b/retired_benchmarks/recommendation/dlrm/tf/mlp_log.py @@ -16,8 +16,6 @@ """ - - import inspect import json import logging diff --git a/retired_benchmarks/recommendation/dlrm/tf/train_and_eval_runner.py b/retired_benchmarks/recommendation/dlrm/tf/train_and_eval_runner.py index dd7783a35c..4d55e2bcf6 100644 --- a/retired_benchmarks/recommendation/dlrm/tf/train_and_eval_runner.py +++ b/retired_benchmarks/recommendation/dlrm/tf/train_and_eval_runner.py @@ -15,7 +15,6 @@ """Bypass TPUEstimator for ResNet-50 Train.""" - import functools import math import operator diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/inference_test.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/inference_test.py index bc6041295a..71ee1217af 100644 --- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/inference_test.py +++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/inference_test.py @@ -16,7 +16,6 @@ """Tests for model inference.""" - import os import numpy as np import tensorflow as tf diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/model_test.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/model_test.py index d4a9af6bae..6c8804435f 100644 --- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/model_test.py +++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/model_test.py @@ -15,7 +15,6 @@ """Tests for model.py.""" - import pprint import sys import numpy as np diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/nmt_test.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/nmt_test.py index 149b4c90b2..ea587ed40c 100644 --- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/nmt_test.py +++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/nmt_test.py @@ -15,7 +15,6 @@ """Tests for nmt.py, train.py and inference.py.""" - import argparse import os diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/scripts/rouge.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/scripts/rouge.py index 2afd3fc825..18096aa0da 100644 --- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/scripts/rouge.py +++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/scripts/rouge.py @@ -6,8 +6,6 @@ """ - - import itertools import numpy as np diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/common_test_utils.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/common_test_utils.py index 5d68e4ede2..960be680f0 100644 --- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/common_test_utils.py +++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/common_test_utils.py @@ -16,7 +16,6 @@ """Common utility functions for tests.""" - import tensorflow as tf from tensorflow.python.ops import lookup_ops diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/evaluation_utils_test.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/evaluation_utils_test.py index 5012b69646..1e0ca62eaf 100644 --- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/evaluation_utils_test.py +++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/evaluation_utils_test.py @@ -16,7 +16,6 @@ """Tests for evaluation_utils.py.""" - import tensorflow as tf from ..utils import evaluation_utils diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/iterator_utils_test.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/iterator_utils_test.py index b39233051f..7a10d256c1 100644 --- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/iterator_utils_test.py +++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/iterator_utils_test.py @@ -16,7 +16,6 @@ """Tests for iterator_utils.py""" - import tensorflow as tf from tensorflow.python.ops import lookup_ops diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/misc_utils_test.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/misc_utils_test.py index a63b531345..29649dd807 100644 --- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/misc_utils_test.py +++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/misc_utils_test.py @@ -16,7 +16,6 @@ """Tests for vocab_utils.""" - import tensorflow as tf from ..utils import misc_utils diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/standard_hparams_utils.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/standard_hparams_utils.py index 077122208c..84f1760006 100644 --- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/standard_hparams_utils.py +++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/standard_hparams_utils.py @@ -16,7 +16,6 @@ """standard hparams utils.""" - import tensorflow as tf diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/vocab_utils.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/vocab_utils.py index 0e2b6ee339..32425d2664 100644 --- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/vocab_utils.py +++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/vocab_utils.py @@ -16,7 +16,6 @@ """Utility to handle vocabularies.""" - import codecs import os import tensorflow as tf diff --git a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/vocab_utils_test.py b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/vocab_utils_test.py index c2af64628a..c6588c8e37 100644 --- a/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/vocab_utils_test.py +++ b/retired_benchmarks/translation/gnmt/tensorflow/nmt/utils/vocab_utils_test.py @@ -16,7 +16,6 @@ """Tests for vocab_utils.""" - import codecs import os import tensorflow as tf diff --git a/retired_benchmarks/vision/classification_and_detection/python/main.py b/retired_benchmarks/vision/classification_and_detection/python/main.py index ff29e3887e..de1a547647 100755 --- a/retired_benchmarks/vision/classification_and_detection/python/main.py +++ b/retired_benchmarks/vision/classification_and_detection/python/main.py @@ -3,8 +3,6 @@ """ - - import argparse import array import collections diff --git a/retired_benchmarks/vision/classification_and_detection/tools/accuracy-coco.py b/retired_benchmarks/vision/classification_and_detection/tools/accuracy-coco.py index c50853631c..88709706c6 100644 --- a/retired_benchmarks/vision/classification_and_detection/tools/accuracy-coco.py +++ b/retired_benchmarks/vision/classification_and_detection/tools/accuracy-coco.py @@ -5,8 +5,6 @@ """ - - import argparse import json import os diff --git a/retired_benchmarks/vision/classification_and_detection/tools/accuracy-imagenet.py b/retired_benchmarks/vision/classification_and_detection/tools/accuracy-imagenet.py index 45a5e025f0..71ef9d5636 100644 --- a/retired_benchmarks/vision/classification_and_detection/tools/accuracy-imagenet.py +++ b/retired_benchmarks/vision/classification_and_detection/tools/accuracy-imagenet.py @@ -4,8 +4,6 @@ """ - - import argparse import json diff --git a/retired_benchmarks/vision/classification_and_detection/tools/coco-analyze.py b/retired_benchmarks/vision/classification_and_detection/tools/coco-analyze.py index 8761b00786..49bfc19a4d 100755 --- a/retired_benchmarks/vision/classification_and_detection/tools/coco-analyze.py +++ b/retired_benchmarks/vision/classification_and_detection/tools/coco-analyze.py @@ -3,8 +3,6 @@ """ - - import argparse import collections import json diff --git a/retired_benchmarks/vision/classification_and_detection/tools/lglog2csv.py b/retired_benchmarks/vision/classification_and_detection/tools/lglog2csv.py index 73e023e53d..1b968a251c 100644 --- a/retired_benchmarks/vision/classification_and_detection/tools/lglog2csv.py +++ b/retired_benchmarks/vision/classification_and_detection/tools/lglog2csv.py @@ -3,8 +3,6 @@ """ - - import argparse import re import time diff --git a/retired_benchmarks/vision/classification_and_detection/tools/resnet_save.py b/retired_benchmarks/vision/classification_and_detection/tools/resnet_save.py index 67374d7c22..ed6a9a5a69 100755 --- a/retired_benchmarks/vision/classification_and_detection/tools/resnet_save.py +++ b/retired_benchmarks/vision/classification_and_detection/tools/resnet_save.py @@ -20,7 +20,6 @@ """ - import functools import math import multiprocessing diff --git a/text_to_image/main.py b/text_to_image/main.py index 3b4aecf97f..1935c2d1e0 100644 --- a/text_to_image/main.py +++ b/text_to_image/main.py @@ -3,8 +3,6 @@ """ - - import argparse import array import collections diff --git a/tools/submission/filter_errors.py b/tools/submission/filter_errors.py index 874dfc05f3..ce037184d8 100644 --- a/tools/submission/filter_errors.py +++ b/tools/submission/filter_errors.py @@ -3,8 +3,6 @@ """ - - import argparse import sys diff --git a/vision/classification_and_detection/python/main.py b/vision/classification_and_detection/python/main.py index ba2fbb5f85..5f1ef39429 100755 --- a/vision/classification_and_detection/python/main.py +++ b/vision/classification_and_detection/python/main.py @@ -3,8 +3,6 @@ """ - - import argparse import array import collections diff --git a/vision/classification_and_detection/tools/accuracy-coco.py b/vision/classification_and_detection/tools/accuracy-coco.py index c50853631c..88709706c6 100644 --- a/vision/classification_and_detection/tools/accuracy-coco.py +++ b/vision/classification_and_detection/tools/accuracy-coco.py @@ -5,8 +5,6 @@ """ - - import argparse import json import os diff --git a/vision/classification_and_detection/tools/accuracy-imagenet.py b/vision/classification_and_detection/tools/accuracy-imagenet.py index 45a5e025f0..71ef9d5636 100644 --- a/vision/classification_and_detection/tools/accuracy-imagenet.py +++ b/vision/classification_and_detection/tools/accuracy-imagenet.py @@ -4,8 +4,6 @@ """ - - import argparse import json diff --git a/vision/classification_and_detection/tools/accuracy-openimages.py b/vision/classification_and_detection/tools/accuracy-openimages.py index fc20ecb648..2e631623fe 100644 --- a/vision/classification_and_detection/tools/accuracy-openimages.py +++ b/vision/classification_and_detection/tools/accuracy-openimages.py @@ -5,8 +5,6 @@ """ - - import argparse import json import os diff --git a/vision/classification_and_detection/tools/coco-analyze.py b/vision/classification_and_detection/tools/coco-analyze.py index 8761b00786..49bfc19a4d 100755 --- a/vision/classification_and_detection/tools/coco-analyze.py +++ b/vision/classification_and_detection/tools/coco-analyze.py @@ -3,8 +3,6 @@ """ - - import argparse import collections import json diff --git a/vision/classification_and_detection/tools/lglog2csv.py b/vision/classification_and_detection/tools/lglog2csv.py index 73e023e53d..1b968a251c 100644 --- a/vision/classification_and_detection/tools/lglog2csv.py +++ b/vision/classification_and_detection/tools/lglog2csv.py @@ -3,8 +3,6 @@ """ - - import argparse import re import time From 604aed1fd611d5e5d9c2d41efa07a82379e178cb Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 30 Mar 2026 16:29:26 +0000 Subject: [PATCH 59/64] [Automated Commit] Format Codebase --- .../tools/calibrate_torchvision_model.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/vision/classification_and_detection/tools/calibrate_torchvision_model.py b/vision/classification_and_detection/tools/calibrate_torchvision_model.py index 875d26c388..87c090e2ee 100644 --- a/vision/classification_and_detection/tools/calibrate_torchvision_model.py +++ b/vision/classification_and_detection/tools/calibrate_torchvision_model.py @@ -74,10 +74,15 @@ def main(): dataloader = DataLoader(dataset, batch_size=1) if not hasattr(torchvision_quantization_models, args.model): - raise ValueError(f"Model {args.model} not found in torchvision quantization models") - - - model = getattr(torchvision_quantization_models, args.model)(pretrained=True, progress=True, quantize=False) + raise ValueError( + f"Model {args.model} not found in torchvision quantization models") + + model = getattr( + torchvision_quantization_models, + args.model)( + pretrained=True, + progress=True, + quantize=False) quantize_model(model, dataloader) print(model) From fa06b0429c46d7d859355b56c359157e9fee15fe Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 30 Mar 2026 16:35:24 +0000 Subject: [PATCH 60/64] [Automated Commit] Format Codebase --- language/mixtral-8x7b/evaluate_mbxp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/language/mixtral-8x7b/evaluate_mbxp.py b/language/mixtral-8x7b/evaluate_mbxp.py index 48c39a37bd..57a13d63e0 100644 --- a/language/mixtral-8x7b/evaluate_mbxp.py +++ b/language/mixtral-8x7b/evaluate_mbxp.py @@ -76,7 +76,7 @@ def worker(inp_queue, out_queue): } checker = lang_to_checker_map[problem["lang"]] - + problem["task_id"] = key problem["test"] = problem["test_code"] From 15b5b882572931375a0026c79caa61e4d8623a90 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 30 Mar 2026 16:38:42 +0000 Subject: [PATCH 61/64] [Automated Commit] Format Codebase --- .../tools/coco_generate_calibration.py | 36 ++++++++++--------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/text_to_image/tools/coco_generate_calibration.py b/text_to_image/tools/coco_generate_calibration.py index 3a89c63683..096f5e4079 100644 --- a/text_to_image/tools/coco_generate_calibration.py +++ b/text_to_image/tools/coco_generate_calibration.py @@ -15,6 +15,7 @@ logging.basicConfig(level=logging.INFO) log = logging.getLogger("coco") + def get_args(): """Parse commandline.""" parser = argparse.ArgumentParser() @@ -41,6 +42,7 @@ def get_args(): args = parser.parse_args() return args + def download_file(url: str, output_dir: Path, filename: str | None = None): os.makedirs(str(output_dir), exist_ok=True) @@ -65,6 +67,7 @@ def download_file(url: str, output_dir: Path, filename: str | None = None): return output_path + if __name__ == "__main__": args = get_args() dataset_dir = os.path.abspath(args.dataset_dir) @@ -80,21 +83,22 @@ def download_file(url: str, output_dir: Path, filename: str | None = None): calibration_dir = Path(calibration_dir) # Check if raw annotations file already exist - if not (dataset_dir / "raw" / "annotations" / "captions_train2014.json").exists(): - # Download annotations - os.makedirs(str(dataset_dir / "raw"), exist_ok=True) - os.makedirs(str(dataset_dir / "download_aux"), exist_ok=True) - download_file( - url="http://images.cocodataset.org/annotations/annotations_trainval2014.zip", - output_dir=dataset_dir / "download_aux", - ) - # Unzip file - zipfile_path = dataset_dir / "download_aux" / "annotations_trainval2014.zip" - # Unzip file - with zipfile.ZipFile( - str(zipfile_path), "r" - ) as zip_ref: - zip_ref.extractall(str(dataset_dir / "raw/")) + if not (dataset_dir / "raw" / "annotations" / + "captions_train2014.json").exists(): + # Download annotations + os.makedirs(str(dataset_dir / "raw"), exist_ok=True) + os.makedirs(str(dataset_dir / "download_aux"), exist_ok=True) + download_file( + url="http://images.cocodataset.org/annotations/annotations_trainval2014.zip", + output_dir=dataset_dir / "download_aux", + ) + # Unzip file + zipfile_path = dataset_dir / "download_aux" / "annotations_trainval2014.zip" + # Unzip file + with zipfile.ZipFile( + str(zipfile_path), "r" + ) as zip_ref: + zip_ref.extractall(str(dataset_dir / "raw/")) # Convert to dataframe format and extract the relevant fields with open(dataset_dir / "raw" / "annotations" / "captions_train2014.json") as f: @@ -133,4 +137,4 @@ def download_file(url: str, output_dir: Path, filename: str | None = None): s = "\n".join([str(_) for _ in df_annotations["id"].values]) f.write(s) # Remove Folder - shutil.rmtree(dataset_dir) \ No newline at end of file + shutil.rmtree(dataset_dir) From 59db6b934e7ab179c2fb42f86cc5641b526064b0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 30 Mar 2026 16:58:08 +0000 Subject: [PATCH 62/64] [Automated Commit] Format Codebase --- vision/classification_and_detection/python/backend_ncnn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vision/classification_and_detection/python/backend_ncnn.py b/vision/classification_and_detection/python/backend_ncnn.py index 59f6accde1..a5d2fd2adb 100644 --- a/vision/classification_and_detection/python/backend_ncnn.py +++ b/vision/classification_and_detection/python/backend_ncnn.py @@ -24,7 +24,7 @@ def load(self, model_path, inputs=None, outputs=None): if param_file.endswith("resnet50_v1.param"): # download model files if doesn't self.net = Resnet50(param_file, bin_file) - else: + else: print( "please add your ncnn model .param and .bin files to dir named 'resnet'" From 56ba8ea48d133c5d6979e5566154de64da37f46f Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Tue, 31 Mar 2026 21:15:48 +0530 Subject: [PATCH 63/64] fix typo --- docs/benchmarks/language/get-llama3_1-8b-data.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/benchmarks/language/get-llama3_1-8b-data.md b/docs/benchmarks/language/get-llama3_1-8b-data.md index 4f14563731..c0be60053a 100644 --- a/docs/benchmarks/language/get-llama3_1-8b-data.md +++ b/docs/benchmarks/language/get-llama3_1-8b-data.md @@ -41,7 +41,7 @@ The benchmark implementation run command will automatically download the require === "Pytorch" - === "From Cloudfare R2" + === "From MLCOMMONS Storage" > **Note:** One has to accept the [MLCommons Llama 3.1 License Confidentiality Notice](http://llama3-1.mlcommons.org/) to access the model files in MLCOMMONS Storage. From f666f163cd8622a255f27c5defff9766c0e28f21 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 31 Mar 2026 17:25:47 +0000 Subject: [PATCH 64/64] [Automated Commit] Format Codebase --- tools/submission/submission_checker/constants.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/submission/submission_checker/constants.py b/tools/submission/submission_checker/constants.py index dc45cd83d2..2f4abd87f8 100644 --- a/tools/submission/submission_checker/constants.py +++ b/tools/submission/submission_checker/constants.py @@ -1132,12 +1132,12 @@ "84", "59", "12", - "31", + "31", "86", - "122", - "233", + "122", + "233", "96", - ] + ] }, } }