-
Notifications
You must be signed in to change notification settings - Fork 47
[Feature Enhancement] Modify auto_fault_bisearcher #644
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,107 @@ | ||
| import sys | ||
| import subprocess | ||
| import time | ||
| from pathlib import Path | ||
| from graph_net.declare_config_mixin import DeclareConfigMixin | ||
|
|
||
|
|
||
| class DeviceEvaluator(DeclareConfigMixin): | ||
| """ | ||
| Evaluator responsible for comparing model performance and accuracy between | ||
| a reference device (e.g., CPU) and a target device (e.g., CUDA). | ||
| Uses 'default' as the operator library for all target executions. | ||
| """ | ||
|
|
||
| def __init__(self, config=None): | ||
| self.init_config(config) | ||
|
|
||
| def declare_config( | ||
| self, | ||
| model_path_prefix: str, | ||
| output_dir: str, | ||
| ref_device: str = "cpu", | ||
| target_device: str = "cuda", | ||
| compiler: str = "nope", | ||
| ): | ||
| """ | ||
| Configuration schema for cross-device benchmarking. | ||
| """ | ||
| pass | ||
|
|
||
| def __call__(self, rel_model_path: str) -> str: | ||
| """ | ||
| Orchestrates the evaluation pipeline: | ||
| 1. Generates ground truth data on the reference device. | ||
| 2. Validates performance/accuracy on the target device. | ||
| """ | ||
| output_path = Path(self.config["output_dir"]) | ||
| full_model_path = Path(self.config["model_path_prefix"]) / rel_model_path | ||
|
|
||
| # Define specific workspace for target device logs | ||
| workspace = output_path / self.config["target_device"] / rel_model_path | ||
| workspace.mkdir(parents=True, exist_ok=True) | ||
|
|
||
| # Directory for sharing ground truth data between runs | ||
| reference_dir = output_path / "reference_data" | ||
| reference_dir.mkdir(parents=True, exist_ok=True) | ||
|
|
||
| log_file = workspace / "validation.log" | ||
|
|
||
| # Step 1: Execute reference test to establish baseline | ||
| print(f"Generating reference data on: {self.config['ref_device']}") | ||
| self._run_reference_test(full_model_path, reference_dir) | ||
|
|
||
| # Step 2: Execute target test and return captured logs | ||
| print(f"Running target evaluation on: {self.config['target_device']}") | ||
| return self._run_target_test(full_model_path, reference_dir, log_file) | ||
|
|
||
| def _run_reference_test(self, full_model_path: Path, reference_dir: Path): | ||
| """ | ||
| Invokes the reference module to generate expected outputs (Ground Truth). | ||
| """ | ||
| cmd = [ | ||
| sys.executable, | ||
| "-m", | ||
| "graph_net.torch.test_reference_device", | ||
| "--model-path", | ||
| str(full_model_path), | ||
| "--reference-dir", | ||
| str(reference_dir), | ||
| "--compiler", | ||
| self.config["compiler"], | ||
| "--device", | ||
| self.config["ref_device"], | ||
| ] | ||
| # Reference runs are silent; errors will raise a CalledProcessError | ||
| subprocess.run(cmd, check=True, capture_output=True, text=True) | ||
|
|
||
| def _run_target_test( | ||
| self, full_model_path: Path, reference_dir: Path, log_file: Path | ||
| ) -> str: | ||
| """ | ||
| Executes the model on the target device using 'default' op_lib | ||
| and captures the full output log. | ||
| """ | ||
| cmd = [ | ||
| sys.executable, | ||
| "-m", | ||
| "graph_net.torch.test_target_device", | ||
| "--model-path", | ||
| str(full_model_path), | ||
| "--reference-dir", | ||
| str(reference_dir), | ||
| "--device", | ||
| self.config["target_device"], | ||
| "--op-lib", | ||
| "default", | ||
| ] | ||
|
|
||
| print(" ".join(cmd)) | ||
| # Redirect all output to the log file for persistence and analysis | ||
| with log_file.open("w") as f: | ||
| start_time = time.perf_counter() | ||
| subprocess.run(cmd, stdout=f, stderr=subprocess.STDOUT, check=True) | ||
| end_time = time.perf_counter() | ||
| print(f"Target execution completed in {end_time - start_time:.4f} seconds") | ||
|
|
||
| return log_file.read_text() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,6 @@ | ||
| import os | ||
| import graph_net | ||
| import shutil | ||
| from pathlib import Path | ||
| from typing import List, Tuple | ||
| from graph_net.sample_pass.sample_pass import SamplePass | ||
|
|
@@ -81,7 +82,10 @@ def __call__(self, rel_model_path: str): | |
| """ | ||
| # 2. Invoke the core binary search algorithm | ||
| # history type: list[tuple[int, bool]] | ||
| history: List[Tuple[int, bool]] = bi_search( | ||
| history: List[Tuple[int, bool]] | ||
| faulty_operator_index: int | ||
| faulty_model_path: str | ||
| history, faulty_operator_index, faulty_model_path = bi_search( | ||
| relative_model_path=rel_model_path, | ||
| truncator=self.truncator, | ||
| evaluator=self.evaluator, | ||
|
|
@@ -100,13 +104,21 @@ def __call__(self, rel_model_path: str): | |
| output_base.mkdir(parents=True, exist_ok=True) | ||
|
|
||
| result_file = output_base / file_name | ||
| test_file = ( | ||
| Path(self.config["truncator_config"]["output_dir"]) / faulty_model_path | ||
| ) | ||
|
|
||
| # Write history entries in the format: {truncate_size} {has_fault} | ||
| with result_file.open("w", encoding="utf-8") as f: | ||
| for trunc_size, has_fault in history: | ||
| f.write(f"{trunc_size} {has_fault}\n") | ||
|
|
||
| save_base = Path(self.config["output_dir"]) / "faulty_test" | ||
| save_base.mkdir(parents=True, exist_ok=True) | ||
| shutil.copytree(test_file, save_base / test_file.name, dirs_exist_ok=True) | ||
|
Comment on lines
+116
to
+118
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 支持将错误/误差算子子图保存到faulty_test目录下 |
||
| print( | ||
| f"[AutoFault] Search history for {rel_model_path} saved to: {result_file}" | ||
| ) | ||
| print(f"First faulty operator index: {faulty_operator_index}") | ||
| print(f"Faulty operator model path: {test_file}") | ||
| return history | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,38 @@ | ||
| #!/bin/bash | ||
|
|
||
| # Resolve the root directory of the project | ||
| GRAPH_NET_ROOT=$(python3 -c "import graph_net; import os; print(os.path.dirname(os.path.dirname(graph_net.__file__)))") | ||
|
|
||
| # Test Environment Setup | ||
| MODEL_LIST="$GRAPH_NET_ROOT/graph_net/test/small10_torch_samples_list.txt" | ||
| MODEL_PREFIX="$GRAPH_NET_ROOT" | ||
| OUTPUT_DIR="/tmp/workspace_auto_fault_bisearcher" | ||
|
|
||
| # Execute the SamplePass via the standard CLI entry point | ||
| python3 -m graph_net.apply_sample_pass \ | ||
| --model-path-list "$MODEL_LIST" \ | ||
| --sample-pass-file-path "$GRAPH_NET_ROOT/graph_net/sample_pass/auto_fault_bisearcher.py" \ | ||
| --sample-pass-class-name AutoFaultBisearcher \ | ||
| --sample-pass-config $(base64 -w 0 <<EOF | ||
| { | ||
| "model_path_prefix": "$MODEL_PREFIX", | ||
| "output_dir": "$OUTPUT_DIR", | ||
| "output_file_name": "truncate_size_has_fault.txt", | ||
|
|
||
| "truncator_config": { | ||
| "model_path_prefix": "$MODEL_PREFIX", | ||
| "output_dir": "$OUTPUT_DIR/workspace_truncator/" | ||
| }, | ||
| "evaluator_file_path": "$GRAPH_NET_ROOT/graph_net/fault_locator/torch/device_evaluator.py", | ||
| "evaluator_class_name": "DeviceEvaluator", | ||
| "evaluator_config": { | ||
| "model_path_prefix": "$OUTPUT_DIR/workspace_truncator/", | ||
| "output_dir": "$OUTPUT_DIR/device_evaluator", | ||
| "compiler": "nope", | ||
| "ref_device": "cpu", | ||
| "target_device": "cuda" | ||
| }, | ||
| "tolerance": -9 | ||
| } | ||
| EOF | ||
| ) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -75,8 +75,9 @@ def test_single_model(args): | |
| target_time_stats = eval_backend_diff.parse_time_stats_from_reference_log( | ||
| target_log | ||
| ) | ||
|
|
||
| eval_backend_diff.compare_correctness(ref_out, target_out, eval_args) | ||
| eval_backend_diff.compare_correctness( | ||
| list(flatten_tensor(ref_out)), list(flatten_tensor(target_out)), eval_args | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 部分中间结果返回是tuple(tensor),需要展开传入 |
||
| ) | ||
| test_compiler_util.print_times_and_speedup(args, ref_time_stats, target_time_stats) | ||
|
|
||
|
|
||
|
|
@@ -85,6 +86,14 @@ def is_reference_log_exist(reference_dir, model_path): | |
| return os.path.isfile(log_path) | ||
|
|
||
|
|
||
| def flatten_tensor(lst): | ||
| for i in lst: | ||
| if isinstance(i, (list, tuple)): | ||
| yield from flatten_tensor(i) | ||
| else: | ||
| yield i | ||
|
|
||
|
|
||
| def test_multi_models(args): | ||
| assert os.path.isdir(args.reference_dir) | ||
|
|
||
|
|
@@ -144,7 +153,11 @@ def main(args): | |
| ) | ||
| else: | ||
| eval_backend_perf.register_op_lib(args.op_lib) | ||
|
|
||
| print( | ||
| f"[Processing] model_path: {args.model_path}", | ||
| file=sys.stderr, | ||
| flush=True, | ||
| ) | ||
| test_single_model(args) | ||
| else: | ||
| test_multi_models(args) | ||
|
|
||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
增加边界停止条件