From 8533e68d489b9a1ba4fc0caaf3c6d2dfd9dd6e19 Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Wed, 22 Apr 2026 15:13:08 +0300 Subject: [PATCH 01/30] Configurable host and paths via environment Convection for it path = os.getenv("MY_PATH", "/default/path") --- .../CLIRegister/CLIRegister.py | 3 +- .../ConfigValidator/Config/RunnerConfig.py | 6 ++- .../Config/Validation/ConfigValidator.py | 5 ++- .../Plugins/Profilers/WattsUpPro.py | 6 ++- test_env_var.py | 40 +++++++++++++++++++ 5 files changed, 54 insertions(+), 6 deletions(-) create mode 100644 test_env_var.py diff --git a/experiment-runner/ConfigValidator/CLIRegister/CLIRegister.py b/experiment-runner/ConfigValidator/CLIRegister/CLIRegister.py index 3b7026f6f..c995efd70 100644 --- a/experiment-runner/ConfigValidator/CLIRegister/CLIRegister.py +++ b/experiment-runner/ConfigValidator/CLIRegister/CLIRegister.py @@ -31,7 +31,8 @@ def execute(args=None) -> None: if args is None: filepath = __file__.split('/') filepath.pop() - filepath = '/'.join(filepath) + "/../../../examples/" + #filepath = '/'.join(filepath) + "/../../../examples/" + filepath = os.getenv("EXAMPLES_PATH", '/'.join(filepath) + "/../../../examples/") destination = os.path.abspath(filepath) else: if len(args) == 3: diff --git a/experiment-runner/ConfigValidator/Config/RunnerConfig.py b/experiment-runner/ConfigValidator/Config/RunnerConfig.py index a79e0eb4e..4fbef26a9 100644 --- a/experiment-runner/ConfigValidator/Config/RunnerConfig.py +++ b/experiment-runner/ConfigValidator/Config/RunnerConfig.py @@ -1,3 +1,5 @@ +import os + from EventManager.Models.RunnerEvents import RunnerEvents from EventManager.EventSubscriptionController import EventSubscriptionController from ConfigValidator.Config.Models.RunTableModel import RunTableModel @@ -22,7 +24,9 @@ class RunnerConfig: """The path in which Experiment Runner will create a folder with the name `self.name`, in order to store the results from this experiment. (Path does not need to exist - it will be created if necessary.) Output path defaults to the config file's path, inside the folder 'experiments'""" - results_output_path: Path = ROOT_DIR / 'experiments' + #results_output_path: Path = ROOT_DIR / 'experiments' + default_output = ROOT_DIR / "experiments" + results_output_path: Path = Path(os.getenv("EXPERIMENT_RUNNER_OUTPUT_PATH", str(default_output))) """Experiment operation type. Unless you manually want to initiate each run, use `OperationType.AUTO`.""" operation_type: OperationType = OperationType.AUTO diff --git a/experiment-runner/ConfigValidator/Config/Validation/ConfigValidator.py b/experiment-runner/ConfigValidator/Config/Validation/ConfigValidator.py index dcd9ff205..81d16bde6 100644 --- a/experiment-runner/ConfigValidator/Config/Validation/ConfigValidator.py +++ b/experiment-runner/ConfigValidator/Config/Validation/ConfigValidator.py @@ -81,8 +81,9 @@ def validate_config(config: RunnerConfig): if config.self_measure: if not hasattr(config, "self_measure_bin"): - config.self_measure_bin = "/usr/local/bin/energibridge" # This is spesific to linux, might work for osx as well - + #config.self_measure_bin = "/usr/local/bin/energibridge" # This is spesific to linux, might work for osx as well + config.self_measure_bin = os.getenv("ENERGIBRIDGE_PATH", "/usr/local/bin/energibridge") + if not hasattr(config, "self_measure_logfile"): config.self_measure_logfile = None diff --git a/experiment-runner/Plugins/Profilers/WattsUpPro.py b/experiment-runner/Plugins/Profilers/WattsUpPro.py index 6cac77f7b..333b9670c 100644 --- a/experiment-runner/Plugins/Profilers/WattsUpPro.py +++ b/experiment-runner/Plugins/Profilers/WattsUpPro.py @@ -15,9 +15,11 @@ def __init__(self, port: str = None, interval=1.0): if port is None: system = uname()[0] if system == 'Darwin': # OS X - port = '/dev/tty.usbserial-A1000wT3' + #port = '/dev/tty.usbserial-A1000wT3' + port = os.getenv("WATTS_UP_PRO_PORT_MACOS", '/dev/tty.usbserial-A1000wT3') elif system == 'Linux': - port = '/dev/ttyUSB0' + #port = '/dev/ttyUSB0' + port = os.getenv("WATTS_UP_PRO_PORT_LINUX", '/dev/ttyUSB0') if not os.path.exists(port): print( '') diff --git a/test_env_var.py b/test_env_var.py new file mode 100644 index 000000000..e0722eba9 --- /dev/null +++ b/test_env_var.py @@ -0,0 +1,40 @@ +import os +from pathlib import Path + +print("=" * 70) +print("ENVIRONMENT VARIABLE TEST - Experiment Runner Portability") +print("=" * 70) + +# Test all environment variables +env_vars = { + "EXPERIMENT_RUNNER_OUTPUT_PATH": "/default/experiments", + "ENERGIBRIDGE_PATH": "/usr/local/bin/energibridge", + "WATTS_UP_PRO_PORT_MACOS": "/dev/tty.usbserial-A1000wT3", + "WATTS_UP_PRO_PORT_LINUX": "/dev/ttyUSB0", + "EXAMPLES_PATH": "/default/examples" +} + +print("\n1. WITHOUT environment variables set:") +print("-" * 70) +for var, default in env_vars.items(): + value = os.getenv(var, f"DEFAULT: {default}") + print(f" {var}") + print(f" = {value}\n") + +# Now set environment variables +os.environ["EXPERIMENT_RUNNER_OUTPUT_PATH"] = "C:\\my-experiments" +os.environ["ENERGIBRIDGE_PATH"] = "C:\\tools\\energibridge.exe" +os.environ["WATTS_UP_PRO_PORT_MACOS"] = "COM5" +os.environ["WATTS_UP_PRO_PORT_LINUX"] = "COM3" +os.environ["EXAMPLES_PATH"] = "C:\\my-examples" + +print("\n2. WITH environment variables set:") +print("-" * 70) +for var in env_vars.keys(): + value = os.getenv(var, "NOT SET") + print(f" {var}") + print(f" = {value}\n") + +print("=" * 70) +print("SUCCESS - Environment variables are working!") +print("=" * 70) \ No newline at end of file From c1f6c5b08e9bb14353de7242f333662825559402 Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Thu, 23 Apr 2026 13:27:46 +0300 Subject: [PATCH 02/30] Configurable host and paths via environment Update the READme file and also the experiments gave as exxemples --- Exemples.md | 16 +++++ Personal_experiments/RunnerConfig.py | 58 +++++++++++++++++++ README.md | 30 ++++++++++ .../hello-world-fibonacci/RunnerConfig.py | 3 +- examples/hello-world/RunnerConfig.py | 4 +- .../profilers/EnergiBridge/RunnerConfig.py | 4 +- examples/profilers/JoularCore/RunnerConfig.py | 3 +- examples/profilers/NvidiaML/RunnerConfig.py | 4 +- examples/profilers/PicoCM3/RunnerConfig.py | 4 +- .../profilers/PowerJoular/RunnerConfig.py | 4 +- .../profilers/PowerLetrics/RunnerConfig.py | 4 +- .../profilers/PowerMetrics/RunnerConfig.py | 4 +- .../linux-ps-profiling/RunnerConfig.py | 4 +- .../measure-self-profiling/RunnerConfig.py | 7 ++- .../core/arbitrary-objects/RunnerConfig.py | 4 +- .../core/shuffling/RunnerConfig.py | 3 +- .../combined/RunnerConfig.py | 3 +- .../individual/RunnerConfig.py | 3 +- .../plugins/PicoCM3/RunnerConfig.py | 3 +- 19 files changed, 148 insertions(+), 17 deletions(-) create mode 100644 Exemples.md create mode 100644 Personal_experiments/RunnerConfig.py diff --git a/Exemples.md b/Exemples.md new file mode 100644 index 000000000..c266f4310 --- /dev/null +++ b/Exemples.md @@ -0,0 +1,16 @@ +# Domain-Specific Usage Examples + +Experiment Runner is designed to be domain-agnostic. Below are practical examples demonstrating how it can be configured and used across different research domains. + +These examples are meant to help researchers quickly adapt the framework to their own experimental setup. + +## 1. Code-level Performance Measurements +This experiment compares the performance of two implementations of summation under different input sizes. + +### Set-Up +- Factors: + - algorithm โˆˆ {sum_loop, optimized_sum} + - input_size โˆˆ {10k, 100k, 500k} +- Metric: + - execution_time_ms + diff --git a/Personal_experiments/RunnerConfig.py b/Personal_experiments/RunnerConfig.py new file mode 100644 index 000000000..4d6d0e7b5 --- /dev/null +++ b/Personal_experiments/RunnerConfig.py @@ -0,0 +1,58 @@ +import time +import os + +from ConfigValidator.Config.Models.FactorModel import FactorModel +from ConfigValidator.Config.Models.RunTableModel import RunTableModel +from ConfigValidator.Config.Models.RunnerContext import RunnerContext +from pathlib import Path +from ConfigValidator.Config.Models.OperationType import OperationType + + +def run_experiment(algorithm, input_size): + data = range(input_size) + + start = time.time() + + if algorithm == "sum_loop": + total = 0 + for x in data: + total += x + elif algorithm == "optimized_sum": + total = sum(data) + + end = time.time() + + return (end - start) * 1000 + + +class RunnerConfig: + + name = "code_performance_example" + default_output = Path("experiments") + results_output_path = Path(os.getenv("EXPERIMENT_RUNNER_OUTPUT_PATH", str(default_output))) + + operation_type = OperationType.AUTO + + time_between_runs_in_ms = 1000 + + experiment_path = None + + def create_run_table_model(self): + factor1 = FactorModel("algorithm", ["sum_loop", "optimized_sum"]) + factor2 = FactorModel("input_size", [10000, 100000, 500000]) + + return RunTableModel( + factors=[factor1, factor2], + data_columns=["execution_time_ms"] + ) + + def populate_run_data(self, context: RunnerContext): + + algorithm = context.run_variation["algorithm"] + input_size = context.run_variation["input_size"] + + exec_time = run_experiment(algorithm, input_size) + + return { + "execution_time_ms": exec_time + } \ No newline at end of file diff --git a/README.md b/README.md index b02699b7c..375b6ed97 100644 --- a/README.md +++ b/README.md @@ -76,6 +76,36 @@ python experiment-runner/ The results of the experiment will be stored in the directory `RunnerConfig.results_output_path/RunnerConfig.name` as defined by your config variables. +### Portability Across Users and Machines + +When sharing experiments across different users or machines, hardcoded paths in configuration files can cause issues. Experiment Runner supports **environment variables** to make your experiments portable without code changes: + +#### Available Environment Variables + +- **`EXPERIMENT_RUNNER_OUTPUT_PATH`**: Directory where experiment results are stored + - Default: `/experiments` + - Example: `export EXPERIMENT_RUNNER_OUTPUT_PATH="/path/to/results"` + +- **`ENERGIBRIDGE_PATH`**: Path to the EnergiBridge executable (for energy measurements) + - Default: `/usr/local/bin/energibridge` + - Example: `export ENERGIBRIDGE_PATH="/usr/local/bin/energibridge"` + +- **`EXAMPLES_PATH`**: Directory for generating new config templates + - Default: `/examples` + - Example: `export EXAMPLES_PATH="/home/user/my-experiments"` + +#### Using Environment Variables + +Set environment variables before running your experiment: + +```bash +export EXPERIMENT_RUNNER_OUTPUT_PATH="/data/experiments" +export ENERGIBRIDGE_PATH="/opt/energibridge/bin/energibridge" +python experiment-runner/ MyRunnerConfig.py +``` + +Your configuration files automatically use these variables if set, with sensible defaults when they are not. This allows the same experiment to run on different machines without any code modifications. + **More information about the profilers and use cases can be found in the [Wiki tab](https://github.com/S2-group/experiment-runner/wiki).** ## How to cite Experiment Runner diff --git a/examples/hello-world-fibonacci/RunnerConfig.py b/examples/hello-world-fibonacci/RunnerConfig.py index 9c5dbc3a8..bd066494b 100644 --- a/examples/hello-world-fibonacci/RunnerConfig.py +++ b/examples/hello-world-fibonacci/RunnerConfig.py @@ -27,7 +27,8 @@ class RunnerConfig: """The path in which Experiment Runner will create a folder with the name `self.name`, in order to store the results from this experiment. (Path does not need to exist - it will be created if necessary.) Output path defaults to the config file's path, inside the folder 'experiments'""" - results_output_path: Path = ROOT_DIR / 'experiments' + default_output = ROOT_DIR / "experiments" + results_output_path: Path = Path(os.getenv("EXPERIMENT_RUNNER_OUTPUT_PATH", str(default_output))) """Experiment operation type. Unless you manually want to initiate each run, use `OperationType.AUTO`.""" operation_type: OperationType = OperationType.AUTO diff --git a/examples/hello-world/RunnerConfig.py b/examples/hello-world/RunnerConfig.py index 61d1411f0..3641052f7 100644 --- a/examples/hello-world/RunnerConfig.py +++ b/examples/hello-world/RunnerConfig.py @@ -9,6 +9,7 @@ from typing import Dict, List, Any, Optional from pathlib import Path from os.path import dirname, realpath +import os class RunnerConfig: @@ -21,7 +22,8 @@ class RunnerConfig: """The path in which Experiment Runner will create a folder with the name `self.name`, in order to store the results from this experiment. (Path does not need to exist - it will be created if necessary.) Output path defaults to the config file's path, inside the folder 'experiments'""" - results_output_path: Path = ROOT_DIR / 'experiments' + default_output = ROOT_DIR / "experiments" + results_output_path: Path = Path(os.getenv("EXPERIMENT_RUNNER_OUTPUT_PATH", str(default_output))) """Experiment operation type. Unless you manually want to initiate each run, use `OperationType.AUTO`.""" operation_type: OperationType = OperationType.AUTO diff --git a/examples/profilers/EnergiBridge/RunnerConfig.py b/examples/profilers/EnergiBridge/RunnerConfig.py index 28df866b0..e2b93c3db 100644 --- a/examples/profilers/EnergiBridge/RunnerConfig.py +++ b/examples/profilers/EnergiBridge/RunnerConfig.py @@ -10,6 +10,7 @@ from typing import Dict, List, Any, Optional from pathlib import Path from os.path import dirname, realpath +import os class RunnerConfig: @@ -22,7 +23,8 @@ class RunnerConfig: """The path in which Experiment Runner will create a folder with the name `self.name`, in order to store the results from this experiment. (Path does not need to exist - it will be created if necessary.) Output path defaults to the config file's path, inside the folder 'experiments'""" - results_output_path: Path = ROOT_DIR / 'experiments' + default_output = ROOT_DIR / "experiments" + results_output_path: Path = Path(os.getenv("EXPERIMENT_RUNNER_OUTPUT_PATH", str(default_output))) """Experiment operation type. Unless you manually want to initiate each run, use `OperationType.AUTO`.""" operation_type: OperationType = OperationType.AUTO diff --git a/examples/profilers/JoularCore/RunnerConfig.py b/examples/profilers/JoularCore/RunnerConfig.py index 6e1975a8b..03134ca1a 100644 --- a/examples/profilers/JoularCore/RunnerConfig.py +++ b/examples/profilers/JoularCore/RunnerConfig.py @@ -22,7 +22,8 @@ class RunnerConfig: ROOT_DIR = Path(dirname(realpath(__file__))) name: str = "joularcore_example" - results_output_path: Path = ROOT_DIR / "experiments" + default_output = ROOT_DIR / "experiments" + results_output_path: Path = Path(os.getenv("EXPERIMENT_RUNNER_OUTPUT_PATH", str(default_output))) operation_type: OperationType = OperationType.AUTO time_between_runs_in_ms: int = 1000 diff --git a/examples/profilers/NvidiaML/RunnerConfig.py b/examples/profilers/NvidiaML/RunnerConfig.py index fd6242dba..6110833b5 100644 --- a/examples/profilers/NvidiaML/RunnerConfig.py +++ b/examples/profilers/NvidiaML/RunnerConfig.py @@ -9,6 +9,7 @@ from typing import Dict, List, Any, Optional from pathlib import Path +import os import numpy as np import time from os.path import dirname, realpath @@ -24,7 +25,8 @@ class RunnerConfig: """The path in which Experiment Runner will create a folder with the name `self.name`, in order to store the results from this experiment. (Path does not need to exist - it will be created if necessary.) Output path defaults to the config file's path, inside the folder 'experiments'""" - results_output_path: Path = ROOT_DIR / 'experiments' + default_output = ROOT_DIR / "experiments" + results_output_path: Path = Path(os.getenv("EXPERIMENT_RUNNER_OUTPUT_PATH", str(default_output))) """Experiment operation type. Unless you manually want to initiate each run, use `OperationType.AUTO`.""" operation_type: OperationType = OperationType.AUTO diff --git a/examples/profilers/PicoCM3/RunnerConfig.py b/examples/profilers/PicoCM3/RunnerConfig.py index 5772b5cc3..4aa7c7202 100644 --- a/examples/profilers/PicoCM3/RunnerConfig.py +++ b/examples/profilers/PicoCM3/RunnerConfig.py @@ -9,6 +9,7 @@ from typing import Dict, Any, Optional from pathlib import Path from os.path import dirname, realpath +import os import subprocess import shlex @@ -26,7 +27,8 @@ class RunnerConfig: """The path in which Experiment Runner will create a folder with the name `self.name`, in order to store the results from this experiment. (Path does not need to exist - it will be created if necessary.) Output path defaults to the config file's path, inside the folder 'experiments'""" - results_output_path: Path = ROOT_DIR / 'experiments' + default_output = ROOT_DIR / "experiments" + results_output_path: Path = Path(os.getenv("EXPERIMENT_RUNNER_OUTPUT_PATH", str(default_output))) """Experiment operation type. Unless you manually want to initiate each run, use `OperationType.AUTO`.""" operation_type: OperationType = OperationType.AUTO diff --git a/examples/profilers/PowerJoular/RunnerConfig.py b/examples/profilers/PowerJoular/RunnerConfig.py index 93219d57b..ab25f0cd3 100644 --- a/examples/profilers/PowerJoular/RunnerConfig.py +++ b/examples/profilers/PowerJoular/RunnerConfig.py @@ -11,6 +11,7 @@ from typing import Dict, List, Any, Optional from pathlib import Path from os.path import dirname, realpath +import os import time import subprocess @@ -26,7 +27,8 @@ class RunnerConfig: """The path in which Experiment Runner will create a folder with the name `self.name`, in order to store the results from this experiment. (Path does not need to exist - it will be created if necessary.) Output path defaults to the config file's path, inside the folder 'experiments'""" - results_output_path: Path = ROOT_DIR / 'experiments' + default_output = ROOT_DIR / "experiments" + results_output_path: Path = Path(os.getenv("EXPERIMENT_RUNNER_OUTPUT_PATH", str(default_output))) """Experiment operation type. Unless you manually want to initiate each run, use `OperationType.AUTO`.""" operation_type: OperationType = OperationType.AUTO diff --git a/examples/profilers/PowerLetrics/RunnerConfig.py b/examples/profilers/PowerLetrics/RunnerConfig.py index 026b33549..3902067dd 100644 --- a/examples/profilers/PowerLetrics/RunnerConfig.py +++ b/examples/profilers/PowerLetrics/RunnerConfig.py @@ -12,6 +12,7 @@ import numpy as np from pathlib import Path from os.path import dirname, realpath +import os class RunnerConfig: @@ -24,7 +25,8 @@ class RunnerConfig: """The path in which Experiment Runner will create a folder with the name `self.name`, in order to store the results from this experiment. (Path does not need to exist - it will be created if necessary.) Output path defaults to the config file's path, inside the folder 'experiments'""" - results_output_path: Path = ROOT_DIR / 'experiments' + default_output = ROOT_DIR / "experiments" + results_output_path: Path = Path(os.getenv("EXPERIMENT_RUNNER_OUTPUT_PATH", str(default_output))) """Experiment operation type. Unless you manually want to initiate each run, use `OperationType.AUTO`.""" operation_type: OperationType = OperationType.AUTO diff --git a/examples/profilers/PowerMetrics/RunnerConfig.py b/examples/profilers/PowerMetrics/RunnerConfig.py index 417aeccbd..09040547c 100644 --- a/examples/profilers/PowerMetrics/RunnerConfig.py +++ b/examples/profilers/PowerMetrics/RunnerConfig.py @@ -10,6 +10,7 @@ from typing import Dict, List, Any, Optional from pathlib import Path from os.path import dirname, realpath +import os import time import numpy as np @@ -23,7 +24,8 @@ class RunnerConfig: """The path in which Experiment Runner will create a folder with the name `self.name`, in order to store the results from this experiment. (Path does not need to exist - it will be created if necessary.) Output path defaults to the config file's path, inside the folder 'experiments'""" - results_output_path: Path = ROOT_DIR / 'experiments' + default_output = ROOT_DIR / "experiments" + results_output_path: Path = Path(os.getenv("EXPERIMENT_RUNNER_OUTPUT_PATH", str(default_output))) """Experiment operation type. Unless you manually want to initiate each run, use `OperationType.AUTO`.""" operation_type: OperationType = OperationType.AUTO diff --git a/examples/profilers/linux-ps-profiling/RunnerConfig.py b/examples/profilers/linux-ps-profiling/RunnerConfig.py index 96e4d8be8..4b02a5c58 100644 --- a/examples/profilers/linux-ps-profiling/RunnerConfig.py +++ b/examples/profilers/linux-ps-profiling/RunnerConfig.py @@ -10,6 +10,7 @@ from typing import Dict, List, Any, Optional from pathlib import Path from os.path import dirname, realpath +import os import numpy as np import time @@ -27,7 +28,8 @@ class RunnerConfig: """The path in which Experiment Runner will create a folder with the name `self.name`, in order to store the results from this experiment. (Path does not need to exist - it will be created if necessary.) Output path defaults to the config file's path, inside the folder 'experiments'""" - results_output_path: Path = ROOT_DIR / 'experiments' + default_output = ROOT_DIR / "experiments" + results_output_path: Path = Path(os.getenv("EXPERIMENT_RUNNER_OUTPUT_PATH", str(default_output))) """Experiment operation type. Unless you manually want to initiate each run, use `OperationType.AUTO`.""" operation_type: OperationType = OperationType.AUTO diff --git a/examples/profilers/measure-self-profiling/RunnerConfig.py b/examples/profilers/measure-self-profiling/RunnerConfig.py index a15754836..298aeaf2d 100644 --- a/examples/profilers/measure-self-profiling/RunnerConfig.py +++ b/examples/profilers/measure-self-profiling/RunnerConfig.py @@ -8,6 +8,7 @@ from typing import Optional, Dict, Any from pathlib import Path from os.path import dirname, realpath +import os import time @@ -21,7 +22,8 @@ class RunnerConfig: """The path in which Experiment Runner will create a folder with the name `self.name`, in order to store the results from this experiment. (Path does not need to exist - it will be created if necessary.) Output path defaults to the config file's path, inside the folder 'experiments'""" - results_output_path: Path = ROOT_DIR / 'experiments' + default_output = ROOT_DIR / "experiments" + results_output_path: Path = Path(os.getenv("EXPERIMENT_RUNNER_OUTPUT_PATH", str(default_output))) """Experiment operation type. Unless you manually want to initiate each run, use `OperationType.AUTO`.""" operation_type: OperationType = OperationType.AUTO @@ -44,7 +46,8 @@ class RunnerConfig: This parameter is optional and defaults to /usr/local/bin/energibridge """ - self_measure_bin: Path = "/usr/local/bin/energibridge" + default_energibridge = "/usr/local/bin/energibridge" + self_measure_bin: Path = Path(os.getenv("ENERGIBRIDGE_PATH", default_energibridge)) """ Where to save the full log files for energibridge. If specified, log files are saved to context.run_dir/. diff --git a/test-standalone/core/arbitrary-objects/RunnerConfig.py b/test-standalone/core/arbitrary-objects/RunnerConfig.py index 79ff0b8d9..7dcd316d9 100644 --- a/test-standalone/core/arbitrary-objects/RunnerConfig.py +++ b/test-standalone/core/arbitrary-objects/RunnerConfig.py @@ -10,6 +10,7 @@ from typing import Dict, List, Any, Optional from pathlib import Path from os.path import dirname, realpath +import os ''' Test Description: @@ -37,7 +38,8 @@ class RunnerConfig: # ================================ USER SPECIFIC CONFIG ================================ name: str = "new_runner_experiment" - results_output_path: Path = ROOT_DIR / 'experiments' + default_output = ROOT_DIR / "experiments" + results_output_path: Path = Path(os.getenv("EXPERIMENT_RUNNER_OUTPUT_PATH", str(default_output))) operation_type: OperationType = OperationType.AUTO time_between_runs_in_ms: int = 100 diff --git a/test-standalone/core/shuffling/RunnerConfig.py b/test-standalone/core/shuffling/RunnerConfig.py index 16095eb17..7e0116904 100644 --- a/test-standalone/core/shuffling/RunnerConfig.py +++ b/test-standalone/core/shuffling/RunnerConfig.py @@ -23,7 +23,8 @@ class RunnerConfig: # ================================ USER SPECIFIC CONFIG ================================ name: str = "new_runner_experiment" - results_output_path: Path = ROOT_DIR / 'experiments' + default_output = ROOT_DIR / "experiments" + results_output_path: Path = Path(os.getenv("EXPERIMENT_RUNNER_OUTPUT_PATH", str(default_output))) operation_type: OperationType = OperationType.AUTO time_between_runs_in_ms: int = 100 diff --git a/test-standalone/plugins/CodecarbonWrapper/combined/RunnerConfig.py b/test-standalone/plugins/CodecarbonWrapper/combined/RunnerConfig.py index e626929a2..d3d52db71 100644 --- a/test-standalone/plugins/CodecarbonWrapper/combined/RunnerConfig.py +++ b/test-standalone/plugins/CodecarbonWrapper/combined/RunnerConfig.py @@ -29,7 +29,8 @@ class RunnerConfig: # ================================ USER SPECIFIC CONFIG ================================ name: str = "new_runner_experiment" - results_output_path: Path = ROOT_DIR / 'experiments' + default_output = ROOT_DIR / "experiments" + results_output_path: Path = Path(os.getenv("EXPERIMENT_RUNNER_OUTPUT_PATH", str(default_output))) operation_type: OperationType = OperationType.AUTO time_between_runs_in_ms: int = 100 diff --git a/test-standalone/plugins/CodecarbonWrapper/individual/RunnerConfig.py b/test-standalone/plugins/CodecarbonWrapper/individual/RunnerConfig.py index 0d00ff5f8..c00723944 100644 --- a/test-standalone/plugins/CodecarbonWrapper/individual/RunnerConfig.py +++ b/test-standalone/plugins/CodecarbonWrapper/individual/RunnerConfig.py @@ -25,7 +25,8 @@ class RunnerConfig: # ================================ USER SPECIFIC CONFIG ================================ name: str = "new_runner_experiment" - results_output_path: Path = ROOT_DIR / 'experiments' + default_output = ROOT_DIR / "experiments" + results_output_path: Path = Path(os.getenv("EXPERIMENT_RUNNER_OUTPUT_PATH", str(default_output))) operation_type: OperationType = OperationType.AUTO time_between_runs_in_ms: int = 100 diff --git a/test-standalone/plugins/PicoCM3/RunnerConfig.py b/test-standalone/plugins/PicoCM3/RunnerConfig.py index 9b3d580cf..30606f702 100644 --- a/test-standalone/plugins/PicoCM3/RunnerConfig.py +++ b/test-standalone/plugins/PicoCM3/RunnerConfig.py @@ -20,7 +20,8 @@ class RunnerConfig: # ================================ USER SPECIFIC CONFIG ================================ name: str = "new_runner_experiment" - results_output_path: Path = ROOT_DIR / 'experiments' + default_output = ROOT_DIR / "experiments" + results_output_path: Path = Path(os.getenv("EXPERIMENT_RUNNER_OUTPUT_PATH", str(default_output))) operation_type: OperationType = OperationType.AUTO time_between_runs_in_ms: int = 1000 From 6cc66e601fcf38e5c5e4e392805398dc7660d33f Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Thu, 7 May 2026 09:53:01 +0300 Subject: [PATCH 03/30] h --- .coverage | Bin 0 -> 69632 bytes READe.md | 160 ++++++++ .../DistributedMasterOrchestrator.py | 163 +++++++++ .../DistributedExecution/Worker.py | 145 ++++++++ .../DistributedExecution/__init__.py | 15 + .../Plugins/Profilers/DataSource.py | 2 +- .../Plugins/Profilers/PowerJoular.py | 4 +- experiment-runner/__main__.py | 47 ++- requirements.txt | 2 + test/conftest.py | 168 +++++++++ test/integration/__init__.py | 0 test/system/__init__.py | 0 test/system/base_system_test.py | 345 ++++++++++++++++++ test/system/fixtures/__init__.py | 0 test/system/test_basic_run.py | 292 +++++++++++++++ test/system/validators/__init__.py | 0 test/unit/__init__.py | 0 validate_local_test_setup.py | 176 +++++++++ 18 files changed, 1515 insertions(+), 4 deletions(-) create mode 100644 .coverage create mode 100644 READe.md create mode 100644 experiment-runner/DistributedExecution/DistributedMasterOrchestrator.py create mode 100644 experiment-runner/DistributedExecution/Worker.py create mode 100644 experiment-runner/DistributedExecution/__init__.py create mode 100644 test/conftest.py create mode 100644 test/integration/__init__.py create mode 100644 test/system/__init__.py create mode 100644 test/system/base_system_test.py create mode 100644 test/system/fixtures/__init__.py create mode 100644 test/system/test_basic_run.py create mode 100644 test/system/validators/__init__.py create mode 100644 test/unit/__init__.py create mode 100644 validate_local_test_setup.py diff --git a/.coverage b/.coverage new file mode 100644 index 0000000000000000000000000000000000000000..50437b8b96b9910dc8ddd646c4f4b6ad83bc8bc0 GIT binary patch literal 69632 zcmeI5du$v>9mjX?_TG;H$-JPMn>elvT2-@ zOVpI7v;vhiJ+I|dThpgydxmwuvt0*>glf||^C90*4L&5Bbc&i(&r!=&af6m0iVmGQ9Sx;BI>d)5Pm^lq z;4fB}8gN)n%AHdXQO+>)Onuf=_1p}#hGlgrg*gLs&e?8iO{1vThFs8eo=n}+Y|YSR zI!AM5o95R{;4p*c9RRO3H4xV^mY`#9$m+dn=qv|bBV^TkHSiRO(+v;nazJ)#2+TF> z zJejO!TT3UAxN>y66|t0nb?gw>)#K{lp%m0_V}H*=sg*VY-ugQ1$NqRB?p zBPeZYX=7qRgPbiG+4{1^ON>RKfV zvD`{}b)0^ZW~_P=c0y7p-PI*JZjE5=68i+|62V-StH6qm;Oga< zRC`8Vt(`f8@WDVZy{oH{a3&!$#=e7RQ$8;F3w}<&2#r4b`0FGymu1PHrt>4SY5`i4 zlB$_dWuXmrdjOev%&Xv8%^GE!mtSV(#gp|t%X-}imhB#U@HND}L-n3CEK7rKm33Rz zZJw(h zUj2fTVo}xe)&7cg!)~n5_Or^#?&CHh5Ktb3N@+nAT@j}jY4s{=vAp<4OTdK1T2%9H zb!VAU6XVGlx+*%iHyT%MYHLL*GcQ~WFIXp4OU{`TE>`c$PPg( z4MU{b!o`5sFl1Ku4y;9Vvpwsg^L$kYNYSALs1(V90DpKO0VIF~kN^@u0!RP}AOR$R z1dsp{KmwbOfM4{9AvXW_kxK%36Ta|30!RP}AOR$R1dsp{Kmter2_OL^fCN621mZsb zcH;dkgZ79EDW*$m;@moxJm*OdyIz0!RP}AOR$R1dsp{Kmter z2_OL^fCQEZ#QobvZyCTB_4h}rs{jG;!G8Y_M;rbRz!EMzK>|ns2_OL^fCP{L5BOW; zuc#DtTgfR$EqF4b0uNGDvkE-fZ`{;osWMKjjB zmK6rPF%m!mNB{{S0VIF~kN^@u0!RP}Ac6lk0l(-M672u~Me>0Fe|R7PB!C2v01`j~ zNB{{S0VIF~kN^@u0-KP4-=B!E=l}O5pAg8u$s6Ql@)Ef~eoVego*<8qFO$!cQ>07^ zBu^&EM@T!7lK)8lKKWeo7s($czXO5rKmter2_OL^fCP{L54 zD7(@rh9b-{~EFr0LDKe-S*76?g^NKy<2nHT?Oy8>Vicf5ac>PKDgWnTS3MV`DS zAUZ*bU2eap-4EvV;WyuV=;fck@y2Vy-=ja<#rFUE$N>Rn z{_m4Nl5dlrl5;To|0g*>4ilA}C0~R!fLUUY8S*N5g*;7uMSe@(Cij!`Oyd%4lm!;M&)8*Sa(NOf_8baErv!Hsx3 zHxg~!=u2@UNVpM7azjdRqc_ftK#UumQEqfaxX~RBz#@Sd;ub$&DiAvY+}18}Bkbo! z+Q*g;B0jeNFQm3wPGNeH01`j~NB{{S0VIF~kN^@u0!RP}Y%Kz8{~zc7TPx2owMYO7 zAOR$R1dsp{Kmter2_OL^u=NPw{{Pm?b4)Q3Kmter2_OL^fCP{L5bn|CV%H`~z_y@mTCqyeOOvPlo>%dMb2X@LVtunG>Il zMx)3h|JmgnGHyK>OT(}JZO0=TNJ89KZJlLElB zvqP%vX|lwLS$JFLK~-0$;ccDVa=4tea;8>d%0RGf8ifKiL0`AiEmf3O>Z^jghre5t zi?u*kS4&=FTFWhy(R9tuWI#-((<@bOZZf0><;>;eUIb|!8?Op1DYWUF4HDa9mjbt} zW2MKakN^@u0-K(|hHTo8sM!MLeKs`hVR*aOwoRW8|6>B3?NViTlLrkGOD}6*J-FYe zS~GW0H4kruhpY=YT~cLaW5}x+YDvyEsj{odvQ`bX%9d>uCrr~YEd{VqehSe1&VW?e z+iG=u0^&_LKBeYh`sP(gpOcU(0}-XD+e%J3YC&Ers-9P~$|0SOo7yZ@#;JAOHcCoa z2b*bW_Plab*D|_cQ3oi7~r zJ4qOhHTnLNrZH_&%kmb0YldTyR!k{Z>Vfq^K8f^3uweJRzz{pyiV1*se-GKAvRx|M zifh2d9L<+aww&MNq@>EV&Cbw5d0Nvg1%ym#uxxKBHCY;S+NH{1%N56UAnWb@pV(Il9~spZE88DiZ~roWoToly2~gRRJIqC+!#g4)wAE= z6)wIpG>xmade|tNIo1dyTCp88xvCM^4?B)I)yx|DeWqH1eMgYi$L9arn_R9Y=4`6x zX&$ybHGR6aaSHPH{|RRVQcAv<97{YJUx*iCe~En}`n_l*ax(na@S)I!&_{z`3rYc1 z`i*p}|C@f%_kj3IaR&ripTp_tlZ496&0P|?y4-RGdO_O$=1RNL65*=-f2Y3(6z*!S z!mI9$9XSo^CYq~ms%wlg72-Rs%f~uhnp=xP_>a-;`LQ$$44LWV{ji9M+)a;NLXu7u5 znOEC{%280%;Vzs7SPXIlW|>~6N4#y{?@-~WeY5s){sA>`Gz6a9XW)#M4sszTpr z|KHi+19f|w3vHFUEA0O}2@wP~xeZ$>uqHd%{9h!n|Bw6p>$94Hu^<5?fCP{L5z^Q&oa85UO!J_5|aQ^``_2=CH literal 0 HcmV?d00001 diff --git a/READe.md b/READe.md new file mode 100644 index 000000000..abf1a6fcb --- /dev/null +++ b/READe.md @@ -0,0 +1,160 @@ +# Experiment-Runner + +[![DOI](https://zenodo.org/badge/505379793.svg)](https://doi.org/10.5281/zenodo.15430328) + +Experiment Runner is a generic framework to automatically execute measurement-based experiments on any platform. The experiments are user-defined, can be completely customized, and expressed in python code! + +The technical details, main features, software architecture, and example experiment using Experiment Runner are presented in our [SCICO 2025 publication](https://www.sciencedirect.com/science/article/pii/S0167642325001546). + +## Features + +- **Run Table Model**: Framework support to easily define an experiment's measurements with Factors, their Treatment levels, exclude certain combinations of Treatments, and add data columns for storing aggregated data. +- **Restarting**: If an experiment was not entirely completed on the last invocation (e.g. some variations crashes), experiment runner can be re-invoked to finish any remaining experiment variations. +- **Persistency**: Raw and aggregated experiment data per variation can be persistently stored. +- **Operational Types**: Two operational types: `AUTO` and `SEMI`, for more fine-grained experiment control. +- **Progress Indicator**: Keeps track of the execution of each run of the experiment +- **Target and profiler agnostic**: Can be used with any target to measure (e.g. ELF binary, .apk over adb, etc.) and with any profiler (e.g. WattsUpPro, etc.) + +## Requirements + +The framework has been tested with Python3 version 3.8, but should also work with any higher version. + +### Supported Platforms +| Platform | Status | +|----------|---------------| +| Linux | Supported | +| macOS | Supported | +| Windows | Not supported | + +--- + +## Installation +**Clone the repository:** +```bash +git clone https://github.com/S2-group/experiment-runner.git +cd experiment-runner/ +``` + +- *Optional, create a virtual envoirment:* + ```bash + python3 -m venv venv + source venv/bin/activate + ``` +**Install dependencies:** +```bash +pip install --upgrade pip +pip install -r requirements.txt +``` + +**To verify installation, run the hello-world exemple:** + +```bash +python experiment-runner/ examples/hello-world/RunnerConfig.py +``` +- The expected output: + - Experiment executes successfully + - Output directory experiments/ is created + - No missing dependency errors + +## Running + +In this section, we assume as the current working directory, the root directory of the project. + +### The provided examples + +To run any of the examples provided, run the following command: + +```bash +python experiment-runner/ examples// +``` + +Each example is accompanied with a README for further information. + +Once you successfully run an experiment, the framework will not allow you to run the same experiment again under, giving the message: + +```log +[FAIL]: EXPERIMENT_RUNNER ENCOUNTERED AN ERROR! +The experiment was restarted, but all runs are already completed. +``` + +This is to prevent you from accidentally overwriting the results of a previously run experiment! In order to run again the experiment, either delete any previously generated data (by default "experiments/" directory), or modify the config's `name` variable to a different name. + +*It is recommended to start with the [hello-world](examples/hello-world) example to also test your installation.* + +### Creating a new experiment + +First, generate a config for your experiment: + +```bash +python experiment-runner/ config-create [directory] +``` + +When running this command, where `[directory]` is an optional argument, a new config file with skeleton code will be generated in the given directory. +- The default location is the `examples/` directory. *This config is similar to the [hello-world](examples/hello-world) example.* + +Feel free to move the generated config to any other directory. + +You can modify its contents and write python code to define your own measurement-based experiment(s). +- *At this stage, you might find useful the [linux-ps-profiling](examples/linux-ps-profiling) example.* + +Once the experiment has been coded, the experiment can be executed by Experiment Runner. To do this, run the following command: + +```bash +python experiment-runner/ +``` + +The results of the experiment will be stored in the directory `RunnerConfig.results_output_path/RunnerConfig.name` as defined by your config variables. + +### Portability Across Users and Machines + +When sharing experiments across different users or machines, hardcoded paths in configuration files can cause issues. Experiment Runner supports **environment variables** to make your experiments portable without code changes: + +#### Available Environment Variables + +- **`EXPERIMENT_RUNNER_OUTPUT_PATH`**: Directory where experiment results are stored + - Default: `/experiments` + - Example: `export EXPERIMENT_RUNNER_OUTPUT_PATH="/path/to/results"` + +- **`ENERGIBRIDGE_PATH`**: Path to the EnergiBridge executable (for energy measurements) + - Default: `/usr/local/bin/energibridge` + - Example: `export ENERGIBRIDGE_PATH="/usr/local/bin/energibridge"` + +- **`EXAMPLES_PATH`**: Directory for generating new config templates + - Default: `/examples` + - Example: `export EXAMPLES_PATH="/home/user/my-experiments"` + +#### Using Environment Variables + +Set environment variables before running your experiment: + +```bash +export EXPERIMENT_RUNNER_OUTPUT_PATH="/data/experiments" +export ENERGIBRIDGE_PATH="/opt/energibridge/bin/energibridge" +python experiment-runner/ MyRunnerConfig.py +``` + +Your configuration files automatically use these variables if set, with sensible defaults when they are not. This allows the same experiment to run on different machines without any code modifications. + +**More information about the profilers and use cases can be found in the [Wiki tab](https://github.com/S2-group/experiment-runner/wiki).** + +## How to cite Experiment Runner + +If Experiment Runner is helping your research, consider to cite it as follows, thank you! + +``` +@article{SCICO_2025, + title = {{Experiment {Runner}: a {Tool} for the {Automatic} {Orchestration} of {Experiments} {Targeting} {Software} {Systems}}}, + issn = {0167-6423}, + journal = {Science of Computer Programming}, + author = {Max Karsten and {Andrei Calin} Dragomir and Radu Apsan and Vincenzo Stoico and Ivano Malavolta}, + year = {2025}, + pages = {103415}, + volume = {1}, + url = {https://www.sciencedirect.com/science/article/pii/S0167642325001546}, + doi = {https://doi.org/10.1016/j.scico.2025.103415} +} +``` + +### Contributing +If you want to develop a new feature or ER, or found some bug you want to report we would love to hear from you! Please refer to our [contribution guidelines](https://github.com/S2-group/experiment-runner/wiki/Contributing-to-ER) for information on how to submit PRs or bug reports. + diff --git a/experiment-runner/DistributedExecution/DistributedMasterOrchestrator.py b/experiment-runner/DistributedExecution/DistributedMasterOrchestrator.py new file mode 100644 index 000000000..f1ae11105 --- /dev/null +++ b/experiment-runner/DistributedExecution/DistributedMasterOrchestrator.py @@ -0,0 +1,163 @@ +from flask import Flask, request, jsonify +import threading +import time +from ProgressManager.RunTable.Models.RunProgress import RunProgress + +import threading +from ProgressManager.RunTable.Models.RunProgress import RunProgress + + +class TaskManager: + + def __init__(self, run_table): + self.run_table = run_table + self.assigned_runs = {} + self.total_runs = len(run_table) + + self.lock = threading.Lock() # โœ… FIXED (you were missing this) + + def get_next_task(self, agent_id): + with self.lock: + for idx, run in enumerate(self.run_table): + if run['__done'] == RunProgress.TODO: + + run['__done'] = "RUNNING" + run['agent_id'] = agent_id + + run['__current_run'] = idx + run['__total_runs'] = self.total_runs + + self.assigned_runs[run['__run_id']] = agent_id + + return run + + return None + + def complete_task(self, run_id, data): + with self.lock: + for run in self.run_table: + if run["__run_id"] == run_id: + + run.update(data) + run["__done"] = RunProgress.DONE + + self.assigned_runs.pop(run_id, None) + return + + def reset_tasks_for_agent(self, agent_id): + """๐Ÿ”ฅ IMPORTANT: recovery function""" + with self.lock: + for run in self.run_table: + if run.get("agent_id") == agent_id and run["__done"] == "RUNNING": + run["__done"] = RunProgress.TODO + run["agent_id"] = None + + self.assigned_runs = { + k: v for k, v in self.assigned_runs.items() + if v != agent_id + } + +class APIServer: + """Flask API server for distributed task management""" + + def __init__(self, task_manager, worker_monitor): + self.app = Flask(__name__) + self.task_manager = task_manager + self.monitor = worker_monitor + + # Register endpoints + @self.app.route('/task', methods=['GET']) + def get_task(): + agent_id = request.args.get('agent_id') + self.monitor.heartbeat(agent_id) + + task = self.task_manager.get_next_task(agent_id) + return jsonify({"run": task if task else None}) + + @self.app.route('/result', methods=['POST']) + def submit_result(): + payload = request.get_json() + + run_id = payload.get('run_id') + run_data = payload.get('data', {}) # โœ… extract correctly + status = payload.get('status') + + if status == "FAILED": + print(f"[MASTER] Run {run_id} failed: {payload.get('error')}") + + self.task_manager.complete_task(run_id, run_data) + + return jsonify({"status": "ok"}) + + @self.app.route('/heartbeat', methods=['POST']) + def heartbeat(): + data = request.get_json() + agent_id = data.get('agent_id') + self.monitor.heartbeat(agent_id) + return jsonify({"status": "ok"}) + + @self.app.route('/status', methods=['GET']) + def status(): + total_runs = len(self.task_manager.run_table) + todo_count = sum(1 for r in self.task_manager.run_table if r['__done'] == RunProgress.TODO) + running_count = sum(1 for r in self.task_manager.run_table if r['__done'] == "RUNNING") + done_count = sum(1 for r in self.task_manager.run_table if r['__done'] == RunProgress.DONE) + + return jsonify({ + "status": "ok", + "total_runs": total_runs, + "runs": { + "todo": todo_count, + "running": running_count, + "done": done_count + }, + "active_agents": len(self.monitor.heartbeats) + }) + +class WorkerMonitor: + def __init__(self, task_manager): + self.heartbeats = {} + self.task_manager = task_manager + self.timeout = 60 + + def heartbeat(self, agent_id): + self.heartbeats[agent_id] = time.time() + + def monitor(self): + while True: + time.sleep(10) + now = time.time() + + dead = [ + agent for agent, t in self.heartbeats.items() + if now - t > self.timeout + ] + + for agent in dead: + print(f"[MASTER] Worker {agent} dead") + + for run in self.task_manager.run_table: + if run.get("agent_id") == agent and run["__done"] != RunProgress.DONE: + run["__done"] = RunProgress.TODO + run["agent_id"] = None + + del self.heartbeats[agent] + +class DistributedMasterOrchestrator: + + def __init__(self, config, metadata, host="0.0.0.0", port=5000): + self.config = config + self.metadata = metadata + self.host = host + self.port = port + + run_table = config.create_run_table_model().generate_experiment_run_table() + + + self.task_manager = TaskManager(run_table) + self.monitor = WorkerMonitor(self.task_manager) + self.api = APIServer(self.task_manager, self.monitor) + + def start(self): + threading.Thread(target=self.monitor.monitor, daemon=True).start() + self.api.app.run(host=self.host, port=self.port) \ No newline at end of file diff --git a/experiment-runner/DistributedExecution/Worker.py b/experiment-runner/DistributedExecution/Worker.py new file mode 100644 index 000000000..bbe781b4c --- /dev/null +++ b/experiment-runner/DistributedExecution/Worker.py @@ -0,0 +1,145 @@ +import threading +import time +import requests + +from ExperimentOrchestrator.Experiment.Run.RunController import RunController + + +class WorkerRuntime: + + def __init__(self, master_url, heartbeat_interval=40, idle_timeout=120): + self.master_url = master_url + self.heartbeat_interval = heartbeat_interval + self.idle_timeout = idle_timeout # Exit after N seconds with no tasks + + self._stop = False + self.current_run = None + self.agent_id = None + self.last_task_time = None + + # ========================= + # MAIN LOOP + # ========================= + def run_loop(self, agent_id, config): + self.agent_id = agent_id + self.last_task_time = time.time() + print(f"[WORKER] Starting with agent_id: {self.agent_id}") + print(f"[WORKER] Master URL: {self.master_url}") + + # start heartbeat thread + threading.Thread(target=self._heartbeat_loop, daemon=True).start() + print(f"[WORKER] Heartbeat thread started") + print(f"[WORKER] Waiting for tasks (will exit after {self.idle_timeout}s of inactivity)...") + + while True: + task = self._get_task() + + if not task: + # Check if we've been idle too long + idle_time = time.time() - self.last_task_time + if idle_time > self.idle_timeout: + print(f"[WORKER] No tasks for {self.idle_timeout}s - exiting") + break + + self.current_run = None + time.sleep(3) + continue + + self.last_task_time = time.time() + self.current_run = task + + try: + result = self._execute(task, config) + self._send_result(task["__run_id"], result) + + except Exception as e: + self._send_failure(task["__run_id"], str(e)) + + finally: + self.current_run = None + + print(f"[WORKER] Worker {self.agent_id} exiting") + + # ========================= + # TASK FETCH + # ========================= + def _get_task(self): + try: + r = requests.get( + self.master_url + "/task", + params={"agent_id": self.agent_id}, + timeout=5 + ) + task = r.json().get("run") + if task: + print(f"[WORKER] Got task: {task.get('__run_id', 'unknown')}") + return task + except requests.exceptions.Timeout: + print(f"[WORKER] Task request timeout (master not responding)") + return None + except Exception as e: + print(f"[WORKER] Error getting task: {e}") + return None + + # ========================= + # EXECUTION + # ========================= + def _execute(self, run, config): + print(f"[WORKER] Executing task {run.get('__run_id')}") + current_run = run.get('__current_run', 0) + total_runs = run.get('__total_runs', 1) + + try: + controller = RunController(run, config, current_run, total_runs) + controller.do_run() + print(f"[WORKER] Task {run.get('__run_id')} completed successfully") + return run # updated in-place + except Exception as e: + print(f"[WORKER] Task {run.get('__run_id')} failed with error: {type(e).__name__}: {e}") + import traceback + traceback.print_exc() + raise + + # ========================= + # RESULT + # ========================= + def _send_result(self, run_id, data): + try: + requests.post(self.master_url + "/result", json={ + "run_id": run_id, + "data": data, + "status": "COMPLETED" + }, timeout=5) + print(f"[WORKER] Result sent for task {run_id}") + except Exception as e: + print(f"[WORKER] Error sending result: {e}") + + def _send_failure(self, run_id, error): + try: + requests.post(self.master_url + "/result", json={ + "run_id": run_id, + "status": "FAILED", + "error": error + }, timeout=5) + print(f"[WORKER] Task {run_id} failed: {error}") + except Exception as e: + print(f"[WORKER] Error sending failure: {e}") + + # ========================= + # HEARTBEAT + # ========================= + def _heartbeat_loop(self): + while not self._stop: + try: + requests.post(self.master_url + "/heartbeat", json={ + "agent_id": self.agent_id, + "status": "RUNNING" if self.current_run else "IDLE", + "run_id": self.current_run["__run_id"] if self.current_run else None, + "timestamp": time.time() + }, timeout=5) + except requests.exceptions.Timeout: + print(f"[WORKER] Heartbeat timeout") + except Exception as e: + print(f"[WORKER] Heartbeat error: {e}") + + time.sleep(self.heartbeat_interval) \ No newline at end of file diff --git a/experiment-runner/DistributedExecution/__init__.py b/experiment-runner/DistributedExecution/__init__.py new file mode 100644 index 000000000..f4364bf12 --- /dev/null +++ b/experiment-runner/DistributedExecution/__init__.py @@ -0,0 +1,15 @@ +""" +Distributed Execution Module + +Simple framework for running experiments across multiple machines. +""" +from .DistributedMasterOrchestrator import DistributedMasterOrchestrator, APIServer, TaskManager, WorkerMonitor +from .Worker import WorkerRuntime + +__all__ = [ + 'WorkerRuntime', + 'APIServer', + 'TaskManager', + 'WorkerMonitor', + 'DistributedMasterOrchestrator', +] diff --git a/experiment-runner/Plugins/Profilers/DataSource.py b/experiment-runner/Plugins/Profilers/DataSource.py index 4d36d4991..92c0fc650 100644 --- a/experiment-runner/Plugins/Profilers/DataSource.py +++ b/experiment-runner/Plugins/Profilers/DataSource.py @@ -203,7 +203,7 @@ def _format_cmd(self): elif isinstance(v, ValueRef): cmd += f" {p} {v.value}" elif isinstance(v, Iterable) and not (isinstance(v, StrEnum) or isinstance(v, str)): - cmd += f" {p} {",".join(map(str, v))}" + cmd += f' {p} {",".join(map(str, v))}' else: cmd += f" {p} {v}" diff --git a/experiment-runner/Plugins/Profilers/PowerJoular.py b/experiment-runner/Plugins/Profilers/PowerJoular.py index 56c5aca30..a11763b99 100644 --- a/experiment-runner/Plugins/Profilers/PowerJoular.py +++ b/experiment-runner/Plugins/Profilers/PowerJoular.py @@ -42,8 +42,8 @@ def __init__(self, @property def target_logfile(self): if "-p" in self.args.keys(): - return f"{self.logfile}-{self.args["-p"]}.csv" - + return f"{self.logfile}-{self.args['-p']}.csv" + return None @staticmethod diff --git a/experiment-runner/__main__.py b/experiment-runner/__main__.py index 3a9c97909..63184f51a 100644 --- a/experiment-runner/__main__.py +++ b/experiment-runner/__main__.py @@ -1,4 +1,5 @@ import sys +import os import traceback import dill as pickle import hashlib @@ -14,6 +15,12 @@ from ConfigValidator.CustomErrors.ConfigErrors import ConfigInvalidClassNameError from ExperimentOrchestrator.Experiment.ExperimentController import ExperimentController +from DistributedExecution.DistributedMasterOrchestrator import DistributedMasterOrchestrator +from DistributedExecution.Worker import WorkerRuntime + + + + def is_no_argument_given(args: List[str]): return (len(args) == 1) def is_config_file_given(args: List[str]): return (args[1][-3:] == '.py') def load_and_get_config_file_as_module(args: List[str]): @@ -24,6 +31,13 @@ def load_and_get_config_file_as_module(args: List[str]): spec.loader.exec_module(config_file) return config_file +def get_flag_value(flag: str): + if flag in sys.argv: + idx = sys.argv.index(flag) + if idx + 1 < len(sys.argv): + return sys.argv[idx + 1] + return None + def calc_ast_md5sum(src, name): tree = compile(src, name, 'exec', flags=ast.PyCF_ONLY_AST, optimize=0) @@ -51,6 +65,9 @@ def calc_ast_md5sum(src, name): if __name__ == "__main__": try: + has_distribute_flag = '--distribute' in sys.argv + has_master_url_flag = '--master-url' in sys.argv + if is_no_argument_given(sys.argv): sys.argv.append('help') CLIRegister.parse_command(sys.argv) @@ -66,7 +83,35 @@ def calc_ast_md5sum(src, name): ) ConfigValidator.validate_config(config) # Validate config as a valid RunnerConfig - ExperimentController(config, metadata).do_experiment() # Instantiate controller with config and start experiment + + if '--distribute' in sys.argv: + mode = get_flag_value('--distribute') + + if mode == "master": + master_host = get_flag_value('--host') or "0.0.0.0" + master_port = int(get_flag_value('--port') or 5000) + + orchestrator = DistributedMasterOrchestrator( + config=config, + metadata=metadata, + host=master_host, + port=master_port + ) + orchestrator.start() + + elif mode == "worker": + master_url = get_flag_value('--master') + if not master_url: + raise BaseError("--master URL required for worker") + + agent_id = f"worker_{os.getpid()}" + + worker = WorkerRuntime(master_url) + worker.run_loop(agent_id=agent_id, config=config) + else: + raise BaseError("Invalid --distribute mode (use 'master' or 'worker')") + else: + ExperimentController(config, metadata).do_experiment() # Instantiate controller with config and start experiment else: raise ConfigInvalidClassNameError else: # Else, a utility command is entered diff --git a/requirements.txt b/requirements.txt index 32607d5be..1eac56f6a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,5 @@ psutil tabulate dill jsonpickle +flask +requests diff --git a/test/conftest.py b/test/conftest.py new file mode 100644 index 000000000..1e789bc16 --- /dev/null +++ b/test/conftest.py @@ -0,0 +1,168 @@ +""" +Global pytest configuration and shared fixtures for ALL tests + +This file is automatically discovered by pytest and runs before any tests. +It contains: +1. Pytest plugins and configuration +2. Shared fixtures (reusable setup/teardown) +3. Hooks for test execution + +WHY THIS MATTERS: +- Fixtures replace traditional setUp()/tearDown() methods +- Fixtures are more flexible: can be scoped (function, class, module, session) +- Shared fixtures prevent code duplication across test files +- conftest.py is the standard pytest way to organize test utilities +""" + +import pytest +import tempfile +import shutil +from pathlib import Path +import sys +import os + +# Add experiment-runner to Python path so tests can import it +PROJECT_ROOT = Path(__file__).parent.parent +sys.path.insert(0, str(PROJECT_ROOT)) +sys.path.insert(0, str(PROJECT_ROOT / "experiment-runner")) + + +# ============================================================================ +# FIXTURES - Reusable setup/teardown for tests +# ============================================================================ +# A fixture is like a setUp() method that runs before each test +# Think of it as: "Here's the environment my test needs" + +@pytest.fixture +def temp_dir(): + """ + Fixture: Create a temporary directory for test files + + SCOPE: "function" means a new temp directory for EACH test function + + WHY: Tests need isolated environments so they don't interfere with each other + One test shouldn't modify another test's files + + USAGE in tests: + def test_something(temp_dir): + # temp_dir is a Path object pointing to a fresh temporary directory + config_file = temp_dir / "RunnerConfig.py" + config_file.write_text("...") + + CLEANUP: Automatically deleted after test completes (yield statement does this) + """ + tmpdir = Path(tempfile.mkdtemp()) + yield tmpdir # "yield" = pause here, run test, resume after + # After test completes, cleanup happens below + if tmpdir.exists(): + shutil.rmtree(tmpdir) + + +@pytest.fixture +def experiment_output_dir(temp_dir): + """ + Fixture: Create directory structure expected by Experiment Runner + + This prepares a directory that Experiment Runner can write results to. + + STRUCTURE: + temp_dir/ + โ””โ”€โ”€ experiments/ <- Where results go + โ””โ”€โ”€ my_experiment/ <- One folder per experiment + โ”œโ”€โ”€ run_table.csv + โ”œโ”€โ”€ metadata.json + โ””โ”€โ”€ run_0_repetition_0/ + """ + results_dir = temp_dir / "experiments" + results_dir.mkdir(parents=True, exist_ok=True) + return results_dir + + +@pytest.fixture +def env_vars_clean(): + """ + Fixture: Clean environment variables before/after test + + WHY: Some tests rely on environment variables (like EXPERIMENT_RUNNER_OUTPUT_PATH) + We want a clean state so tests don't affect each other + + This saves original values, provides clean environment, then restores + """ + # Save original environment + original_env = os.environ.copy() + + yield # Run test with clean environment + + # Restore original environment after test + os.environ.clear() + os.environ.update(original_env) + + +# ============================================================================ +# PYTEST CONFIGURATION +# ============================================================================ + +def pytest_configure(config): + """ + Hook: Runs once when pytest starts + + We use this to register custom markers (test categories) + """ + config.addinivalue_line( + "markers", + "system: System-level tests (real experiment execution)" + ) + config.addinivalue_line( + "markers", + "integration: Integration tests (multiple components)" + ) + config.addinivalue_line( + "markers", + "unit: Unit tests (single component with mocks)" + ) + config.addinivalue_line( + "markers", + "slow: Tests that take a while to run" + ) + + +def pytest_collection_modifyitems(config, items): + """ + Hook: Runs after tests are discovered, before they run + + This automatically assigns markers based on test location + """ + for item in items: + # If test is in system/, mark it as @pytest.mark.system + if "system" in str(item.fspath): + item.add_marker(pytest.mark.system) + elif "integration" in str(item.fspath): + item.add_marker(pytest.mark.integration) + elif "unit" in str(item.fspath): + item.add_marker(pytest.mark.unit) + + +# ============================================================================ +# PYTEST COMMAND LINE OPTIONS +# ============================================================================ +# These let users run specific test types: +# pytest -m system (only system tests) +# pytest -m unit (only unit tests) +# pytest -k shuffling (only tests with "shuffling" in name) + +def pytest_addoption(parser): + """ + Hook: Allows passing custom command line options to pytest + """ + parser.addoption( + "--real-profilers", + action="store_true", + default=False, + help="Run tests that require real profiler installations" + ) + parser.addoption( + "--skip-slow", + action="store_true", + default=False, + help="Skip slow system tests" + ) diff --git a/test/integration/__init__.py b/test/integration/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/system/__init__.py b/test/system/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/system/base_system_test.py b/test/system/base_system_test.py new file mode 100644 index 000000000..09a47651d --- /dev/null +++ b/test/system/base_system_test.py @@ -0,0 +1,345 @@ +""" +Base class for System-Level Tests + +What is a "System Test"? +- Runs the ACTUAL experiment (not mocked) +- Tests real profilers, real data collection +- Validates end-to-end workflow +- Catches integration issues + +This base class provides reusable methods for all system tests so we don't +repeat code in every test file. + +INHERITANCE EXAMPLE: + class TestBasicExperiment(SystemExperimentTest): + def test_hello_world_runs(self, temp_dir): + # Use inherited methods like self.run_experiment() + result = self.run_experiment("hello-world", temp_dir) + assert result.success +""" + +import subprocess +import sys +from pathlib import Path +from typing import Dict, Optional, List +import pytest + + +class ExperimentResult: + """ + Container for experiment execution results + + WHY: Instead of returning a tuple, we return an object with named fields + This is clearer: result.success vs result[0] + """ + def __init__(self, success: bool, stdout: str, stderr: str, + results_dir: Path, config_path: Path): + self.success = success # Did experiment complete? + self.stdout = stdout # Console output + self.stderr = stderr # Error output + self.results_dir = results_dir # Where results were written + self.config_path = config_path # Which config was used + + +class SystemExperimentTest: + """ + Base class for ALL system-level tests + + Think of this as a "helper class" that all system tests inherit from. + It provides common methods so we don't repeat code. + + EXAMPLE USAGE: + class TestProfilers(SystemExperimentTest): + def test_picoCM3_experiment(self, temp_dir): + # Call inherited method from this class + result = self.run_experiment( + config_name="test-standalone/plugins/PicoCM3", + results_dir=temp_dir + ) + assert result.success + self.validate_csv_output(result.results_dir) + """ + + # ======================================================================== + # SETUP METHODS + # ======================================================================== + + def run_experiment( + self, + config_path: str, + results_dir: Path, + timeout: int = 300 + ) -> ExperimentResult: + """ + Execute an actual experiment using Experiment Runner + + This runs: python experiment-runner/ + + PARAMETERS: + config_path: Relative or absolute path to RunnerConfig.py + results_dir: Where to store results + timeout: Maximum seconds to wait (default 5 min) + + RETURNS: + ExperimentResult object with success, stdout, stderr, etc. + + WHY NOT JUST CALL subprocess DIRECTLY? + - Encapsulation: If how we run experiments changes, update here once + - Reusability: All tests use same execution method + - Error handling: Consistent error reporting + + EXAMPLE: + result = self.run_experiment("examples/hello-world", temp_dir) + if not result.success: + print(result.stderr) # Show what went wrong + """ + project_root = Path(__file__).parent.parent.parent + config_file = Path(config_path) + + if not config_file.is_absolute(): + config_file = project_root / config_path / "RunnerConfig.py" + + # Build the command: python experiment-runner/ + cmd = [ + sys.executable, + str(project_root / "experiment-runner" / "__main__.py"), + str(config_file) + ] + + try: + # Run the command and capture output + result = subprocess.run( + cmd, + capture_output=True, # Capture stdout/stderr + text=True, # Return as strings, not bytes + timeout=timeout, + cwd=str(project_root) + ) + + # Experiment was successful if return code is 0 + success = result.returncode == 0 + + return ExperimentResult( + success=success, + stdout=result.stdout, + stderr=result.stderr, + results_dir=results_dir, + config_path=config_file + ) + + except subprocess.TimeoutExpired: + # Experiment took too long + return ExperimentResult( + success=False, + stdout="", + stderr=f"Experiment timed out after {timeout} seconds", + results_dir=results_dir, + config_path=config_file + ) + except Exception as e: + # Something went wrong executing the command + return ExperimentResult( + success=False, + stdout="", + stderr=f"Failed to run experiment: {str(e)}", + results_dir=results_dir, + config_path=config_file + ) + + + # ======================================================================== + # VALIDATION METHODS + # ======================================================================== + + def validate_csv_output(self, experiment_dir: Path) -> bool: + """ + Validate that CSV output exists and is readable + + WHAT IT CHECKS: + - run_table.csv exists + - CSV is readable (valid format) + - At least one row of data + + RETURNS: + True if valid, raises AssertionError if not + + WHY: CSV is the main output format, so this is critical + """ + csv_file = experiment_dir / "run_table.csv" + + # Check file exists + assert csv_file.exists(), f"run_table.csv not found in {experiment_dir}" + + # Check file is not empty + content = csv_file.read_text() + assert len(content) > 0, "run_table.csv is empty" + + # Check it has at least a header row + lines = content.strip().split('\n') + assert len(lines) >= 1, "run_table.csv has no header" + + return True + + + def validate_experiment_structure(self, experiment_dir: Path) -> bool: + """ + Validate expected directory structure exists + + EXPECTED STRUCTURE: + experiment_dir/ + โ”œโ”€โ”€ run_table.csv (main results) + โ”œโ”€โ”€ metadata.json (experiment metadata) + โ””โ”€โ”€ run_0_repetition_0/ (per-run data) + โ”œโ”€โ”€ profiler_output + โ””โ”€โ”€ raw_data + + RETURNS: + True if structure is valid + """ + # Check required files + required_files = [ + "run_table.csv", + "metadata.json" + ] + + for filename in required_files: + filepath = experiment_dir / filename + assert filepath.exists(), \ + f"Missing required file: {filename} in {experiment_dir}" + + # Check at least one run directory exists + run_dirs = list(experiment_dir.glob("run_*")) + assert len(run_dirs) > 0, \ + f"No run directories found in {experiment_dir}" + + return True + + + def validate_no_errors_in_output(self, result: ExperimentResult) -> bool: + """ + Check that stderr doesn't contain error keywords + + WHAT IT CHECKS: + - stderr is empty OR doesn't contain [FAIL], "Error", "Exception" + + WHY: The experiment might complete but still have warnings/errors + + RETURNS: + True if no critical errors detected + """ + error_keywords = ["[FAIL]", "[ERROR]", "Exception", "Traceback"] + + for keyword in error_keywords: + assert keyword not in result.stderr, \ + f"Found error keyword '{keyword}' in stderr:\n{result.stderr}" + + return True + + + # ======================================================================== + # SIMULATION METHODS (for testing failure cases) + # ======================================================================== + + def simulate_run_crash( + self, + experiment_dir: Path, + run_id: int + ) -> None: + """ + Simulate a crash mid-experiment by modifying run_table.csv + + This marks a run as incomplete so when we re-run, the framework + will think it crashed and try to restart it. + + USAGE: + # Run experiment partially + result1 = self.run_experiment(config, temp_dir) + + # Simulate crash on run 1 + self.simulate_run_crash(temp_dir, run_id=1) + + # Re-run and verify it handles the restart correctly + result2 = self.run_experiment(config, temp_dir) + assert result2.success + + WHAT IT DOES: + - Reads run_table.csv + - Finds the row for the specified run + - Sets __done to "TODO" (marks as incomplete) + - Writes it back + + WHY: Tests that restart/recovery logic works correctly + """ + csv_file = experiment_dir / "run_table.csv" + + # Read CSV content + content = csv_file.read_text() + lines = content.strip().split('\n') + + if len(lines) < 2: + raise ValueError("CSV has no data rows to modify") + + # Find and modify the row for this run_id + header = lines[0] + rows = lines[1:] + + modified_rows = [] + for row_idx, row in enumerate(rows): + if row_idx == run_id: + # Set __done to TODO (incomplete) + # This assumes __done is the first column + cols = row.split(',') + cols[0] = 'TODO' + modified_rows.append(','.join(cols)) + else: + modified_rows.append(row) + + # Write back to CSV + new_content = header + '\n' + '\n'.join(modified_rows) + csv_file.write_text(new_content) + + + # ======================================================================== + # HELPER METHODS + # ======================================================================== + + def get_experiment_dir( + self, + results_dir: Path, + experiment_name: str + ) -> Path: + """ + Get the full path to an experiment's results directory + + STRUCTURE: + results_dir/ + โ””โ”€โ”€ / <- returned path + โ””โ”€โ”€ run_table.csv + + PARAMETERS: + results_dir: Parent results directory + experiment_name: Name of experiment (from RunnerConfig.name) + + RETURNS: + Path to experiment directory + """ + return results_dir / experiment_name + + + def read_csv_as_dicts(self, csv_path: Path) -> List[Dict]: + """ + Read CSV file and return as list of dictionaries + + WHY: Easier to work with dictionaries than raw CSV strings + Can access columns by name: row['avg_cpu'] + + EXAMPLE: + rows = self.read_csv_as_dicts(Path("run_table.csv")) + for row in rows: + print(row['run_id'], row['avg_cpu']) + """ + import csv + + with open(csv_path, 'r') as f: + reader = csv.DictReader(f) + return list(reader) diff --git a/test/system/fixtures/__init__.py b/test/system/fixtures/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/system/test_basic_run.py b/test/system/test_basic_run.py new file mode 100644 index 000000000..db1607e7c --- /dev/null +++ b/test/system/test_basic_run.py @@ -0,0 +1,292 @@ +""" +EXAMPLE: Basic System Tests + +This file demonstrates how to write system-level tests using the new framework. + +KEY CONCEPTS: +1. Tests inherit from SystemExperimentTest (base class with helper methods) +2. Tests use fixtures from conftest.py (temp_dir, env_vars_clean) +3. Tests run REAL experiments, not mocked versions +4. Each test is independent (isolated temp directories) + +RUN THESE TESTS: + pytest test/system/test_basic_run.py # Run all tests in this file + pytest test/system/test_basic_run.py::TestBasicRuns::test_hello_world + pytest test/system/test_basic_run.py -v # Verbose output + pytest test/system/test_basic_run.py -s # Show print statements +""" + +import pytest +from pathlib import Path +from test.system.base_system_test import SystemExperimentTest + + +class TestBasicRuns(SystemExperimentTest): + """ + Test suite: Basic experiment execution + + Each method is a test. Pytest runs them and reports: + - PASSED: test completed successfully + - FAILED: assertion failed + - ERROR: exception raised + """ + + @pytest.mark.system + def test_hello_world_experiment_runs(self, temp_dir): + """ + TEST 1: Can we run the hello-world example? + + SETUP: + - temp_dir: pytest fixture (see conftest.py) provides fresh directory + + WHAT IT DOES: + 1. Run the hello-world experiment + 2. Check that it completes successfully + 3. Verify output directory exists + + HOW PYTEST WORKS: + - Calls fixture: temp_dir is created + - Runs test function + - If any 'assert' fails, test FAILS + - Cleanup: temp_dir is deleted + + EXAMPLE OUTPUT: + PASSED test_hello_world_experiment_runs + + If it fails: + FAILED test_hello_world_experiment_runs + AssertionError: assert False == True + ...stderr output... + """ + # Step 1: Run the actual experiment + result = self.run_experiment( + config_path="examples/hello-world", + results_dir=temp_dir + ) + + # Step 2: Assert it was successful + # If this fails, test fails with clear error + assert result.success, f"Experiment failed!\nStderr: {result.stderr}" + + # Step 3: Verify no errors in output + self.validate_no_errors_in_output(result) + + # Step 4: Verify results directory exists + assert (temp_dir / "experiments").exists(), \ + "Results directory was not created" + + + @pytest.mark.system + def test_hello_world_output_structure(self, temp_dir): + """ + TEST 2: Does hello-world create the expected output structure? + + WHAT IT CHECKS: + - run_table.csv exists and has content + - Directory structure is correct + - All required files are present + """ + # Run experiment + result = self.run_experiment( + config_path="examples/hello-world", + results_dir=temp_dir + ) + + # Get the experiment directory + # (assumes experiment is named "new_runner_experiment" by default) + exp_dir = temp_dir / "experiments" / "new_runner_experiment" + + # Validate structure + self.validate_experiment_structure(exp_dir) + self.validate_csv_output(exp_dir) + + + @pytest.mark.system + def test_fibonacci_experiment_runs(self, temp_dir): + """ + TEST 3: Test a different example (fibonacci) + + WHY: Tests should be specific, not generic + Each example might have different requirements + """ + result = self.run_experiment( + config_path="examples/hello-world-fibonacci", + results_dir=temp_dir + ) + + assert result.success, f"Fibonacci experiment failed:\n{result.stderr}" + self.validate_no_errors_in_output(result) + + + @pytest.mark.system + @pytest.mark.slow + def test_multiple_sequential_runs(self, temp_dir): + """ + TEST 4: Can we run multiple experiments in sequence? + + @pytest.mark.slow decorator means: + - pytest -m slow (run ONLY slow tests) + - pytest --skip-slow (skip slow tests) + + WHY: Some tests are slow. During development, you might skip them. + Use for comprehensive testing before submitting. + """ + # Run first experiment + result1 = self.run_experiment( + config_path="examples/hello-world", + results_dir=temp_dir + ) + assert result1.success, f"First run failed: {result1.stderr}" + + # Run second experiment (different name to avoid conflicts) + # This tests that framework can handle multiple experiments + result2 = self.run_experiment( + config_path="examples/hello-world-fibonacci", + results_dir=temp_dir + ) + assert result2.success, f"Second run failed: {result2.stderr}" + + +class TestRestartRecovery(SystemExperimentTest): + """ + Test suite: Experiment restart/recovery on crash + + These tests verify that if an experiment crashes mid-way, + we can resume it and it completes correctly. + """ + + @pytest.mark.system + @pytest.mark.slow + def test_restart_after_simulated_crash(self, temp_dir): + """ + TEST 5: Can framework recover from a crash? + + SCENARIO: + 1. Run experiment partially + 2. Simulate a crash (mark a run as incomplete) + 3. Re-run and verify it continues from where it left off + + WHY: Real-world experiments can crash. Framework should handle this gracefully. + """ + # Step 1: Run initial experiment + result1 = self.run_experiment( + config_path="test-standalone/core/shuffling", + results_dir=temp_dir + ) + assert result1.success, f"Initial run failed: {result1.stderr}" + + # Step 2: Simulate crash by marking run 1 as incomplete + exp_dir = temp_dir / "experiments" / "new_runner_experiment" + self.simulate_run_crash(exp_dir, run_id=1) + + # Verify the crash was simulated + csv_rows = self.read_csv_as_dicts(exp_dir / "run_table.csv") + assert csv_rows[1]['__done'] == 'TODO', \ + "Crash simulation didn't mark run as incomplete" + + # Step 3: Re-run experiment (should continue from run 1) + result2 = self.run_experiment( + config_path="test-standalone/core/shuffling", + results_dir=temp_dir + ) + assert result2.success, f"Recovery run failed: {result2.stderr}" + + # Step 4: Verify all runs are now complete + csv_rows = self.read_csv_as_dicts(exp_dir / "run_table.csv") + for row in csv_rows: + assert row['__done'] == 'DONE', \ + f"Run not completed: {row}" + + +# ============================================================================ +# DEMONSTRATION: How fixtures work +# ============================================================================ + +class TestFixtureDemonstration: + """ + This class shows HOW FIXTURES WORK in pytest + + Fixtures are like setUp() but more powerful. + They can: + - Provide test data + - Create temporary resources + - Handle cleanup automatically + """ + + def test_temp_dir_fixture(self, temp_dir): + """ + This test receives 'temp_dir' fixture automatically. + + Pytest: + 1. Creates temp directory + 2. Passes it to this function as 'temp_dir' parameter + 3. Runs this test + 4. Cleans up temp directory + 5. Test done! + """ + # temp_dir is a Path object pointing to fresh directory + assert temp_dir.is_dir() + assert len(list(temp_dir.iterdir())) == 0 # Empty + + # Create a file + test_file = temp_dir / "test.txt" + test_file.write_text("Hello!") + + # Verify it exists + assert test_file.exists() + + # After this test ends, temp_dir is automatically deleted + + + def test_experiment_output_dir_fixture(self, experiment_output_dir): + """ + This test receives 'experiment_output_dir' fixture. + + This fixture creates the directory structure that + Experiment Runner expects: + experiments/ + โ””โ”€โ”€ my_experiment/ + """ + # The fixture creates experiments/ directory + assert experiment_output_dir.exists() + assert experiment_output_dir.parent.name == "experiments" + + +# ============================================================================ +# ADVANCED: Parameterized tests +# ============================================================================ + +class TestParameterized(SystemExperimentTest): + """ + Parameterized tests run the same test with different inputs. + + WHY: Avoid writing the same test multiple times with different configs. + One test function runs multiple times with different parameters. + """ + + @pytest.mark.parametrize("example_name", [ + "hello-world", + "hello-world-fibonacci", + ]) + @pytest.mark.system + def test_all_examples_run(self, example_name, temp_dir): + """ + This test runs TWICE: + - Once with example_name="hello-world" + - Once with example_name="hello-world-fibonacci" + + PYTEST PARAMETRIZE SYNTAX: + @pytest.mark.parametrize("param_name", [list of values]) + def test_something(param_name, other_fixtures): + ... + + BENEFIT: + - DRY (Don't Repeat Yourself) + - Easier to add new test cases + - Clear pass/fail for each variant + """ + result = self.run_experiment( + config_path=f"examples/{example_name}", + results_dir=temp_dir + ) + assert result.success, f"{example_name} failed: {result.stderr}" diff --git a/test/system/validators/__init__.py b/test/system/validators/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/unit/__init__.py b/test/unit/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/validate_local_test_setup.py b/validate_local_test_setup.py new file mode 100644 index 000000000..e7b31f5a3 --- /dev/null +++ b/validate_local_test_setup.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python3 +""" +Validate local SSH setup for distributed testing + +Usage: + python validate_local_test_setup.py + python validate_local_test_setup.py --full +""" +import subprocess +import sys +from pathlib import Path + +def run_cmd(cmd, timeout=5): + """Run command and return (rc, stdout, stderr)""" + try: + result = subprocess.run( + cmd, + shell=True, + capture_output=True, + text=True, + timeout=timeout + ) + return result.returncode, result.stdout, result.stderr + except subprocess.TimeoutExpired: + return -1, "", "Timeout" + except Exception as e: + return -1, "", str(e) + +def check_ssh_server(): + """Check if SSH server is running""" + print("Checking SSH server...") + rc, _, _ = run_cmd("sudo service ssh status") + if rc == 0: + print(" โœ“ SSH server is running") + return True + else: + print(" โœ— SSH server NOT running") + print(" Fix with: sudo service ssh start") + return False + +def check_ssh_keys(): + """Check if SSH keys exist""" + print("Checking SSH keys...") + key_path = Path.home() / ".ssh" / "id_rsa" + if key_path.exists(): + print(" โœ“ SSH key exists") + return True + else: + print(" โœ— SSH key NOT found") + print(" Fix with: ssh-keygen -t rsa -b 4096 -f ~/.ssh/id_rsa -N \"\"") + return False + +def check_localhost_ssh(port=22): + """Check SSH to localhost""" + print(f"Checking SSH localhost:{port}...") + rc, stdout, stderr = run_cmd(f"ssh -p {port} -o ConnectTimeout=3 localhost 'echo OK'") + if rc == 0 and "OK" in stdout: + print(f" โœ“ SSH localhost:{port} working") + return True + else: + print(f" โœ— SSH localhost:{port} FAILED") + if "refused" in stderr or "Connection refused" in stderr: + print(f" Port {port} not listening on SSH") + elif "Permission denied" in stderr: + print(f" Permission denied - check SSH keys") + else: + print(f" Error: {stderr}") + return False + +def check_ports_configured(): + """Check if sshd_config has multiple ports""" + print("Checking SSH config for multiple ports...") + rc, stdout, _ = run_cmd("grep -E '^Port ' /etc/ssh/sshd_config || true") + ports = [] + for line in stdout.split('\n'): + if line.strip().startswith('Port '): + port = line.strip().split()[-1] + ports.append(port) + + if len(ports) >= 4: # Should have 22, 2201, 2202, 2203 + print(f" โœ“ Found {len(ports)} ports configured: {', '.join(ports)}") + return ports + else: + print(f" โš  Found {len(ports)} port(s): {', '.join(ports) if ports else 'none'}") + print(" Configure /etc/ssh/sshd_config with:") + print(" Port 22") + print(" Port 2201") + print(" Port 2202") + print(" Port 2203") + print(" Then: sudo service ssh restart") + return ports + +def check_all_test_ports(ports=[2201, 2202, 2203]): + """Check if all test ports are accessible""" + print(f"Checking test ports: {', '.join(map(str, ports))}...") + all_ok = True + for port in ports: + if check_localhost_ssh(port): + pass # Already printed + else: + all_ok = False + return all_ok + +def main(): + print("=" * 50) + print("Distributed Testing - Setup Validation") + print("=" * 50) + print() + + checks = [ + ("SSH Server", check_ssh_server), + ("SSH Keys", check_ssh_keys), + ("Default SSH (port 22)", lambda: check_localhost_ssh(22)), + ] + + results = [] + for name, check_fn in checks: + try: + result = check_fn() + results.append((name, result)) + except Exception as e: + print(f" โœ— Error: {e}") + results.append((name, False)) + print() + + # Check ports + ports = check_ports_configured() + print() + + # Full test mode + if "--full" in sys.argv and len(ports) >= 4: + print("Running full port connectivity test...") + print() + if check_all_test_ports(): + print() + print("=" * 50) + print("โœ“ All checks passed!") + print("=" * 50) + print() + print("Ready to test distributed execution:") + print() + print(" python experiment-runner/ test_distributed_config.py \\") + print(" --distribute \"localhost:2201,localhost:2202,localhost:2203\"") + print() + return 0 + + # Summary + print("=" * 50) + passed = sum(1 for _, r in results if r) + total = len(results) + print(f"Setup Status: {passed}/{total} checks passed") + print("=" * 50) + print() + + if passed < total: + print("โš  Some checks failed - follow the fixes above") + return 1 + elif len(ports) < 4: + print("โš  Test ports not configured yet") + print(" Run 'sudo nano /etc/ssh/sshd_config' and add:") + print(" Port 2201") + print(" Port 2202") + print(" Port 2203") + print(" Then: sudo service ssh restart") + print() + print(" After that, run with --full flag:") + print(" python validate_local_test_setup.py --full") + return 1 + else: + print("โœ“ Basic setup looks good") + print(" Run with --full flag to test all ports:") + print(" python validate_local_test_setup.py --full") + return 0 + +if __name__ == "__main__": + sys.exit(main()) From 556263bbdac99467b1eb9648ddd3da2b79430878 Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Fri, 8 May 2026 15:56:48 +0200 Subject: [PATCH 04/30] j --- examples/hello-world-fibonacci/RunnerConfig.py | 4 ++-- experiment-runner/__main__.py | 5 +---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/examples/hello-world-fibonacci/RunnerConfig.py b/examples/hello-world-fibonacci/RunnerConfig.py index bd066494b..d4266ae73 100644 --- a/examples/hello-world-fibonacci/RunnerConfig.py +++ b/examples/hello-world-fibonacci/RunnerConfig.py @@ -16,6 +16,7 @@ import time import subprocess import shlex +import sys class RunnerConfig: ROOT_DIR = Path(dirname(realpath(__file__))) @@ -98,8 +99,7 @@ def start_measurement(self, context: RunnerContext) -> None: --max-execution 20 \ --output {context.run_dir / "energibridge.csv"} \ --summary \ - python examples/hello-world-fibonacci/fibonacci_{fib_type}.py {problem_size}' - + {sys.executable} examples/hello-world-fibonacci/fibonacci_{fib_type}.py {problem_size}' energibridge_log = open(f'{context.run_dir}/energibridge.log', 'w') self.profiler = subprocess.Popen(shlex.split(profiler_cmd), stdout=energibridge_log) diff --git a/experiment-runner/__main__.py b/experiment-runner/__main__.py index 63184f51a..edca44553 100644 --- a/experiment-runner/__main__.py +++ b/experiment-runner/__main__.py @@ -55,11 +55,8 @@ def calc_ast_md5sum(src, name): # Ignore docstring if isinstance(node, (ast.AsyncFunctionDef, ast.FunctionDef, ast.ClassDef, ast.Module)) and ast.get_docstring(node) is not None: docstring_node = node.body[0].value - if isinstance(docstring_node, ast.Str): - docstring_node.s = '' - elif isinstance(docstring_node, ast.Constant) and isinstance(docstring_node.value, str): + if isinstance(docstring_node, ast.Constant) and isinstance(docstring_node.value, str): docstring_node.value = '' - return hashlib.md5(pickle.dumps(tree)).digest() From b91d97c1e0ef805c3ed485feae63a2ee381304c4 Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Tue, 12 May 2026 10:49:00 +0200 Subject: [PATCH 05/30] remote-missing_shuttingdown --- .../hello-world-fibonacci/RunnerConfig.py | 236 +++++++++++++----- .../DistributedMasterOrchestrator.py | 59 +++-- .../DistributedExecution/Worker.py | 154 +++++++----- .../Experiment/Run/RunController.py | 8 +- 4 files changed, 299 insertions(+), 158 deletions(-) diff --git a/examples/hello-world-fibonacci/RunnerConfig.py b/examples/hello-world-fibonacci/RunnerConfig.py index d4266ae73..6da601534 100644 --- a/examples/hello-world-fibonacci/RunnerConfig.py +++ b/examples/hello-world-fibonacci/RunnerConfig.py @@ -6,139 +6,239 @@ from ConfigValidator.Config.Models.OperationType import OperationType from ProgressManager.Output.OutputProcedure import OutputProcedure as output -from typing import Dict, List, Any, Optional +from typing import Dict, Any, Optional from pathlib import Path from os.path import dirname, realpath import os -import signal import pandas as pd import time import subprocess import shlex import sys + class RunnerConfig: ROOT_DIR = Path(dirname(realpath(__file__))) - # ================================ USER SPECIFIC CONFIG ================================ - """The name of the experiment.""" - name: str = "new_runner_experiment" + name: str = "new_runner_experiment" - """The path in which Experiment Runner will create a folder with the name `self.name`, in order to store the - results from this experiment. (Path does not need to exist - it will be created if necessary.) - Output path defaults to the config file's path, inside the folder 'experiments'""" default_output = ROOT_DIR / "experiments" - results_output_path: Path = Path(os.getenv("EXPERIMENT_RUNNER_OUTPUT_PATH", str(default_output))) - """Experiment operation type. Unless you manually want to initiate each run, use `OperationType.AUTO`.""" - operation_type: OperationType = OperationType.AUTO + results_output_path: Path = Path( + os.getenv("EXPERIMENT_RUNNER_OUTPUT_PATH", str(default_output)) + ) + + operation_type: OperationType = OperationType.AUTO + + time_between_runs_in_ms: int = 1000 - """The time Experiment Runner will wait after a run completes. - This can be essential to accommodate for cooldown periods on some systems.""" - time_between_runs_in_ms: int = 1000 + ENERGIBRIDGE_PATH = "/home/andabarbu/.cargo/bin/energibridge" - # Dynamic configurations can be one-time satisfied here before the program takes the config as-is - # e.g. Setting some variable based on some criteria def __init__(self): - """Executes immediately after program start, on config load""" EventSubscriptionController.subscribe_to_multiple_events([ (RunnerEvents.BEFORE_EXPERIMENT, self.before_experiment), - (RunnerEvents.BEFORE_RUN , self.before_run ), - (RunnerEvents.START_RUN , self.start_run ), + (RunnerEvents.BEFORE_RUN, self.before_run), + (RunnerEvents.START_RUN, self.start_run), (RunnerEvents.START_MEASUREMENT, self.start_measurement), - (RunnerEvents.INTERACT , self.interact ), - (RunnerEvents.STOP_MEASUREMENT , self.stop_measurement ), - (RunnerEvents.STOP_RUN , self.stop_run ), + (RunnerEvents.INTERACT, self.interact), + (RunnerEvents.STOP_MEASUREMENT, self.stop_measurement), + (RunnerEvents.STOP_RUN, self.stop_run), (RunnerEvents.POPULATE_RUN_DATA, self.populate_run_data), - (RunnerEvents.AFTER_EXPERIMENT , self.after_experiment ) + (RunnerEvents.AFTER_EXPERIMENT, self.after_experiment) ]) - self.run_table_model = None # Initialized later + + self.run_table_model = None + self.profiler = None + output.console_log("Custom config loaded") def create_run_table_model(self) -> RunTableModel: - """Create and return the run_table model here. A run_table is a List (rows) of tuples (columns), - representing each run performed""" + factor1 = FactorModel("fib_type", ['iter', 'mem', 'rec']) factor2 = FactorModel("problem_size", [10, 35, 40, 5000, 10000]) + self.run_table_model = RunTableModel( factors=[factor1, factor2], exclude_combinations=[ - {factor2: [10]}, # all runs having treatment "10" will be excluded + {factor2: [10]}, {factor1: ['rec'], factor2: [5000, 10000]}, - {factor1: ['mem', 'iter'], factor2: [35, 40]}, # all runs having the combination ("iter", 30) will be excluded + {factor1: ['mem', 'iter'], factor2: [35, 40]}, ], - repetitions = 10, - data_columns=["energy", "runtime", "memory"] + repetitions=10, + + # IMPORTANT: + data_columns=[ + "cpu_energy", + "core0_energy", + "core1_energy", + "core2_energy", + "core3_energy", + "core4_energy", + "core5_energy", + "core6_energy", + "core7_energy" + ] ) + return self.run_table_model def before_experiment(self) -> None: - """Perform any activity required before starting the experiment here - Invoked only once during the lifetime of the program.""" pass def before_run(self) -> None: - """Perform any activity required before starting a run. - No context is available here as the run is not yet active (BEFORE RUN)""" pass def start_run(self, context: RunnerContext) -> None: - """Perform any activity required for starting the run here. - For example, starting the target system to measure. - Activities after starting the run should also be performed here.""" pass def start_measurement(self, context: RunnerContext) -> None: - """Perform any activity required for starting measurements.""" + fib_type = context.execute_run["fib_type"] problem_size = context.execute_run["problem_size"] - profiler_cmd = f'sudo energibridge \ - --max-execution 20 \ - --output {context.run_dir / "energibridge.csv"} \ - --summary \ - {sys.executable} examples/hello-world-fibonacci/fibonacci_{fib_type}.py {problem_size}' - energibridge_log = open(f'{context.run_dir}/energibridge.log', 'w') - self.profiler = subprocess.Popen(shlex.split(profiler_cmd), stdout=energibridge_log) + output_csv = context.run_dir / "energibridge.csv" + + profiler_cmd = ( + f'{self.ENERGIBRIDGE_PATH} ' + f'--max-execution 20 ' + f'--output {output_csv} ' + f'--summary ' + f'{sys.executable} ' + f'examples/hello-world-fibonacci/fibonacci_{fib_type}.py ' + f'{problem_size}' + ) + + output.console_log(f"Running: {profiler_cmd}") + + energibridge_log = open( + context.run_dir / "energibridge.log", + "w" + ) + + self.profiler = subprocess.Popen( + shlex.split(profiler_cmd), + stdout=energibridge_log, + stderr=energibridge_log, + cwd=str(self.ROOT_DIR.parent.parent) + ) def interact(self, context: RunnerContext) -> None: - """Perform any interaction with the running target system here, or block here until the target finishes.""" - # No interaction. We just run it for XX seconds. - # Another example would be to wait for the target to finish, e.g. via `self.target.wait()` output.console_log("Running program for 20 seconds") + time.sleep(20) def stop_measurement(self, context: RunnerContext) -> None: - """Perform any activity here required for stopping measurements.""" - self.profiler.wait() + + if self.profiler: + self.profiler.wait() def stop_run(self, context: RunnerContext) -> None: - """Perform any activity here required for stopping the run. - Activities after stopping the run should also be performed here.""" pass - - def populate_run_data(self, context: RunnerContext) -> Optional[Dict[str, Any]]: - """Parse and process any measurement data here. - You can also store the raw measurement data under `context.run_dir` - Returns a dictionary with keys `self.run_table_model.data_columns` and their values populated""" - - # energibridge.csv - Power consumption of the whole system - df = pd.read_csv(context.run_dir / f"energibridge.csv") + + def populate_run_data( + self, + context: RunnerContext + ) -> Optional[Dict[str, Any]]: + + csv_path = context.run_dir / "energibridge.csv" + + if not csv_path.exists(): + output.console_log(f"CSV missing: {csv_path}") + return None + + if csv_path.stat().st_size == 0: + output.console_log("CSV empty") + return None + + try: + df = pd.read_csv(csv_path) + + except Exception as e: + output.console_log(f"CSV read error: {e}") + return None + + required_columns = [ + "CPU_ENERGY (J)", + "CORE0_ENERGY (J)", + "CORE1_ENERGY (J)", + "CORE2_ENERGY (J)", + "CORE3_ENERGY (J)", + "CORE4_ENERGY (J)", + "CORE5_ENERGY (J)", + "CORE6_ENERGY (J)", + "CORE7_ENERGY (J)" + ] + + for col in required_columns: + if col not in df.columns: + output.console_log(f"Missing column: {col}") + return None + run_data = { - 'dram_energy': round(df['DRAM_ENERGY (J)'].iloc[-1] - df['DRAM_ENERGY (J)'].iloc[0], 3), - 'package_energy': round(df['PACKAGE_ENERGY (J)'].iloc[-1] - df['PACKAGE_ENERGY (J)'].iloc[0], 3), - 'pp0_energy': round(df['PP0_ENERGY (J)'].iloc[-1] - df['PP0_ENERGY (J)'].iloc[0], 3), - 'pp1_energy': round(df['PP1_ENERGY (J)'].iloc[-1] - df['PP1_ENERGY (J)'].iloc[0], 3), + "cpu_energy": round( + df["CPU_ENERGY (J)"].iloc[-1] + - df["CPU_ENERGY (J)"].iloc[0], + 3 + ), + + "core0_energy": round( + df["CORE0_ENERGY (J)"].iloc[-1] + - df["CORE0_ENERGY (J)"].iloc[0], + 3 + ), + + "core1_energy": round( + df["CORE1_ENERGY (J)"].iloc[-1] + - df["CORE1_ENERGY (J)"].iloc[0], + 3 + ), + + "core2_energy": round( + df["CORE2_ENERGY (J)"].iloc[-1] + - df["CORE2_ENERGY (J)"].iloc[0], + 3 + ), + + "core3_energy": round( + df["CORE3_ENERGY (J)"].iloc[-1] + - df["CORE3_ENERGY (J)"].iloc[0], + 3 + ), + + "core4_energy": round( + df["CORE4_ENERGY (J)"].iloc[-1] + - df["CORE4_ENERGY (J)"].iloc[0], + 3 + ), + + "core5_energy": round( + df["CORE5_ENERGY (J)"].iloc[-1] + - df["CORE5_ENERGY (J)"].iloc[0], + 3 + ), + + "core6_energy": round( + df["CORE6_ENERGY (J)"].iloc[-1] + - df["CORE6_ENERGY (J)"].iloc[0], + 3 + ), + + "core7_energy": round( + df["CORE7_ENERGY (J)"].iloc[-1] + - df["CORE7_ENERGY (J)"].iloc[0], + 3 + ) } + + output.console_log(f"Run data: {run_data}") + return run_data + def after_experiment(self) -> None: - """Perform any activity required after stopping the experiment here - Invoked only once during the lifetime of the program.""" pass - # ================================ DO NOT ALTER BELOW THIS LINE ================================ - experiment_path: Path = None + experiment_path: Path = None \ No newline at end of file diff --git a/experiment-runner/DistributedExecution/DistributedMasterOrchestrator.py b/experiment-runner/DistributedExecution/DistributedMasterOrchestrator.py index f1ae11105..e43e52c92 100644 --- a/experiment-runner/DistributedExecution/DistributedMasterOrchestrator.py +++ b/experiment-runner/DistributedExecution/DistributedMasterOrchestrator.py @@ -1,33 +1,43 @@ from flask import Flask, request, jsonify import threading import time +from pathlib import Path +import pandas as pd +import os + from ProgressManager.RunTable.Models.RunProgress import RunProgress import threading from ProgressManager.RunTable.Models.RunProgress import RunProgress - class TaskManager: - def __init__(self, run_table): + def __init__(self, run_table, experiment_path: Path): self.run_table = run_table + self.experiment_path = experiment_path self.assigned_runs = {} self.total_runs = len(run_table) - - self.lock = threading.Lock() # โœ… FIXED (you were missing this) + self.lock = threading.Lock() def get_next_task(self, agent_id): with self.lock: for idx, run in enumerate(self.run_table): if run['__done'] == RunProgress.TODO: + run_id = run["__run_id"] + + run_dir = self.experiment_path / str(run_id) + run_dir.mkdir(parents=True, exist_ok=True) + run['__done'] = "RUNNING" run['agent_id'] = agent_id run['__current_run'] = idx run['__total_runs'] = self.total_runs - self.assigned_runs[run['__run_id']] = agent_id + run["run_dir"] = str(run_dir) + + self.assigned_runs[run_id] = agent_id return run @@ -37,29 +47,21 @@ def complete_task(self, run_id, data): with self.lock: for run in self.run_table: if run["__run_id"] == run_id: + if data: + for k, v in data.items(): + run[k] = v - run.update(data) run["__done"] = RunProgress.DONE self.assigned_runs.pop(run_id, None) + + pd.DataFrame(self.run_table).to_csv( + self.experiment_path / "run_table.csv", + index=False + ) return - - def reset_tasks_for_agent(self, agent_id): - """๐Ÿ”ฅ IMPORTANT: recovery function""" - with self.lock: - for run in self.run_table: - if run.get("agent_id") == agent_id and run["__done"] == "RUNNING": - run["__done"] = RunProgress.TODO - run["agent_id"] = None - - self.assigned_runs = { - k: v for k, v in self.assigned_runs.items() - if v != agent_id - } - -class APIServer: - """Flask API server for distributed task management""" - + +class APIServer: def __init__(self, task_manager, worker_monitor): self.app = Flask(__name__) self.task_manager = task_manager @@ -79,7 +81,7 @@ def submit_result(): payload = request.get_json() run_id = payload.get('run_id') - run_data = payload.get('data', {}) # โœ… extract correctly + run_data = payload.get('data', {}) status = payload.get('status') if status == "FAILED": @@ -151,10 +153,15 @@ def __init__(self, config, metadata, host="0.0.0.0", port=5000): self.host = host self.port = port + self.experiment_path = config.results_output_path / config.name + self.experiment_path.mkdir(parents=True, exist_ok=True) + run_table = config.create_run_table_model().generate_experiment_run_table() - - self.task_manager = TaskManager(run_table) + self.run_table_path = self.experiment_path / "run_table.csv" + pd.DataFrame(run_table).to_csv(self.run_table_path, index=False) + + self.task_manager = TaskManager(run_table, self.experiment_path) self.monitor = WorkerMonitor(self.task_manager) self.api = APIServer(self.task_manager, self.monitor) diff --git a/experiment-runner/DistributedExecution/Worker.py b/experiment-runner/DistributedExecution/Worker.py index bbe781b4c..6d25d0c2e 100644 --- a/experiment-runner/DistributedExecution/Worker.py +++ b/experiment-runner/DistributedExecution/Worker.py @@ -1,46 +1,61 @@ import threading import time import requests +import numpy as np +from enum import Enum from ExperimentOrchestrator.Experiment.Run.RunController import RunController class WorkerRuntime: + @staticmethod + def make_json_safe(obj): + if isinstance(obj, dict): + return {k: WorkerRuntime.make_json_safe(v) for k, v in obj.items()} + + if isinstance(obj, list): + return [WorkerRuntime.make_json_safe(v) for v in obj] + + if isinstance(obj, np.generic): + return obj.item() + + if isinstance(obj, Enum): + return obj.value + + return obj + + def __init__(self, master_url, heartbeat_interval=40, idle_timeout=120): self.master_url = master_url self.heartbeat_interval = heartbeat_interval - self.idle_timeout = idle_timeout # Exit after N seconds with no tasks + self.idle_timeout = idle_timeout self._stop = False self.current_run = None self.agent_id = None self.last_task_time = None - # ========================= - # MAIN LOOP - # ========================= def run_loop(self, agent_id, config): self.agent_id = agent_id self.last_task_time = time.time() + print(f"[WORKER] Starting with agent_id: {self.agent_id}") print(f"[WORKER] Master URL: {self.master_url}") - # start heartbeat thread threading.Thread(target=self._heartbeat_loop, daemon=True).start() - print(f"[WORKER] Heartbeat thread started") - print(f"[WORKER] Waiting for tasks (will exit after {self.idle_timeout}s of inactivity)...") + + print("[WORKER] Heartbeat thread started") + print(f"[WORKER] Waiting for tasks (idle timeout {self.idle_timeout}s)") while True: task = self._get_task() if not task: - # Check if we've been idle too long - idle_time = time.time() - self.last_task_time - if idle_time > self.idle_timeout: - print(f"[WORKER] No tasks for {self.idle_timeout}s - exiting") + if time.time() - self.last_task_time > self.idle_timeout: + print("[WORKER] Idle timeout reached - exiting") break - + self.current_run = None time.sleep(3) continue @@ -48,21 +63,20 @@ def run_loop(self, agent_id, config): self.last_task_time = time.time() self.current_run = task + run_id = task["__run_id"] + try: - result = self._execute(task, config) - self._send_result(task["__run_id"], result) + run_data = self._execute(task, config) + self._send_result(run_id, run_data) except Exception as e: - self._send_failure(task["__run_id"], str(e)) + self._send_failure(run_id, str(e)) finally: self.current_run = None - + print(f"[WORKER] Worker {self.agent_id} exiting") - # ========================= - # TASK FETCH - # ========================= def _get_task(self): try: r = requests.get( @@ -70,75 +84,89 @@ def _get_task(self): params={"agent_id": self.agent_id}, timeout=5 ) + task = r.json().get("run") + if task: - print(f"[WORKER] Got task: {task.get('__run_id', 'unknown')}") + print(f"[WORKER] Got task: {task.get('__run_id')}") + return task - except requests.exceptions.Timeout: - print(f"[WORKER] Task request timeout (master not responding)") - return None + except Exception as e: print(f"[WORKER] Error getting task: {e}") return None - # ========================= - # EXECUTION - # ========================= def _execute(self, run, config): print(f"[WORKER] Executing task {run.get('__run_id')}") + current_run = run.get('__current_run', 0) total_runs = run.get('__total_runs', 1) - - try: - controller = RunController(run, config, current_run, total_runs) - controller.do_run() - print(f"[WORKER] Task {run.get('__run_id')} completed successfully") - return run # updated in-place - except Exception as e: - print(f"[WORKER] Task {run.get('__run_id')} failed with error: {type(e).__name__}: {e}") - import traceback - traceback.print_exc() - raise - - # ========================= - # RESULT - # ========================= + + controller = RunController(run, config, current_run, total_runs) + controller.data_manager = None + result = controller.do_run() # MUST return dict + + print(f"[WORKER] Task {run.get('__run_id')} completed") + + return result + def _send_result(self, run_id, data): try: - requests.post(self.master_url + "/result", json={ + safe_data = WorkerRuntime.make_json_safe(data) + + payload = { "run_id": run_id, - "data": data, - "status": "COMPLETED" - }, timeout=5) + "data": safe_data, + "status": "DONE" + } + + response = requests.post( + self.master_url + "/result", + json=payload, + timeout=10 + ) + + response.raise_for_status() + print(f"[WORKER] Result sent for task {run_id}") + + except requests.exceptions.RequestException as e: + print(f"[WORKER] Network error sending result: {e}") + except Exception as e: - print(f"[WORKER] Error sending result: {e}") + print(f"[WORKER] Unexpected error: {e}") def _send_failure(self, run_id, error): try: - requests.post(self.master_url + "/result", json={ - "run_id": run_id, - "status": "FAILED", - "error": error - }, timeout=5) - print(f"[WORKER] Task {run_id} failed: {error}") + requests.post( + self.master_url + "/result", + json={ + "run_id": run_id, + "status": "FAILED", + "error": error + }, + timeout=10 + ) + + print(f"[WORKER] Failure sent for {run_id}") + except Exception as e: print(f"[WORKER] Error sending failure: {e}") - # ========================= - # HEARTBEAT - # ========================= def _heartbeat_loop(self): while not self._stop: try: - requests.post(self.master_url + "/heartbeat", json={ - "agent_id": self.agent_id, - "status": "RUNNING" if self.current_run else "IDLE", - "run_id": self.current_run["__run_id"] if self.current_run else None, - "timestamp": time.time() - }, timeout=5) - except requests.exceptions.Timeout: - print(f"[WORKER] Heartbeat timeout") + requests.post( + self.master_url + "/heartbeat", + json={ + "agent_id": self.agent_id, + "status": "RUNNING" if self.current_run else "IDLE", + "run_id": self.current_run["__run_id"] if self.current_run else None, + "timestamp": time.time() + }, + timeout=5 + ) + except Exception as e: print(f"[WORKER] Heartbeat error: {e}") diff --git a/experiment-runner/ExperimentOrchestrator/Experiment/Run/RunController.py b/experiment-runner/ExperimentOrchestrator/Experiment/Run/RunController.py index 4e0a8e041..034807cf5 100644 --- a/experiment-runner/ExperimentOrchestrator/Experiment/Run/RunController.py +++ b/experiment-runner/ExperimentOrchestrator/Experiment/Run/RunController.py @@ -91,4 +91,10 @@ def do_run(self): updated_run_data = self.run_context.execute_run updated_run_data['__done'] = RunProgress.DONE - self.data_manager.update_row_data(updated_run_data) + #self.data_manager.update_row_data(updated_run_data) + #return updated_run_data + + if self.data_manager: + self.data_manager.update_row_data(updated_run_data) + + return updated_run_data \ No newline at end of file From f0717f1890dbb89adb20e48038afa8df0721a605 Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Fri, 15 May 2026 14:43:40 +0200 Subject: [PATCH 06/30] remote_mode_before_testing --- .../DistributedMasterOrchestrator.py | 202 +++++++++++++----- .../DistributedExecution/Worker.py | 50 +---- .../Experiment/Run/IRunController.py | 3 +- .../Experiment/Run/RunController.py | 6 +- .../Output/CSVOutputManager.py | 10 +- .../RunTable/Models/RunProgress.py | 3 +- 6 files changed, 173 insertions(+), 101 deletions(-) diff --git a/experiment-runner/DistributedExecution/DistributedMasterOrchestrator.py b/experiment-runner/DistributedExecution/DistributedMasterOrchestrator.py index e43e52c92..6d9a24f8e 100644 --- a/experiment-runner/DistributedExecution/DistributedMasterOrchestrator.py +++ b/experiment-runner/DistributedExecution/DistributedMasterOrchestrator.py @@ -1,3 +1,8 @@ +from ProgressManager.RunTable.Models.RunProgress import RunProgress +from ProgressManager.Output.CSVOutputManager import CSVOutputManager +from EventManager.Models.RunnerEvents import RunnerEvents +from EventManager.EventSubscriptionController import EventSubscriptionController + from flask import Flask, request, jsonify import threading import time @@ -5,11 +10,6 @@ import pandas as pd import os -from ProgressManager.RunTable.Models.RunProgress import RunProgress - -import threading -from ProgressManager.RunTable.Models.RunProgress import RunProgress - class TaskManager: def __init__(self, run_table, experiment_path: Path): @@ -18,93 +18,156 @@ def __init__(self, run_table, experiment_path: Path): self.assigned_runs = {} self.total_runs = len(run_table) self.lock = threading.Lock() + self.csv_manager = CSVOutputManager(experiment_path) + self.completed = False def get_next_task(self, agent_id): with self.lock: + + # If experiment already completed + if self.completed: + return None + for idx, run in enumerate(self.run_table): if run['__done'] == RunProgress.TODO: - run_id = run["__run_id"] run_dir = self.experiment_path / str(run_id) run_dir.mkdir(parents=True, exist_ok=True) - run['__done'] = "RUNNING" + run['__done'] = RunProgress.RUNNING run['agent_id'] = agent_id - run['__current_run'] = idx + run['__current_run'] = idx + 1 run['__total_runs'] = self.total_runs - run["run_dir"] = str(run_dir) self.assigned_runs[run_id] = agent_id + self.csv_manager.write_run_table(self.run_table) - return run + task = run.copy() + task['__done'] = task['__done'].name - return None + print(f"[MASTER] Assigned {run_id} -> {agent_id}") + return task + return None def complete_task(self, run_id, data): with self.lock: for run in self.run_table: if run["__run_id"] == run_id: + # Merge returned data if data: for k, v in data.items(): run[k] = v - run["__done"] = RunProgress.DONE self.assigned_runs.pop(run_id, None) + self.csv_manager.write_run_table(self.run_table) + print(f"[MASTER] Completed run {run_id}") + break + + # Check if all runs are done + all_done = all( + run['__done'] == RunProgress.DONE + for run in self.run_table + ) + if all_done and not self.completed: + self.completed = True + print("\n[MASTER] ALL RUNS COMPLETED\n") + + # AFTER_EXPERIMENT hook + print("[MASTER] Calling AFTER_EXPERIMENT hook") + EventSubscriptionController.raise_event( + RunnerEvents.AFTER_EXPERIMENT + ) + shutdown_server() + + def restore_crashed_runs(self): + """ + If server restarts and finds RUNNING runs, + restore them to TODO. + """ + changed = False + + for run in self.run_table: + if run['__done'] == RunProgress.RUNNING: + run['__done'] = RunProgress.TODO + run['agent_id'] = None + changed = True + if changed: + print("[MASTER] Restored RUNNING -> TODO after restart") + self.csv_manager.write_run_table(self.run_table) + + def experiment_already_completed(self): + return all( + run['__done'] == RunProgress.DONE + for run in self.run_table + ) + + +class APIServer: - pd.DataFrame(self.run_table).to_csv( - self.experiment_path / "run_table.csv", - index=False - ) - return - -class APIServer: def __init__(self, task_manager, worker_monitor): self.app = Flask(__name__) self.task_manager = task_manager self.monitor = worker_monitor - - # Register endpoints + @self.app.route('/task', methods=['GET']) def get_task(): agent_id = request.args.get('agent_id') self.monitor.heartbeat(agent_id) - task = self.task_manager.get_next_task(agent_id) - return jsonify({"run": task if task else None}) - + return jsonify({ + "run": task if task else None + }) + @self.app.route('/result', methods=['POST']) def submit_result(): payload = request.get_json() - run_id = payload.get('run_id') - run_data = payload.get('data', {}) + run_data = payload.get('data', {}) status = payload.get('status') if status == "FAILED": - print(f"[MASTER] Run {run_id} failed: {payload.get('error')}") - - self.task_manager.complete_task(run_id, run_data) + print(f"[MASTER] Run failed: {run_id}") + print(payload.get('error')) + # Return run to TODO + for run in self.task_manager.run_table: + if run['__run_id'] == run_id: + run['__done'] = RunProgress.TODO + run['agent_id'] = None + self.task_manager.csv_manager.write_run_table( + self.task_manager.run_table + ) + else: + self.task_manager.complete_task(run_id, run_data) return jsonify({"status": "ok"}) - + @self.app.route('/heartbeat', methods=['POST']) def heartbeat(): data = request.get_json() agent_id = data.get('agent_id') self.monitor.heartbeat(agent_id) + return jsonify({"status": "ok"}) - + @self.app.route('/status', methods=['GET']) def status(): total_runs = len(self.task_manager.run_table) - todo_count = sum(1 for r in self.task_manager.run_table if r['__done'] == RunProgress.TODO) - running_count = sum(1 for r in self.task_manager.run_table if r['__done'] == "RUNNING") - done_count = sum(1 for r in self.task_manager.run_table if r['__done'] == RunProgress.DONE) - + todo_count = sum( + 1 for r in self.task_manager.run_table + if r['__done'] == RunProgress.TODO + ) + running_count = sum( + 1 for r in self.task_manager.run_table + if r['__done'] == RunProgress.RUNNING + ) + done_count = sum( + 1 for r in self.task_manager.run_table + if r['__done'] == RunProgress.DONE + ) return jsonify({ "status": "ok", "total_runs": total_runs, @@ -117,6 +180,7 @@ def status(): }) class WorkerMonitor: + def __init__(self, task_manager): self.heartbeats = {} self.task_manager = task_manager @@ -126,25 +190,32 @@ def heartbeat(self, agent_id): self.heartbeats[agent_id] = time.time() def monitor(self): - while True: + while not self.task_manager.completed: time.sleep(10) now = time.time() - dead = [ agent for agent, t in self.heartbeats.items() if now - t > self.timeout ] - for agent in dead: print(f"[MASTER] Worker {agent} dead") for run in self.task_manager.run_table: - if run.get("agent_id") == agent and run["__done"] != RunProgress.DONE: + if ( + run.get("agent_id") == agent + and run["__done"] != RunProgress.DONE + ): + print(f"[MASTER] Returning run " + f"{run['__run_id']} -> TODO") + run["__done"] = RunProgress.TODO run["agent_id"] = None - + self.task_manager.csv_manager.write_run_table( + self.task_manager.run_table + ) del self.heartbeats[agent] + class DistributedMasterOrchestrator: def __init__(self, config, metadata, host="0.0.0.0", port=5000): @@ -153,18 +224,51 @@ def __init__(self, config, metadata, host="0.0.0.0", port=5000): self.host = host self.port = port - self.experiment_path = config.results_output_path / config.name + self.experiment_path = (config.results_output_path / config.name) self.experiment_path.mkdir(parents=True, exist_ok=True) + self.run_table_path = (self.experiment_path / "run_table.csv") + + if self.run_table_path.exists(): + print("[MASTER] Existing experiment detected") + + csv_manager = CSVOutputManager(self.experiment_path) + run_table = csv_manager.read_run_table() + else: + print("[MASTER] Creating new experiment") + + run_table = (config.create_run_table_model().generate_experiment_run_table()) + + pd.DataFrame(run_table).to_csv(self.run_table_path, index=False) - run_table = config.create_run_table_model().generate_experiment_run_table() - - self.run_table_path = self.experiment_path / "run_table.csv" - pd.DataFrame(run_table).to_csv(self.run_table_path, index=False) - self.task_manager = TaskManager(run_table, self.experiment_path) + self.task_manager.restore_crashed_runs() + + if self.task_manager.experiment_already_completed(): + print("[MASTER] Experiment already completed") + + self.finished_before_start = True + else: + self.finished_before_start = False self.monitor = WorkerMonitor(self.task_manager) + self.api = APIServer(self.task_manager, self.monitor) - def start(self): - threading.Thread(target=self.monitor.monitor, daemon=True).start() - self.api.app.run(host=self.host, port=self.port) \ No newline at end of file + def start(self): + if self.finished_before_start: + return + + EventSubscriptionController.raise_event(RunnerEvents.BEFORE_EXPERIMENT) + + threading.Thread(target=self.monitor.monitor, daemon=True).start() + + print(f"[MASTER] Starting server " + f"on {self.host}:{self.port}") + + self.api.app.run(host=self.host, port=self.port, use_reloader=False) + + +def shutdown_server(): + func = request.environ.get('werkzeug.server.shutdown') + if func is None: + os._exit(0) + func() \ No newline at end of file diff --git a/experiment-runner/DistributedExecution/Worker.py b/experiment-runner/DistributedExecution/Worker.py index 6d25d0c2e..15b304be0 100644 --- a/experiment-runner/DistributedExecution/Worker.py +++ b/experiment-runner/DistributedExecution/Worker.py @@ -1,11 +1,11 @@ +from ExperimentOrchestrator.Experiment.Run.RunController import RunController + import threading import time import requests import numpy as np from enum import Enum -from ExperimentOrchestrator.Experiment.Run.RunController import RunController - class WorkerRuntime: @@ -13,19 +13,14 @@ class WorkerRuntime: def make_json_safe(obj): if isinstance(obj, dict): return {k: WorkerRuntime.make_json_safe(v) for k, v in obj.items()} - if isinstance(obj, list): return [WorkerRuntime.make_json_safe(v) for v in obj] - if isinstance(obj, np.generic): return obj.item() - if isinstance(obj, Enum): return obj.value - return obj - def __init__(self, master_url, heartbeat_interval=40, idle_timeout=120): self.master_url = master_url self.heartbeat_interval = heartbeat_interval @@ -50,12 +45,10 @@ def run_loop(self, agent_id, config): while True: task = self._get_task() - if not task: if time.time() - self.last_task_time > self.idle_timeout: print("[WORKER] Idle timeout reached - exiting") break - self.current_run = None time.sleep(3) continue @@ -68,23 +61,15 @@ def run_loop(self, agent_id, config): try: run_data = self._execute(task, config) self._send_result(run_id, run_data) - except Exception as e: self._send_failure(run_id, str(e)) - finally: self.current_run = None - print(f"[WORKER] Worker {self.agent_id} exiting") def _get_task(self): try: - r = requests.get( - self.master_url + "/task", - params={"agent_id": self.agent_id}, - timeout=5 - ) - + r = requests.get(self.master_url + "/task", params={"agent_id": self.agent_id}, timeout=5) task = r.json().get("run") if task: @@ -102,37 +87,25 @@ def _execute(self, run, config): current_run = run.get('__current_run', 0) total_runs = run.get('__total_runs', 1) - controller = RunController(run, config, current_run, total_runs) - controller.data_manager = None - result = controller.do_run() # MUST return dict + controller = RunController(run, config, current_run, total_runs, distributed_mode=True) + result = controller.do_run() print(f"[WORKER] Task {run.get('__run_id')} completed") - return result def _send_result(self, run_id, data): try: safe_data = WorkerRuntime.make_json_safe(data) - payload = { - "run_id": run_id, - "data": safe_data, - "status": "DONE" - } - - response = requests.post( - self.master_url + "/result", - json=payload, - timeout=10 - ) + payload = {"run_id": run_id, "data": safe_data, "status": "DONE"} + response = requests.post(self.master_url + "/result", json=payload, timeout=10) response.raise_for_status() print(f"[WORKER] Result sent for task {run_id}") except requests.exceptions.RequestException as e: print(f"[WORKER] Network error sending result: {e}") - except Exception as e: print(f"[WORKER] Unexpected error: {e}") @@ -140,14 +113,9 @@ def _send_failure(self, run_id, error): try: requests.post( self.master_url + "/result", - json={ - "run_id": run_id, - "status": "FAILED", - "error": error - }, + json={"run_id": run_id, "status": "FAILED", "error": error}, timeout=10 ) - print(f"[WORKER] Failure sent for {run_id}") except Exception as e: @@ -166,8 +134,6 @@ def _heartbeat_loop(self): }, timeout=5 ) - except Exception as e: print(f"[WORKER] Heartbeat error: {e}") - time.sleep(self.heartbeat_interval) \ No newline at end of file diff --git a/experiment-runner/ExperimentOrchestrator/Experiment/Run/IRunController.py b/experiment-runner/ExperimentOrchestrator/Experiment/Run/IRunController.py index d90cd8de3..144ecbd5e 100644 --- a/experiment-runner/ExperimentOrchestrator/Experiment/Run/IRunController.py +++ b/experiment-runner/ExperimentOrchestrator/Experiment/Run/IRunController.py @@ -16,7 +16,7 @@ class IRunController(ABC): run_context: RunnerContext = None data_manager: CSVOutputManager = None - def __init__(self, variation: Dict, config: RunnerConfig, current_run: int, total_runs: int): + def __init__(self, variation: Dict, config: RunnerConfig, current_run: int, total_runs: int, distributed_mode: bool = False): self.run_dir = config.experiment_path / variation['__run_id'] self.run_dir.mkdir(parents=True, exist_ok=True) @@ -25,6 +25,7 @@ def __init__(self, variation: Dict, config: RunnerConfig, current_run: int, tota self.current_run = current_run self.run_context = RunnerContext(self.variation, self.current_run, self.run_dir) self.data_manager = CSVOutputManager(self.config.experiment_path) + self.distributed_mode = distributed_mode self.run_completed_event = Event() diff --git a/experiment-runner/ExperimentOrchestrator/Experiment/Run/RunController.py b/experiment-runner/ExperimentOrchestrator/Experiment/Run/RunController.py index 034807cf5..ce7b004a3 100644 --- a/experiment-runner/ExperimentOrchestrator/Experiment/Run/RunController.py +++ b/experiment-runner/ExperimentOrchestrator/Experiment/Run/RunController.py @@ -91,10 +91,6 @@ def do_run(self): updated_run_data = self.run_context.execute_run updated_run_data['__done'] = RunProgress.DONE - #self.data_manager.update_row_data(updated_run_data) - #return updated_run_data - - if self.data_manager: + if not self.distributed_mode: self.data_manager.update_row_data(updated_run_data) - return updated_run_data \ No newline at end of file diff --git a/experiment-runner/ProgressManager/Output/CSVOutputManager.py b/experiment-runner/ProgressManager/Output/CSVOutputManager.py index 5e3233616..20a85f79a 100644 --- a/experiment-runner/ProgressManager/Output/CSVOutputManager.py +++ b/experiment-runner/ProgressManager/Output/CSVOutputManager.py @@ -38,10 +38,14 @@ def write_run_table(self, run_table: List[Dict]): with open(self._experiment_path / 'run_table.csv', 'w', newline='') as myfile: writer = csv.DictWriter(myfile, fieldnames=list(run_table[0].keys())) writer.writeheader() + for data in run_table: - data['__done'] = data['__done'].name - writer.writerow(data) - except: + row = data.copy() + + if isinstance(row['__done'], RunProgress): + row['__done'] = row['__done'].name + writer.writerow(row) + except Exception as e: raise ExperimentOutputFileDoesNotExistError # TODO: Nice To have diff --git a/experiment-runner/ProgressManager/RunTable/Models/RunProgress.py b/experiment-runner/ProgressManager/RunTable/Models/RunProgress.py index 08231b1a9..9e0fcd5b0 100644 --- a/experiment-runner/ProgressManager/RunTable/Models/RunProgress.py +++ b/experiment-runner/ProgressManager/RunTable/Models/RunProgress.py @@ -2,4 +2,5 @@ class RunProgress(Enum): TODO = 1 - DONE = 2 \ No newline at end of file + RUNNING = 2 + DONE = 3 \ No newline at end of file From 3a2d52e4ea7842f983026ab71ac152e3a1a88cf2 Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Mon, 18 May 2026 10:48:56 +0200 Subject: [PATCH 07/30] add coments to the remote --- Troubleshoating.md | 139 ++++++++++++++++++ ...estrator.py => DistributedOrchestrator.py} | 72 ++++++++- .../DistributedExecution/Worker.py | 26 +++- .../DistributedExecution/__init__.py | 4 +- experiment-runner/__main__.py | 4 +- 5 files changed, 233 insertions(+), 12 deletions(-) create mode 100644 Troubleshoating.md rename experiment-runner/DistributedExecution/{DistributedMasterOrchestrator.py => DistributedOrchestrator.py} (70%) diff --git a/Troubleshoating.md b/Troubleshoating.md new file mode 100644 index 000000000..e0cd4bc8b --- /dev/null +++ b/Troubleshoating.md @@ -0,0 +1,139 @@ +# Troubleshooting + +## 1. Python Package Installation Error + +When installing and setting up `experiment-runner`, one common issue is running: + +```bash +pip3 install -r requirments.txt +``` + +and getting the following error: + +```text +error: externally-managed-environment + +ร— This environment is externally managed +โ•ฐโ”€> To install Python packages system-wide, try apt install + python3-xyz +``` + +Some Linux distributions (especially Ubuntu 24+, Debian, and Fedora) protect the system Python installation to avoid breaking system packages. + +### Solution + +Run: + +```bash +pip3 install -r requirments.txt --break-system-packages +``` + +### Alternative + +Use a Python virtual environment: + +```bash +python3 -m venv venv +source venv/bin/activate +pip install -r requirements.txt +``` + +--- + +## 2. EnergiBridge / JoularCore Permission Error + +When using EnergiBridge or JoularCore on Linux systems (especially AMD CPUs), you may encounter the following error when running the experiment: + +```text +thread 'main' (33575) panicked at src/cpu/amd.rs:20:76: +called `Result::unwrap()` on an `Err` value: Os { code: 13, kind: PermissionDenied, message: "Permission denied" } +note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace +``` + +The Rust profiler is trying to access low-level CPU energy counters (MSR / RAPL interfaces), but Linux blocks access for normal users. + +### Solution + +#### 1. Load the MSR Kernel Module + +Run: + +```bash +sudo modprobe msr +``` + +Then verify the device exists: + +```bash +ls /dev/cpu/0/msr +``` + +Expected output: + +```text +/dev/cpu/0/msr +``` + +If the file does not exist, the kernel module did not load correctly. + +--- + +#### 2. Check MSR Permissions + +Run: + +```bash +ls -l /dev/cpu/0/msr +``` + +If you see something similar to: + +```text +crw------- 1 root root +``` + +then only the root user can access the CPU energy counters. + +--- + +#### 3. Grant Read Permissions + +Run: + +```bash +sudo chmod o+r /dev/cpu/*/msr +``` + +This temporarily allows non-root users to read the MSR registers. + +--- + +#### If Nothing Works + +Some Linux systems completely block low-level profiling access. + +Run: + +```bash +cat /proc/sys/kernel/perf_event_paranoid +``` + +If the value is: + +```text +2 +3 +4 +``` + +then Linux is blocking low-level performance counters. + +#### Temporary Fix + +Run: + +```bash +echo -1 | sudo tee /proc/sys/kernel/perf_event_paranoid +``` + +This temporarily lowers the kernel restrictions and allows profiling tools to access hardware counters. diff --git a/experiment-runner/DistributedExecution/DistributedMasterOrchestrator.py b/experiment-runner/DistributedExecution/DistributedOrchestrator.py similarity index 70% rename from experiment-runner/DistributedExecution/DistributedMasterOrchestrator.py rename to experiment-runner/DistributedExecution/DistributedOrchestrator.py index 6d9a24f8e..d43fc8173 100644 --- a/experiment-runner/DistributedExecution/DistributedMasterOrchestrator.py +++ b/experiment-runner/DistributedExecution/DistributedOrchestrator.py @@ -9,7 +9,20 @@ from pathlib import Path import pandas as pd import os - +from waitress import serve + +### ========================================================= +### | | +### | TaskManager | +### | - Assign available runs to connected workers | +### | - Update and persist run_table.csv state | +### | - Trigger AFTER_EXPERIMENT lifecycle event | +### | - Detect experiment completion | +### | | +### | *Any state modification to runs should happen | +### | through this class to avoid race conditions | +### | | +### ========================================================= class TaskManager: def __init__(self, run_table, experiment_path: Path): @@ -20,6 +33,7 @@ def __init__(self, run_table, experiment_path: Path): self.lock = threading.Lock() self.csv_manager = CSVOutputManager(experiment_path) self.completed = False + self.shutdown = False def get_next_task(self, agent_id): with self.lock: @@ -105,7 +119,21 @@ def experiment_already_completed(self): for run in self.run_table ) - +### ========================================================= +### | | +### | APIServer | +### | - Handles the communication between workers | +### | and the orchestrator | +### | - Handle task distribution requests | +### | - Receive completed experiment results | +### | - Handle worker heartbeat updates | +### | - Receive worker heartbeat updates | +### | - Provide experiment | +### | monitoring/status endpoint | +### | - Trigger orchestrator shutdown | +### | | +### | | +### ========================================================= class APIServer: def __init__(self, task_manager, worker_monitor): @@ -118,7 +146,13 @@ def get_task(): agent_id = request.args.get('agent_id') self.monitor.heartbeat(agent_id) task = self.task_manager.get_next_task(agent_id) + if self.task_manager.shutdown: + return jsonify({ + "shutdown": True, + "run": None + }) return jsonify({ + "shutdown": False, "run": task if task else None }) @@ -178,7 +212,23 @@ def status(): }, "active_agents": len(self.monitor.heartbeats) }) - + + @self.app.route('/shutdown', methods=['POST']) + def shutdown(): + shutdown_server() + return jsonify({"status": "shutting down"}) + +### ========================================================= +### | | +### | WorkerMonitor | +### | - Keeps track of connected workers | +### | - If a worker fails to send a heartbeat | +### | within the timeout period, it is considered | +### | dead | +### | - Return the assigment back to TODO | +### | | +### | | +### ========================================================= class WorkerMonitor: def __init__(self, task_manager): @@ -215,8 +265,19 @@ def monitor(self): ) del self.heartbeats[agent] +### ========================================================= +### | | +### | DistributedOrchestrator | +### | - Initialize experiment infrastructure | +### | - Load or create run_table.csv | +### | - Restore interrupted experiments | +### | - Start monitoring threads | +### | - Start the API server | +### | | +### | | +### ========================================================= -class DistributedMasterOrchestrator: +class DistributedOrchestrator: def __init__(self, config, metadata, host="0.0.0.0", port=5000): self.config = config @@ -264,7 +325,8 @@ def start(self): print(f"[MASTER] Starting server " f"on {self.host}:{self.port}") - self.api.app.run(host=self.host, port=self.port, use_reloader=False) + #self.api.app.run(host=self.host, port=self.port, use_reloader=False) + serve(self.api.app, host=self.host, port=self.port) def shutdown_server(): diff --git a/experiment-runner/DistributedExecution/Worker.py b/experiment-runner/DistributedExecution/Worker.py index 15b304be0..f35ebf848 100644 --- a/experiment-runner/DistributedExecution/Worker.py +++ b/experiment-runner/DistributedExecution/Worker.py @@ -6,7 +6,18 @@ import numpy as np from enum import Enum - +### ========================================================= +### | | +### | WorkerRuntime | +### | | +### | - Connect to the master orchestrator | +### | - Request experiment runs/tasks | +### | - Execute runs locally | +### | - Send results back to the master | +### | - Send periodic heartbeat updates | +### | - Gracefully shutdown on master request | +### | | +### ========================================================= class WorkerRuntime: @staticmethod @@ -43,8 +54,12 @@ def run_loop(self, agent_id, config): print("[WORKER] Heartbeat thread started") print(f"[WORKER] Waiting for tasks (idle timeout {self.idle_timeout}s)") - while True: + while not self._stop: task = self._get_task() + if task == "SHUTDOWN": + print("[WORKER] Master shutdown acknowledged") + break + if not task: if time.time() - self.last_task_time > self.idle_timeout: print("[WORKER] Idle timeout reached - exiting") @@ -70,7 +85,12 @@ def run_loop(self, agent_id, config): def _get_task(self): try: r = requests.get(self.master_url + "/task", params={"agent_id": self.agent_id}, timeout=5) - task = r.json().get("run") + response = r.json() + if response.get("shutdown"): + print("[WORKER] Received shutdown signal from master") + self._stop = True + return "SHUTDOWN" + task = response.get("run") if task: print(f"[WORKER] Got task: {task.get('__run_id')}") diff --git a/experiment-runner/DistributedExecution/__init__.py b/experiment-runner/DistributedExecution/__init__.py index f4364bf12..613f7dc93 100644 --- a/experiment-runner/DistributedExecution/__init__.py +++ b/experiment-runner/DistributedExecution/__init__.py @@ -3,7 +3,7 @@ Simple framework for running experiments across multiple machines. """ -from .DistributedMasterOrchestrator import DistributedMasterOrchestrator, APIServer, TaskManager, WorkerMonitor +from .DistributedOrchestrator import DistributedOrchestrator, APIServer, TaskManager, WorkerMonitor from .Worker import WorkerRuntime __all__ = [ @@ -11,5 +11,5 @@ 'APIServer', 'TaskManager', 'WorkerMonitor', - 'DistributedMasterOrchestrator', + 'DistributedOrchestrator', ] diff --git a/experiment-runner/__main__.py b/experiment-runner/__main__.py index edca44553..8078a71be 100644 --- a/experiment-runner/__main__.py +++ b/experiment-runner/__main__.py @@ -15,7 +15,7 @@ from ConfigValidator.CustomErrors.ConfigErrors import ConfigInvalidClassNameError from ExperimentOrchestrator.Experiment.ExperimentController import ExperimentController -from DistributedExecution.DistributedMasterOrchestrator import DistributedMasterOrchestrator +from DistributedExecution.DistributedOrchestrator import DistributedOrchestrator from DistributedExecution.Worker import WorkerRuntime @@ -88,7 +88,7 @@ def calc_ast_md5sum(src, name): master_host = get_flag_value('--host') or "0.0.0.0" master_port = int(get_flag_value('--port') or 5000) - orchestrator = DistributedMasterOrchestrator( + orchestrator = DistributedOrchestrator( config=config, metadata=metadata, host=master_host, From 98c100dfdbbb0fae6b001db29dc771a684fd219e Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Mon, 18 May 2026 13:05:03 +0200 Subject: [PATCH 08/30] README file change I added the remote distribution segment and added two more libraries in the requirements.txt --- README.md | 32 ++++++++++++++++++++++++++++++++ requirements.txt | 1 + 2 files changed, 33 insertions(+) diff --git a/README.md b/README.md index 375b6ed97..a8d61337e 100644 --- a/README.md +++ b/README.md @@ -108,6 +108,38 @@ Your configuration files automatically use these variables if set, with sensible **More information about the profilers and use cases can be found in the [Wiki tab](https://github.com/S2-group/experiment-runner/wiki).** +--- +## Remote distribution + +Experiment Runner supports **distributed execution across multiple machines** using a masterโ€“worker architecture. + +### Architecture Overview + +- One machine acts as the **Master (Orchestrator)** + - Owns the experiment `run_table` + - Assigns runs to workers via a REST API + - Tracks progress and persists experiment state + - Triggers lifecycle events (e.g. `AFTER_EXPERIMENT`) when finished + +- Multiple machines act as **Workers** + - Request tasks from the master + - Execute runs locally using the configured experiment + - Submit results back to the master + +- Communication between master and workers is handled via a lightweight **Flask-based HTTP API** + +### How to run it +Start the orchestrator on the master machine: + ```bash +python experiment-runner/ examples// --distribute master --host host_nr --port port_nr +``` +On each worker machine, connect to the master: +```bash +experiment-runner/ examples// --distribute worker --master orchestor_adress +``` +When the experiment finish it, the master would close automatically, the rest of the workers would need manually closing, they would close after 120s + + ## How to cite Experiment Runner If Experiment Runner is helping your research, consider to cite it as follows, thank you! diff --git a/requirements.txt b/requirements.txt index 1eac56f6a..d608c6a84 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ dill jsonpickle flask requests +waitress \ No newline at end of file From bcd146fdaa04abfbe3ea60b4e570902638239c59 Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Mon, 25 May 2026 17:52:04 +0200 Subject: [PATCH 09/30] Update DitributeOrgestor.py --- .../DistributedOrchestrator.py | 41 +++++++++++++++---- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/experiment-runner/DistributedExecution/DistributedOrchestrator.py b/experiment-runner/DistributedExecution/DistributedOrchestrator.py index d43fc8173..9bc0d5470 100644 --- a/experiment-runner/DistributedExecution/DistributedOrchestrator.py +++ b/experiment-runner/DistributedExecution/DistributedOrchestrator.py @@ -88,6 +88,7 @@ def complete_task(self, run_id, data): ) if all_done and not self.completed: self.completed = True + self.shutdown = True print("\n[MASTER] ALL RUNS COMPLETED\n") # AFTER_EXPERIMENT hook @@ -95,7 +96,8 @@ def complete_task(self, run_id, data): EventSubscriptionController.raise_event( RunnerEvents.AFTER_EXPERIMENT ) - shutdown_server() + #time.sleep(5) + #shutdown_server() def restore_crashed_runs(self): """ @@ -145,12 +147,16 @@ def __init__(self, task_manager, worker_monitor): def get_task(): agent_id = request.args.get('agent_id') self.monitor.heartbeat(agent_id) - task = self.task_manager.get_next_task(agent_id) + #task = self.task_manager.get_next_task(agent_id) + if self.task_manager.shutdown: return jsonify({ "shutdown": True, "run": None }) + + task = self.task_manager.get_next_task(agent_id) + return jsonify({ "shutdown": False, "run": task if task else None @@ -318,17 +324,38 @@ def start(self): if self.finished_before_start: return - EventSubscriptionController.raise_event(RunnerEvents.BEFORE_EXPERIMENT) + EventSubscriptionController.raise_event( + RunnerEvents.BEFORE_EXPERIMENT + ) - threading.Thread(target=self.monitor.monitor, daemon=True).start() + threading.Thread( + target=self.monitor.monitor, + daemon=True + ).start() print(f"[MASTER] Starting server " - f"on {self.host}:{self.port}") + f"on {self.host}:{self.port}") + + server_thread = threading.Thread( + target=lambda: serve( + self.api.app, + host=self.host, + port=self.port + ), + daemon=True + ) + + server_thread.start() - #self.api.app.run(host=self.host, port=self.port, use_reloader=False) - serve(self.api.app, host=self.host, port=self.port) + while not self.task_manager.shutdown: + time.sleep(1) + print("[MASTER] Waiting for workers to shutdown...") + time.sleep(10) + print("[MASTER] Shutting down") + os._exit(0) + def shutdown_server(): func = request.environ.get('werkzeug.server.shutdown') if func is None: From 547760e14833abbffeb5054fc999d0f424714ee9 Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Thu, 11 Jun 2026 10:12:59 +0200 Subject: [PATCH 10/30] test remote_distribution --- test/ExperimentOrchestrator/__init__.py | 0 .../test_RemoteDistribution.py | 493 ++++++++++++++++++ 2 files changed, 493 insertions(+) create mode 100644 test/ExperimentOrchestrator/__init__.py create mode 100644 test/ExperimentOrchestrator/test_RemoteDistribution.py diff --git a/test/ExperimentOrchestrator/__init__.py b/test/ExperimentOrchestrator/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/ExperimentOrchestrator/test_RemoteDistribution.py b/test/ExperimentOrchestrator/test_RemoteDistribution.py new file mode 100644 index 000000000..7bfb93d18 --- /dev/null +++ b/test/ExperimentOrchestrator/test_RemoteDistribution.py @@ -0,0 +1,493 @@ +import unittest +import tempfile +import shutil +import sys +from pathlib import Path +from typing import AnyStr, List, Dict, Any + +sys.path.insert(0, "experiment-runner") + +from ConfigValidator.Config.Models.RunnerContext import RunnerContext +from ConfigValidator.Config.Models.FactorModel import FactorModel +from ConfigValidator.Config.Models.RunTableModel import RunTableModel +from ConfigValidator.Config.RunnerConfig import RunnerConfig +from ProgressManager.Output.OutputProcedure import OutputProcedure as output + + +class RemoteAgent: + """Mock remote agent for testing distributed execution""" + def __init__(self, agent_id: str, host: str, port: int): + self.agent_id = agent_id + self.host = host + self.port = port + self.is_connected = False + self.assigned_runs: List[Dict] = [] + self.completed_runs: List[Dict] = [] + self.failed_runs: List[str] = [] + + def connect(self) -> bool: + """Simulate connection to remote agent""" + if not self.host or self.port <= 0: + return False + self.is_connected = True + return True + + def disconnect(self) -> bool: + """Disconnect from remote agent""" + self.is_connected = False + return True + + def send_run(self, run_data: Dict) -> bool: + """Send a run to the remote agent for execution""" + if not self.is_connected: + return False + self.assigned_runs.append(run_data) + return True + + def retrieve_results(self) -> List[Dict]: + """Retrieve completed run results from remote agent""" + return self.completed_runs.copy() + + def mark_run_complete(self, run_id: str, result_data: Dict) -> bool: + """Mark a run as completed on the remote agent""" + result_data['__run_id'] = run_id + self.completed_runs.append(result_data) + self.assigned_runs = [r for r in self.assigned_runs if r.get('__run_id') != run_id] + return True + + def mark_run_failed(self, run_id: str, error_message: str) -> bool: + """Mark a run as failed""" + self.failed_runs.append(run_id) + self.assigned_runs = [r for r in self.assigned_runs if r.get('__run_id') != run_id] + return True + + +class RemoteDistributionManager: + """Manages distribution of experiments across remote agents""" + def __init__(self): + self.agents: Dict[str, RemoteAgent] = {} + self.pending_runs: List[Dict] = [] + self.completed_runs: List[Dict] = [] + self.failed_runs: Dict[str, str] = {} + + def register_agent(self, agent: RemoteAgent) -> bool: + """Register a new remote agent""" + if not isinstance(agent, RemoteAgent): + return False + self.agents[agent.agent_id] = agent + return True + + def connect_all_agents(self) -> Dict[str, bool]: + """Connect to all registered agents""" + results = {} + for agent_id, agent in self.agents.items(): + results[agent_id] = agent.connect() + return results + + def disconnect_all_agents(self) -> Dict[str, bool]: + """Disconnect from all agents""" + results = {} + for agent_id, agent in self.agents.items(): + results[agent_id] = agent.disconnect() + return results + + def distribute_runs(self, runs: List[Dict]) -> Dict[str, int]: + """Distribute runs across available agents using round-robin""" + self.pending_runs = runs.copy() + agent_ids = list(self.agents.keys()) + + if not agent_ids: + self.failed_runs.update({r.get('__run_id'): 'No agents available' for r in runs}) + return {'distributed': 0, 'failed': len(runs)} + + distributed = 0 + failed = 0 + + for idx, run in enumerate(runs): + agent_id = agent_ids[idx % len(agent_ids)] + agent = self.agents[agent_id] + + if agent.send_run(run): + distributed += 1 + else: + self.failed_runs[run.get('__run_id')] = f'Failed to send to agent {agent_id}' + failed += 1 + + return {'distributed': distributed, 'failed': failed} + + def collect_results(self) -> Dict[str, Any]: + """Collect results from all agents""" + for agent in self.agents.values(): + self.completed_runs.extend(agent.retrieve_results()) + + return { + 'total_completed': len(self.completed_runs), + 'total_failed': len(self.failed_runs), + 'results': self.completed_runs + } + + def get_agent_status(self) -> Dict[str, Dict]: + """Get status of all agents""" + status = {} + for agent_id, agent in self.agents.items(): + status[agent_id] = { + 'connected': agent.is_connected, + 'assigned_runs': len(agent.assigned_runs), + 'completed_runs': len(agent.completed_runs), + 'failed_runs': len(agent.failed_runs), + 'host': agent.host, + 'port': agent.port + } + return status + + +class TestRemoteAgentBasic(unittest.TestCase): + """Test basic remote agent functionality""" + + def setUp(self): + self.agent = RemoteAgent( + agent_id="test_agent_1", + host="localhost", + port=8000 + ) + + def test_agent_initialization(self): + """Test that agent is properly initialized""" + self.assertEqual(self.agent.agent_id, "test_agent_1") + self.assertEqual(self.agent.host, "localhost") + self.assertEqual(self.agent.port, 8000) + self.assertFalse(self.agent.is_connected) + + def test_agent_connection(self): + """Test connecting to remote agent""" + self.assertFalse(self.agent.is_connected) + connected = self.agent.connect() + self.assertTrue(connected) + self.assertTrue(self.agent.is_connected) + + def test_agent_disconnection(self): + """Test disconnecting from remote agent""" + self.agent.connect() + self.assertTrue(self.agent.is_connected) + disconnected = self.agent.disconnect() + self.assertTrue(disconnected) + self.assertFalse(self.agent.is_connected) + + def test_invalid_agent_connection(self): + """Test connection failure with invalid parameters""" + invalid_agent = RemoteAgent("invalid", "", -1) + connected = invalid_agent.connect() + self.assertFalse(connected) + self.assertFalse(invalid_agent.is_connected) + + +class TestRemoteAgentRunManagement(unittest.TestCase): + """Test run management on remote agents""" + + def setUp(self): + self.agent = RemoteAgent("test_agent", "localhost", 8000) + self.agent.connect() + self.test_run = { + '__run_id': 'run_1', + 'factor1': 'treatment1', + 'factor2': 'value1' + } + + def tearDown(self): + self.agent.disconnect() + + def test_send_run_when_connected(self): + """Test sending a run to connected agent""" + result = self.agent.send_run(self.test_run) + self.assertTrue(result) + self.assertEqual(len(self.agent.assigned_runs), 1) + self.assertEqual(self.agent.assigned_runs[0]['__run_id'], 'run_1') + + def test_send_run_when_disconnected(self): + """Test that runs cannot be sent to disconnected agent""" + self.agent.disconnect() + result = self.agent.send_run(self.test_run) + self.assertFalse(result) + self.assertEqual(len(self.agent.assigned_runs), 0) + + def test_mark_run_complete(self): + """Test marking a run as completed""" + self.agent.send_run(self.test_run) + result_data = {'result_col': 42.5} + + success = self.agent.mark_run_complete('run_1', result_data) + self.assertTrue(success) + self.assertEqual(len(self.agent.completed_runs), 1) + self.assertEqual(self.agent.completed_runs[0]['result_col'], 42.5) + self.assertEqual(self.agent.completed_runs[0]['__run_id'], 'run_1') + self.assertEqual(len(self.agent.assigned_runs), 0) + + def test_mark_run_failed(self): + """Test marking a run as failed""" + self.agent.send_run(self.test_run) + + success = self.agent.mark_run_failed('run_1', 'Timeout error') + self.assertTrue(success) + self.assertEqual(len(self.agent.failed_runs), 1) + self.assertIn('run_1', self.agent.failed_runs) + self.assertEqual(len(self.agent.assigned_runs), 0) + + def test_retrieve_results(self): + """Test retrieving results from agent""" + self.agent.send_run(self.test_run) + self.agent.mark_run_complete('run_1', {'data': 100}) + + results = self.agent.retrieve_results() + self.assertEqual(len(results), 1) + self.assertEqual(results[0]['data'], 100) + + +class TestRemoteDistributionManagerBasic(unittest.TestCase): + """Test basic distribution manager functionality""" + + def setUp(self): + self.manager = RemoteDistributionManager() + self.agent1 = RemoteAgent("agent_1", "host1.local", 8000) + self.agent2 = RemoteAgent("agent_2", "host2.local", 8001) + + def test_manager_initialization(self): + """Test distribution manager initialization""" + self.assertEqual(len(self.manager.agents), 0) + self.assertEqual(len(self.manager.pending_runs), 0) + + def test_register_agents(self): + """Test registering remote agents""" + success1 = self.manager.register_agent(self.agent1) + success2 = self.manager.register_agent(self.agent2) + + self.assertTrue(success1) + self.assertTrue(success2) + self.assertEqual(len(self.manager.agents), 2) + + def test_register_invalid_agent(self): + """Test that invalid objects cannot be registered""" + result = self.manager.register_agent("not_an_agent") + self.assertFalse(result) + + def test_connect_all_agents(self): + """Test connecting all registered agents""" + self.manager.register_agent(self.agent1) + self.manager.register_agent(self.agent2) + + results = self.manager.connect_all_agents() + + self.assertEqual(results['agent_1'], True) + self.assertEqual(results['agent_2'], True) + self.assertTrue(self.agent1.is_connected) + self.assertTrue(self.agent2.is_connected) + + def test_disconnect_all_agents(self): + """Test disconnecting all agents""" + self.manager.register_agent(self.agent1) + self.manager.register_agent(self.agent2) + self.manager.connect_all_agents() + + results = self.manager.disconnect_all_agents() + + self.assertEqual(results['agent_1'], True) + self.assertEqual(results['agent_2'], True) + self.assertFalse(self.agent1.is_connected) + self.assertFalse(self.agent2.is_connected) + + +class TestDistributionAlgorithms(unittest.TestCase): + """Test distribution algorithms""" + + def setUp(self): + self.manager = RemoteDistributionManager() + self.agent1 = RemoteAgent("agent_1", "host1.local", 8000) + self.agent2 = RemoteAgent("agent_2", "host2.local", 8001) + + self.manager.register_agent(self.agent1) + self.manager.register_agent(self.agent2) + self.manager.connect_all_agents() + + def test_round_robin_distribution(self): + """Test round-robin distribution across agents""" + runs = [ + {'__run_id': f'run_{i}', 'factor': i} + for i in range(6) + ] + + distribution = self.manager.distribute_runs(runs) + + self.assertEqual(distribution['distributed'], 6) + self.assertEqual(distribution['failed'], 0) + self.assertEqual(len(self.agent1.assigned_runs), 3) + self.assertEqual(len(self.agent2.assigned_runs), 3) + + def test_distribution_to_single_agent(self): + """Test distribution when only one agent is available""" + single_manager = RemoteDistributionManager() + single_manager.register_agent(self.agent1) + single_manager.connect_all_agents() + + runs = [ + {'__run_id': f'run_{i}', 'factor': i} + for i in range(4) + ] + + distribution = single_manager.distribute_runs(runs) + + self.assertEqual(distribution['distributed'], 4) + self.assertEqual(len(self.agent1.assigned_runs), 4) + + def test_distribution_with_no_agents(self): + """Test distribution fails gracefully with no agents""" + empty_manager = RemoteDistributionManager() + + runs = [{'__run_id': 'run_1', 'factor': 1}] + distribution = empty_manager.distribute_runs(runs) + + self.assertEqual(distribution['distributed'], 0) + self.assertEqual(distribution['failed'], 1) + + +class TestResultAggregation(unittest.TestCase): + """Test result aggregation from multiple agents""" + + def setUp(self): + self.manager = RemoteDistributionManager() + self.agent1 = RemoteAgent("agent_1", "host1.local", 8000) + self.agent2 = RemoteAgent("agent_2", "host2.local", 8001) + + self.manager.register_agent(self.agent1) + self.manager.register_agent(self.agent2) + self.manager.connect_all_agents() + + def test_collect_results_from_multiple_agents(self): + """Test collecting results from all agents""" + runs = [ + {'__run_id': f'run_{i}', 'factor': i} + for i in range(4) + ] + + self.manager.distribute_runs(runs) + + self.agent1.mark_run_complete('run_0', {'result': 100}) + self.agent1.mark_run_complete('run_2', {'result': 150}) + self.agent2.mark_run_complete('run_1', {'result': 120}) + self.agent2.mark_run_complete('run_3', {'result': 180}) + + aggregation = self.manager.collect_results() + + self.assertEqual(aggregation['total_completed'], 4) + self.assertEqual(aggregation['total_failed'], 0) + self.assertEqual(len(aggregation['results']), 4) + + def test_agent_status_reporting(self): + """Test getting status of all agents""" + runs = [ + {'__run_id': f'run_{i}', 'factor': i} + for i in range(4) + ] + + self.manager.distribute_runs(runs) + self.agent1.mark_run_complete('run_0', {'result': 100}) + + status = self.manager.get_agent_status() + + self.assertEqual(status['agent_1']['assigned_runs'], 1) + self.assertEqual(status['agent_1']['completed_runs'], 1) + self.assertEqual(status['agent_2']['assigned_runs'], 2) + self.assertEqual(status['agent_2']['completed_runs'], 0) + + +class RemoteDistributionTestConfig(RunnerConfig): + """Test configuration for remote distribution experiments""" + + tmpdir: AnyStr = tempfile.mkdtemp() + + def clear(self): + if Path(self.__class__.tmpdir).exists(): + shutil.rmtree(self.__class__.tmpdir) + + def create_run_table_model(self): + return RunTableModel( + factors=[ + FactorModel("algorithm", ["quicksort", "mergesort", "heapsort"]), + FactorModel("data_size", [100, 1000, 10000]), + ], + data_columns=['execution_time', 'memory_used'] + ) + + def start_measurement(self, context: RunnerContext): + output.console_log("RemoteDistribution: Starting measurement") + pass + + def interact(self, context: RunnerContext): + output.console_log("RemoteDistribution: Executing on remote agent") + pass + + def stop_measurement(self, context: RunnerContext): + output.console_log("RemoteDistribution: Stopping measurement") + pass + + def populate_run_data(self, context: RunnerContext): + output.console_log("RemoteDistribution: Populating run data") + return { + 'execution_time': 1.5, + 'memory_used': 512 + } + + +class TestRemoteDistributionIntegration(unittest.TestCase): + """Integration tests for remote distribution with RunnerConfig""" + + def setUp(self): + self.config = RemoteDistributionTestConfig() + self.run_table = self.config.create_run_table_model().generate_experiment_run_table() + + def tearDown(self): + self.config.clear() + + def test_config_with_remote_distribution(self): + """Test that config works with remote distribution""" + self.config.start_measurement(None) + self.config.interact(None) + self.config.stop_measurement(None) + run_data = self.config.populate_run_data(None) + + self.assertIsNotNone(run_data) + self.assertEqual(run_data['execution_time'], 1.5) + self.assertEqual(run_data['memory_used'], 512) + + def test_run_table_generation_for_distribution(self): + """Test that run table can be properly distributed""" + self.assertGreater(len(self.run_table), 0) + + for run in self.run_table: + self.assertIn('__run_id', run) + self.assertIn('algorithm', run) + self.assertIn('data_size', run) + self.assertIn('execution_time', run) + self.assertIn('memory_used', run) + + def test_distributed_execution_simulation(self): + """Test simulating distributed execution of experiment""" + manager = RemoteDistributionManager() + agent1 = RemoteAgent("agent_1", "localhost", 8000) + agent2 = RemoteAgent("agent_2", "localhost", 8001) + + manager.register_agent(agent1) + manager.register_agent(agent2) + manager.connect_all_agents() + + distribution = manager.distribute_runs(self.run_table) + self.assertEqual(distribution['distributed'], len(self.run_table)) + + for run in self.run_table: + run_id = run['__run_id'] + manager.agents['agent_1'].mark_run_complete(run_id, self.config.populate_run_data(None)) + + aggregation = manager.collect_results() + self.assertGreater(aggregation['total_completed'], 0) + + +if __name__ == '__main__': + unittest.main() From bb067cd990f4c1e3d224fc337a179c3e5a354b1f Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Sun, 14 Jun 2026 19:16:41 +0200 Subject: [PATCH 11/30] Validation_of_the_setUp --- .../hello-world-fibonacci/RunnerConfig.py | 4 + .../EventSubscriptionController.py | 13 +- .../EventManager/Models/RunnerEvents.py | 21 +- .../Experiment/ExperimentController.py | 9 + .../Validation/RequirementsValidator.py | 313 ++++++++++++++++++ .../ProgressManager/Validation/__init__.py | 0 6 files changed, 345 insertions(+), 15 deletions(-) create mode 100644 experiment-runner/ProgressManager/Validation/RequirementsValidator.py create mode 100644 experiment-runner/ProgressManager/Validation/__init__.py diff --git a/examples/hello-world-fibonacci/RunnerConfig.py b/examples/hello-world-fibonacci/RunnerConfig.py index 6da601534..e43649be6 100644 --- a/examples/hello-world-fibonacci/RunnerConfig.py +++ b/examples/hello-world-fibonacci/RunnerConfig.py @@ -5,6 +5,7 @@ from ConfigValidator.Config.Models.RunnerContext import RunnerContext from ConfigValidator.Config.Models.OperationType import OperationType from ProgressManager.Output.OutputProcedure import OutputProcedure as output +from ProgressManager.Validation.RequirementsValidator import (validate_experiment_requirements) from typing import Dict, Any, Optional from pathlib import Path @@ -38,6 +39,7 @@ class RunnerConfig: def __init__(self): EventSubscriptionController.subscribe_to_multiple_events([ + (RunnerEvents.VALIDATE_EXPERIMENT, self.validate_experiment), (RunnerEvents.BEFORE_EXPERIMENT, self.before_experiment), (RunnerEvents.BEFORE_RUN, self.before_run), (RunnerEvents.START_RUN, self.start_run), @@ -83,6 +85,8 @@ def create_run_table_model(self) -> RunTableModel: ) return self.run_table_model + def validate_experiment(self) -> None: + validate_experiment_requirements(Path(__file__)) def before_experiment(self) -> None: pass diff --git a/experiment-runner/EventManager/EventSubscriptionController.py b/experiment-runner/EventManager/EventSubscriptionController.py index 113fe1457..285ea6215 100644 --- a/experiment-runner/EventManager/EventSubscriptionController.py +++ b/experiment-runner/EventManager/EventSubscriptionController.py @@ -1,5 +1,6 @@ from typing import Callable, List, Tuple from EventManager.Models.RunnerEvents import RunnerEvents +from ConfigValidator.CustomErrors.BaseError import BaseError class EventSubscriptionController: __call_back_register: dict = dict() @@ -20,11 +21,13 @@ def raise_event(event: RunnerEvents, runner_context=None): event_callback = EventSubscriptionController.__call_back_register[event] except KeyError: return None - - if runner_context: - return event_callback(runner_context) - else: - return event_callback() + try: + if runner_context: + return event_callback(runner_context) + else: + return event_callback() + except Exception as e: + raise BaseError(f"Error in event handler for {event.name}: {str(e)}") @staticmethod def get_event_callback(event: RunnerEvents): diff --git a/experiment-runner/EventManager/Models/RunnerEvents.py b/experiment-runner/EventManager/Models/RunnerEvents.py index 9ae200bc5..f6dd36699 100644 --- a/experiment-runner/EventManager/Models/RunnerEvents.py +++ b/experiment-runner/EventManager/Models/RunnerEvents.py @@ -1,13 +1,14 @@ from enum import Enum, auto class RunnerEvents(Enum): - BEFORE_EXPERIMENT = auto() - BEFORE_RUN = auto() - START_RUN = auto() - START_MEASUREMENT = auto() - INTERACT = auto() - CONTINUE = auto() - STOP_MEASUREMENT = auto() - STOP_RUN = auto() - POPULATE_RUN_DATA = auto() - AFTER_EXPERIMENT = auto() + VALIDATE_EXPERIMENT = auto() + BEFORE_EXPERIMENT = auto() + BEFORE_RUN = auto() + START_RUN = auto() + START_MEASUREMENT = auto() + INTERACT = auto() + CONTINUE = auto() + STOP_MEASUREMENT = auto() + STOP_RUN = auto() + POPULATE_RUN_DATA = auto() + AFTER_EXPERIMENT = auto() diff --git a/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py b/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py index a851747e5..fbdd4322c 100644 --- a/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py +++ b/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py @@ -117,6 +117,15 @@ def __init__(self, config: RunnerConfig, metadata: Metadata): output.console_log_WARNING("Experiment run table created...") def do_experiment(self): + # -- Validate experiment setup + # TODO: From the user perspective, it would be nice to know if are any possible issues with the experiment before staring the experiment runs. For example, if the config hooks are not properly defined, or if there are any issues with the config file itself + output.console_log_WARNING("Calling validate_experiment config hook") + try: + EventSubscriptionController.raise_event(RunnerEvents.VALIDATE_EXPERIMENT) + except BaseError as e: + output.console_log_FAIL(f"Experiment validation failed: {e}") + raise + output.console_log_OK("Experiment setup completed...") # -- Before experiment diff --git a/experiment-runner/ProgressManager/Validation/RequirementsValidator.py b/experiment-runner/ProgressManager/Validation/RequirementsValidator.py new file mode 100644 index 000000000..2e0e2dbd3 --- /dev/null +++ b/experiment-runner/ProgressManager/Validation/RequirementsValidator.py @@ -0,0 +1,313 @@ +import sys +import ast +import os +import shutil +import importlib +import importlib.util +from pathlib import Path +from typing import List, Dict, Tuple, Optional,Set +from ConfigValidator.CustomErrors.BaseError import BaseError +from ProgressManager.Output.OutputProcedure import OutputProcedure as output + +class RequirementCheckResult: + def __init__(self, name: str, requirement_type: str): + self.name = name + self.requirement_type = requirement_type + self.installed = False + self.error_message = "" + self.version = None + + def mark_failure(self, error: str): + self.installed = False + self.error_message = error + +### ========================================================= +### | | +### | RequirementsValidator: | +### | | +### | - Checks the following requirements: | +### | - Framework requirements (python versions | +### | and packages from requirements.txt) | +### | - External tools availability in Path | +### | - Experiment-specific requirements | +### | | +### | *Validates all requirements for an | +### | experiment before execution. | +### | | +### ========================================================= +PROFILER_DEPS = { + "JoularCore": { + "tools": ["java"], + "python_modules": ["jpype"], + }, + "PowerJoular": { + "tools": ["java"], + "python_modules": [], + }, + "EnergiBridge": { + "tools": ["energibridge"], + "python_modules": [], + }, + "NvidiaML": { + "tools": ["nvidia-smi"], + "python_modules": ["pynvml"], + }, + "PowerMetrics": { + "tools": ["powermetrics"], + "python_modules": [], + }, + "PowerLetrics": { + "tools": ["powermetrics"], + "python_modules": [], + }, + "Ps": { + "tools": ["ps"], + "python_modules": [], + }, + "PicoCM3": { + "tools": [], + "python_modules": ["picosdk"], + }, + "CodecarbonWrapper": { + "tools": [], + "python_modules": ["codecarbon"], + }, + "WattsUpPro": { + "tools": [], + "python_modules": ["serial"], + }, +} + +class RequirementsValidator: + + def __init__(self, config_file_path: Path): + self.config_file_path = config_file_path + self.config_dir = config_file_path.parent + self.framework_root = self._find_framework_root() + self.results: List[RequirementCheckResult] = [] + self.failed_checks: List[RequirementCheckResult] = [] + + @staticmethod + def _find_framework_root() -> Path: + """Find the root of the experiment-runner framework""" + cwd = Path.cwd() + + if (cwd / 'experiment-runner').exists(): + return cwd + if (cwd / 'requirements.txt').exists(): + return cwd + + for parent in cwd.parents: + if (parent / 'experiment-runner').exists(): + return parent + if (parent / 'requirements.txt').exists(): + return parent + + return cwd + + def validate_all(self) -> bool: + """ + Run all validation checks. Returns True if all pass, False otherwise. + Raises BaseError with details if any critical checks fail. + """ + try: + # Check Python version + self._validate_python_version() + # Check requirements.txt + self._validate_framework_requirements() + # Check experiment-specific requirements + self._validate_plugin_requirements_file() + self._check_profiler_external_deps() + # Check MSR module and permissions + self._validate_msr_module() + self._validate_msr_permissions() + self._validate_perf_permissions() + + # Results + return self._report_results() + + except BaseError: + raise + except Exception as e: + raise BaseError(f"Validation error: {str(e)}") + + def _validate_perf_permissions(self): + """Check if the user has permission to access performance counters""" + + result = RequirementCheckResult("perf_event_paranoid", "system") + perf_file = Path("/proc/sys/kernel/perf_event_paranoid") + + if not perf_file.exists(): + return + + value = int(perf_file.read_text().strip()) + + if value >= 2: + result.mark_failure( + "Check Troubleshooting.md: perf_event_paranoid is too restrictive.\n" + f"perf_event_paranoid={value}\n" + "Hardware performance counters are restricted.\n" + ) + self.failed_checks.append(result) + self.results.append(result) + + def _validate_msr_module(self): + """Check if the MSR kernel module is loaded""" + + msr_path = Path("/dev/cpu/0/msr") + result = RequirementCheckResult("MSR module", "system") + if not msr_path.exists(): + result.mark_failure( + "Check Troubleshooting.md: MSR kernel module not loaded.\n" + "MSR kernel module not loaded.\n" + ) + self.failed_checks.append(result) + self.results.append(result) + + def _validate_msr_permissions(self): + """Check if the user has permission to read MSR registers""" + + result = RequirementCheckResult("MSR permissions","system") + msr_path = "/dev/cpu/0/msr" + + if not os.access(msr_path, os.R_OK): + result.mark_failure( + "Check Troubleshooting.md: No permission to read MSR registers.\n" + "No permission to read MSR registers.\n" + ) + self.failed_checks.append(result) + self.results.append(result) + + def _validate_python_version(self): + """Check Python version compatibility""" + + python_version = sys.version_info + result = RequirementCheckResult(f"Python {python_version.major}.{python_version.minor}", "system") + + # Framework requires Python 3.8+ + if python_version.major < 3 or (python_version.major == 3 and python_version.minor < 8): + result.mark_failure( + f"Python 3.8+ required. Current: {python_version.major}.{python_version.minor}" + ) + self.results.append(result) + + def _validate_framework_requirements(self): + """Check framework dependencies from requirements.txt""" + requirements_file = self.framework_root / "requirements.txt" + result = RequirementCheckResult("Framework requirements", "python_module") + + if not requirements_file.exists(): + output.console_log_WARNING(" requirements.txt not found") + return + with open(requirements_file) as f: + for line in f: + line = line.strip() + if not line or line.startswith('#'): + continue + # Parse requirement: package_name or package_name==version + package_spec = line.split('==')[0].split('>=')[0].split('<=')[0].split('>')[0].split('<')[0] + package_name = package_spec.strip() + + result = RequirementCheckResult(package_name, "python_module") + try: + # Try to import the module + module = importlib.import_module(package_name) + version = getattr(module, '__version__', 'unknown') + except ImportError as e: + result.mark_failure(f"Cannot import: {str(e)}") + self.failed_checks.append(result) + self.results.append(result) + + def _check_profiler_external_deps(self): + """Check experiment-specific plugins required""" + used_profilers = self.extract_used_profilers(self.config_file_path) + + for profiler in used_profilers: + if profiler not in PROFILER_DEPS: + output.console_log_WARNING(f"Unknown profiler '{profiler}'") + continue + + dependencies = PROFILER_DEPS[profiler] + + for tool in dependencies["tools"]: + result = RequirementCheckResult(f"{profiler}:{tool}", "system_tool") + + if not shutil.which(tool): + result.mark_failure(f"Missing tool '{tool}'") + self.failed_checks.append(result) + self.results.append(result) + + for module in dependencies["python_modules"]: + result = RequirementCheckResult(f"{profiler}:{module}", "python_module") + try: + importlib.import_module(module) + except ImportError: + result.mark_failure(f"Missing Python module '{module}'") + self.failed_checks.append(result) + self.results.append(result) + + @staticmethod + def extract_used_profilers(config_file: Path) -> list[str]: + """Extract the names of profilers used in the config file by parsing import statements.""" + profilers = [] + + with open(config_file, "r") as f: + for line in f: + line = line.strip() + + if line.startswith("from Plugins.Profilers."): + profiler = ( + line.split("from Plugins.Profilers.")[1] + .split(" import ")[0] + .strip() + ) + profilers.append(profiler) + + return profilers + + def _validate_plugin_requirements_file(self): + """Check experiment-specific dependencies from experiment's requirements.txt""" + for requirements_file in self.framework_root.rglob("requirements.txt"): + if requirements_file == self.framework_root / "requirements.txt": + continue + with open(requirements_file) as f: + for line in f: + line = line.strip() + if not line or line.startswith("#"): + continue + package_name = ( + line.split("==")[0] + .split(">=")[0] + .split("<=")[0] + .split(">")[0] + .split("<")[0] + .strip() + ) + result = RequirementCheckResult(package_name,"plugin_requirement") + + try: + importlib.import_module(package_name) + except ImportError: + result.mark_failure(f"'{package_name}' required by "f"{requirements_file} is not installed") + self.failed_checks.append(result) + self.results.append(result) + + def _report_results(self): + if self.failed_checks: + message = [] + message.append("=" * 50) + message.append("EXPERIMENT VALIDATION FAILED") + message.append("=" * 50) + + for idx, check in enumerate(self.failed_checks,start=1): + message.append("") + message.append(f"[{idx}] {check.name}") + message.append(check.error_message) + raise BaseError( + "\n".join(message) + ) + return True + +def validate_experiment_requirements(config_file_path: Path) -> bool: + validator = RequirementsValidator(config_file_path) + return validator.validate_all() \ No newline at end of file diff --git a/experiment-runner/ProgressManager/Validation/__init__.py b/experiment-runner/ProgressManager/Validation/__init__.py new file mode 100644 index 000000000..e69de29bb From f2886b21e8c1d1739f3edfb20fd0360b78b42aef Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Mon, 15 Jun 2026 11:03:07 +0200 Subject: [PATCH 12/30] energyvalidator --- .../hello-world-fibonacci/RunnerConfig.py | 9 + .../Experiment/ExperimentController.py | 13 ++ .../Validation/EnergyValidator.py | 106 ++++++++++++ test/system/test_EnergyValidator.py | 159 ++++++++++++++++++ 4 files changed, 287 insertions(+) create mode 100644 experiment-runner/ProgressManager/Validation/EnergyValidator.py create mode 100644 test/system/test_EnergyValidator.py diff --git a/examples/hello-world-fibonacci/RunnerConfig.py b/examples/hello-world-fibonacci/RunnerConfig.py index e43649be6..fe33f7496 100644 --- a/examples/hello-world-fibonacci/RunnerConfig.py +++ b/examples/hello-world-fibonacci/RunnerConfig.py @@ -35,6 +35,15 @@ class RunnerConfig: time_between_runs_in_ms: int = 1000 ENERGIBRIDGE_PATH = "/home/andabarbu/.cargo/bin/energibridge" + + enable_energy_validation: bool = True + + """Path to log file for energy validation report. Relative to experiment output directory.""" + energy_validation_log_file: str = "energy_validation_report.log" + + """List of data column names that contain energy measurements (e.g., ['energy', 'joules', 'watts']). + Only used if enable_energy_validation is True.""" + energy_validation_columns: List[str] = [] def __init__(self): diff --git a/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py b/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py index fbdd4322c..678297faf 100644 --- a/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py +++ b/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py @@ -13,6 +13,8 @@ from ProgressManager.Output.OutputProcedure import OutputProcedure as output from EventManager.EventSubscriptionController import EventSubscriptionController from ConfigValidator.CustomErrors.ProgressErrors import AllRunsCompletedOnRestartError +from ProgressManager.Validation.EnergyValidator import EnergyValidator +from pathlib import Path ### ========================================================= @@ -162,3 +164,14 @@ def do_experiment(self): # -- After experiment output.console_log_WARNING("Calling after_experiment config hook") EventSubscriptionController.raise_event(RunnerEvents.AFTER_EXPERIMENT) + + # -- Energy validation + if self.config.enable_energy_validation and self.config.energy_validation_columns: + updated_run_table = self.csv_data_manager.read_run_table() + energy_report = EnergyValidator.validate_run_table(updated_run_table, self.config.energy_validation_columns) + + if energy_report.has_anomalies(): + output.console_log_WARNING(f"Energy anomalies detected. Report saved to {log_file_path}") + + log_file_path = self.config.experiment_path / self.config.energy_validation_log_file + EnergyValidator.save_report_to_file(energy_report, self.config.energy_validation_columns, log_file_path) diff --git a/experiment-runner/ProgressManager/Validation/EnergyValidator.py b/experiment-runner/ProgressManager/Validation/EnergyValidator.py new file mode 100644 index 000000000..b3f2e2207 --- /dev/null +++ b/experiment-runner/ProgressManager/Validation/EnergyValidator.py @@ -0,0 +1,106 @@ +from typing import Dict, List, Tuple, Any +from pathlib import Path +from ProgressManager.Output.OutputProcedure import OutputProcedure as output + + +class EnergyAnomalyReport: + """Represents energy measurement anomalies found during validation.""" + + def __init__(self): + self.anomalies: List[Dict[str, Any]] = [] + + def add_anomaly(self, run_id: str, treatment_levels: Dict[str, Any], column_name: str, value: Any): + """Add an anomaly to the report. + The anomaly fallowes the structure: + run_id: The run identifier + treatment_levels: Dictionary of factor names to treatment levels for this run + column_name: The energy column name where anomaly was detected + value: The anomalous value + """ + self.anomalies.append({ + 'run_id': run_id, + 'treatment_levels': treatment_levels, + 'column_name': column_name, + 'value': value, + }) + + def has_anomalies(self) -> bool: + """Check if any anomalies were found.""" + return len(self.anomalies) > 0 + + +class EnergyValidator: + """Validates energy measurements for anomalies (zero or negative values).""" + + @staticmethod + def validate_run_table(run_table: List[Dict[str, Any]], energy_columns: List[str]) -> EnergyAnomalyReport: + """Validate energy measurements in a run table.""" + report = EnergyAnomalyReport() + + if not energy_columns: + return report + + for run in run_table: + run_id = run.get('__run_id', 'unknown') + # Extract treatment levels + treatment_levels = { + k: v for k, v in run.items() + if not k.startswith('__') + } + + for column_name in energy_columns: + if column_name not in run: + continue + value = run[column_name] + + # Check for None or missing values + if value is None: + report.add_anomaly(run_id, treatment_levels, column_name, value) + continue + try: + numeric_value = float(value) + if numeric_value < 0: + report.add_anomaly(run_id, treatment_levels, column_name, numeric_value) + elif numeric_value == 0: + report.add_anomaly(run_id, treatment_levels, column_name, numeric_value) + except (ValueError, TypeError): + report.add_anomaly(run_id, treatment_levels, column_name, value) + return report + + @staticmethod + def generate_report_text(report: EnergyAnomalyReport, energy_columns: List[str]) -> str: + """ Generate the report text.""" + lines = [] + lines.append("=" * 80) + lines.append("ENERGY MEASUREMENT VALIDATION REPORT") + lines.append("=" * 80) + lines.append("") + + if report.has_anomalies(): + lines.append(f"Found {len(report.anomalies)} anomalous energy measurements") + lines.append("-" * 80) + + for anomaly in report.anomalies: + lines.append(f"Run ID: {anomaly['run_id']}") + lines.append(f"Column: {anomaly['column_name']}") + lines.append(f"Value: {anomaly['value']}") + lines.append(f"Treatment levels: {anomaly['treatment_levels']}") + lines.append("") + + lines.append("=" * 80) + lines.append("") + + return "\n".join(lines) + + @staticmethod + def save_report_to_file(report: EnergyAnomalyReport, energy_columns: List[str],log_file: Path) -> None: + """Save validation report to a file.""" + + report_text = EnergyValidator.generate_report_text(report, energy_columns) + try: + log_file.parent.mkdir(parents=True, exist_ok=True) + with open(log_file, 'w') as f: + f.write(report_text) + output.console_log_OK(f"Energy validation report saved to: {log_file}") + except Exception as e: + output.console_log_FAIL(f"Failed to write energy validation report: {e}") diff --git a/test/system/test_EnergyValidator.py b/test/system/test_EnergyValidator.py new file mode 100644 index 000000000..f9760cfe1 --- /dev/null +++ b/test/system/test_EnergyValidator.py @@ -0,0 +1,159 @@ +import unittest +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) + +from ProgressManager.Validation.EnergyValidator import EnergyValidator, EnergyAnomalyReport + + +class TestEnergyAnomalyReport(unittest.TestCase): + def test_report_creation(self): + report = EnergyAnomalyReport() + self.assertFalse(report.has_anomalies()) + self.assertFalse(report.has_errors()) + + def test_add_error_anomaly(self): + report = EnergyAnomalyReport() + report.add_anomaly('run_1', {'factor_a': 'value1'}, 'energy', -5.0, 'error') + + self.assertTrue(report.has_anomalies()) + self.assertTrue(report.has_errors()) + self.assertEqual(len(report.anomalies), 1) + self.assertEqual(report.anomalies[0]['value'], -5.0) + + def test_add_warning_anomaly(self): + report = EnergyAnomalyReport() + report.add_anomaly('run_1', {'factor_a': 'value1'}, 'energy', 0, 'warning') + + self.assertTrue(report.has_anomalies()) + self.assertFalse(report.has_errors()) + + def test_mixed_anomalies(self): + report = EnergyAnomalyReport() + report.add_anomaly('run_1', {'factor_a': 'value1'}, 'energy', -5.0, 'error') + report.add_anomaly('run_2', {'factor_a': 'value2'}, 'energy', 0, 'warning') + + self.assertTrue(report.has_anomalies()) + self.assertTrue(report.has_errors()) + self.assertEqual(len(report.anomalies), 2) + + +class TestEnergyValidator(unittest.TestCase): + def test_validate_empty_run_table(self): + report = EnergyValidator.validate_run_table([], ['energy']) + self.assertFalse(report.has_anomalies()) + + def test_validate_no_energy_columns(self): + run_table = [ + {'__run_id': 'run_1', 'factor_a': 'value1', 'energy': 10.5} + ] + report = EnergyValidator.validate_run_table(run_table, []) + self.assertFalse(report.has_anomalies()) + + def test_validate_positive_energy(self): + run_table = [ + {'__run_id': 'run_1', 'factor_a': 'value1', 'energy': 10.5}, + {'__run_id': 'run_2', 'factor_a': 'value2', 'energy': 25.0} + ] + report = EnergyValidator.validate_run_table(run_table, ['energy']) + self.assertFalse(report.has_anomalies()) + + def test_validate_zero_energy(self): + run_table = [ + {'__run_id': 'run_1', 'factor_a': 'value1', 'energy': 0} + ] + report = EnergyValidator.validate_run_table(run_table, ['energy']) + + self.assertTrue(report.has_anomalies()) + self.assertFalse(report.has_errors()) # Zero is a warning, not an error + self.assertEqual(report.anomalies[0]['severity'], 'warning') + + def test_validate_negative_energy(self): + run_table = [ + {'__run_id': 'run_1', 'factor_a': 'value1', 'energy': -5.0} + ] + report = EnergyValidator.validate_run_table(run_table, ['energy']) + + self.assertTrue(report.has_anomalies()) + self.assertTrue(report.has_errors()) + self.assertEqual(report.anomalies[0]['severity'], 'error') + + def test_validate_none_energy(self): + run_table = [ + {'__run_id': 'run_1', 'factor_a': 'value1', 'energy': None} + ] + report = EnergyValidator.validate_run_table(run_table, ['energy']) + + self.assertTrue(report.has_anomalies()) + self.assertEqual(report.anomalies[0]['severity'], 'warning') + + def test_validate_multiple_energy_columns(self): + run_table = [ + {'__run_id': 'run_1', 'factor_a': 'value1', 'energy': 10.0, 'power': 5.0}, + {'__run_id': 'run_2', 'factor_a': 'value2', 'energy': -5.0, 'power': 0} + ] + report = EnergyValidator.validate_run_table(run_table, ['energy', 'power']) + + self.assertTrue(report.has_anomalies()) + self.assertTrue(report.has_errors()) + # Should have 2 anomalies: one negative energy, one zero power + self.assertEqual(len(report.anomalies), 2) + + def test_validate_mixed_valid_invalid(self): + run_table = [ + {'__run_id': 'run_1', 'factor_a': 'value1', 'energy': 10.5}, + {'__run_id': 'run_2', 'factor_a': 'value2', 'energy': -2.0}, + {'__run_id': 'run_3', 'factor_a': 'value3', 'energy': 25.0} + ] + report = EnergyValidator.validate_run_table(run_table, ['energy']) + + self.assertTrue(report.has_anomalies()) + self.assertTrue(report.has_errors()) + self.assertEqual(len(report.anomalies), 1) # Only run_2 has anomaly + self.assertEqual(report.anomalies[0]['run_id'], 'run_2') + + def test_generate_report_text_no_anomalies(self): + report = EnergyAnomalyReport() + text = EnergyValidator.generate_report_text(report, ['energy']) + + self.assertIn("No anomalies detected", text) + self.assertIn("โœ“", text) + + def test_generate_report_text_with_errors(self): + report = EnergyAnomalyReport() + report.add_anomaly('run_1', {'factor_a': 'value1'}, 'energy', -5.0, 'error') + text = EnergyValidator.generate_report_text(report, ['energy']) + + self.assertIn("CRITICAL ERRORS", text) + self.assertIn("run_1", text) + self.assertIn("-5.0", text) + + def test_extract_treatment_levels(self): + run_table = [ + { + '__run_id': 'run_1', + '__done': 'DONE', + 'factor_a': 'value1', + 'factor_b': 'value2', + 'energy': 10.0 + } + ] + report = EnergyValidator.validate_run_table(run_table, ['energy']) + + # Energy is positive, so should be no anomalies and no treatment levels extracted + # Now test with negative energy to get anomaly with treatment levels + run_table[0]['energy'] = -5.0 + report = EnergyValidator.validate_run_table(run_table, ['energy']) + + self.assertTrue(report.has_anomalies()) + anomaly = report.anomalies[0] + self.assertEqual(anomaly['treatment_levels']['factor_a'], 'value1') + self.assertEqual(anomaly['treatment_levels']['factor_b'], 'value2') + # __run_id and __done should not be in treatment_levels + self.assertNotIn('__run_id', anomaly['treatment_levels']) + self.assertNotIn('__done', anomaly['treatment_levels']) + + +if __name__ == '__main__': + unittest.main() From d258f41f85a7df983361135b52e7572204b63c14 Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Mon, 15 Jun 2026 14:59:31 +0200 Subject: [PATCH 13/30] test_energyValidator --- .../Validation/test_EnergyValidator.py | 125 ++++++++++++++ test/ProgressManager/__init__.py | 0 test/ProgressManager/test_EnergyValidator.py | 81 +++++++++ test/system/test_EnergyValidator.py | 159 ------------------ 4 files changed, 206 insertions(+), 159 deletions(-) create mode 100644 experiment-runner/ProgressManager/Validation/test_EnergyValidator.py create mode 100644 test/ProgressManager/__init__.py create mode 100644 test/ProgressManager/test_EnergyValidator.py delete mode 100644 test/system/test_EnergyValidator.py diff --git a/experiment-runner/ProgressManager/Validation/test_EnergyValidator.py b/experiment-runner/ProgressManager/Validation/test_EnergyValidator.py new file mode 100644 index 000000000..e8f7919d4 --- /dev/null +++ b/experiment-runner/ProgressManager/Validation/test_EnergyValidator.py @@ -0,0 +1,125 @@ +import unittest +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) + +from ProgressManager.Validation.EnergyValidator import ( + EnergyValidator, + EnergyAnomalyReport +) + + +class TestEnergyValidator(unittest.TestCase): + + def test_positive_energy(self): + run_table = [ + {"__run_id": "run_1", + "cpu_energy": 10.5 + } + ] + + report = EnergyValidator.validate_run_table( + run_table, + ["cpu_energy"] + ) + + self.assertFalse(report.has_anomalies()) + + def test_zero_energy(self): + run_table = [ + { + "__run_id": "run_1", + "cpu_energy": 0 + } + ] + + report = EnergyValidator.validate_run_table( + run_table, + ["cpu_energy"] + ) + + self.assertTrue(report.has_anomalies()) + self.assertEqual(len(report.anomalies), 1) + + def test_negative_energy(self): + run_table = [ + { + "__run_id": "run_1", + "cpu_energy": -1 + } + ] + + report = EnergyValidator.validate_run_table( + run_table, + ["cpu_energy"] + ) + + self.assertTrue(report.has_anomalies()) + self.assertEqual(len(report.anomalies), 1) + + def test_mixed_values(self): + run_table = [ + { + "__run_id": "run_1", + "cpu_energy": 10 + }, + { + "__run_id": "run_2", + "cpu_energy": 0 + }, + { + "__run_id": "run_3", + "cpu_energy": -1 + } + ] + + report = EnergyValidator.validate_run_table( + run_table, + ["cpu_energy"] + ) + + self.assertTrue(report.has_anomalies()) + self.assertEqual(len(report.anomalies), 2) + + def test_treatment_levels_saved(self): + run_table = [ + { + "__run_id": "run_1", + "__done": "DONE", + "fib_type": "iter", + "problem_size": 1000, + "cpu_energy": -1 + } + ] + + report = EnergyValidator.validate_run_table( + run_table, + ["cpu_energy"] + ) + + anomaly = report.anomalies[0] + + self.assertEqual( + anomaly["treatment_levels"]["fib_type"], + "iter" + ) + + self.assertEqual( + anomaly["treatment_levels"]["problem_size"], + 1000 + ) + + self.assertNotIn( + "__run_id", + anomaly["treatment_levels"] + ) + + self.assertNotIn( + "__done", + anomaly["treatment_levels"] + ) + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/test/ProgressManager/__init__.py b/test/ProgressManager/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/ProgressManager/test_EnergyValidator.py b/test/ProgressManager/test_EnergyValidator.py new file mode 100644 index 000000000..ee9be7e05 --- /dev/null +++ b/test/ProgressManager/test_EnergyValidator.py @@ -0,0 +1,81 @@ +import unittest +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) + +from ProgressManager.Validation.EnergyValidator import (EnergyValidator,EnergyAnomalyReport) + + +class TestEnergyValidator(unittest.TestCase): + def test_positive_energy(self): + run_table = [ + { + "__run_id": "run_1", + "cpu_energy": 10.5 + }] + + report = EnergyValidator.validate_run_table(run_table,["cpu_energy"]) + self.assertFalse(report.has_anomalies()) + + def test_zero_energy(self): + run_table = [ + { + "__run_id": "run_1", + "cpu_energy": 0 + }] + + report = EnergyValidator.validate_run_table(run_table,["cpu_energy"]) + self.assertTrue(report.has_anomalies()) + self.assertEqual(len(report.anomalies), 1) + + def test_negative_energy(self): + run_table = [ + { + "__run_id": "run_1", + "cpu_energy": -1 + }] + + report = EnergyValidator.validate_run_table(run_table,["cpu_energy"]) + self.assertTrue(report.has_anomalies()) + self.assertEqual(len(report.anomalies), 1) + + def test_mixed_values(self): + run_table = [ + { + "__run_id": "run_1", + "cpu_energy": 10 + }, + { + "__run_id": "run_2", + "cpu_energy": 0 + }, + { + "__run_id": "run_3", + "cpu_energy": -1 + }] + + report = EnergyValidator.validate_run_table(run_table, ["cpu_energy"]) + self.assertTrue(report.has_anomalies()) + self.assertEqual(len(report.anomalies), 2) + + def test_treatment_levels_saved(self): + run_table = [ + { + "__run_id": "run_1", + "__done": "DONE", + "fib_type": "iter", + "problem_size": 1000, + "cpu_energy": -1 + }] + + report = EnergyValidator.validate_run_table(run_table,["cpu_energy"]) + anomaly = report.anomalies[0] + + self.assertEqual(anomaly["treatment_levels"]["fib_type"],"iter") + self.assertEqual(anomaly["treatment_levels"]["problem_size"], 1000) + self.assertNotIn("__run_id", anomaly["treatment_levels"]) + self.assertNotIn("__done", anomaly["treatment_levels"]) + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/test/system/test_EnergyValidator.py b/test/system/test_EnergyValidator.py deleted file mode 100644 index f9760cfe1..000000000 --- a/test/system/test_EnergyValidator.py +++ /dev/null @@ -1,159 +0,0 @@ -import unittest -import sys -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) - -from ProgressManager.Validation.EnergyValidator import EnergyValidator, EnergyAnomalyReport - - -class TestEnergyAnomalyReport(unittest.TestCase): - def test_report_creation(self): - report = EnergyAnomalyReport() - self.assertFalse(report.has_anomalies()) - self.assertFalse(report.has_errors()) - - def test_add_error_anomaly(self): - report = EnergyAnomalyReport() - report.add_anomaly('run_1', {'factor_a': 'value1'}, 'energy', -5.0, 'error') - - self.assertTrue(report.has_anomalies()) - self.assertTrue(report.has_errors()) - self.assertEqual(len(report.anomalies), 1) - self.assertEqual(report.anomalies[0]['value'], -5.0) - - def test_add_warning_anomaly(self): - report = EnergyAnomalyReport() - report.add_anomaly('run_1', {'factor_a': 'value1'}, 'energy', 0, 'warning') - - self.assertTrue(report.has_anomalies()) - self.assertFalse(report.has_errors()) - - def test_mixed_anomalies(self): - report = EnergyAnomalyReport() - report.add_anomaly('run_1', {'factor_a': 'value1'}, 'energy', -5.0, 'error') - report.add_anomaly('run_2', {'factor_a': 'value2'}, 'energy', 0, 'warning') - - self.assertTrue(report.has_anomalies()) - self.assertTrue(report.has_errors()) - self.assertEqual(len(report.anomalies), 2) - - -class TestEnergyValidator(unittest.TestCase): - def test_validate_empty_run_table(self): - report = EnergyValidator.validate_run_table([], ['energy']) - self.assertFalse(report.has_anomalies()) - - def test_validate_no_energy_columns(self): - run_table = [ - {'__run_id': 'run_1', 'factor_a': 'value1', 'energy': 10.5} - ] - report = EnergyValidator.validate_run_table(run_table, []) - self.assertFalse(report.has_anomalies()) - - def test_validate_positive_energy(self): - run_table = [ - {'__run_id': 'run_1', 'factor_a': 'value1', 'energy': 10.5}, - {'__run_id': 'run_2', 'factor_a': 'value2', 'energy': 25.0} - ] - report = EnergyValidator.validate_run_table(run_table, ['energy']) - self.assertFalse(report.has_anomalies()) - - def test_validate_zero_energy(self): - run_table = [ - {'__run_id': 'run_1', 'factor_a': 'value1', 'energy': 0} - ] - report = EnergyValidator.validate_run_table(run_table, ['energy']) - - self.assertTrue(report.has_anomalies()) - self.assertFalse(report.has_errors()) # Zero is a warning, not an error - self.assertEqual(report.anomalies[0]['severity'], 'warning') - - def test_validate_negative_energy(self): - run_table = [ - {'__run_id': 'run_1', 'factor_a': 'value1', 'energy': -5.0} - ] - report = EnergyValidator.validate_run_table(run_table, ['energy']) - - self.assertTrue(report.has_anomalies()) - self.assertTrue(report.has_errors()) - self.assertEqual(report.anomalies[0]['severity'], 'error') - - def test_validate_none_energy(self): - run_table = [ - {'__run_id': 'run_1', 'factor_a': 'value1', 'energy': None} - ] - report = EnergyValidator.validate_run_table(run_table, ['energy']) - - self.assertTrue(report.has_anomalies()) - self.assertEqual(report.anomalies[0]['severity'], 'warning') - - def test_validate_multiple_energy_columns(self): - run_table = [ - {'__run_id': 'run_1', 'factor_a': 'value1', 'energy': 10.0, 'power': 5.0}, - {'__run_id': 'run_2', 'factor_a': 'value2', 'energy': -5.0, 'power': 0} - ] - report = EnergyValidator.validate_run_table(run_table, ['energy', 'power']) - - self.assertTrue(report.has_anomalies()) - self.assertTrue(report.has_errors()) - # Should have 2 anomalies: one negative energy, one zero power - self.assertEqual(len(report.anomalies), 2) - - def test_validate_mixed_valid_invalid(self): - run_table = [ - {'__run_id': 'run_1', 'factor_a': 'value1', 'energy': 10.5}, - {'__run_id': 'run_2', 'factor_a': 'value2', 'energy': -2.0}, - {'__run_id': 'run_3', 'factor_a': 'value3', 'energy': 25.0} - ] - report = EnergyValidator.validate_run_table(run_table, ['energy']) - - self.assertTrue(report.has_anomalies()) - self.assertTrue(report.has_errors()) - self.assertEqual(len(report.anomalies), 1) # Only run_2 has anomaly - self.assertEqual(report.anomalies[0]['run_id'], 'run_2') - - def test_generate_report_text_no_anomalies(self): - report = EnergyAnomalyReport() - text = EnergyValidator.generate_report_text(report, ['energy']) - - self.assertIn("No anomalies detected", text) - self.assertIn("โœ“", text) - - def test_generate_report_text_with_errors(self): - report = EnergyAnomalyReport() - report.add_anomaly('run_1', {'factor_a': 'value1'}, 'energy', -5.0, 'error') - text = EnergyValidator.generate_report_text(report, ['energy']) - - self.assertIn("CRITICAL ERRORS", text) - self.assertIn("run_1", text) - self.assertIn("-5.0", text) - - def test_extract_treatment_levels(self): - run_table = [ - { - '__run_id': 'run_1', - '__done': 'DONE', - 'factor_a': 'value1', - 'factor_b': 'value2', - 'energy': 10.0 - } - ] - report = EnergyValidator.validate_run_table(run_table, ['energy']) - - # Energy is positive, so should be no anomalies and no treatment levels extracted - # Now test with negative energy to get anomaly with treatment levels - run_table[0]['energy'] = -5.0 - report = EnergyValidator.validate_run_table(run_table, ['energy']) - - self.assertTrue(report.has_anomalies()) - anomaly = report.anomalies[0] - self.assertEqual(anomaly['treatment_levels']['factor_a'], 'value1') - self.assertEqual(anomaly['treatment_levels']['factor_b'], 'value2') - # __run_id and __done should not be in treatment_levels - self.assertNotIn('__run_id', anomaly['treatment_levels']) - self.assertNotIn('__done', anomaly['treatment_levels']) - - -if __name__ == '__main__': - unittest.main() From bc96684c87bbd0e5e3070719b85be00171099471 Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Tue, 16 Jun 2026 08:28:26 +0200 Subject: [PATCH 14/30] clean --- test_env_var.py | 40 -------- validate_local_test_setup.py | 176 ----------------------------------- 2 files changed, 216 deletions(-) delete mode 100644 test_env_var.py delete mode 100644 validate_local_test_setup.py diff --git a/test_env_var.py b/test_env_var.py deleted file mode 100644 index e0722eba9..000000000 --- a/test_env_var.py +++ /dev/null @@ -1,40 +0,0 @@ -import os -from pathlib import Path - -print("=" * 70) -print("ENVIRONMENT VARIABLE TEST - Experiment Runner Portability") -print("=" * 70) - -# Test all environment variables -env_vars = { - "EXPERIMENT_RUNNER_OUTPUT_PATH": "/default/experiments", - "ENERGIBRIDGE_PATH": "/usr/local/bin/energibridge", - "WATTS_UP_PRO_PORT_MACOS": "/dev/tty.usbserial-A1000wT3", - "WATTS_UP_PRO_PORT_LINUX": "/dev/ttyUSB0", - "EXAMPLES_PATH": "/default/examples" -} - -print("\n1. WITHOUT environment variables set:") -print("-" * 70) -for var, default in env_vars.items(): - value = os.getenv(var, f"DEFAULT: {default}") - print(f" {var}") - print(f" = {value}\n") - -# Now set environment variables -os.environ["EXPERIMENT_RUNNER_OUTPUT_PATH"] = "C:\\my-experiments" -os.environ["ENERGIBRIDGE_PATH"] = "C:\\tools\\energibridge.exe" -os.environ["WATTS_UP_PRO_PORT_MACOS"] = "COM5" -os.environ["WATTS_UP_PRO_PORT_LINUX"] = "COM3" -os.environ["EXAMPLES_PATH"] = "C:\\my-examples" - -print("\n2. WITH environment variables set:") -print("-" * 70) -for var in env_vars.keys(): - value = os.getenv(var, "NOT SET") - print(f" {var}") - print(f" = {value}\n") - -print("=" * 70) -print("SUCCESS - Environment variables are working!") -print("=" * 70) \ No newline at end of file diff --git a/validate_local_test_setup.py b/validate_local_test_setup.py deleted file mode 100644 index e7b31f5a3..000000000 --- a/validate_local_test_setup.py +++ /dev/null @@ -1,176 +0,0 @@ -#!/usr/bin/env python3 -""" -Validate local SSH setup for distributed testing - -Usage: - python validate_local_test_setup.py - python validate_local_test_setup.py --full -""" -import subprocess -import sys -from pathlib import Path - -def run_cmd(cmd, timeout=5): - """Run command and return (rc, stdout, stderr)""" - try: - result = subprocess.run( - cmd, - shell=True, - capture_output=True, - text=True, - timeout=timeout - ) - return result.returncode, result.stdout, result.stderr - except subprocess.TimeoutExpired: - return -1, "", "Timeout" - except Exception as e: - return -1, "", str(e) - -def check_ssh_server(): - """Check if SSH server is running""" - print("Checking SSH server...") - rc, _, _ = run_cmd("sudo service ssh status") - if rc == 0: - print(" โœ“ SSH server is running") - return True - else: - print(" โœ— SSH server NOT running") - print(" Fix with: sudo service ssh start") - return False - -def check_ssh_keys(): - """Check if SSH keys exist""" - print("Checking SSH keys...") - key_path = Path.home() / ".ssh" / "id_rsa" - if key_path.exists(): - print(" โœ“ SSH key exists") - return True - else: - print(" โœ— SSH key NOT found") - print(" Fix with: ssh-keygen -t rsa -b 4096 -f ~/.ssh/id_rsa -N \"\"") - return False - -def check_localhost_ssh(port=22): - """Check SSH to localhost""" - print(f"Checking SSH localhost:{port}...") - rc, stdout, stderr = run_cmd(f"ssh -p {port} -o ConnectTimeout=3 localhost 'echo OK'") - if rc == 0 and "OK" in stdout: - print(f" โœ“ SSH localhost:{port} working") - return True - else: - print(f" โœ— SSH localhost:{port} FAILED") - if "refused" in stderr or "Connection refused" in stderr: - print(f" Port {port} not listening on SSH") - elif "Permission denied" in stderr: - print(f" Permission denied - check SSH keys") - else: - print(f" Error: {stderr}") - return False - -def check_ports_configured(): - """Check if sshd_config has multiple ports""" - print("Checking SSH config for multiple ports...") - rc, stdout, _ = run_cmd("grep -E '^Port ' /etc/ssh/sshd_config || true") - ports = [] - for line in stdout.split('\n'): - if line.strip().startswith('Port '): - port = line.strip().split()[-1] - ports.append(port) - - if len(ports) >= 4: # Should have 22, 2201, 2202, 2203 - print(f" โœ“ Found {len(ports)} ports configured: {', '.join(ports)}") - return ports - else: - print(f" โš  Found {len(ports)} port(s): {', '.join(ports) if ports else 'none'}") - print(" Configure /etc/ssh/sshd_config with:") - print(" Port 22") - print(" Port 2201") - print(" Port 2202") - print(" Port 2203") - print(" Then: sudo service ssh restart") - return ports - -def check_all_test_ports(ports=[2201, 2202, 2203]): - """Check if all test ports are accessible""" - print(f"Checking test ports: {', '.join(map(str, ports))}...") - all_ok = True - for port in ports: - if check_localhost_ssh(port): - pass # Already printed - else: - all_ok = False - return all_ok - -def main(): - print("=" * 50) - print("Distributed Testing - Setup Validation") - print("=" * 50) - print() - - checks = [ - ("SSH Server", check_ssh_server), - ("SSH Keys", check_ssh_keys), - ("Default SSH (port 22)", lambda: check_localhost_ssh(22)), - ] - - results = [] - for name, check_fn in checks: - try: - result = check_fn() - results.append((name, result)) - except Exception as e: - print(f" โœ— Error: {e}") - results.append((name, False)) - print() - - # Check ports - ports = check_ports_configured() - print() - - # Full test mode - if "--full" in sys.argv and len(ports) >= 4: - print("Running full port connectivity test...") - print() - if check_all_test_ports(): - print() - print("=" * 50) - print("โœ“ All checks passed!") - print("=" * 50) - print() - print("Ready to test distributed execution:") - print() - print(" python experiment-runner/ test_distributed_config.py \\") - print(" --distribute \"localhost:2201,localhost:2202,localhost:2203\"") - print() - return 0 - - # Summary - print("=" * 50) - passed = sum(1 for _, r in results if r) - total = len(results) - print(f"Setup Status: {passed}/{total} checks passed") - print("=" * 50) - print() - - if passed < total: - print("โš  Some checks failed - follow the fixes above") - return 1 - elif len(ports) < 4: - print("โš  Test ports not configured yet") - print(" Run 'sudo nano /etc/ssh/sshd_config' and add:") - print(" Port 2201") - print(" Port 2202") - print(" Port 2203") - print(" Then: sudo service ssh restart") - print() - print(" After that, run with --full flag:") - print(" python validate_local_test_setup.py --full") - return 1 - else: - print("โœ“ Basic setup looks good") - print(" Run with --full flag to test all ports:") - print(" python validate_local_test_setup.py --full") - return 0 - -if __name__ == "__main__": - sys.exit(main()) From 72d033a073ecb6bd063f6cd7b2d61bab32bc9631 Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Tue, 16 Jun 2026 08:30:24 +0200 Subject: [PATCH 15/30] clean --- Exemples.md | 16 ------ READe.md | 160 ---------------------------------------------------- 2 files changed, 176 deletions(-) delete mode 100644 Exemples.md delete mode 100644 READe.md diff --git a/Exemples.md b/Exemples.md deleted file mode 100644 index c266f4310..000000000 --- a/Exemples.md +++ /dev/null @@ -1,16 +0,0 @@ -# Domain-Specific Usage Examples - -Experiment Runner is designed to be domain-agnostic. Below are practical examples demonstrating how it can be configured and used across different research domains. - -These examples are meant to help researchers quickly adapt the framework to their own experimental setup. - -## 1. Code-level Performance Measurements -This experiment compares the performance of two implementations of summation under different input sizes. - -### Set-Up -- Factors: - - algorithm โˆˆ {sum_loop, optimized_sum} - - input_size โˆˆ {10k, 100k, 500k} -- Metric: - - execution_time_ms - diff --git a/READe.md b/READe.md deleted file mode 100644 index abf1a6fcb..000000000 --- a/READe.md +++ /dev/null @@ -1,160 +0,0 @@ -# Experiment-Runner - -[![DOI](https://zenodo.org/badge/505379793.svg)](https://doi.org/10.5281/zenodo.15430328) - -Experiment Runner is a generic framework to automatically execute measurement-based experiments on any platform. The experiments are user-defined, can be completely customized, and expressed in python code! - -The technical details, main features, software architecture, and example experiment using Experiment Runner are presented in our [SCICO 2025 publication](https://www.sciencedirect.com/science/article/pii/S0167642325001546). - -## Features - -- **Run Table Model**: Framework support to easily define an experiment's measurements with Factors, their Treatment levels, exclude certain combinations of Treatments, and add data columns for storing aggregated data. -- **Restarting**: If an experiment was not entirely completed on the last invocation (e.g. some variations crashes), experiment runner can be re-invoked to finish any remaining experiment variations. -- **Persistency**: Raw and aggregated experiment data per variation can be persistently stored. -- **Operational Types**: Two operational types: `AUTO` and `SEMI`, for more fine-grained experiment control. -- **Progress Indicator**: Keeps track of the execution of each run of the experiment -- **Target and profiler agnostic**: Can be used with any target to measure (e.g. ELF binary, .apk over adb, etc.) and with any profiler (e.g. WattsUpPro, etc.) - -## Requirements - -The framework has been tested with Python3 version 3.8, but should also work with any higher version. - -### Supported Platforms -| Platform | Status | -|----------|---------------| -| Linux | Supported | -| macOS | Supported | -| Windows | Not supported | - ---- - -## Installation -**Clone the repository:** -```bash -git clone https://github.com/S2-group/experiment-runner.git -cd experiment-runner/ -``` - -- *Optional, create a virtual envoirment:* - ```bash - python3 -m venv venv - source venv/bin/activate - ``` -**Install dependencies:** -```bash -pip install --upgrade pip -pip install -r requirements.txt -``` - -**To verify installation, run the hello-world exemple:** - -```bash -python experiment-runner/ examples/hello-world/RunnerConfig.py -``` -- The expected output: - - Experiment executes successfully - - Output directory experiments/ is created - - No missing dependency errors - -## Running - -In this section, we assume as the current working directory, the root directory of the project. - -### The provided examples - -To run any of the examples provided, run the following command: - -```bash -python experiment-runner/ examples// -``` - -Each example is accompanied with a README for further information. - -Once you successfully run an experiment, the framework will not allow you to run the same experiment again under, giving the message: - -```log -[FAIL]: EXPERIMENT_RUNNER ENCOUNTERED AN ERROR! -The experiment was restarted, but all runs are already completed. -``` - -This is to prevent you from accidentally overwriting the results of a previously run experiment! In order to run again the experiment, either delete any previously generated data (by default "experiments/" directory), or modify the config's `name` variable to a different name. - -*It is recommended to start with the [hello-world](examples/hello-world) example to also test your installation.* - -### Creating a new experiment - -First, generate a config for your experiment: - -```bash -python experiment-runner/ config-create [directory] -``` - -When running this command, where `[directory]` is an optional argument, a new config file with skeleton code will be generated in the given directory. -- The default location is the `examples/` directory. *This config is similar to the [hello-world](examples/hello-world) example.* - -Feel free to move the generated config to any other directory. - -You can modify its contents and write python code to define your own measurement-based experiment(s). -- *At this stage, you might find useful the [linux-ps-profiling](examples/linux-ps-profiling) example.* - -Once the experiment has been coded, the experiment can be executed by Experiment Runner. To do this, run the following command: - -```bash -python experiment-runner/ -``` - -The results of the experiment will be stored in the directory `RunnerConfig.results_output_path/RunnerConfig.name` as defined by your config variables. - -### Portability Across Users and Machines - -When sharing experiments across different users or machines, hardcoded paths in configuration files can cause issues. Experiment Runner supports **environment variables** to make your experiments portable without code changes: - -#### Available Environment Variables - -- **`EXPERIMENT_RUNNER_OUTPUT_PATH`**: Directory where experiment results are stored - - Default: `/experiments` - - Example: `export EXPERIMENT_RUNNER_OUTPUT_PATH="/path/to/results"` - -- **`ENERGIBRIDGE_PATH`**: Path to the EnergiBridge executable (for energy measurements) - - Default: `/usr/local/bin/energibridge` - - Example: `export ENERGIBRIDGE_PATH="/usr/local/bin/energibridge"` - -- **`EXAMPLES_PATH`**: Directory for generating new config templates - - Default: `/examples` - - Example: `export EXAMPLES_PATH="/home/user/my-experiments"` - -#### Using Environment Variables - -Set environment variables before running your experiment: - -```bash -export EXPERIMENT_RUNNER_OUTPUT_PATH="/data/experiments" -export ENERGIBRIDGE_PATH="/opt/energibridge/bin/energibridge" -python experiment-runner/ MyRunnerConfig.py -``` - -Your configuration files automatically use these variables if set, with sensible defaults when they are not. This allows the same experiment to run on different machines without any code modifications. - -**More information about the profilers and use cases can be found in the [Wiki tab](https://github.com/S2-group/experiment-runner/wiki).** - -## How to cite Experiment Runner - -If Experiment Runner is helping your research, consider to cite it as follows, thank you! - -``` -@article{SCICO_2025, - title = {{Experiment {Runner}: a {Tool} for the {Automatic} {Orchestration} of {Experiments} {Targeting} {Software} {Systems}}}, - issn = {0167-6423}, - journal = {Science of Computer Programming}, - author = {Max Karsten and {Andrei Calin} Dragomir and Radu Apsan and Vincenzo Stoico and Ivano Malavolta}, - year = {2025}, - pages = {103415}, - volume = {1}, - url = {https://www.sciencedirect.com/science/article/pii/S0167642325001546}, - doi = {https://doi.org/10.1016/j.scico.2025.103415} -} -``` - -### Contributing -If you want to develop a new feature or ER, or found some bug you want to report we would love to hear from you! Please refer to our [contribution guidelines](https://github.com/S2-group/experiment-runner/wiki/Contributing-to-ER) for information on how to submit PRs or bug reports. - From 4febc502877368d97f33600001472ad1b8ed7ff9 Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Tue, 16 Jun 2026 11:08:14 +0200 Subject: [PATCH 16/30] clean --- Personal_experiments/RunnerConfig.py | 58 ---------------------------- 1 file changed, 58 deletions(-) delete mode 100644 Personal_experiments/RunnerConfig.py diff --git a/Personal_experiments/RunnerConfig.py b/Personal_experiments/RunnerConfig.py deleted file mode 100644 index 4d6d0e7b5..000000000 --- a/Personal_experiments/RunnerConfig.py +++ /dev/null @@ -1,58 +0,0 @@ -import time -import os - -from ConfigValidator.Config.Models.FactorModel import FactorModel -from ConfigValidator.Config.Models.RunTableModel import RunTableModel -from ConfigValidator.Config.Models.RunnerContext import RunnerContext -from pathlib import Path -from ConfigValidator.Config.Models.OperationType import OperationType - - -def run_experiment(algorithm, input_size): - data = range(input_size) - - start = time.time() - - if algorithm == "sum_loop": - total = 0 - for x in data: - total += x - elif algorithm == "optimized_sum": - total = sum(data) - - end = time.time() - - return (end - start) * 1000 - - -class RunnerConfig: - - name = "code_performance_example" - default_output = Path("experiments") - results_output_path = Path(os.getenv("EXPERIMENT_RUNNER_OUTPUT_PATH", str(default_output))) - - operation_type = OperationType.AUTO - - time_between_runs_in_ms = 1000 - - experiment_path = None - - def create_run_table_model(self): - factor1 = FactorModel("algorithm", ["sum_loop", "optimized_sum"]) - factor2 = FactorModel("input_size", [10000, 100000, 500000]) - - return RunTableModel( - factors=[factor1, factor2], - data_columns=["execution_time_ms"] - ) - - def populate_run_data(self, context: RunnerContext): - - algorithm = context.run_variation["algorithm"] - input_size = context.run_variation["input_size"] - - exec_time = run_experiment(algorithm, input_size) - - return { - "execution_time_ms": exec_time - } \ No newline at end of file From 6d486e93f16aaa63e9e14125d756dbfa343b4613 Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Thu, 18 Jun 2026 18:16:17 +0200 Subject: [PATCH 17/30] ADB_VALIDATION_REQUIRMENTSCHECKING --- examples/hello-world-fibonacci/README.md | 2 +- .../hello-world-fibonacci/RunnerConfig.py | 24 +- examples/hello-world/README.md | 1 + examples/hello-world/RunnerConfig.py | 36 +- examples/profilers/ADB/README.md | 28 ++ examples/profilers/ADB/RunnerConfig.py | 176 ++++++++++ examples/profilers/EnergiBridge/README.md | 2 +- .../profilers/EnergiBridge/RunnerConfig.py | 31 +- examples/profilers/JoularCore/README.md | 1 + examples/profilers/JoularCore/RunnerConfig.py | 32 +- examples/profilers/NvidiaML/README.md | 2 + examples/profilers/NvidiaML/RunnerConfig.py | 26 +- examples/profilers/PicoCM3/README.md | 2 + examples/profilers/PicoCM3/RunnerConfig.py | 26 ++ examples/profilers/PowerJoular/README.md | 1 + .../profilers/PowerJoular/RunnerConfig.py | 29 +- examples/profilers/PowerLetrics/README.md | 2 + .../profilers/PowerLetrics/RunnerConfig.py | 29 +- examples/profilers/PowerMetrics/README.md | 2 + .../profilers/PowerMetrics/RunnerConfig.py | 30 +- .../profilers/linux-ps-profiling/README.md | 1 + .../linux-ps-profiling/RunnerConfig.py | 29 +- .../measure-self-profiling/README.md | 2 + .../measure-self-profiling/RunnerConfig.py | 26 +- .../DistributedOrchestrator.py | 5 +- .../DistributedExecution/Worker.py | 7 + .../Experiment/ExperimentController.py | 8 +- .../Plugins/Profilers/AndroidDebugBridge.py | 309 ++++++++++++++++++ .../Validation/EnergyValidator.py | 4 +- .../Validation/test_EnergyValidator.py | 125 ------- .../Profilers/test_AndroidDebugBridge.py | 112 +++++++ test/ProgressManager/test_EnergyValidator.py | 3 +- 32 files changed, 888 insertions(+), 225 deletions(-) create mode 100644 examples/profilers/ADB/README.md create mode 100644 examples/profilers/ADB/RunnerConfig.py create mode 100644 experiment-runner/Plugins/Profilers/AndroidDebugBridge.py delete mode 100644 experiment-runner/ProgressManager/Validation/test_EnergyValidator.py create mode 100644 test/Plugins/Profilers/test_AndroidDebugBridge.py diff --git a/examples/hello-world-fibonacci/README.md b/examples/hello-world-fibonacci/README.md index 33763119c..d83f45ad3 100644 --- a/examples/hello-world-fibonacci/README.md +++ b/examples/hello-world-fibonacci/README.md @@ -18,6 +18,6 @@ python experiment-runner/ examples/hello-world-fibonacci/RunnerConfig.py ## Results The results are generated in the `examples/hello-world-fibonacci/experiments` folder. - +In case there are anomalies such as null, absent, or negative values, a report will be generated in the `examples/hello-world-fibonacci/experiments` folder. **!!! WARNING !!!**: COLUMNS IN THE `energibridge.csv` FILES CAN BE DIFFERENT ACROSS MACHINES. ADJUST THE DATAFRAME COLUMN NAMES ACCORDINGLY. diff --git a/examples/hello-world-fibonacci/RunnerConfig.py b/examples/hello-world-fibonacci/RunnerConfig.py index fe33f7496..5bf9cc929 100644 --- a/examples/hello-world-fibonacci/RunnerConfig.py +++ b/examples/hello-world-fibonacci/RunnerConfig.py @@ -7,7 +7,7 @@ from ProgressManager.Output.OutputProcedure import OutputProcedure as output from ProgressManager.Validation.RequirementsValidator import (validate_experiment_requirements) -from typing import Dict, Any, Optional +from typing import Dict, Any, Optional, List from pathlib import Path from os.path import dirname, realpath @@ -36,14 +36,21 @@ class RunnerConfig: ENERGIBRIDGE_PATH = "/home/andabarbu/.cargo/bin/energibridge" - enable_energy_validation: bool = True - """Path to log file for energy validation report. Relative to experiment output directory.""" energy_validation_log_file: str = "energy_validation_report.log" - """List of data column names that contain energy measurements (e.g., ['energy', 'joules', 'watts']). - Only used if enable_energy_validation is True.""" - energy_validation_columns: List[str] = [] + """List of data column names that contain energy measurements (e.g., ['energy', 'joules', 'watts']).""" + energy_validation_columns: List[str] = [ + "cpu_energy", + "core0_energy", + "core1_energy", + "core2_energy", + "core3_energy", + "core4_energy", + "core5_energy", + "core6_energy", + "core7_energy" + ] def __init__(self): @@ -65,6 +72,9 @@ def __init__(self): output.console_log("Custom config loaded") + def validate_experiment(self) -> None: + validate_experiment_requirements(Path(__file__)) + def create_run_table_model(self) -> RunTableModel: factor1 = FactorModel("fib_type", ['iter', 'mem', 'rec']) @@ -94,8 +104,6 @@ def create_run_table_model(self) -> RunTableModel: ) return self.run_table_model - def validate_experiment(self) -> None: - validate_experiment_requirements(Path(__file__)) def before_experiment(self) -> None: pass diff --git a/examples/hello-world/README.md b/examples/hello-world/README.md index a5ff6e013..406f630db 100644 --- a/examples/hello-world/README.md +++ b/examples/hello-world/README.md @@ -14,3 +14,4 @@ python experiment-runner/ examples/hello-world/RunnerConfig.py ## Results The results are generated in the `examples/hello-world/experiments` folder. +In case there are anomalies such as null, absent, or negative values, a report will be generated in the `examples/hello-world/experiments` folder. \ No newline at end of file diff --git a/examples/hello-world/RunnerConfig.py b/examples/hello-world/RunnerConfig.py index 3641052f7..6b9f0ffaa 100644 --- a/examples/hello-world/RunnerConfig.py +++ b/examples/hello-world/RunnerConfig.py @@ -5,6 +5,7 @@ from ConfigValidator.Config.Models.RunnerContext import RunnerContext from ConfigValidator.Config.Models.OperationType import OperationType from ProgressManager.Output.OutputProcedure import OutputProcedure as output +from ProgressManager.Validation.RequirementsValidator import (validate_experiment_requirements) from typing import Dict, List, Any, Optional from pathlib import Path @@ -32,21 +33,31 @@ class RunnerConfig: This can be essential to accommodate for cooldown periods on some systems.""" time_between_runs_in_ms: int = 1000 + """Path to log file for energy validation report. Relative to experiment output directory.""" + energy_validation_log_file: str = "energy_validation_report.log" + + """List of data column names that contain energy measurements (e.g., ['energy', 'joules', 'watts']).""" + energy_validation_columns = [ + "avg_cpu", + "avg_mem" + ] + # Dynamic configurations can be one-time satisfied here before the program takes the config as-is # e.g. Setting some variable based on some criteria def __init__(self): """Executes immediately after program start, on config load""" EventSubscriptionController.subscribe_to_multiple_events([ - (RunnerEvents.BEFORE_EXPERIMENT, self.before_experiment), - (RunnerEvents.BEFORE_RUN , self.before_run ), - (RunnerEvents.START_RUN , self.start_run ), - (RunnerEvents.START_MEASUREMENT, self.start_measurement), - (RunnerEvents.INTERACT , self.interact ), - (RunnerEvents.STOP_MEASUREMENT , self.stop_measurement ), - (RunnerEvents.STOP_RUN , self.stop_run ), - (RunnerEvents.POPULATE_RUN_DATA, self.populate_run_data), - (RunnerEvents.AFTER_EXPERIMENT , self.after_experiment ) + (RunnerEvents.VALIDATE_EXPERIMENT, self.validate_experiment), + (RunnerEvents.BEFORE_EXPERIMENT , self.before_experiment), + (RunnerEvents.BEFORE_RUN , self.before_run ), + (RunnerEvents.START_RUN , self.start_run ), + (RunnerEvents.START_MEASUREMENT , self.start_measurement), + (RunnerEvents.INTERACT , self.interact ), + (RunnerEvents.STOP_MEASUREMENT , self.stop_measurement ), + (RunnerEvents.STOP_RUN , self.stop_run ), + (RunnerEvents.POPULATE_RUN_DATA , self.populate_run_data), + (RunnerEvents.AFTER_EXPERIMENT , self.after_experiment ) ]) self.run_table_model = None # Initialized later @@ -67,6 +78,11 @@ def create_run_table_model(self) -> RunTableModel: data_columns=['avg_cpu', 'avg_mem'] ) return self.run_table_model + + def validate_experiment(self) -> None: + """Perform any experiment validation here. If any validation fails, raise an exception with details on the failure.""" + validate_experiment_requirements(Path(__file__)) + output.console_log("Config.validate_experiment() called!") def before_experiment(self) -> None: """Perform any activity required before starting the experiment here @@ -122,4 +138,4 @@ def after_experiment(self) -> None: output.console_log("Config.after_experiment() called!") # ================================ DO NOT ALTER BELOW THIS LINE ================================ - experiment_path: Path = None + experiment_path: Path = None \ No newline at end of file diff --git a/examples/profilers/ADB/README.md b/examples/profilers/ADB/README.md new file mode 100644 index 000000000..5c445df6c --- /dev/null +++ b/examples/profilers/ADB/README.md @@ -0,0 +1,28 @@ +# `Android Debug Bridge` Profiler + +This example shows how to automatically collect battery and energy metrics from +Android devices during experiment execution using ADB. + +## Requirements + - Android SDK Platform Tools installed + - Linux: + ```bash + sudo apt install android-tools-adb android-tools-fastboot + ``` + - macOS: + ```bash + brew install android-platform-tools + ``` + - Android device connected via USB or emulator running + - USB debugging enabled on device + +## Running +From the root directory of the repo, run the following command: + ```bash + python experiment-runner/ examples/profilers/ADB/RunnerConfig.py + ``` + +## Results +The results are generated in the `examples/profilers/ADB/experiments` folder. + +In case there are anomalies such as null, absent, or negative values, a report will be generated in the `examples/profilers/ADB/experiments` folder. \ No newline at end of file diff --git a/examples/profilers/ADB/RunnerConfig.py b/examples/profilers/ADB/RunnerConfig.py new file mode 100644 index 000000000..eff52c8d7 --- /dev/null +++ b/examples/profilers/ADB/RunnerConfig.py @@ -0,0 +1,176 @@ +from EventManager.Models.RunnerEvents import RunnerEvents +from EventManager.EventSubscriptionController import EventSubscriptionController +from ConfigValidator.Config.Models.RunTableModel import RunTableModel +from ConfigValidator.Config.Models.FactorModel import FactorModel +from ConfigValidator.Config.Models.RunnerContext import RunnerContext +from ConfigValidator.Config.Models.OperationType import OperationType +from ProgressManager.Output.OutputProcedure import OutputProcedure as output +from ProgressManager.Validation.RequirementsValidator import (validate_experiment_requirements) + +from Plugins.Profilers.AndroidDebugBridge import AndroidBatteryMonitor, battery_monitor +from typing import Dict, List, Any, Optional +from pathlib import Path +from os.path import dirname, realpath +import time + +@battery_monitor( + device_serial=None, + poll_interval=1, + data_columns=[ + 'battery_percentage', + 'battery_temperature', + 'battery_voltage', + 'charge_rate', + 'power_draw' + ] +) +class RunnerConfig: + ROOT_DIR = Path(dirname(realpath(__file__))) + + # ================================ USER SPECIFIC CONFIG ================================ + """The name of the experiment.""" + name: str = "android_energy_monitoring_experiment" + + """The path in which Experiment Runner will create a folder with the name `self.name`""" + results_output_path: Path = ROOT_DIR / 'experiments' + + """Experiment operation type""" + operation_type: OperationType = OperationType.AUTO + + """Time between runs (cooldown period)""" + time_between_runs_in_ms: int = 3000 + + + """Path to log file for energy validation report. Relative to experiment output directory.""" + energy_validation_log_file: str = "energy_validation_report.log" + + """List of data column names that contain energy measurements (e.g., ['energy', 'joules', 'watts']). + Only used if enable_energy_validation is True.""" + energy_validation_columns = [ + "android_battery__percentage", + "android_battery__temperature", + "android_battery__voltage", + "android_battery__current_now", + "android_battery__power_draw" + ] + + # Dynamic configurations can be one-time satisfied here before the program takes the config as-is + # e.g. Setting some variable based on some criteria + def __init__(self): + """Executes immediately after program start, on config load""" + + EventSubscriptionController.subscribe_to_multiple_events([ + (RunnerEvents.VALIDATE_EXPERIMENT, self.validate_experiment), + (RunnerEvents.BEFORE_EXPERIMENT , self.before_experiment), + (RunnerEvents.BEFORE_RUN , self.before_run ), + (RunnerEvents.START_RUN , self.start_run ), + (RunnerEvents.START_MEASUREMENT , self.start_measurement), + (RunnerEvents.INTERACT , self.interact ), + (RunnerEvents.STOP_MEASUREMENT , self.stop_measurement ), + (RunnerEvents.STOP_RUN , self.stop_run ), + (RunnerEvents.POPULATE_RUN_DATA , self.populate_run_data), + (RunnerEvents.AFTER_EXPERIMENT , self.after_experiment ) + ]) + self.run_table_model = None + + output.console_log("Android Energy Monitoring config loaded") + + def create_run_table_model(self) -> RunTableModel: + """Define the experimental design with factors and data columns. + + Note: The @AndroidEnergyMonitor.energy_monitor decorator automatically + adds energy data columns to this model. + """ + # Define experimental factors + workload_factor = FactorModel("workload", ['light', 'medium', 'heavy']) + screen_factor = FactorModel("screen_brightness", ['low', 'high']) + + self.run_table_model = RunTableModel( + factors=[workload_factor, screen_factor], + repetitions=3, + # Add custom data columns (energy columns are added by decorator) + data_columns=['workload_duration_ms', 'task_completion_status'] + ) + return self.run_table_model + + def validate_experiment(self) -> None: + """Perform any experiment validation here. If any validation fails, raise an exception with details on the failure.""" + validate_experiment_requirements(Path(__file__)) + output.console_log("Config.validate_experiment() called!") + + def before_experiment(self) -> None: + """Called before experiment starts.""" + output.console_log("Starting Android energy monitoring experiment...") + output.console_log("Ensure your Android device is connected via USB or emulator is running") + + def before_run(self) -> None: + """Called before each run.""" + output.console_log(f"Preparing device for run...") + + def start_run(self, context: RunnerContext) -> None: + """Start a single experiment run. + + In a real scenario, this would start your Android app or workload. + For this example, we just wait a bit. + """ + output.console_log("Config.start_run() called!") + + def start_measurement(self, context: RunnerContext) -> None: + """Start measurement - energy monitoring begins here automatically.""" + output.console_log("Energy monitoring started (battery metrics being collected)") + + def interact(self, context: RunnerContext): + workload = context.execute_run['workload'] + brightness = context.execute_run['screen_brightness'] + + duration_ms = { + 'light': 5000, + 'medium': 10000, + 'heavy': 15000 + }[workload] + + output.console_log( + f"Running {workload} workload " + f"for {duration_ms}ms " + f"(brightness: {brightness})" + ) + + time.sleep(duration_ms / 1000) + + output.console_log("Workload completed") + + def stop_measurement(self, context: RunnerContext) -> None: + """Stop measurement - energy monitoring ends here automatically.""" + output.console_log("Energy monitoring stopped") + + def stop_run(self, context: RunnerContext) -> None: + """Stop the current run.""" + output.console_log(f"Stopped run: {context.execute_run['__run_id']}") + + def populate_run_data(self, context: RunnerContext) -> Optional[Dict[str, Any]]: + """Populate data columns for this run. + + The @AndroidEnergyMonitor.energy_monitor decorator automatically + populates energy-related columns. This method can add custom data. + """ + # In a real scenario, you would parse workload results here + workload = context.execute_run['workload'] + + duration_ms = { + 'light': 5000, + 'medium': 10000, + 'heavy': 15000 + }.get(workload, 5000) + + return { + 'workload_duration_ms': duration_ms, + 'task_completion_status': 'success' + } + + def after_experiment(self) -> None: + """Called after experiment completes.""" + output.console_log("Android energy monitoring experiment completed!") + output.console_log("Results stored in experiments/android_energy_monitoring_experiment/") + + # ================================ DO NOT ALTER BELOW THIS LINE ================================ + experiment_path: Path = None \ No newline at end of file diff --git a/examples/profilers/EnergiBridge/README.md b/examples/profilers/EnergiBridge/README.md index 57eca022e..1b4db1a75 100644 --- a/examples/profilers/EnergiBridge/README.md +++ b/examples/profilers/EnergiBridge/README.md @@ -20,7 +20,7 @@ python3 experiment-runner/ examples/profilers/EnergiBridge/RunnerConfig.py ## Results The results are generated in the `examples/profilers/EnergiBridge/experiments` folder. - +In case there are anomalies such as null, absent, or negative values, a report will be generated in the `examples/profilers/EnergiBridge/experiments` folder. **!!! WARNING !!!**: COLUMNS IN THE `energibridge.csv` FILES CAN BE DIFFERENT ACROSS MACHINES. ADJUST THE DATAFRAME COLUMN NAMES ACCORDINGLY. diff --git a/examples/profilers/EnergiBridge/RunnerConfig.py b/examples/profilers/EnergiBridge/RunnerConfig.py index e2b93c3db..120879ff5 100644 --- a/examples/profilers/EnergiBridge/RunnerConfig.py +++ b/examples/profilers/EnergiBridge/RunnerConfig.py @@ -5,6 +5,7 @@ from ConfigValidator.Config.Models.RunnerContext import RunnerContext from ConfigValidator.Config.Models.OperationType import OperationType from ProgressManager.Output.OutputProcedure import OutputProcedure as output +from ProgressManager.Validation.RequirementsValidator import (validate_experiment_requirements) from Plugins.Profilers.EnergiBridge import EnergiBridge from typing import Dict, List, Any, Optional @@ -32,6 +33,12 @@ class RunnerConfig: """The time Experiment Runner will wait after a run completes. This can be essential to accommodate for cooldown periods on some systems.""" time_between_runs_in_ms: int = 1000 + + """Path to log file for energy validation report. Relative to experiment output directory.""" + energy_validation_log_file: str = "energy_validation_report.log" + + """List of data column names that contain energy measurements (e.g., ['energy', 'joules', 'watts']).""" + energy_validation_columns: List[str] = [] # Dynamic configurations can be one-time satisfied here before the program takes the config as-is # e.g. Setting some variable based on some criteria @@ -39,15 +46,16 @@ def __init__(self): """Executes immediately after program start, on config load""" EventSubscriptionController.subscribe_to_multiple_events([ - (RunnerEvents.BEFORE_EXPERIMENT, self.before_experiment), - (RunnerEvents.BEFORE_RUN , self.before_run ), - (RunnerEvents.START_RUN , self.start_run ), - (RunnerEvents.START_MEASUREMENT, self.start_measurement), - (RunnerEvents.INTERACT , self.interact ), - (RunnerEvents.STOP_MEASUREMENT , self.stop_measurement ), - (RunnerEvents.STOP_RUN , self.stop_run ), - (RunnerEvents.POPULATE_RUN_DATA, self.populate_run_data), - (RunnerEvents.AFTER_EXPERIMENT , self.after_experiment ) + (RunnerEvents.VALIDATE_EXPERIMENT, self.validate_experiment), + (RunnerEvents.BEFORE_EXPERIMENT , self.before_experiment), + (RunnerEvents.BEFORE_RUN , self.before_run ), + (RunnerEvents.START_RUN , self.start_run ), + (RunnerEvents.START_MEASUREMENT , self.start_measurement), + (RunnerEvents.INTERACT , self.interact ), + (RunnerEvents.STOP_MEASUREMENT , self.stop_measurement ), + (RunnerEvents.STOP_RUN , self.stop_run ), + (RunnerEvents.POPULATE_RUN_DATA , self.populate_run_data), + (RunnerEvents.AFTER_EXPERIMENT , self.after_experiment ) ]) self.run_table_model = None # Initialized later @@ -63,6 +71,11 @@ def create_run_table_model(self) -> RunTableModel: ) return self.run_table_model + + def validate_experiment(self) -> None: + """Perform any experiment validation here. If any validation fails, raise an exception with details on the failure.""" + validate_experiment_requirements(Path(__file__)) + output.console_log("Config.validate_experiment() called!") def before_experiment(self) -> None: """Perform any activity required before starting the experiment here diff --git a/examples/profilers/JoularCore/README.md b/examples/profilers/JoularCore/README.md index b92775876..0931b82cc 100644 --- a/examples/profilers/JoularCore/README.md +++ b/examples/profilers/JoularCore/README.md @@ -22,3 +22,4 @@ sudo python3 experiment-runner/ examples/joularcore-profiling/RunnerConfig.py The results are generated in the `examples/joularcore-profiling/experiments` folder. +In case there are anomalies such as null, absent, or negative values, a report will be generated in the `examples/joularcore-profiling/experiments` folder diff --git a/examples/profilers/JoularCore/RunnerConfig.py b/examples/profilers/JoularCore/RunnerConfig.py index 03134ca1a..5b8a852d3 100644 --- a/examples/profilers/JoularCore/RunnerConfig.py +++ b/examples/profilers/JoularCore/RunnerConfig.py @@ -5,6 +5,7 @@ from ConfigValidator.Config.Models.RunnerContext import RunnerContext from ConfigValidator.Config.Models.OperationType import OperationType from ProgressManager.Output.OutputProcedure import OutputProcedure as output +from ProgressManager.Validation.RequirementsValidator import (validate_experiment_requirements) import shlex from typing import Dict, Any, Optional @@ -26,18 +27,29 @@ class RunnerConfig: results_output_path: Path = Path(os.getenv("EXPERIMENT_RUNNER_OUTPUT_PATH", str(default_output))) operation_type: OperationType = OperationType.AUTO time_between_runs_in_ms: int = 1000 + + """Path to log file for energy validation report. Relative to experiment output directory.""" + energy_validation_log_file: str = "energy_validation_report.log" + """List of data column names that contain energy measurements (e.g., ['energy', 'joules', 'watts']).""" + energy_validation_columns: List[str] = [] + + # Dynamic configurations can be one-time satisfied here before the program takes the config as-is + # e.g. Setting some variable based on some criteria def __init__(self): + """Executes immediately after program start, on config load""" + EventSubscriptionController.subscribe_to_multiple_events([ - (RunnerEvents.BEFORE_EXPERIMENT, self.before_experiment), - (RunnerEvents.BEFORE_RUN , self.before_run), - (RunnerEvents.START_RUN , self.start_run), - (RunnerEvents.START_MEASUREMENT, self.start_measurement), - (RunnerEvents.INTERACT , self.interact), - (RunnerEvents.STOP_MEASUREMENT , self.stop_measurement), - (RunnerEvents.STOP_RUN , self.stop_run), - (RunnerEvents.POPULATE_RUN_DATA, self.populate_run_data), - (RunnerEvents.AFTER_EXPERIMENT , self.after_experiment) + (RunnerEvents.VALIDATE_EXPERIMENT, self.validate_experiment), + (RunnerEvents.BEFORE_EXPERIMENT , self.before_experiment), + (RunnerEvents.BEFORE_RUN , self.before_run ), + (RunnerEvents.START_RUN , self.start_run ), + (RunnerEvents.START_MEASUREMENT , self.start_measurement), + (RunnerEvents.INTERACT , self.interact ), + (RunnerEvents.STOP_MEASUREMENT , self.stop_measurement ), + (RunnerEvents.STOP_RUN , self.stop_run ), + (RunnerEvents.POPULATE_RUN_DATA , self.populate_run_data), + (RunnerEvents.AFTER_EXPERIMENT , self.after_experiment ) ]) self.run_table_model = None @@ -57,6 +69,8 @@ def create_run_table_model(self) -> RunTableModel: data_columns=["avg_process_power", "avg_cpu_usage", "avg_cpu_power"] ) return self.run_table_model + def validate_experiment(self) -> None: + output.console_log("Config.validate_experiment() called!") def before_experiment(self) -> None: output.console_log("Config.before_experiment() called!") diff --git a/examples/profilers/NvidiaML/README.md b/examples/profilers/NvidiaML/README.md index b7acab138..c6854aa05 100644 --- a/examples/profilers/NvidiaML/README.md +++ b/examples/profilers/NvidiaML/README.md @@ -30,3 +30,5 @@ python experiment-runner/ examples/nvml-profiling/RunnerConfig.py ## Results The results are generated in the `examples/nvml-profiling/experiments` folder, in json format. + +In case there are anomalies such as null, absent, or negative values, a report will be generated in the `examples/nvml-profiling/experiments` folder. diff --git a/examples/profilers/NvidiaML/RunnerConfig.py b/examples/profilers/NvidiaML/RunnerConfig.py index 6110833b5..95603b379 100644 --- a/examples/profilers/NvidiaML/RunnerConfig.py +++ b/examples/profilers/NvidiaML/RunnerConfig.py @@ -5,6 +5,7 @@ from ConfigValidator.Config.Models.RunnerContext import RunnerContext from ConfigValidator.Config.Models.OperationType import OperationType from ProgressManager.Output.OutputProcedure import OutputProcedure as output +from ProgressManager.Validation.RequirementsValidator import (validate_experiment_requirements) from Plugins.Profilers.NvidiaML import NvidiaML, NVML_Sample, NVML_Field, NVML_GPU_Operation_Mode, NVML_IDs, NVML_Dynamic_Query from typing import Dict, List, Any, Optional @@ -35,21 +36,25 @@ class RunnerConfig: This can be essential to accommodate for cooldown periods on some systems.""" time_between_runs_in_ms: int = 1000 - # Dynamic configurations can be one-time satisfied here before the program takes the config as-is - # e.g. Setting some variable based on some criteria + """Path to log file for energy validation report. Relative to experiment output directory.""" + energy_validation_log_file: str = "energy_validation_report.log" + + """List of data column names that contain energy measurements (e.g., ['energy', 'joules', 'watts']).""" + energy_validation_columns: List[str] = [] + def __init__(self): - """Executes immediately after program start, on config load""" EventSubscriptionController.subscribe_to_multiple_events([ + (RunnerEvents.VALIDATE_EXPERIMENT, self.validate_experiment), (RunnerEvents.BEFORE_EXPERIMENT, self.before_experiment), - (RunnerEvents.BEFORE_RUN , self.before_run ), - (RunnerEvents.START_RUN , self.start_run ), + (RunnerEvents.BEFORE_RUN, self.before_run), + (RunnerEvents.START_RUN, self.start_run), (RunnerEvents.START_MEASUREMENT, self.start_measurement), - (RunnerEvents.INTERACT , self.interact ), - (RunnerEvents.STOP_MEASUREMENT , self.stop_measurement ), - (RunnerEvents.STOP_RUN , self.stop_run ), + (RunnerEvents.INTERACT, self.interact), + (RunnerEvents.STOP_MEASUREMENT, self.stop_measurement), + (RunnerEvents.STOP_RUN, self.stop_run), (RunnerEvents.POPULATE_RUN_DATA, self.populate_run_data), - (RunnerEvents.AFTER_EXPERIMENT , self.after_experiment ) + (RunnerEvents.AFTER_EXPERIMENT, self.after_experiment) ]) self.run_table_model = None # Initialized later @@ -65,6 +70,9 @@ def create_run_table_model(self) -> RunTableModel: data_columns=["avg_enc", "avg_dec", "avg_pstate"]) return self.run_table_model + + def validate_experiment(self) -> None: + validate_experiment_requirements(Path(__file__)) def before_experiment(self) -> None: """Perform any activity required before starting the experiment here diff --git a/examples/profilers/PicoCM3/README.md b/examples/profilers/PicoCM3/README.md index f9bfec0b2..b95c8cd83 100644 --- a/examples/profilers/PicoCM3/README.md +++ b/examples/profilers/PicoCM3/README.md @@ -36,3 +36,5 @@ python experiment-runner/ examples/picocm3-profiling/RunnerConfig.py The results are generated in the `examples/picocm3-profiling/experiments` folder. There should be a unique log file for each variation in the experiment, as well as a run_table.csv file summarizing these log files. + +In case there are anomalies such as null, absent, or negative values, a report will be generated in the `examples/picocm3-profiling/experiments` folder. \ No newline at end of file diff --git a/examples/profilers/PicoCM3/RunnerConfig.py b/examples/profilers/PicoCM3/RunnerConfig.py index 4aa7c7202..79ec44dc5 100644 --- a/examples/profilers/PicoCM3/RunnerConfig.py +++ b/examples/profilers/PicoCM3/RunnerConfig.py @@ -5,6 +5,7 @@ from ConfigValidator.Config.Models.RunnerContext import RunnerContext from ConfigValidator.Config.Models.OperationType import OperationType from ProgressManager.Output.OutputProcedure import OutputProcedure as output +from ProgressManager.Validation.RequirementsValidator import (validate_experiment_requirements) from typing import Dict, Any, Optional from pathlib import Path @@ -52,6 +53,28 @@ def __init__(self): (RunnerEvents.STOP_RUN , self.stop_run ), (RunnerEvents.POPULATE_RUN_DATA, self.populate_run_data), (RunnerEvents.AFTER_EXPERIMENT , self.after_experiment ) + ])"""Path to log file for energy validation report. Relative to experiment output directory.""" + energy_validation_log_file: str = "energy_validation_report.log" + + """List of data column names that contain energy measurements (e.g., ['energy', 'joules', 'watts']).""" + energy_validation_columns: List[str] = [] + + # Dynamic configurations can be one-time satisfied here before the program takes the config as-is + # e.g. Setting some variable based on some criteria + def __init__(self): + """Executes immediately after program start, on config load""" + + EventSubscriptionController.subscribe_to_multiple_events([ + (RunnerEvents.VALIDATE_EXPERIMENT, self.validate_experiment), + (RunnerEvents.BEFORE_EXPERIMENT , self.before_experiment), + (RunnerEvents.BEFORE_RUN , self.before_run ), + (RunnerEvents.START_RUN , self.start_run ), + (RunnerEvents.START_MEASUREMENT , self.start_measurement), + (RunnerEvents.INTERACT , self.interact ), + (RunnerEvents.STOP_MEASUREMENT , self.stop_measurement ), + (RunnerEvents.STOP_RUN , self.stop_run ), + (RunnerEvents.POPULATE_RUN_DATA , self.populate_run_data), + (RunnerEvents.AFTER_EXPERIMENT , self.after_experiment ) ]) self.latest_log = None @@ -69,6 +92,9 @@ def create_run_table_model(self) -> RunTableModel: data_columns=['timestamp', 'channel_1(avg)', 'channel_2(off)', 'channel_3(off)']) # Channel 1 is in Amps return self.run_table_model + + def validate_experiment(self) -> None: + validate_experiment_requirements(Path(__file__)) def before_experiment(self) -> None: """Perform any activity required before starting the experiment here diff --git a/examples/profilers/PowerJoular/README.md b/examples/profilers/PowerJoular/README.md index aa1c17961..dfbe75bf3 100644 --- a/examples/profilers/PowerJoular/README.md +++ b/examples/profilers/PowerJoular/README.md @@ -31,3 +31,4 @@ python experiment-runner/ examples/PowerJoular/RunnerConfig.py The results are generated in the `examples/linux-powerjoular-profiling/experiments` folder. +In case there are anomalies such as null, absent, or negative values, a report will be generated in the `examples/linux-powerjoular-profiling/experiments` folder. \ No newline at end of file diff --git a/examples/profilers/PowerJoular/RunnerConfig.py b/examples/profilers/PowerJoular/RunnerConfig.py index ab25f0cd3..5794b9fc2 100644 --- a/examples/profilers/PowerJoular/RunnerConfig.py +++ b/examples/profilers/PowerJoular/RunnerConfig.py @@ -5,6 +5,7 @@ from ConfigValidator.Config.Models.RunnerContext import RunnerContext from ConfigValidator.Config.Models.OperationType import OperationType from ProgressManager.Output.OutputProcedure import OutputProcedure as output +from ProgressManager.Validation.RequirementsValidator import (validate_experiment_requirements) from Plugins.Profilers.PowerJoular import PowerJoular @@ -37,21 +38,28 @@ class RunnerConfig: This can be essential to accommodate for cooldown periods on some systems.""" time_between_runs_in_ms: int = 1000 + """Path to log file for energy validation report. Relative to experiment output directory.""" + energy_validation_log_file: str = "energy_validation_report.log" + + """List of data column names that contain energy measurements (e.g., ['energy', 'joules', 'watts']).""" + energy_validation_columns: List[str] = [] + # Dynamic configurations can be one-time satisfied here before the program takes the config as-is # e.g. Setting some variable based on some criteria def __init__(self): """Executes immediately after program start, on config load""" EventSubscriptionController.subscribe_to_multiple_events([ - (RunnerEvents.BEFORE_EXPERIMENT, self.before_experiment), - (RunnerEvents.BEFORE_RUN , self.before_run ), - (RunnerEvents.START_RUN , self.start_run ), - (RunnerEvents.START_MEASUREMENT, self.start_measurement), - (RunnerEvents.INTERACT , self.interact ), - (RunnerEvents.STOP_MEASUREMENT , self.stop_measurement ), - (RunnerEvents.STOP_RUN , self.stop_run ), - (RunnerEvents.POPULATE_RUN_DATA, self.populate_run_data), - (RunnerEvents.AFTER_EXPERIMENT , self.after_experiment ) + (RunnerEvents.VALIDATE_EXPERIMENT, self.validate_experiment), + (RunnerEvents.BEFORE_EXPERIMENT , self.before_experiment), + (RunnerEvents.BEFORE_RUN , self.before_run ), + (RunnerEvents.START_RUN , self.start_run ), + (RunnerEvents.START_MEASUREMENT , self.start_measurement), + (RunnerEvents.INTERACT , self.interact ), + (RunnerEvents.STOP_MEASUREMENT , self.stop_measurement ), + (RunnerEvents.STOP_RUN , self.stop_run ), + (RunnerEvents.POPULATE_RUN_DATA , self.populate_run_data), + (RunnerEvents.AFTER_EXPERIMENT , self.after_experiment ) ]) self.run_table_model = None # Initialized later output.console_log("Custom config loaded") @@ -65,6 +73,9 @@ def create_run_table_model(self) -> RunTableModel: data_columns=['avg_cpu', 'total_energy'] ) return self.run_table_model + + def validate_experiment(self) -> None: + validate_experiment_requirements(Path(__file__)) def before_experiment(self) -> None: """Perform any activity required before starting the experiment here diff --git a/examples/profilers/PowerLetrics/README.md b/examples/profilers/PowerLetrics/README.md index 0b357870b..8cef83c13 100644 --- a/examples/profilers/PowerLetrics/README.md +++ b/examples/profilers/PowerLetrics/README.md @@ -19,3 +19,5 @@ python experiment-runner/ examples/powerletrics-profiling/RunnerConfig.py ## Results The results are generated in the `examples/powerletrics-profiling/experiments` folder. + +In case there are anomalies such as null, absent, or negative values, a report will be generated in the `examples/powerletrics-profiling/experiments` folder. \ No newline at end of file diff --git a/examples/profilers/PowerLetrics/RunnerConfig.py b/examples/profilers/PowerLetrics/RunnerConfig.py index 3902067dd..dec09e797 100644 --- a/examples/profilers/PowerLetrics/RunnerConfig.py +++ b/examples/profilers/PowerLetrics/RunnerConfig.py @@ -5,6 +5,7 @@ from ConfigValidator.Config.Models.RunnerContext import RunnerContext from ConfigValidator.Config.Models.OperationType import OperationType from ProgressManager.Output.OutputProcedure import OutputProcedure as output +from ProgressManager.Validation.RequirementsValidator import (validate_experiment_requirements) from Plugins.Profilers.PowerLetrics import PowerLetrics from typing import Dict, List, Any, Optional @@ -35,21 +36,28 @@ class RunnerConfig: This can be essential to accommodate for cooldown periods on some systems.""" time_between_runs_in_ms: int = 1000 + """Path to log file for energy validation report. Relative to experiment output directory.""" + energy_validation_log_file: str = "energy_validation_report.log" + + """List of data column names that contain energy measurements (e.g., ['energy', 'joules', 'watts']).""" + energy_validation_columns: List[str] = [] + # Dynamic configurations can be one-time satisfied here before the program takes the config as-is # e.g. Setting some variable based on some criteria def __init__(self): """Executes immediately after program start, on config load""" EventSubscriptionController.subscribe_to_multiple_events([ - (RunnerEvents.BEFORE_EXPERIMENT, self.before_experiment), - (RunnerEvents.BEFORE_RUN , self.before_run ), - (RunnerEvents.START_RUN , self.start_run ), - (RunnerEvents.START_MEASUREMENT, self.start_measurement), - (RunnerEvents.INTERACT , self.interact ), - (RunnerEvents.STOP_MEASUREMENT , self.stop_measurement ), - (RunnerEvents.STOP_RUN , self.stop_run ), - (RunnerEvents.POPULATE_RUN_DATA, self.populate_run_data), - (RunnerEvents.AFTER_EXPERIMENT , self.after_experiment ) + (RunnerEvents.VALIDATE_EXPERIMENT, self.validate_experiment), + (RunnerEvents.BEFORE_EXPERIMENT , self.before_experiment), + (RunnerEvents.BEFORE_RUN , self.before_run ), + (RunnerEvents.START_RUN , self.start_run ), + (RunnerEvents.START_MEASUREMENT , self.start_measurement), + (RunnerEvents.INTERACT , self.interact ), + (RunnerEvents.STOP_MEASUREMENT , self.stop_measurement ), + (RunnerEvents.STOP_RUN , self.stop_run ), + (RunnerEvents.POPULATE_RUN_DATA , self.populate_run_data), + (RunnerEvents.AFTER_EXPERIMENT , self.after_experiment ) ]) self.run_table_model = None # Initialized later @@ -65,6 +73,9 @@ def create_run_table_model(self) -> RunTableModel: data_columns=["energy_footprint", "cpu_utilization", "process_name"]) return self.run_table_model + + def validate_experiment(self) -> None: + validate_experiment_requirements(realpath(__file__)) def before_experiment(self) -> None: """Perform any activity required before starting the experiment here diff --git a/examples/profilers/PowerMetrics/README.md b/examples/profilers/PowerMetrics/README.md index c63c0ab5f..df62e5143 100644 --- a/examples/profilers/PowerMetrics/README.md +++ b/examples/profilers/PowerMetrics/README.md @@ -23,3 +23,5 @@ sudo python experiment-runner/ examples/powermetrics-profiling/RunnerConfig.py ## Results The results are generated in the `examples/powermetrics-profiling/experiments` folder. + +In case there are anomalies such as null, absent, or negative values, a report will be generated in the `examples/powermetrics-profiling/experiments` folder. \ No newline at end of file diff --git a/examples/profilers/PowerMetrics/RunnerConfig.py b/examples/profilers/PowerMetrics/RunnerConfig.py index 09040547c..05e50b13f 100644 --- a/examples/profilers/PowerMetrics/RunnerConfig.py +++ b/examples/profilers/PowerMetrics/RunnerConfig.py @@ -5,6 +5,7 @@ from ConfigValidator.Config.Models.RunnerContext import RunnerContext from ConfigValidator.Config.Models.OperationType import OperationType from ProgressManager.Output.OutputProcedure import OutputProcedure as output +from ProgressManager.Validation.RequirementsValidator import (validate_experiment_requirements) from Plugins.Profilers.PowerMetrics import PowerMetrics from typing import Dict, List, Any, Optional @@ -34,21 +35,28 @@ class RunnerConfig: This can be essential to accommodate for cooldown periods on some systems.""" time_between_runs_in_ms: int = 1000 + """Path to log file for energy validation report. Relative to experiment output directory.""" + energy_validation_log_file: str = "energy_validation_report.log" + + """List of data column names that contain energy measurements (e.g., ['energy', 'joules', 'watts']).""" + energy_validation_columns: List[str] = [] + # Dynamic configurations can be one-time satisfied here before the program takes the config as-is # e.g. Setting some variable based on some criteria def __init__(self): """Executes immediately after program start, on config load""" EventSubscriptionController.subscribe_to_multiple_events([ - (RunnerEvents.BEFORE_EXPERIMENT, self.before_experiment), - (RunnerEvents.BEFORE_RUN , self.before_run ), - (RunnerEvents.START_RUN , self.start_run ), - (RunnerEvents.START_MEASUREMENT, self.start_measurement), - (RunnerEvents.INTERACT , self.interact ), - (RunnerEvents.STOP_MEASUREMENT , self.stop_measurement ), - (RunnerEvents.STOP_RUN , self.stop_run ), - (RunnerEvents.POPULATE_RUN_DATA, self.populate_run_data), - (RunnerEvents.AFTER_EXPERIMENT , self.after_experiment ) + (RunnerEvents.VALIDATE_EXPERIMENT, self.validate_experiment), + (RunnerEvents.BEFORE_EXPERIMENT , self.before_experiment), + (RunnerEvents.BEFORE_RUN , self.before_run ), + (RunnerEvents.START_RUN , self.start_run ), + (RunnerEvents.START_MEASUREMENT , self.start_measurement), + (RunnerEvents.INTERACT , self.interact ), + (RunnerEvents.STOP_MEASUREMENT , self.stop_measurement ), + (RunnerEvents.STOP_RUN , self.stop_run ), + (RunnerEvents.POPULATE_RUN_DATA , self.populate_run_data), + (RunnerEvents.AFTER_EXPERIMENT , self.after_experiment ) ]) self.run_table_model = None # Initialized later output.console_log("Custom config loaded") @@ -64,6 +72,10 @@ def create_run_table_model(self) -> RunTableModel: data_columns=["joules", "avg_cpu", "avg_gpu"]) return self.run_table_model + + def validate_experiment(self) -> None: + """Perform any experiment validation here. If any validation fails, raise an exception with details on the failure.""" + validate_experiment_requirements(Path(__file__)) def before_experiment(self) -> None: """Perform any activity required before starting the experiment here diff --git a/examples/profilers/linux-ps-profiling/README.md b/examples/profilers/linux-ps-profiling/README.md index 17c30877a..3d593f2dc 100644 --- a/examples/profilers/linux-ps-profiling/README.md +++ b/examples/profilers/linux-ps-profiling/README.md @@ -26,3 +26,4 @@ python experiment-runner/ examples/linux-ps-profiling/RunnerConfig.py The results are generated in the `examples/linux-ps-profiling/experiments` folder. +In case there are anomalies such as null, absent, or negative values, a report will be generated in the `examples/linux-ps-profiling/experiments` folder. diff --git a/examples/profilers/linux-ps-profiling/RunnerConfig.py b/examples/profilers/linux-ps-profiling/RunnerConfig.py index 4b02a5c58..d601349f6 100644 --- a/examples/profilers/linux-ps-profiling/RunnerConfig.py +++ b/examples/profilers/linux-ps-profiling/RunnerConfig.py @@ -5,6 +5,7 @@ from ConfigValidator.Config.Models.RunnerContext import RunnerContext from ConfigValidator.Config.Models.OperationType import OperationType from ProgressManager.Output.OutputProcedure import OutputProcedure as output +from ProgressManager.Validation.RequirementsValidator import (validate_experiment_requirements) from Plugins.Profilers.Ps import Ps from typing import Dict, List, Any, Optional @@ -37,6 +38,12 @@ class RunnerConfig: """The time Experiment Runner will wait after a run completes. This can be essential to accommodate for cooldown periods on some systems.""" time_between_runs_in_ms: int = 1000 + + """Path to log file for energy validation report. Relative to experiment output directory.""" + energy_validation_log_file: str = "energy_validation_report.log" + + """List of data column names that contain energy measurements (e.g., ['energy', 'joules', 'watts']).""" + energy_validation_columns: List[str] = [] # Dynamic configurations can be one-time satisfied here before the program takes the config as-is # e.g. Setting some variable based on some criteria @@ -44,15 +51,16 @@ def __init__(self): """Executes immediately after program start, on config load""" EventSubscriptionController.subscribe_to_multiple_events([ - (RunnerEvents.BEFORE_EXPERIMENT, self.before_experiment), - (RunnerEvents.BEFORE_RUN , self.before_run ), - (RunnerEvents.START_RUN , self.start_run ), - (RunnerEvents.START_MEASUREMENT, self.start_measurement), - (RunnerEvents.INTERACT , self.interact ), - (RunnerEvents.STOP_MEASUREMENT , self.stop_measurement ), - (RunnerEvents.STOP_RUN , self.stop_run ), - (RunnerEvents.POPULATE_RUN_DATA, self.populate_run_data), - (RunnerEvents.AFTER_EXPERIMENT , self.after_experiment ) + (RunnerEvents.VALIDATE_EXPERIMENT, self.validate_experiment), + (RunnerEvents.BEFORE_EXPERIMENT , self.before_experiment), + (RunnerEvents.BEFORE_RUN , self.before_run ), + (RunnerEvents.START_RUN , self.start_run ), + (RunnerEvents.START_MEASUREMENT , self.start_measurement), + (RunnerEvents.INTERACT , self.interact ), + (RunnerEvents.STOP_MEASUREMENT , self.stop_measurement ), + (RunnerEvents.STOP_RUN , self.stop_run ), + (RunnerEvents.POPULATE_RUN_DATA , self.populate_run_data), + (RunnerEvents.AFTER_EXPERIMENT , self.after_experiment ) ]) self.run_table_model = None # Initialized later output.console_log("Custom config loaded") @@ -70,6 +78,9 @@ def create_run_table_model(self) -> RunTableModel: data_columns=["avg_cpu", "avg_mem"] ) return self.run_table_model + + def validate_experiment(self) -> None: + validate_experiment_requirements(Path(__file__)) def before_experiment(self) -> None: """Perform any activity required before starting the experiment here diff --git a/examples/profilers/measure-self-profiling/README.md b/examples/profilers/measure-self-profiling/README.md index 34c1cf2d7..3c851ae0e 100644 --- a/examples/profilers/measure-self-profiling/README.md +++ b/examples/profilers/measure-self-profiling/README.md @@ -31,6 +31,8 @@ python experiment-runner/ examples/measure-self-profiling/RunnerConfig.py The results are generated in the `examples/measure-self-profiling/experiments` folder, and are added to your run table model. A log file can be specified to additionally save the full energibridge logs to a separate file. +In case there are anomalies such as null, absent, or negative values, a report will be generated in the `examples/measure-self-profiling/experiments` folder. + **!!! WARNING !!!**: COLUMNS IN THE `energibridge.log` FILES CAN BE DIFFERENT ACROSS MACHINES. ADJUST YOUR ANALYSIS OF THE RESULTS ACCORDINGLY. diff --git a/examples/profilers/measure-self-profiling/RunnerConfig.py b/examples/profilers/measure-self-profiling/RunnerConfig.py index 298aeaf2d..54db75237 100644 --- a/examples/profilers/measure-self-profiling/RunnerConfig.py +++ b/examples/profilers/measure-self-profiling/RunnerConfig.py @@ -4,6 +4,7 @@ from ConfigValidator.Config.Models.RunnerContext import RunnerContext from ConfigValidator.Config.Models.OperationType import OperationType from ProgressManager.Output.OutputProcedure import OutputProcedure as output +from ProgressManager.Validation.RequirementsValidator import (validate_experiment_requirements) from typing import Optional, Dict, Any from pathlib import Path @@ -57,21 +58,25 @@ class RunnerConfig: """ self_measure_logfile: Path = "energibridge.log" - # Dynamic configurations can be one-time satisfied here before the program takes the config as-is - # e.g. Setting some variable based on some criteria + """Path to log file for energy validation report. Relative to experiment output directory.""" + energy_validation_log_file: str = "energy_validation_report.log" + + """List of data column names that contain energy measurements (e.g., ['energy', 'joules', 'watts']).""" + energy_validation_columns: List[str] = [] + def __init__(self): - """Executes immediately after program start, on config load""" EventSubscriptionController.subscribe_to_multiple_events([ + (RunnerEvents.VALIDATE_EXPERIMENT, self.validate_experiment), (RunnerEvents.BEFORE_EXPERIMENT, self.before_experiment), - (RunnerEvents.BEFORE_RUN , self.before_run ), - (RunnerEvents.START_RUN , self.start_run ), + (RunnerEvents.BEFORE_RUN, self.before_run), + (RunnerEvents.START_RUN, self.start_run), (RunnerEvents.START_MEASUREMENT, self.start_measurement), - (RunnerEvents.INTERACT , self.interact ), - (RunnerEvents.STOP_MEASUREMENT , self.stop_measurement ), - (RunnerEvents.STOP_RUN , self.stop_run ), + (RunnerEvents.INTERACT, self.interact), + (RunnerEvents.STOP_MEASUREMENT, self.stop_measurement), + (RunnerEvents.STOP_RUN, self.stop_run), (RunnerEvents.POPULATE_RUN_DATA, self.populate_run_data), - (RunnerEvents.AFTER_EXPERIMENT , self.after_experiment ) + (RunnerEvents.AFTER_EXPERIMENT, self.after_experiment) ]) self.run_table_model = None # Initialized later output.console_log("Custom config loaded") @@ -85,6 +90,9 @@ def create_run_table_model(self) -> RunTableModel: ) return self.run_table_model + def validate_experiment(self) -> None: + validate_experiment_requirements(Path(__file__)) + def before_experiment(self) -> None: """Perform any activity required before starting the experiment here Invoked only once during the lifetime of the program.""" diff --git a/experiment-runner/DistributedExecution/DistributedOrchestrator.py b/experiment-runner/DistributedExecution/DistributedOrchestrator.py index 9bc0d5470..d4acf8fe4 100644 --- a/experiment-runner/DistributedExecution/DistributedOrchestrator.py +++ b/experiment-runner/DistributedExecution/DistributedOrchestrator.py @@ -304,7 +304,6 @@ def __init__(self, config, metadata, host="0.0.0.0", port=5000): print("[MASTER] Creating new experiment") run_table = (config.create_run_table_model().generate_experiment_run_table()) - pd.DataFrame(run_table).to_csv(self.run_table_path, index=False) self.task_manager = TaskManager(run_table, self.experiment_path) @@ -323,6 +322,10 @@ def __init__(self, config, metadata, host="0.0.0.0", port=5000): def start(self): if self.finished_before_start: return + + EventSubscriptionController.raise_event( + RunnerEvents.VALIDATE_EXPERIMENT + ) EventSubscriptionController.raise_event( RunnerEvents.BEFORE_EXPERIMENT diff --git a/experiment-runner/DistributedExecution/Worker.py b/experiment-runner/DistributedExecution/Worker.py index f35ebf848..28af536f6 100644 --- a/experiment-runner/DistributedExecution/Worker.py +++ b/experiment-runner/DistributedExecution/Worker.py @@ -1,4 +1,6 @@ from ExperimentOrchestrator.Experiment.Run.RunController import RunController +from EventManager.EventSubscriptionController import EventSubscriptionController +from EventManager.Models.RunnerEvents import RunnerEvents import threading import time @@ -49,6 +51,11 @@ def run_loop(self, agent_id, config): print(f"[WORKER] Starting with agent_id: {self.agent_id}") print(f"[WORKER] Master URL: {self.master_url}") + print("[WORKER] Validating experiment setup") + EventSubscriptionController.raise_event( + RunnerEvents.VALIDATE_EXPERIMENT + ) + threading.Thread(target=self._heartbeat_loop, daemon=True).start() print("[WORKER] Heartbeat thread started") diff --git a/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py b/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py index 678297faf..02da2dbf8 100644 --- a/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py +++ b/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py @@ -166,12 +166,12 @@ def do_experiment(self): EventSubscriptionController.raise_event(RunnerEvents.AFTER_EXPERIMENT) # -- Energy validation - if self.config.enable_energy_validation and self.config.energy_validation_columns: + if self.config.energy_validation_columns: updated_run_table = self.csv_data_manager.read_run_table() energy_report = EnergyValidator.validate_run_table(updated_run_table, self.config.energy_validation_columns) if energy_report.has_anomalies(): + log_file_path = (self.config.experiment_path / self.config.energy_validation_log_file) + output.console_log_WARNING(f"Energy anomalies detected. Report saved to {log_file_path}") - - log_file_path = self.config.experiment_path / self.config.energy_validation_log_file - EnergyValidator.save_report_to_file(energy_report, self.config.energy_validation_columns, log_file_path) + EnergyValidator.save_report_to_file(energy_report, self.config.energy_validation_columns, log_file_path) \ No newline at end of file diff --git a/experiment-runner/Plugins/Profilers/AndroidDebugBridge.py b/experiment-runner/Plugins/Profilers/AndroidDebugBridge.py new file mode 100644 index 000000000..2ef218745 --- /dev/null +++ b/experiment-runner/Plugins/Profilers/AndroidDebugBridge.py @@ -0,0 +1,309 @@ +from __future__ import annotations +from pathlib import Path +from typing import Iterable, Optional, Dict, Any +from enum import Enum, auto +import re +import subprocess +import threading +import csv +from datetime import datetime +import pandas as pd + +from Plugins.Profilers.DataSource import CLISource, ParameterDict +from ConfigValidator.Config.Models.RunnerContext import RunnerContext +from ConfigValidator.Config.RunnerConfig import RunnerConfig + + +class DataColumns(Enum): + """Battery metrics that can be collected from Android devices via ADB dumpsys battery""" + BATTERY_PERCENTAGE = auto() + BATTERY_TEMPERATURE = auto() + BATTERY_VOLTAGE = auto() + CURRENT_NOW = auto() + CHARGE_COUNTER = auto() + BATTERY_HEALTH = auto() + CHARGING_STATUS = auto() + POWER_DRAW = auto() + + _PATTERN = re.compile(r'(android_battery__)(.+)') + + @property + def name(self) -> str: + return f'android_battery__{super().name.lower()}' + +class AndroidBatteryMonitor(CLISource): + """Monitor battery and energy metrics from Android devices via ADB during experiment execution. + This plugin connects to Android devices via ADB and periodically collects battery statistics.""" + source_name = "adb" + supported_platforms = ["Linux", "Darwin"] + + ANDROID_BATTERY_PARAMETERS = {} + + def __init__(self, device_serial: Optional[str] = None, poll_interval: int = 2, out_file: Path = "android_battery.csv", data_columns: Optional[Iterable[str]] = None): + super().__init__() + + self.device_serial = device_serial + self.poll_interval = poll_interval + self.logfile = out_file + self.stop_monitoring = threading.Event() + self.monitoring_thread = None + self.monitor_error: Optional[Exception] = None + + # Validate ADB availability + self._validate_adb_available() + + @property + def parameters(self) -> ParameterDict: + return ParameterDict(self.ANDROID_BATTERY_PARAMETERS) + + def _validate_adb_available(self): + """Verify ADB is installed and accessible.""" + try: + result = subprocess.run(['adb', 'version'], capture_output=True, timeout=5) + if result.returncode != 0: + raise RuntimeError("ADB version check failed.") + except FileNotFoundError: + raise RuntimeError("ADB not found.") + except subprocess.TimeoutExpired: + raise RuntimeError("ADB timeout - check ADB installation") + + def _get_device_serial(self) -> str: + if self.device_serial: + return self.device_serial + + result = subprocess.run(["adb", "devices"], capture_output=True, text=True, timeout=5) + devices = [] + + for line in result.stdout.splitlines(): + line = line.strip() + if not line: + continue + if line.startswith("List of devices"): + continue + if "\tdevice" in line: + devices.append(line.split()[0]) + if not devices: + raise RuntimeError("No ADB devices found") + + return devices[0] + + def _parse_battery_data(self, dumpsys_output: str) -> Dict[str, Any]: + """Parse dumpsys battery output and extract metrics.""" + data = {} + if not dumpsys_output: + return data + + patterns = { + 'percentage' : r'^\s*level:\s+(\d+)', + 'temperature' : r'^\s*temperature:\s+(\d+)', + 'voltage' : r'^\s*voltage:\s+(\d+)', + 'health' : r'^\s*health:\s+(\d+)', + 'status' : r'^\s*status:\s+(\d+)', + 'current_now' : r'^\s*current now:\s+(-?\d+)', + 'charge_counter' : r'^\s*charge counter:\s+(\d+)', + } + + for key, pattern in patterns.items(): + match = re.search(pattern, dumpsys_output, re.MULTILINE) + if match: data[key] = match.group(1) + + # Calculate power draw estimate + if 'voltage' in data and 'current_now' in data: + try: + voltage_mv = int(data['voltage']) + current_ua = int(data['current_now']) + voltage_v = voltage_mv / 1000.0 + current_ma = abs(float(data["current_now"])) + power_mw = voltage_v * current_ma + data['power_draw'] = f"{power_mw:.2f}" + except (ValueError, KeyError): + pass + return data + + def start(self): + """Start monitoring battery metrics.""" + if self.monitoring_thread and self.monitoring_thread.is_alive(): + raise RuntimeError("Android energy monitoring is already running") + + self.stop_monitoring.clear() + self.measurements = [] + self.monitor_error = None + try: + self.logfile.parent.mkdir(parents=True, exist_ok=True) + except Exception as e: + raise RuntimeError(f"Failed to create log directory: {e}") + + self._get_device_serial() + self.monitoring_thread = threading.Thread(target=self._monitor_loop, name="AndroidEnergyMonitor", daemon=True) + self.monitoring_thread.start() + + def _monitor_loop(self): + try: + device_serial = self._get_device_serial() + with open(self.logfile, 'w', newline='') as csvfile: + fieldnames = [ + 'timestamp', + 'percentage', + 'temperature', + 'voltage', + 'health', + 'status', + 'current_now', + 'charge_counter', + 'power_draw' + ] + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + while not self.stop_monitoring.is_set(): + result = subprocess.run( + [ + 'adb', + '-s', + device_serial, + 'shell', + 'dumpsys battery' + ], + capture_output=True, text=True, timeout=10) + + if result.returncode != 0: + raise RuntimeError(f"ADB command failed:\n{result.stderr}") + + metrics = self._parse_battery_data(result.stdout) + metrics['timestamp'] = datetime.now().isoformat() + self.measurements.append(metrics) + writer.writerow(metrics) + csvfile.flush() + self.stop_monitoring.wait(self.poll_interval) + except Exception as e: + self.monitor_error = e + self.stop_monitoring.set() + + def stop(self) -> str: + if not self.monitoring_thread: + return "" + self.stop_monitoring.set() + self.monitoring_thread.join() + if self.monitor_error: + raise RuntimeError(f"AndroidBatteryMonitor failed: {self.monitor_error}") + self.monitoring_thread = None + + return str(self.logfile) + + def __del__(self): + """Cleanup on deletion.""" + if self.monitoring_thread and self.monitoring_thread.is_alive(): + self.stop_monitoring.set() + self.monitoring_thread.join(timeout=5) + + @staticmethod + def parse_log(logfile: Path) -> Dict[str, Any]: + """Parse battery metrics CSV log file.""" + try: + df = pd.read_csv(logfile) + return df.to_dict(orient='records') + except Exception as e: + print(f"Could not parse Android battery log: {e}") + return {} + +def battery_monitor(device_serial=None, poll_interval=2, data_columns=None): + def battery_monitor_decorator(cls): + cols = data_columns or [col.name for col in DataColumns] + + cls.create_run_table_model = add_data_columns(cols)(cls.create_run_table_model) + cls.start_measurement = start_battery_monitor(device_serial, poll_interval)(cls.start_measurement) + cls.stop_measurement = stop_battery_monitor(cls.stop_measurement) + cls.populate_run_data = populate_data_columns(cls.populate_run_data) + return cls + + return battery_monitor_decorator + +def start_battery_monitor(device_serial: Optional[str] = None, poll_interval: int = 2): + def start_battery_monitor_decorator(func): + def wrapper(*args, **kwargs): + self: RunnerConfig = args[0] + context: RunnerContext = args[1] + logfile = (context.run_dir.resolve()/ "android_battery.csv") + + self.__android_battery_monitor__ = (AndroidBatteryMonitor(device_serial=device_serial, poll_interval=poll_interval, out_file=logfile)) + self.__android_battery_monitor__.start() + return func(*args, **kwargs) + + return wrapper + + return start_battery_monitor_decorator + +def stop_battery_monitor(func): + def wrapper(*args, **kwargs): + self: RunnerConfig = args[0] + ret_val = func(*args, **kwargs) + + if hasattr(self, "__android_battery_monitor__"): + self.__android_battery_monitor__.stop() + return ret_val + + return wrapper + +def add_data_columns(data_cols: Iterable[str]): + """Decorator to add Android battery data columns to run table.""" + def add_data_columns_decorator(func): + def wrapper(*args, **kwargs): + self: RunnerConfig = args[0] + + func(*args, **kwargs) + for dc in data_cols: + col_name = f'android_battery__{dc.lower()}' if not dc.startswith('android_battery__') else dc + if col_name not in self.run_table_model.get_data_columns(): + self.run_table_model.get_data_columns().append(col_name) + return self.run_table_model + + return wrapper + + return add_data_columns_decorator + +def populate_data_columns(func): + def wrapper(*args, **kwargs): + self: RunnerConfig = args[0] + ret_val = func(*args, **kwargs) + + if ret_val is None: + ret_val = {} + if not hasattr(self, "__android_battery_monitor__"): + return ret_val + try: + df = pd.read_csv(self.__android_battery_monitor__.logfile) + if df.empty: + return ret_val + + metric_map = { + "battery_percentage": "percentage", + "battery_temperature": "temperature", + "battery_voltage": "voltage", + "battery_health": "health", + "charging_status": "status", + "charge_rate": "charge_rate", + "current_now": "current_now", + "power_draw": "power_draw" + } + + for dc in self.run_table_model.get_data_columns(): + m = DataColumns._PATTERN.value.match(dc) + if not m: + continue + metric_name = m.group(2) + csv_column = metric_map.get(metric_name) + + if csv_column is None: + continue + if csv_column not in df.columns: + continue + + values = pd.to_numeric(df[csv_column], errors="coerce").dropna() + if len(values) == 0: + continue + ret_val[dc] = float(values.mean()) + except Exception as e: + + print(f"Error reading Android battery metrics: {e}") + return ret_val + + return wrapper \ No newline at end of file diff --git a/experiment-runner/ProgressManager/Validation/EnergyValidator.py b/experiment-runner/ProgressManager/Validation/EnergyValidator.py index b3f2e2207..dca110350 100644 --- a/experiment-runner/ProgressManager/Validation/EnergyValidator.py +++ b/experiment-runner/ProgressManager/Validation/EnergyValidator.py @@ -11,7 +11,7 @@ def __init__(self): def add_anomaly(self, run_id: str, treatment_levels: Dict[str, Any], column_name: str, value: Any): """Add an anomaly to the report. - The anomaly fallowes the structure: + The anomaly followes the structure: run_id: The run identifier treatment_levels: Dictionary of factor names to treatment levels for this run column_name: The energy column name where anomaly was detected @@ -103,4 +103,4 @@ def save_report_to_file(report: EnergyAnomalyReport, energy_columns: List[str],l f.write(report_text) output.console_log_OK(f"Energy validation report saved to: {log_file}") except Exception as e: - output.console_log_FAIL(f"Failed to write energy validation report: {e}") + output.console_log_FAIL(f"Failed to write energy validation report: {e}") \ No newline at end of file diff --git a/experiment-runner/ProgressManager/Validation/test_EnergyValidator.py b/experiment-runner/ProgressManager/Validation/test_EnergyValidator.py deleted file mode 100644 index e8f7919d4..000000000 --- a/experiment-runner/ProgressManager/Validation/test_EnergyValidator.py +++ /dev/null @@ -1,125 +0,0 @@ -import unittest -import sys -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) - -from ProgressManager.Validation.EnergyValidator import ( - EnergyValidator, - EnergyAnomalyReport -) - - -class TestEnergyValidator(unittest.TestCase): - - def test_positive_energy(self): - run_table = [ - {"__run_id": "run_1", - "cpu_energy": 10.5 - } - ] - - report = EnergyValidator.validate_run_table( - run_table, - ["cpu_energy"] - ) - - self.assertFalse(report.has_anomalies()) - - def test_zero_energy(self): - run_table = [ - { - "__run_id": "run_1", - "cpu_energy": 0 - } - ] - - report = EnergyValidator.validate_run_table( - run_table, - ["cpu_energy"] - ) - - self.assertTrue(report.has_anomalies()) - self.assertEqual(len(report.anomalies), 1) - - def test_negative_energy(self): - run_table = [ - { - "__run_id": "run_1", - "cpu_energy": -1 - } - ] - - report = EnergyValidator.validate_run_table( - run_table, - ["cpu_energy"] - ) - - self.assertTrue(report.has_anomalies()) - self.assertEqual(len(report.anomalies), 1) - - def test_mixed_values(self): - run_table = [ - { - "__run_id": "run_1", - "cpu_energy": 10 - }, - { - "__run_id": "run_2", - "cpu_energy": 0 - }, - { - "__run_id": "run_3", - "cpu_energy": -1 - } - ] - - report = EnergyValidator.validate_run_table( - run_table, - ["cpu_energy"] - ) - - self.assertTrue(report.has_anomalies()) - self.assertEqual(len(report.anomalies), 2) - - def test_treatment_levels_saved(self): - run_table = [ - { - "__run_id": "run_1", - "__done": "DONE", - "fib_type": "iter", - "problem_size": 1000, - "cpu_energy": -1 - } - ] - - report = EnergyValidator.validate_run_table( - run_table, - ["cpu_energy"] - ) - - anomaly = report.anomalies[0] - - self.assertEqual( - anomaly["treatment_levels"]["fib_type"], - "iter" - ) - - self.assertEqual( - anomaly["treatment_levels"]["problem_size"], - 1000 - ) - - self.assertNotIn( - "__run_id", - anomaly["treatment_levels"] - ) - - self.assertNotIn( - "__done", - anomaly["treatment_levels"] - ) - - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/test/Plugins/Profilers/test_AndroidDebugBridge.py b/test/Plugins/Profilers/test_AndroidDebugBridge.py new file mode 100644 index 000000000..67b6ce93a --- /dev/null +++ b/test/Plugins/Profilers/test_AndroidDebugBridge.py @@ -0,0 +1,112 @@ +import unittest +import shutil +import tempfile +import sys + +from pathlib import Path +from typing import AnyStr + +sys.path.append("experiment-runner") + +from ConfigValidator.Config.Models.RunnerContext import RunnerContext +from ConfigValidator.Config.RunnerConfig import RunnerConfig +from Plugins.Profilers.AndroidDebugBridge import ( + AndroidBatteryMonitor, + battery_monitor, + start_battery_monitor, + stop_battery_monitor, + add_data_columns, + populate_data_columns, + DataColumns +) + +class TestADBIndividual(unittest.TestCase): + class BatteryConfig(RunnerConfig): + tmpdir: AnyStr = tempfile.mkdtemp() + def clear(self): + shutil.rmtree(self.__class__.tmpdir) + + @add_data_columns([ + DataColumns.BATTERY_PERCENTAGE.name, + DataColumns.BATTERY_TEMPERATURE.name, + DataColumns.CURRENT_NOW.name + ]) + def create_run_table_model(self): + return super().create_run_table_model() + + @start_battery_monitor( + poll_interval=1 + ) + def start_measurement(self, context: RunnerContext): + super().start_measurement(context) + + def interact(self, context: RunnerContext): + import time + time.sleep(3) + + @stop_battery_monitor + def stop_measurement(self, context: RunnerContext): + super().stop_measurement(context) + + def setUp(self): + self.runner_config = self.__class__.BatteryConfig() + + def tearDown(self): + self.runner_config.clear() + + def test_monitor(self): + + class FakeContext: + run_dir = Path(self.runner_config.tmpdir) + + context = FakeContext() + + self.runner_config.start_measurement(context) + self.runner_config.interact(context) + self.runner_config.stop_measurement(context) + + run_data = self.runner_config.populate_run_data(context) + + self.assertTrue((Path(self.runner_config.tmpdir)/ "android_battery.csv").is_file()) + print(run_data) + +class TestADBCombined(unittest.TestCase): + tmpdir: AnyStr = tempfile.mkdtemp() + @battery_monitor( + poll_interval=1, + data_columns=[ + DataColumns.BATTERY_PERCENTAGE.name, + DataColumns.BATTERY_TEMPERATURE.name, + DataColumns.CURRENT_NOW.name, + DataColumns.POWER_DRAW.name + ] + ) + class BatteryConfig(RunnerConfig): + def clear(self): + shutil.rmtree(TestADBCombined.tmpdir) + + def interact(self, context): + import time + time.sleep(3) + + def setUp(self): + self.runner_config = self.__class__.BatteryConfig() + + def tearDown(self): + self.runner_config.clear() + + def test_monitor(self): + + class FakeContext: + run_dir = Path(TestADBCombined.tmpdir) + + context = FakeContext() + + self.runner_config.start_measurement(context) + self.runner_config.interact(context) + self.runner_config.stop_measurement(context) + + run_data = self.runner_config.populate_run_data(context) + + self.assertTrue((Path(TestADBCombined.tmpdir)/ "android_battery.csv").is_file()) + print(run_data) \ No newline at end of file diff --git a/test/ProgressManager/test_EnergyValidator.py b/test/ProgressManager/test_EnergyValidator.py index ee9be7e05..6defd635a 100644 --- a/test/ProgressManager/test_EnergyValidator.py +++ b/test/ProgressManager/test_EnergyValidator.py @@ -1,7 +1,8 @@ import unittest import sys from pathlib import Path -sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) + +sys.path.append("experiment-runner") from ProgressManager.Validation.EnergyValidator import (EnergyValidator,EnergyAnomalyReport) From e8e61ccb84abdf88c9054a9c7992c56dae3770a4 Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Fri, 19 Jun 2026 04:48:15 +0200 Subject: [PATCH 18/30] Old_code --- examples/profilers/ADB/RunnerConfig.py | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/examples/profilers/ADB/RunnerConfig.py b/examples/profilers/ADB/RunnerConfig.py index eff52c8d7..e7a9494f3 100644 --- a/examples/profilers/ADB/RunnerConfig.py +++ b/examples/profilers/ADB/RunnerConfig.py @@ -6,8 +6,8 @@ from ConfigValidator.Config.Models.OperationType import OperationType from ProgressManager.Output.OutputProcedure import OutputProcedure as output from ProgressManager.Validation.RequirementsValidator import (validate_experiment_requirements) - from Plugins.Profilers.AndroidDebugBridge import AndroidBatteryMonitor, battery_monitor + from typing import Dict, List, Any, Optional from pathlib import Path from os.path import dirname, realpath @@ -15,7 +15,6 @@ @battery_monitor( device_serial=None, - poll_interval=1, data_columns=[ 'battery_percentage', 'battery_temperature', @@ -44,8 +43,7 @@ class RunnerConfig: """Path to log file for energy validation report. Relative to experiment output directory.""" energy_validation_log_file: str = "energy_validation_report.log" - """List of data column names that contain energy measurements (e.g., ['energy', 'joules', 'watts']). - Only used if enable_energy_validation is True.""" + """List of data column names that contain energy measurements (e.g., ['energy', 'joules', 'watts']).""" energy_validation_columns = [ "android_battery__percentage", "android_battery__temperature", @@ -108,11 +106,7 @@ def before_run(self) -> None: output.console_log(f"Preparing device for run...") def start_run(self, context: RunnerContext) -> None: - """Start a single experiment run. - - In a real scenario, this would start your Android app or workload. - For this example, we just wait a bit. - """ + """Start a single experiment run.""" output.console_log("Config.start_run() called!") def start_measurement(self, context: RunnerContext) -> None: @@ -148,12 +142,7 @@ def stop_run(self, context: RunnerContext) -> None: output.console_log(f"Stopped run: {context.execute_run['__run_id']}") def populate_run_data(self, context: RunnerContext) -> Optional[Dict[str, Any]]: - """Populate data columns for this run. - - The @AndroidEnergyMonitor.energy_monitor decorator automatically - populates energy-related columns. This method can add custom data. - """ - # In a real scenario, you would parse workload results here + """Populate data columns for this run. """ workload = context.execute_run['workload'] duration_ms = { From d2f0ffb94399905f2df41824027e2cdb33d8fe74 Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Mon, 22 Jun 2026 14:38:21 +0200 Subject: [PATCH 19/30] Checking for anomalies feature *improved* --- examples/profilers/ADB/RunnerConfig.py | 10 +- .../Experiment/ExperimentController.py | 33 ++- .../Plugins/Profilers/AndroidDebugBridge.py | 27 ++- .../Validation/EnergyValidator.py | 192 +++++++++++------- .../Validation/RequirementsValidator.py | 4 + 5 files changed, 161 insertions(+), 105 deletions(-) diff --git a/examples/profilers/ADB/RunnerConfig.py b/examples/profilers/ADB/RunnerConfig.py index e7a9494f3..ed82fe9cc 100644 --- a/examples/profilers/ADB/RunnerConfig.py +++ b/examples/profilers/ADB/RunnerConfig.py @@ -44,13 +44,7 @@ class RunnerConfig: energy_validation_log_file: str = "energy_validation_report.log" """List of data column names that contain energy measurements (e.g., ['energy', 'joules', 'watts']).""" - energy_validation_columns = [ - "android_battery__percentage", - "android_battery__temperature", - "android_battery__voltage", - "android_battery__current_now", - "android_battery__power_draw" - ] + # Dynamic configurations can be one-time satisfied here before the program takes the config as-is # e.g. Setting some variable based on some criteria @@ -85,7 +79,7 @@ def create_run_table_model(self) -> RunTableModel: self.run_table_model = RunTableModel( factors=[workload_factor, screen_factor], - repetitions=3, + repetitions=1, # Add custom data columns (energy columns are added by decorator) data_columns=['workload_duration_ms', 'task_completion_status'] ) diff --git a/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py b/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py index 02da2dbf8..ffc559006 100644 --- a/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py +++ b/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py @@ -13,7 +13,10 @@ from ProgressManager.Output.OutputProcedure import OutputProcedure as output from EventManager.EventSubscriptionController import EventSubscriptionController from ConfigValidator.CustomErrors.ProgressErrors import AllRunsCompletedOnRestartError -from ProgressManager.Validation.EnergyValidator import EnergyValidator +from ProgressManager.Validation.EnergyValidator import ( + ResultsValidator, + AnomalyReport +) from pathlib import Path @@ -166,12 +169,22 @@ def do_experiment(self): EventSubscriptionController.raise_event(RunnerEvents.AFTER_EXPERIMENT) # -- Energy validation - if self.config.energy_validation_columns: - updated_run_table = self.csv_data_manager.read_run_table() - energy_report = EnergyValidator.validate_run_table(updated_run_table, self.config.energy_validation_columns) - - if energy_report.has_anomalies(): - log_file_path = (self.config.experiment_path / self.config.energy_validation_log_file) - - output.console_log_WARNING(f"Energy anomalies detected. Report saved to {log_file_path}") - EnergyValidator.save_report_to_file(energy_report, self.config.energy_validation_columns, log_file_path) \ No newline at end of file + final_report = AnomalyReport() + for run in self.run_table: + run_id = run["__run_id"] + treatment_levels = { + k: v for k, v in run.items() + if not k.startswith("__") + } + run_dir = self.config.experiment_path / run_id + run_report = ResultsValidator.validate_output_log( + run_dir, + run_id, + treatment_levels, + ) + final_report.anomalies.extend(run_report.anomalies) + + if final_report.has_anomalies(): + log_file_path = self.config.experiment_path / self.config.energy_validation_log_file + output.console_log_WARNING(f"Signal anomalies detected. Report saved to {log_file_path}") + ResultsValidator.save_report_to_file(final_report, log_file_path) \ No newline at end of file diff --git a/experiment-runner/Plugins/Profilers/AndroidDebugBridge.py b/experiment-runner/Plugins/Profilers/AndroidDebugBridge.py index 2ef218745..4f00d1d22 100644 --- a/experiment-runner/Plugins/Profilers/AndroidDebugBridge.py +++ b/experiment-runner/Plugins/Profilers/AndroidDebugBridge.py @@ -32,8 +32,7 @@ def name(self) -> str: return f'android_battery__{super().name.lower()}' class AndroidBatteryMonitor(CLISource): - """Monitor battery and energy metrics from Android devices via ADB during experiment execution. - This plugin connects to Android devices via ADB and periodically collects battery statistics.""" + """Monitor battery and energy metrics from Android devices via ADB during experiment execution.""" source_name = "adb" supported_platforms = ["Linux", "Darwin"] @@ -178,16 +177,19 @@ def _monitor_loop(self): self.monitor_error = e self.stop_monitoring.set() - def stop(self) -> str: + def stop(self): if not self.monitoring_thread: return "" + + print("STOP: setting event") + self.stop_monitoring.set() + + print("STOP: joining thread") + self.monitoring_thread.join() - if self.monitor_error: - raise RuntimeError(f"AndroidBatteryMonitor failed: {self.monitor_error}") - self.monitoring_thread = None - return str(self.logfile) + print("STOP: thread joined") def __del__(self): """Cleanup on deletion.""" @@ -227,9 +229,7 @@ def wrapper(*args, **kwargs): self.__android_battery_monitor__ = (AndroidBatteryMonitor(device_serial=device_serial, poll_interval=poll_interval, out_file=logfile)) self.__android_battery_monitor__.start() return func(*args, **kwargs) - return wrapper - return start_battery_monitor_decorator def stop_battery_monitor(func): @@ -269,8 +269,15 @@ def wrapper(*args, **kwargs): ret_val = {} if not hasattr(self, "__android_battery_monitor__"): return ret_val + + logfile = self.__android_battery_monitor__.logfile + + if not logfile.exists(): + return ret_val + try: - df = pd.read_csv(self.__android_battery_monitor__.logfile) + df = pd.read_csv(logfile) + if df.empty: return ret_val diff --git a/experiment-runner/ProgressManager/Validation/EnergyValidator.py b/experiment-runner/ProgressManager/Validation/EnergyValidator.py index dca110350..15a2bbc10 100644 --- a/experiment-runner/ProgressManager/Validation/EnergyValidator.py +++ b/experiment-runner/ProgressManager/Validation/EnergyValidator.py @@ -1,106 +1,144 @@ -from typing import Dict, List, Tuple, Any +from typing import Dict, List, Any, Set +import pandas as pd from pathlib import Path from ProgressManager.Output.OutputProcedure import OutputProcedure as output +META_COLUMNS = { + "Delta", + "Time", + "timestamp", + "run_id" +} -class EnergyAnomalyReport: - """Represents energy measurement anomalies found during validation.""" - +class AnomalyReport: def __init__(self): self.anomalies: List[Dict[str, Any]] = [] - - def add_anomaly(self, run_id: str, treatment_levels: Dict[str, Any], column_name: str, value: Any): - """Add an anomaly to the report. - The anomaly followes the structure: - run_id: The run identifier - treatment_levels: Dictionary of factor names to treatment levels for this run - column_name: The energy column name where anomaly was detected - value: The anomalous value - """ + + def add_anomaly( + self, + run_id: str, + treatment_levels: Dict[str, Any], + file_path: str, + row_number: int, + column_name: str, + value: Any, + anomaly_type: str + ): self.anomalies.append({ - 'run_id': run_id, - 'treatment_levels': treatment_levels, - 'column_name': column_name, - 'value': value, + "run_id": run_id, + "treatment_levels": treatment_levels, + "file_path": file_path, + "row_number": row_number, + "column_name": column_name, + "value": value, + "anomaly_type": anomaly_type }) - + def has_anomalies(self) -> bool: - """Check if any anomalies were found.""" return len(self.anomalies) > 0 -class EnergyValidator: - """Validates energy measurements for anomalies (zero or negative values).""" - +class ResultsValidator: + @staticmethod - def validate_run_table(run_table: List[Dict[str, Any]], energy_columns: List[str]) -> EnergyAnomalyReport: - """Validate energy measurements in a run table.""" - report = EnergyAnomalyReport() - - if not energy_columns: - return report - - for run in run_table: - run_id = run.get('__run_id', 'unknown') - # Extract treatment levels - treatment_levels = { - k: v for k, v in run.items() - if not k.startswith('__') - } - - for column_name in energy_columns: - if column_name not in run: - continue - value = run[column_name] - - # Check for None or missing values - if value is None: - report.add_anomaly(run_id, treatment_levels, column_name, value) - continue - try: - numeric_value = float(value) - if numeric_value < 0: - report.add_anomaly(run_id, treatment_levels, column_name, numeric_value) - elif numeric_value == 0: - report.add_anomaly(run_id, treatment_levels, column_name, numeric_value) - except (ValueError, TypeError): - report.add_anomaly(run_id, treatment_levels, column_name, value) - return report - + def _detect_numeric_columns(df: pd.DataFrame) -> List[str]: + """ + Automatically detect columns that contain numeric signals. + """ + numeric_cols = [] + + for col in df.columns: + if col in META_COLUMNS: + continue + + series = pd.to_numeric(df[col], errors="coerce") + + # keep column if it has at least some numeric values + if series.notna().any(): + numeric_cols.append(col) + return numeric_cols + @staticmethod - def generate_report_text(report: EnergyAnomalyReport, energy_columns: List[str]) -> str: - """ Generate the report text.""" + def generate_report_text(report: AnomalyReport) -> str: lines = [] lines.append("=" * 80) - lines.append("ENERGY MEASUREMENT VALIDATION REPORT") + lines.append("GENERIC MEASUREMENT VALIDATION REPORT") lines.append("=" * 80) lines.append("") - - if report.has_anomalies(): - lines.append(f"Found {len(report.anomalies)} anomalous energy measurements") - lines.append("-" * 80) - for anomaly in report.anomalies: - lines.append(f"Run ID: {anomaly['run_id']}") - lines.append(f"Column: {anomaly['column_name']}") - lines.append(f"Value: {anomaly['value']}") - lines.append(f"Treatment levels: {anomaly['treatment_levels']}") - lines.append("") + if not report.has_anomalies(): + lines.append("No anomalies found.") + return "\n".join(lines) - lines.append("=" * 80) - lines.append("") - + runs: Dict[str, List[Dict[str, Any]]] = {} + + for a in report.anomalies: + runs.setdefault(a["run_id"], []).append(a) + + for run_id, anomalies in runs.items(): + treatment = anomalies[0]["treatment_levels"] + + lines.append("-" * 80) + lines.append(f"RUN: {run_id}") + lines.append(f"TREATMENT: {treatment}") + lines.append("-" * 80) + + for a in anomalies: + lines.append( + f"[{a['anomaly_type']}] " + f"{a['column_name']} = {a['value']} " + f"(row {a['row_number']})" + ) + lines.append("") return "\n".join(lines) - + + @staticmethod + def validate_output_log( + run_dir: Path, + run_id: str, + treatment_levels: Dict[str, Any], + ) -> AnomalyReport: + + report = AnomalyReport() + + csv_files = list(run_dir.glob("*.csv")) + if not csv_files: + report.add_anomaly( + run_id, + treatment_levels, + str(run_dir), + -1, + "FILE_MISSING", + None, + "missing_file" + ) + return report + + csv_file = csv_files[0] + df = pd.read_csv(csv_file) + columns_to_check = ResultsValidator._detect_numeric_columns(df) + + for column in columns_to_check: + values = pd.to_numeric(df[column], errors="coerce") + for row_number, value in values.items(): + if pd.isna(value): + report.add_anomaly(run_id, treatment_levels, str(csv_file), row_number, column, value, "NaN") + elif value < 0: + report.add_anomaly(run_id, treatment_levels, str(csv_file), row_number, column, value, "negative") + elif value == 0: + report.add_anomaly(run_id, treatment_levels, str(csv_file), row_number, column, value, "zero") + return report + @staticmethod - def save_report_to_file(report: EnergyAnomalyReport, energy_columns: List[str],log_file: Path) -> None: + def save_report_to_file(report: EnergyAnomalyReport, log_file: Path) -> None: """Save validation report to a file.""" + report_text = ResultsValidator.generate_report_text(report) - report_text = EnergyValidator.generate_report_text(report, energy_columns) try: log_file.parent.mkdir(parents=True, exist_ok=True) with open(log_file, 'w') as f: f.write(report_text) - output.console_log_OK(f"Energy validation report saved to: {log_file}") + output.console_log_OK(f"Results validation report saved to: {log_file}") except Exception as e: - output.console_log_FAIL(f"Failed to write energy validation report: {e}") \ No newline at end of file + output.console_log_FAIL(f"Failed to write results validation report: {e}") + \ No newline at end of file diff --git a/experiment-runner/ProgressManager/Validation/RequirementsValidator.py b/experiment-runner/ProgressManager/Validation/RequirementsValidator.py index 2e0e2dbd3..c3023b183 100644 --- a/experiment-runner/ProgressManager/Validation/RequirementsValidator.py +++ b/experiment-runner/ProgressManager/Validation/RequirementsValidator.py @@ -36,6 +36,10 @@ def mark_failure(self, error: str): ### | | ### ========================================================= PROFILER_DEPS = { + "AndroidDebugBridge":{ + "tools": ["adb"], + "python_modules": [], + }, "JoularCore": { "tools": ["java"], "python_modules": ["jpype"], From 84ba8e57644b0811cdb6946f467100d945c090bb Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Mon, 22 Jun 2026 15:02:41 +0200 Subject: [PATCH 20/30] update --- .../Experiment/ExperimentController.py | 45 +++++++++++-------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py b/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py index ffc559006..8e985c0b4 100644 --- a/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py +++ b/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py @@ -38,6 +38,7 @@ class ExperimentController: def __init__(self, config: RunnerConfig, metadata: Metadata): self.config = config self.metadata = metadata + self.validation_results: dict[str, AnomalyReport] = {} self.csv_data_manager = CSVOutputManager(self.config.experiment_path) self.json_data_manager = JSONOutputManager(self.config.experiment_path) @@ -153,7 +154,23 @@ def do_experiment(self): ) perform_run.start() perform_run.join() + + run_id = current_run["__run_id"] + treatment_levels = { + k: v + for k, v in current_run.items() + if not k.startswith("__") + } + run_dir = self.config.experiment_path / run_id + run_report = ResultsValidator.validate_output_log( + run_dir, + run_id, + treatment_levels, + ) + if run_report.has_anomalies(): + self.validation_results[run_id] = run_report + time_btwn_runs = self.config.time_between_runs_in_ms if time_btwn_runs > 0: output.console_log_bold(f"Run fully ended, waiting for: {time_btwn_runs}ms == {time_btwn_runs / 1000}s") @@ -168,23 +185,13 @@ def do_experiment(self): output.console_log_WARNING("Calling after_experiment config hook") EventSubscriptionController.raise_event(RunnerEvents.AFTER_EXPERIMENT) - # -- Energy validation - final_report = AnomalyReport() - for run in self.run_table: - run_id = run["__run_id"] - treatment_levels = { - k: v for k, v in run.items() - if not k.startswith("__") - } - run_dir = self.config.experiment_path / run_id - run_report = ResultsValidator.validate_output_log( - run_dir, - run_id, - treatment_levels, - ) - final_report.anomalies.extend(run_report.anomalies) + # -- Validation summary + combined_report = AnomalyReport() + + for report in self.validation_results.values(): + combined_report.anomalies.extend(report.anomalies) - if final_report.has_anomalies(): - log_file_path = self.config.experiment_path / self.config.energy_validation_log_file - output.console_log_WARNING(f"Signal anomalies detected. Report saved to {log_file_path}") - ResultsValidator.save_report_to_file(final_report, log_file_path) \ No newline at end of file + if combined_report.has_anomalies(): + log_file_path = (self.config.experiment_path / self.config.energy_validation_log_file) + output.console_log_WARNING(f"Anomalies detected. Report saved to {log_file_path}") + ResultsValidator.save_report_to_file(combined_report, og_file_path) \ No newline at end of file From 44d8f62175d9ced64cf39aab7daad22598e84367 Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Mon, 22 Jun 2026 15:59:49 +0200 Subject: [PATCH 21/30] integarte the result validator into the reomote distribution feature --- .../hello-world-fibonacci/RunnerConfig.py | 13 ------- .../DistributedOrchestrator.py | 33 +++++++++++++++-- .../DistributedExecution/Worker.py | 35 +++++++++++++++---- .../Experiment/ExperimentController.py | 2 +- 4 files changed, 59 insertions(+), 24 deletions(-) diff --git a/examples/hello-world-fibonacci/RunnerConfig.py b/examples/hello-world-fibonacci/RunnerConfig.py index 5bf9cc929..fb65490e3 100644 --- a/examples/hello-world-fibonacci/RunnerConfig.py +++ b/examples/hello-world-fibonacci/RunnerConfig.py @@ -39,19 +39,6 @@ class RunnerConfig: """Path to log file for energy validation report. Relative to experiment output directory.""" energy_validation_log_file: str = "energy_validation_report.log" - """List of data column names that contain energy measurements (e.g., ['energy', 'joules', 'watts']).""" - energy_validation_columns: List[str] = [ - "cpu_energy", - "core0_energy", - "core1_energy", - "core2_energy", - "core3_energy", - "core4_energy", - "core5_energy", - "core6_energy", - "core7_energy" - ] - def __init__(self): EventSubscriptionController.subscribe_to_multiple_events([ diff --git a/experiment-runner/DistributedExecution/DistributedOrchestrator.py b/experiment-runner/DistributedExecution/DistributedOrchestrator.py index d4acf8fe4..646c203b1 100644 --- a/experiment-runner/DistributedExecution/DistributedOrchestrator.py +++ b/experiment-runner/DistributedExecution/DistributedOrchestrator.py @@ -2,6 +2,10 @@ from ProgressManager.Output.CSVOutputManager import CSVOutputManager from EventManager.Models.RunnerEvents import RunnerEvents from EventManager.EventSubscriptionController import EventSubscriptionController +from ProgressManager.Validation.EnergyValidator import ( + ResultsValidator, + AnomalyReport +) from flask import Flask, request, jsonify import threading @@ -34,6 +38,7 @@ def __init__(self, run_table, experiment_path: Path): self.csv_manager = CSVOutputManager(experiment_path) self.completed = False self.shutdown = False + self.validation_results = {} def get_next_task(self, agent_id): with self.lock: @@ -138,11 +143,12 @@ def experiment_already_completed(self): ### ========================================================= class APIServer: - def __init__(self, task_manager, worker_monitor): + def __init__(self, task_manager, worker_monitor, validation_results): self.app = Flask(__name__) self.task_manager = task_manager self.monitor = worker_monitor - + self.validation_results = validation_results + @self.app.route('/task', methods=['GET']) def get_task(): agent_id = request.args.get('agent_id') @@ -168,6 +174,7 @@ def submit_result(): run_id = payload.get('run_id') run_data = payload.get('data', {}) status = payload.get('status') + anomalies = request.json.get("anomalies", []) if status == "FAILED": print(f"[MASTER] Run failed: {run_id}") @@ -183,6 +190,10 @@ def submit_result(): ) else: self.task_manager.complete_task(run_id, run_data) + if anomalies: + report = AnomalyReport() + report.anomalies.extend(anomalies) + self.validation_results[run_id] = report return jsonify({"status": "ok"}) @self.app.route('/heartbeat', methods=['POST']) @@ -290,6 +301,7 @@ def __init__(self, config, metadata, host="0.0.0.0", port=5000): self.metadata = metadata self.host = host self.port = port + self.validation_results = {} self.experiment_path = (config.results_output_path / config.name) self.experiment_path.mkdir(parents=True, exist_ok=True) @@ -317,7 +329,7 @@ def __init__(self, config, metadata, host="0.0.0.0", port=5000): self.finished_before_start = False self.monitor = WorkerMonitor(self.task_manager) - self.api = APIServer(self.task_manager, self.monitor) + self.api = APIServer(self.task_manager, self.monitor, self.validation_results) def start(self): if self.finished_before_start: @@ -355,6 +367,21 @@ def start(self): print("[MASTER] Waiting for workers to shutdown...") time.sleep(10) + combined_report = AnomalyReport() + + for report in self.validation_results.values(): + combined_report.anomalies.extend(report.anomalies) + + if combined_report.has_anomalies(): + log_file_path = ( + self.experiment_path + / self.config.energy_validation_log_file + ) + + ResultsValidator.save_report_to_file( + combined_report, + log_file_path + ) print("[MASTER] Shutting down") os._exit(0) diff --git a/experiment-runner/DistributedExecution/Worker.py b/experiment-runner/DistributedExecution/Worker.py index 28af536f6..5f4ecb0a2 100644 --- a/experiment-runner/DistributedExecution/Worker.py +++ b/experiment-runner/DistributedExecution/Worker.py @@ -1,6 +1,7 @@ from ExperimentOrchestrator.Experiment.Run.RunController import RunController from EventManager.EventSubscriptionController import EventSubscriptionController from EventManager.Models.RunnerEvents import RunnerEvents +from ProgressManager.Validation.EnergyValidator import ResultsValidator import threading import time @@ -21,7 +22,6 @@ ### | | ### ========================================================= class WorkerRuntime: - @staticmethod def make_json_safe(obj): if isinstance(obj, dict): @@ -81,8 +81,8 @@ def run_loop(self, agent_id, config): run_id = task["__run_id"] try: - run_data = self._execute(task, config) - self._send_result(run_id, run_data) + run_data, anomaly_report = self._execute(task, config) + self._send_result(run_id, run_data, anomaly_report) except Exception as e: self._send_failure(run_id, str(e)) finally: @@ -115,16 +115,37 @@ def _execute(self, run, config): total_runs = run.get('__total_runs', 1) controller = RunController(run, config, current_run, total_runs, distributed_mode=True) - result = controller.do_run() + run_data = controller.do_run() + + run_id = run["__run_id"] + + treatment_levels = { + k: v + for k, v in run.items() + if not k.startswith("__") + } + + run_dir = config.experiment_path / run_id + + anomaly_report = ResultsValidator.validate_output_log( + run_dir, + run_id, + treatment_levels + ) + print(f"[WORKER] Task {run.get('__run_id')} completed") - return result + return run_data, anomaly_report - def _send_result(self, run_id, data): + def _send_result(self, run_id, data, anomaly_report = None): try: safe_data = WorkerRuntime.make_json_safe(data) - payload = {"run_id": run_id, "data": safe_data, "status": "DONE"} + payload = {"run_id": run_id, "data": safe_data, "status": "DONE", "anomalies": ( + anomaly_report.anomalies + if anomaly_report and anomaly_report.has_anomalies() + else [] + )} response = requests.post(self.master_url + "/result", json=payload, timeout=10) response.raise_for_status() diff --git a/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py b/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py index 8e985c0b4..09b6ba40e 100644 --- a/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py +++ b/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py @@ -194,4 +194,4 @@ def do_experiment(self): if combined_report.has_anomalies(): log_file_path = (self.config.experiment_path / self.config.energy_validation_log_file) output.console_log_WARNING(f"Anomalies detected. Report saved to {log_file_path}") - ResultsValidator.save_report_to_file(combined_report, og_file_path) \ No newline at end of file + ResultsValidator.save_report_to_file(combined_report, log_file_path) \ No newline at end of file From 4efc4f370c10f11d7fdf6712bd97bc4e6ad66f29 Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Mon, 22 Jun 2026 21:35:24 +0200 Subject: [PATCH 22/30] chanege in the exeperiments exemples --- examples/profilers/JoularCore/RunnerConfig.py | 3 --- examples/profilers/NvidiaML/RunnerConfig.py | 3 --- examples/profilers/PowerJoular/RunnerConfig.py | 3 --- examples/profilers/PowerLetrics/RunnerConfig.py | 3 --- examples/profilers/PowerMetrics/RunnerConfig.py | 3 --- .../linux-ps-profiling/RunnerConfig.py | 3 --- examples/profilers/linux-ps-profiling/primer | Bin 0 -> 16192 bytes .../measure-self-profiling/RunnerConfig.py | 5 +---- 8 files changed, 1 insertion(+), 22 deletions(-) create mode 100755 examples/profilers/linux-ps-profiling/primer diff --git a/examples/profilers/JoularCore/RunnerConfig.py b/examples/profilers/JoularCore/RunnerConfig.py index 5b8a852d3..adb0b13ef 100644 --- a/examples/profilers/JoularCore/RunnerConfig.py +++ b/examples/profilers/JoularCore/RunnerConfig.py @@ -31,9 +31,6 @@ class RunnerConfig: """Path to log file for energy validation report. Relative to experiment output directory.""" energy_validation_log_file: str = "energy_validation_report.log" - """List of data column names that contain energy measurements (e.g., ['energy', 'joules', 'watts']).""" - energy_validation_columns: List[str] = [] - # Dynamic configurations can be one-time satisfied here before the program takes the config as-is # e.g. Setting some variable based on some criteria def __init__(self): diff --git a/examples/profilers/NvidiaML/RunnerConfig.py b/examples/profilers/NvidiaML/RunnerConfig.py index 95603b379..f11f996f8 100644 --- a/examples/profilers/NvidiaML/RunnerConfig.py +++ b/examples/profilers/NvidiaML/RunnerConfig.py @@ -39,9 +39,6 @@ class RunnerConfig: """Path to log file for energy validation report. Relative to experiment output directory.""" energy_validation_log_file: str = "energy_validation_report.log" - """List of data column names that contain energy measurements (e.g., ['energy', 'joules', 'watts']).""" - energy_validation_columns: List[str] = [] - def __init__(self): EventSubscriptionController.subscribe_to_multiple_events([ diff --git a/examples/profilers/PowerJoular/RunnerConfig.py b/examples/profilers/PowerJoular/RunnerConfig.py index 5794b9fc2..cce80c56d 100644 --- a/examples/profilers/PowerJoular/RunnerConfig.py +++ b/examples/profilers/PowerJoular/RunnerConfig.py @@ -41,9 +41,6 @@ class RunnerConfig: """Path to log file for energy validation report. Relative to experiment output directory.""" energy_validation_log_file: str = "energy_validation_report.log" - """List of data column names that contain energy measurements (e.g., ['energy', 'joules', 'watts']).""" - energy_validation_columns: List[str] = [] - # Dynamic configurations can be one-time satisfied here before the program takes the config as-is # e.g. Setting some variable based on some criteria def __init__(self): diff --git a/examples/profilers/PowerLetrics/RunnerConfig.py b/examples/profilers/PowerLetrics/RunnerConfig.py index dec09e797..27b5257b9 100644 --- a/examples/profilers/PowerLetrics/RunnerConfig.py +++ b/examples/profilers/PowerLetrics/RunnerConfig.py @@ -39,9 +39,6 @@ class RunnerConfig: """Path to log file for energy validation report. Relative to experiment output directory.""" energy_validation_log_file: str = "energy_validation_report.log" - """List of data column names that contain energy measurements (e.g., ['energy', 'joules', 'watts']).""" - energy_validation_columns: List[str] = [] - # Dynamic configurations can be one-time satisfied here before the program takes the config as-is # e.g. Setting some variable based on some criteria def __init__(self): diff --git a/examples/profilers/PowerMetrics/RunnerConfig.py b/examples/profilers/PowerMetrics/RunnerConfig.py index 05e50b13f..f7256fa99 100644 --- a/examples/profilers/PowerMetrics/RunnerConfig.py +++ b/examples/profilers/PowerMetrics/RunnerConfig.py @@ -38,9 +38,6 @@ class RunnerConfig: """Path to log file for energy validation report. Relative to experiment output directory.""" energy_validation_log_file: str = "energy_validation_report.log" - """List of data column names that contain energy measurements (e.g., ['energy', 'joules', 'watts']).""" - energy_validation_columns: List[str] = [] - # Dynamic configurations can be one-time satisfied here before the program takes the config as-is # e.g. Setting some variable based on some criteria def __init__(self): diff --git a/examples/profilers/linux-ps-profiling/RunnerConfig.py b/examples/profilers/linux-ps-profiling/RunnerConfig.py index d601349f6..f7e291466 100644 --- a/examples/profilers/linux-ps-profiling/RunnerConfig.py +++ b/examples/profilers/linux-ps-profiling/RunnerConfig.py @@ -42,9 +42,6 @@ class RunnerConfig: """Path to log file for energy validation report. Relative to experiment output directory.""" energy_validation_log_file: str = "energy_validation_report.log" - """List of data column names that contain energy measurements (e.g., ['energy', 'joules', 'watts']).""" - energy_validation_columns: List[str] = [] - # Dynamic configurations can be one-time satisfied here before the program takes the config as-is # e.g. Setting some variable based on some criteria def __init__(self): diff --git a/examples/profilers/linux-ps-profiling/primer b/examples/profilers/linux-ps-profiling/primer new file mode 100755 index 0000000000000000000000000000000000000000..901841a7c8df1b95883f9ffbef8c4dea1c7ea1f6 GIT binary patch literal 16192 zcmeHOeQX@X6`ymy420OB1V~cI1|?OX^~DZJ2nE9>_9g4oNlfCXUxfA9x3;gkuioti zhZbS#fH>T!ZdDNdgUUjc3N3#?i>kCzTl1j=qzE~*D&e0hrGRYlQKJ?xMdbQ>v+u3< zw&zlzs(+vzY5kk`oA=(#yq#Uo+|Jw?@7!4zi6}`6)Qw6@%UveM1;f{hhEg%LO)bOU zHR=kr81zD1F)j{b;{vxpu%7jZCOe1AgZNq26OvMk(yKC9>?3XYXYq=@MoN?84=Y$Bby-B$TRD(mzs z(xSVvRqg8R*wJQfif)Q-@$%PfR#r#%9xG`V?15C#u?yXM+S0kK-JR%7+i*FM$z{0` z%T=wZsPsKwfDuDuW)9s>MNry~tc=Ujs?>R;wU@akzyAP2S*O0{s(bGlc}~yrd;CFu zj|e}`e0|I{6kg;rH@Q6BCmJJ?&iZ(M1a#4qk9WtdYcb>F&le}aSszd1UD6}W(`%ex zWxD8bA1}`~6&&~R`u(o8m74JJ{4YV5Jm=%3&r~q!;}PI6g%JoN5Jn)3Kp25A0$~I` zwGsGx!xjHDN8YSAM;E=kQ7QB838$_yX^uQw|BUW)WlI5cWxa#H4QpbG^arVA`jtwh z(m$qUnjlWUI49G@aQf*vnI>}6znGJ00yq8eoJ(%I1WyLR{`j7r- z!;f}#!aRM>xfl*?c2HkA>-JOZssF>RRIEy$KWL7$?x3t$e$82Gj<)WEF!lVo zN@XgE=$%;93ORBJwqE^YKYbetDRU5ZX8DdObL5UG&Twc{7$YM;ET&Kn}|B z*_H_e>ql7E{Z6|qpVakD9YCwg@lzvroT`*oLgOej%<{>pFMdu{J+pmS1w4dz31mZlkg%;0QPSVzm0LrEaSaqIh~A z!VKSGU%&d0f$w7(+s$Tk4vScMu3m8c zHZ-5U|NQZ9l}Z}eK2xdu5ZDcT4EQ9l8~D&WmCCcgkASZN>6&xGl&Fl_MKbAAB3)XEhbfFe7=bVXVFbbmgb@fM z5Jn)3Kp25fcLZd=p6t()bwWFLd#QAv7c=-wrBm`VF0N!*_8YHenLeITTIGV0{eZOI zp(N$+o~z_Y9^(az?5R6L3l${U+jpGHWl!JF++w|Vm9n%>qO`&V@2{+b#(2M-taHX# zgVtq~WX~S$AEh+L6$?Mf4U=|HXf35=@1E3u1@of!Va4qq;;Jcp%G3QFXMgxyr)7yd zmVsLS4wrwxvh;)9|Ia}7c?TUf^Zkx7b}{xa<{5_>gZ3XM%f8QDZEasPHXQ6NWu21I zvIRR{o10p;mbBb*6*K?A8O3d(dZ!Xpq5BfSs$2Gn8>~RXww6BKw>q3MXy$#YtgZ~^ zdjkA5%*%5l`W?tbFzOp)E+hAqR@xDDiT511rAmDp8LC&FL*eh$`YXKswZi`tnOg0C z96b4%=63e5!F>JwHOeE{5#70#6@5_4|4FTXsS1vlNo}`Wbs9bg$o>^q1W%yVr5As(+D-a(4NBfy!gtN%r*9tKVgJGQ z+y&kcr#__Ll$O`6R5ISB=po|KQZeG!oq7y>BgW5VG0}AKcJLSB{=!Hn{o53BPq4m> zH@8%&-?DxO2TA{Sguhk5wd=i%Q6PyGtky8tg(W6rIYyBz$Lo-~))2L7Tt zyw!v6kpq1NrzM)hzV<|K%5o9|3PjN<_4h~n)Z89;%gI=MvsPwkT_!5S*LJB^%oKu+e(%)nIkaqNftDnDpmc< zWC1No6)pEyO3T{0fBT-e72n%#VH) z?mQ6hwz{|P=#0Z6cvv7NhXFq300138@O0_mfMq8WPD1Ja@ezZnayp<;MIeAMaE}n6 z9o6RvtYk4~4dORjI&09e4_e7o)+!b8vn|4gKUQzC$hP{7108*+a_=6FsG{hgL=|CK zIMg7qqtgyGk1$jppQv^~w@9gI@kqu=^a7oNi-VHGFUWRuAX|#|mQpwplS-;6PG8uC zywdDoqBy9c$s<{KaM3BaC5P=oF_p_!2^Pu0EI3D)+PX&(=;5t^8_o>Xo#77JnI{zW|GOrXgxI&tKQC93_erJNCxtQ3?JXKKUrBKl+*#0iC zH0Kk0ndb>gJ<@*B6FdZannQ{#^GLyFw_5!@z~6qRRT>%cE%q|66`Wv2sXy5N!)(8U z^<e$ zU)+c+_`QJrI2Q=^h#{lc2|gIGm;dL2Nx3lrdvW{YfW6FT1Z90D`oZ}9gzdd*4Ax}a z&7wfip1ws<{Ka18eS$ZLA!E><_Idj4?=@US!ROdcK*o>A&mvRx{#omm$a&IcQDh9- zzZ9^S^?{%fun)HXHI!o(=6zq2b+4S;4Av*I;5)Ew@Y&0{_`xQSNTTOTbF%Osp@5zp zSr5v5T+Z*xIbB{pdj4em-3MEWzu3$Cd~72~q`V8>U#VAcF^v88D#ofDc&SGmh@Hqw zVN9IVFZ0Jnwh!u4UH-J3>t}VZNbF@^ECpikM;W`KL%59{&Tzvn2S;lsX^*^@X Date: Tue, 23 Jun 2026 12:09:36 +0200 Subject: [PATCH 23/30] inished adb_no tests --- examples/profilers/ADB/RunnerConfig.py | 86 ++++++++++--------- .../profilers/EnergiBridge/RunnerConfig.py | 3 - .../DistributedOrchestrator.py | 7 +- .../DistributedExecution/Worker.py | 9 +- .../Experiment/ExperimentController.py | 5 +- ...EnergyValidator.py => AnomaliesChecker.py} | 8 +- 6 files changed, 63 insertions(+), 55 deletions(-) rename experiment-runner/ProgressManager/Validation/{EnergyValidator.py => AnomaliesChecker.py} (96%) diff --git a/examples/profilers/ADB/RunnerConfig.py b/examples/profilers/ADB/RunnerConfig.py index ed82fe9cc..383fec2a7 100644 --- a/examples/profilers/ADB/RunnerConfig.py +++ b/examples/profilers/ADB/RunnerConfig.py @@ -6,23 +6,13 @@ from ConfigValidator.Config.Models.OperationType import OperationType from ProgressManager.Output.OutputProcedure import OutputProcedure as output from ProgressManager.Validation.RequirementsValidator import (validate_experiment_requirements) -from Plugins.Profilers.AndroidDebugBridge import AndroidBatteryMonitor, battery_monitor +from Plugins.Profilers.AndroidDebugBridge import AndroidBatteryMonitor from typing import Dict, List, Any, Optional from pathlib import Path from os.path import dirname, realpath import time -@battery_monitor( - device_serial=None, - data_columns=[ - 'battery_percentage', - 'battery_temperature', - 'battery_voltage', - 'charge_rate', - 'power_draw' - ] -) class RunnerConfig: ROOT_DIR = Path(dirname(realpath(__file__))) @@ -39,17 +29,14 @@ class RunnerConfig: """Time between runs (cooldown period)""" time_between_runs_in_ms: int = 3000 - """Path to log file for energy validation report. Relative to experiment output directory.""" energy_validation_log_file: str = "energy_validation_report.log" - - """List of data column names that contain energy measurements (e.g., ['energy', 'joules', 'watts']).""" - # Dynamic configurations can be one-time satisfied here before the program takes the config as-is # e.g. Setting some variable based on some criteria def __init__(self): """Executes immediately after program start, on config load""" + self.profiler = None EventSubscriptionController.subscribe_to_multiple_events([ (RunnerEvents.VALIDATE_EXPERIMENT, self.validate_experiment), @@ -69,9 +56,6 @@ def __init__(self): def create_run_table_model(self) -> RunTableModel: """Define the experimental design with factors and data columns. - - Note: The @AndroidEnergyMonitor.energy_monitor decorator automatically - adds energy data columns to this model. """ # Define experimental factors workload_factor = FactorModel("workload", ['light', 'medium', 'heavy']) @@ -80,7 +64,6 @@ def create_run_table_model(self) -> RunTableModel: self.run_table_model = RunTableModel( factors=[workload_factor, screen_factor], repetitions=1, - # Add custom data columns (energy columns are added by decorator) data_columns=['workload_duration_ms', 'task_completion_status'] ) return self.run_table_model @@ -90,22 +73,32 @@ def validate_experiment(self) -> None: validate_experiment_requirements(Path(__file__)) output.console_log("Config.validate_experiment() called!") - def before_experiment(self) -> None: - """Called before experiment starts.""" - output.console_log("Starting Android energy monitoring experiment...") - output.console_log("Ensure your Android device is connected via USB or emulator is running") + def before_experiment(self): + self.profiler = AndroidBatteryMonitor( + device_serial=None, + poll_interval=2 + ) + self.profiler.open_device() + output.console_log("Android profiler initialized") def before_run(self) -> None: """Called before each run.""" output.console_log(f"Preparing device for run...") - def start_run(self, context: RunnerContext) -> None: - """Start a single experiment run.""" - output.console_log("Config.start_run() called!") + def start_run(self, context): + if self.profiler is None: + self.profiler = AndroidBatteryMonitor( + device_serial=None, + poll_interval=2 + ) + self.profiler.open_device() + + self.profiler.logfile = (context.run_dir / "android_battery.csv") def start_measurement(self, context: RunnerContext) -> None: - """Start measurement - energy monitoring begins here automatically.""" - output.console_log("Energy monitoring started (battery metrics being collected)") + """Start measurement.""" + output.console_log("Energy monitoring started") + self.profiler.start() def interact(self, context: RunnerContext): workload = context.execute_run['workload'] @@ -130,30 +123,45 @@ def interact(self, context: RunnerContext): def stop_measurement(self, context: RunnerContext) -> None: """Stop measurement - energy monitoring ends here automatically.""" output.console_log("Energy monitoring stopped") + self.profiler.stop() def stop_run(self, context: RunnerContext) -> None: """Stop the current run.""" output.console_log(f"Stopped run: {context.execute_run['__run_id']}") - def populate_run_data(self, context: RunnerContext) -> Optional[Dict[str, Any]]: - """Populate data columns for this run. """ + def populate_run_data(self, context: RunnerContext): + battery_log = self.profiler.parse_log(self.profiler.logfile) workload = context.execute_run['workload'] - duration_ms = { - 'light': 5000, - 'medium': 10000, - 'heavy': 15000 - }.get(workload, 5000) - + 'light':5000, + 'medium':10000, + 'heavy':15000 + }[workload] + return { - 'workload_duration_ms': duration_ms, - 'task_completion_status': 'success' + "workload_duration_ms": duration_ms, + "task_completion_status": "success", + "android_battery__battery_percentage": + battery_log.get("android_battery__percentage", 0), + "android_battery__battery_temperature": + battery_log.get("android_battery__temperature", 0), + "android_battery__battery_voltage": + battery_log.get( + "android_battery__voltage",0), + "android_battery__current_now": + battery_log.get( + "android_battery__current_now",0), + "android_battery__charge_counter": + battery_log.get( + "android_battery__charge_counter",0), + "android_battery__power_draw": + battery_log.get("android_battery__power_draw",0) } def after_experiment(self) -> None: """Called after experiment completes.""" output.console_log("Android energy monitoring experiment completed!") - output.console_log("Results stored in experiments/android_energy_monitoring_experiment/") + output.console_log(f"Results stored in {self.results_output_path}") # ================================ DO NOT ALTER BELOW THIS LINE ================================ experiment_path: Path = None \ No newline at end of file diff --git a/examples/profilers/EnergiBridge/RunnerConfig.py b/examples/profilers/EnergiBridge/RunnerConfig.py index 120879ff5..461996a39 100644 --- a/examples/profilers/EnergiBridge/RunnerConfig.py +++ b/examples/profilers/EnergiBridge/RunnerConfig.py @@ -37,9 +37,6 @@ class RunnerConfig: """Path to log file for energy validation report. Relative to experiment output directory.""" energy_validation_log_file: str = "energy_validation_report.log" - """List of data column names that contain energy measurements (e.g., ['energy', 'joules', 'watts']).""" - energy_validation_columns: List[str] = [] - # Dynamic configurations can be one-time satisfied here before the program takes the config as-is # e.g. Setting some variable based on some criteria def __init__(self): diff --git a/experiment-runner/DistributedExecution/DistributedOrchestrator.py b/experiment-runner/DistributedExecution/DistributedOrchestrator.py index 646c203b1..d4c9f2bfe 100644 --- a/experiment-runner/DistributedExecution/DistributedOrchestrator.py +++ b/experiment-runner/DistributedExecution/DistributedOrchestrator.py @@ -2,10 +2,7 @@ from ProgressManager.Output.CSVOutputManager import CSVOutputManager from EventManager.Models.RunnerEvents import RunnerEvents from EventManager.EventSubscriptionController import EventSubscriptionController -from ProgressManager.Validation.EnergyValidator import ( - ResultsValidator, - AnomalyReport -) +from ProgressManager.Validation.AnomaliesChecker import ResultsValidator, AnomalyReport from flask import Flask, request, jsonify import threading @@ -290,6 +287,8 @@ def monitor(self): ### | - Restore interrupted experiments | ### | - Start monitoring threads | ### | - Start the API server | +### | - If anomalies are present combined them | +### | into a report | ### | | ### | | ### ========================================================= diff --git a/experiment-runner/DistributedExecution/Worker.py b/experiment-runner/DistributedExecution/Worker.py index 5f4ecb0a2..e060a9ecf 100644 --- a/experiment-runner/DistributedExecution/Worker.py +++ b/experiment-runner/DistributedExecution/Worker.py @@ -1,7 +1,7 @@ from ExperimentOrchestrator.Experiment.Run.RunController import RunController from EventManager.EventSubscriptionController import EventSubscriptionController from EventManager.Models.RunnerEvents import RunnerEvents -from ProgressManager.Validation.EnergyValidator import ResultsValidator +from ProgressManager.Validation.AnomaliesChecker import ResultsValidator import threading import time @@ -15,7 +15,7 @@ ### | | ### | - Connect to the master orchestrator | ### | - Request experiment runs/tasks | -### | - Execute runs locally | +### | - Execute runs locally + anomalies check | ### | - Send results back to the master | ### | - Send periodic heartbeat updates | ### | - Gracefully shutdown on master request | @@ -116,23 +116,20 @@ def _execute(self, run, config): controller = RunController(run, config, current_run, total_runs, distributed_mode=True) run_data = controller.do_run() - run_id = run["__run_id"] + # Check for anomalies in the run raw result treatment_levels = { k: v for k, v in run.items() if not k.startswith("__") } - run_dir = config.experiment_path / run_id - anomaly_report = ResultsValidator.validate_output_log( run_dir, run_id, treatment_levels ) - print(f"[WORKER] Task {run.get('__run_id')} completed") return run_data, anomaly_report diff --git a/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py b/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py index 09b6ba40e..5abc9d1d6 100644 --- a/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py +++ b/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py @@ -13,7 +13,7 @@ from ProgressManager.Output.OutputProcedure import OutputProcedure as output from EventManager.EventSubscriptionController import EventSubscriptionController from ConfigValidator.CustomErrors.ProgressErrors import AllRunsCompletedOnRestartError -from ProgressManager.Validation.EnergyValidator import ( +from ProgressManager.Validation.AnomaliesChecker import ( ResultsValidator, AnomalyReport ) @@ -155,6 +155,7 @@ def do_experiment(self): perform_run.start() perform_run.join() + # -- Checks for anomalies in the run raw result run_id = current_run["__run_id"] treatment_levels = { k: v @@ -185,7 +186,7 @@ def do_experiment(self): output.console_log_WARNING("Calling after_experiment config hook") EventSubscriptionController.raise_event(RunnerEvents.AFTER_EXPERIMENT) - # -- Validation summary + # -- Anomalies Report creation combined_report = AnomalyReport() for report in self.validation_results.values(): diff --git a/experiment-runner/ProgressManager/Validation/EnergyValidator.py b/experiment-runner/ProgressManager/Validation/AnomaliesChecker.py similarity index 96% rename from experiment-runner/ProgressManager/Validation/EnergyValidator.py rename to experiment-runner/ProgressManager/Validation/AnomaliesChecker.py index 15a2bbc10..1b69190cb 100644 --- a/experiment-runner/ProgressManager/Validation/EnergyValidator.py +++ b/experiment-runner/ProgressManager/Validation/AnomaliesChecker.py @@ -39,7 +39,13 @@ def has_anomalies(self) -> bool: class ResultsValidator: - + """ + Validates experiment output logs and detects: + - NaN values + - negative values + - zero values + - missing files + """ @staticmethod def _detect_numeric_columns(df: pd.DataFrame) -> List[str]: """ From 0ed985e43cd42a9745e4cb962591ceca61d60055 Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Tue, 23 Jun 2026 12:10:09 +0200 Subject: [PATCH 24/30] finsied adb - no tests --- .../Plugins/Profilers/AndroidDebugBridge.py | 388 ++++++------------ 1 file changed, 127 insertions(+), 261 deletions(-) diff --git a/experiment-runner/Plugins/Profilers/AndroidDebugBridge.py b/experiment-runner/Plugins/Profilers/AndroidDebugBridge.py index 4f00d1d22..0157d2eac 100644 --- a/experiment-runner/Plugins/Profilers/AndroidDebugBridge.py +++ b/experiment-runner/Plugins/Profilers/AndroidDebugBridge.py @@ -1,21 +1,19 @@ -from __future__ import annotations from pathlib import Path -from typing import Iterable, Optional, Dict, Any +from typing import Optional, Dict, Any from enum import Enum, auto import re import subprocess -import threading import csv from datetime import datetime import pandas as pd +import time +import threading -from Plugins.Profilers.DataSource import CLISource, ParameterDict +from Plugins.Profilers.DataSource import DeviceSource from ConfigValidator.Config.Models.RunnerContext import RunnerContext -from ConfigValidator.Config.RunnerConfig import RunnerConfig class DataColumns(Enum): - """Battery metrics that can be collected from Android devices via ADB dumpsys battery""" BATTERY_PERCENTAGE = auto() BATTERY_TEMPERATURE = auto() BATTERY_VOLTAGE = auto() @@ -28,289 +26,157 @@ class DataColumns(Enum): _PATTERN = re.compile(r'(android_battery__)(.+)') @property - def name(self) -> str: - return f'android_battery__{super().name.lower()}' + def column_name(self) -> str: + return f'android_battery__{self.name.lower()}' + -class AndroidBatteryMonitor(CLISource): - """Monitor battery and energy metrics from Android devices via ADB during experiment execution.""" +class AndroidBatteryMonitor(DeviceSource): source_name = "adb" supported_platforms = ["Linux", "Darwin"] - ANDROID_BATTERY_PARAMETERS = {} - - def __init__(self, device_serial: Optional[str] = None, poll_interval: int = 2, out_file: Path = "android_battery.csv", data_columns: Optional[Iterable[str]] = None): + def __init__(self, device_serial=None, poll_interval=2, out_file=Path("android_battery.csv")): super().__init__() - + self.device_serial = device_serial self.poll_interval = poll_interval - self.logfile = out_file - self.stop_monitoring = threading.Event() - self.monitoring_thread = None - self.monitor_error: Optional[Exception] = None - - # Validate ADB availability + self.logfile = Path(out_file) + self._validate_adb_available() - - @property - def parameters(self) -> ParameterDict: - return ParameterDict(self.ANDROID_BATTERY_PARAMETERS) - + + self._thread = None + self._stop_event = threading.Event() + def _validate_adb_available(self): - """Verify ADB is installed and accessible.""" - try: - result = subprocess.run(['adb', 'version'], capture_output=True, timeout=5) - if result.returncode != 0: - raise RuntimeError("ADB version check failed.") - except FileNotFoundError: - raise RuntimeError("ADB not found.") - except subprocess.TimeoutExpired: - raise RuntimeError("ADB timeout - check ADB installation") - - def _get_device_serial(self) -> str: + result = subprocess.run(['adb', 'version'], capture_output=True, timeout=5) + if result.returncode != 0: + raise RuntimeError("ADB version check failed.") + + def open_device(self): if self.device_serial: - return self.device_serial + return result = subprocess.run(["adb", "devices"], capture_output=True, text=True, timeout=5) - devices = [] - - for line in result.stdout.splitlines(): - line = line.strip() - if not line: - continue - if line.startswith("List of devices"): - continue - if "\tdevice" in line: - devices.append(line.split()[0]) + devices = [ + line.split()[0] + for line in result.stdout.splitlines() + if "\tdevice" in line + ] if not devices: - raise RuntimeError("No ADB devices found") - - return devices[0] + raise RuntimeError("No devices found") - def _parse_battery_data(self, dumpsys_output: str) -> Dict[str, Any]: - """Parse dumpsys battery output and extract metrics.""" - data = {} - if not dumpsys_output: - return data + self.device_serial = devices[0] - patterns = { - 'percentage' : r'^\s*level:\s+(\d+)', - 'temperature' : r'^\s*temperature:\s+(\d+)', - 'voltage' : r'^\s*voltage:\s+(\d+)', - 'health' : r'^\s*health:\s+(\d+)', - 'status' : r'^\s*status:\s+(\d+)', - 'current_now' : r'^\s*current now:\s+(-?\d+)', - 'charge_counter' : r'^\s*charge counter:\s+(\d+)', - } - - for key, pattern in patterns.items(): - match = re.search(pattern, dumpsys_output, re.MULTILINE) - if match: data[key] = match.group(1) - - # Calculate power draw estimate - if 'voltage' in data and 'current_now' in data: - try: - voltage_mv = int(data['voltage']) - current_ua = int(data['current_now']) - voltage_v = voltage_mv / 1000.0 - current_ma = abs(float(data["current_now"])) - power_mw = voltage_v * current_ma - data['power_draw'] = f"{power_mw:.2f}" - except (ValueError, KeyError): - pass - return data + def close_device(self): + self.device_serial = None - def start(self): - """Start monitoring battery metrics.""" - if self.monitoring_thread and self.monitoring_thread.is_alive(): - raise RuntimeError("Android energy monitoring is already running") + def list_devices(self): + result = subprocess.run(["adb", "devices"], capture_output=True, text=True, timeout=5) + return [ + line.split()[0] + for line in result.stdout.splitlines() + if "\tdevice" in line + ] + + def set_mode(self, settings=None): + return - self.stop_monitoring.clear() - self.measurements = [] - self.monitor_error = None + def read_sample(self): + result = subprocess.run(["adb", "-s", self.device_serial, "shell", "dumpsys battery"], capture_output=True, text=True, timeout=10) + return self._parse(result.stdout) + + def _parse(self, text): + patterns = { + "percentage": r"^\s*level:\s*(\d+)", + "temperature": r"^\s*temperature:\s*(\d+)", + "voltage": r"^\s*voltage:\s*(\d+)", + "current_now": r"^\s*current now:\s*(-?\d+)", + "charge_counter": r"^\s*charge counter:\s*(\d+)", + } + data = {} + for key, pattern in patterns.items(): + match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE) + if match: + data[key] = match.group(1) + + voltage_raw = data.get("voltage") + if voltage_raw is None: + fallback = re.search(r"voltage:\s*(\d+)", text) + voltage_raw = fallback.group(1) if fallback else None try: - self.logfile.parent.mkdir(parents=True, exist_ok=True) - except Exception as e: - raise RuntimeError(f"Failed to create log directory: {e}") + voltage_v = (float(voltage_raw) / 1000.0) if voltage_raw else None + except ValueError: + voltage_v = None - self._get_device_serial() - self.monitoring_thread = threading.Thread(target=self._monitor_loop, name="AndroidEnergyMonitor", daemon=True) - self.monitoring_thread.start() - - def _monitor_loop(self): try: - device_serial = self._get_device_serial() - with open(self.logfile, 'w', newline='') as csvfile: - fieldnames = [ - 'timestamp', - 'percentage', - 'temperature', - 'voltage', - 'health', - 'status', - 'current_now', - 'charge_counter', - 'power_draw' + current_raw = float(data.get("current_now", 0)) + current_ma = abs(current_raw) / 1000.0 + except ValueError: + current_ma = 0.0 + if voltage_v is not None: + data["voltage"] = float(voltage_raw) + data["power_draw"] = voltage_v * current_ma + else: + data["voltage"] = 0.0 + data["power_draw"] = 0.0 + + return data + + def _run(self): + self.open_device() + with open(self.logfile, "w", newline="") as f: + writer = csv.DictWriter( + f, + fieldnames=[ + "timestamp", + "percentage", + "temperature", + "voltage", + "current_now", + "charge_counter", + "power_draw" ] - writer = csv.DictWriter(csvfile, fieldnames=fieldnames) - writer.writeheader() - while not self.stop_monitoring.is_set(): - result = subprocess.run( - [ - 'adb', - '-s', - device_serial, - 'shell', - 'dumpsys battery' - ], - capture_output=True, text=True, timeout=10) - - if result.returncode != 0: - raise RuntimeError(f"ADB command failed:\n{result.stderr}") - - metrics = self._parse_battery_data(result.stdout) - metrics['timestamp'] = datetime.now().isoformat() - self.measurements.append(metrics) - writer.writerow(metrics) - csvfile.flush() - self.stop_monitoring.wait(self.poll_interval) - except Exception as e: - self.monitor_error = e - self.stop_monitoring.set() + ) + writer.writeheader() - def stop(self): - if not self.monitoring_thread: - return "" + while not self._stop_event.is_set(): + data = self.read_sample() + data["timestamp"] = datetime.now().isoformat() + writer.writerow(data) + f.flush() + time.sleep(self.poll_interval) + self.close_device() - print("STOP: setting event") + def log(self): + self._run() + return 0 - self.stop_monitoring.set() + def start(self): + if self._thread and self._thread.is_alive(): + raise RuntimeError("Battery monitor already running") + self._stop_event.clear() + self._thread = threading.Thread(target=self.log, name="DeviceWorker", daemon=True) + self._thread.start() - print("STOP: joining thread") + def stop(self): + self._stop_event.set() - self.monitoring_thread.join() + if self._thread: + self._thread.join(timeout=5) + self._thread = None - print("STOP: thread joined") - - def __del__(self): - """Cleanup on deletion.""" - if self.monitoring_thread and self.monitoring_thread.is_alive(): - self.stop_monitoring.set() - self.monitoring_thread.join(timeout=5) - @staticmethod - def parse_log(logfile: Path) -> Dict[str, Any]: - """Parse battery metrics CSV log file.""" - try: - df = pd.read_csv(logfile) - return df.to_dict(orient='records') - except Exception as e: - print(f"Could not parse Android battery log: {e}") + def parse_log(logfile): + df = pd.read_csv(logfile) + if df.empty: return {} -def battery_monitor(device_serial=None, poll_interval=2, data_columns=None): - def battery_monitor_decorator(cls): - cols = data_columns or [col.name for col in DataColumns] - - cls.create_run_table_model = add_data_columns(cols)(cls.create_run_table_model) - cls.start_measurement = start_battery_monitor(device_serial, poll_interval)(cls.start_measurement) - cls.stop_measurement = stop_battery_monitor(cls.stop_measurement) - cls.populate_run_data = populate_data_columns(cls.populate_run_data) - return cls - - return battery_monitor_decorator - -def start_battery_monitor(device_serial: Optional[str] = None, poll_interval: int = 2): - def start_battery_monitor_decorator(func): - def wrapper(*args, **kwargs): - self: RunnerConfig = args[0] - context: RunnerContext = args[1] - logfile = (context.run_dir.resolve()/ "android_battery.csv") - - self.__android_battery_monitor__ = (AndroidBatteryMonitor(device_serial=device_serial, poll_interval=poll_interval, out_file=logfile)) - self.__android_battery_monitor__.start() - return func(*args, **kwargs) - return wrapper - return start_battery_monitor_decorator - -def stop_battery_monitor(func): - def wrapper(*args, **kwargs): - self: RunnerConfig = args[0] - ret_val = func(*args, **kwargs) - - if hasattr(self, "__android_battery_monitor__"): - self.__android_battery_monitor__.stop() - return ret_val - - return wrapper - -def add_data_columns(data_cols: Iterable[str]): - """Decorator to add Android battery data columns to run table.""" - def add_data_columns_decorator(func): - def wrapper(*args, **kwargs): - self: RunnerConfig = args[0] - - func(*args, **kwargs) - for dc in data_cols: - col_name = f'android_battery__{dc.lower()}' if not dc.startswith('android_battery__') else dc - if col_name not in self.run_table_model.get_data_columns(): - self.run_table_model.get_data_columns().append(col_name) - return self.run_table_model - - return wrapper - - return add_data_columns_decorator - -def populate_data_columns(func): - def wrapper(*args, **kwargs): - self: RunnerConfig = args[0] - ret_val = func(*args, **kwargs) - - if ret_val is None: - ret_val = {} - if not hasattr(self, "__android_battery_monitor__"): - return ret_val - - logfile = self.__android_battery_monitor__.logfile - - if not logfile.exists(): - return ret_val + result = {} + for col in df.columns: + if col == "timestamp": + continue + values = pd.to_numeric(df[col], errors="coerce").dropna() - try: - df = pd.read_csv(logfile) - - if df.empty: - return ret_val - - metric_map = { - "battery_percentage": "percentage", - "battery_temperature": "temperature", - "battery_voltage": "voltage", - "battery_health": "health", - "charging_status": "status", - "charge_rate": "charge_rate", - "current_now": "current_now", - "power_draw": "power_draw" - } - - for dc in self.run_table_model.get_data_columns(): - m = DataColumns._PATTERN.value.match(dc) - if not m: - continue - metric_name = m.group(2) - csv_column = metric_map.get(metric_name) - - if csv_column is None: - continue - if csv_column not in df.columns: - continue - - values = pd.to_numeric(df[csv_column], errors="coerce").dropna() - if len(values) == 0: - continue - ret_val[dc] = float(values.mean()) - except Exception as e: - - print(f"Error reading Android battery metrics: {e}") - return ret_val - - return wrapper \ No newline at end of file + if len(values): + result[f"android_battery__{col}"] = float(values.mean()) + + return result \ No newline at end of file From 23e4442dae5c7bc0dd7dc86b3a2df6b53f6903a5 Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Tue, 23 Jun 2026 12:44:57 +0200 Subject: [PATCH 25/30] redone_testings --- .../Profilers/test_AndroidDebugBridge.py | 226 ++++++++++-------- test/ProgressManager/test_AnomaliesChecker.py | 138 +++++++++++ test/ProgressManager/test_EnergyValidator.py | 82 ------- 3 files changed, 269 insertions(+), 177 deletions(-) create mode 100644 test/ProgressManager/test_AnomaliesChecker.py delete mode 100644 test/ProgressManager/test_EnergyValidator.py diff --git a/test/Plugins/Profilers/test_AndroidDebugBridge.py b/test/Plugins/Profilers/test_AndroidDebugBridge.py index 67b6ce93a..dcd9d7ac2 100644 --- a/test/Plugins/Profilers/test_AndroidDebugBridge.py +++ b/test/Plugins/Profilers/test_AndroidDebugBridge.py @@ -1,112 +1,148 @@ import unittest -import shutil import tempfile +import shutil import sys - +import time +import pandas as pd from pathlib import Path -from typing import AnyStr +from unittest.mock import patch, MagicMock sys.path.append("experiment-runner") -from ConfigValidator.Config.Models.RunnerContext import RunnerContext -from ConfigValidator.Config.RunnerConfig import RunnerConfig -from Plugins.Profilers.AndroidDebugBridge import ( - AndroidBatteryMonitor, - battery_monitor, - start_battery_monitor, - stop_battery_monitor, - add_data_columns, - populate_data_columns, - DataColumns -) - -class TestADBIndividual(unittest.TestCase): - class BatteryConfig(RunnerConfig): - tmpdir: AnyStr = tempfile.mkdtemp() - def clear(self): - shutil.rmtree(self.__class__.tmpdir) - - @add_data_columns([ - DataColumns.BATTERY_PERCENTAGE.name, - DataColumns.BATTERY_TEMPERATURE.name, - DataColumns.CURRENT_NOW.name - ]) - def create_run_table_model(self): - return super().create_run_table_model() - - @start_battery_monitor( - poll_interval=1 - ) - def start_measurement(self, context: RunnerContext): - super().start_measurement(context) - - def interact(self, context: RunnerContext): - import time - time.sleep(3) - - @stop_battery_monitor - def stop_measurement(self, context: RunnerContext): - super().stop_measurement(context) +from Plugins.Profilers.AndroidDebugBridge import AndroidBatteryMonitor +class TestADBMonitorLoop(unittest.TestCase): def setUp(self): - self.runner_config = self.__class__.BatteryConfig() - + self.tmpdir = tempfile.mkdtemp() def tearDown(self): - self.runner_config.clear() - - def test_monitor(self): - - class FakeContext: - run_dir = Path(self.runner_config.tmpdir) - - context = FakeContext() - - self.runner_config.start_measurement(context) - self.runner_config.interact(context) - self.runner_config.stop_measurement(context) - - run_data = self.runner_config.populate_run_data(context) - - self.assertTrue((Path(self.runner_config.tmpdir)/ "android_battery.csv").is_file()) - print(run_data) - -class TestADBCombined(unittest.TestCase): - tmpdir: AnyStr = tempfile.mkdtemp() - @battery_monitor( - poll_interval=1, - data_columns=[ - DataColumns.BATTERY_PERCENTAGE.name, - DataColumns.BATTERY_TEMPERATURE.name, - DataColumns.CURRENT_NOW.name, - DataColumns.POWER_DRAW.name - ] - ) - class BatteryConfig(RunnerConfig): - def clear(self): - shutil.rmtree(TestADBCombined.tmpdir) - - def interact(self, context): - import time + shutil.rmtree(self.tmpdir) + def fake_subprocess(self, *args, **kwargs): + cmd = args[0] + mock = MagicMock() + mock.returncode = 0 + # CASE 1: adb devices + if "devices" in cmd: + mock.stdout = "emulator-5554\tdevice\n" + return mock + # CASE 2: dumpsys battery + mock.stdout = """ + level: 75 + temperature: 315 + voltage: 4100 + current now: -900000 + charge counter: 3810000 + """ + return mock + + def test_start_stop_monitor(self): + with patch("subprocess.run", side_effect=self.fake_subprocess): + monitor = AndroidBatteryMonitor( + out_file=Path(self.tmpdir) / "android_battery.csv", + poll_interval=1 + ) + + monitor.start() time.sleep(3) + monitor.stop() + + csv_path = Path(self.tmpdir) / "android_battery.csv" + self.assertTrue(csv_path.exists()) + + df = pd.read_csv(csv_path) + + self.assertFalse(df.empty) + self.assertIn("voltage", df.columns) + self.assertIn("power_draw", df.columns) + print(df.head()) + +class FakeBatteryDevice: + """ + Deterministic battery simulator. + Mimics Android dumpsys battery output. + """ + def __init__(self): + self.level = 80 + self.voltage = 4200 + self.temperature = 310 + self.current_now = -900000 # ยตA + self.charge_counter = 3810000 + self.tick = 0 + + def step(self): + """ + Simulate time passing. + """ + self.tick += 1 + + # battery slowly drains + if self.tick % 2 == 0: + self.level = max(0, self.level - 1) + # voltage drops slightly with battery level + self.voltage = 4200 - (80 - self.level) * 2 + # current fluctuates slightly + self.current_now = -900000 - (self.tick * 1000) + + def dumpsys(self): + self.step() + return f""" + level: {self.level} + temperature: {self.temperature} + voltage: {self.voltage} + current now: {self.current_now} + charge counter: {self.charge_counter} + """ + +class FakeADB: + def __init__(self, device: FakeBatteryDevice): + self.device = device + + def run(self, cmd, *args, **kwargs): + mock = MagicMock() + mock.returncode = 0 + cmd_str = " ".join(cmd) + # adb devices + if "devices" in cmd_str: + mock.stdout = "emulator-5554\tdevice\n" + return mock + + # dumpsys battery + if "dumpsys battery" in cmd_str: + mock.stdout = self.device.dumpsys() + return mock + + mock.stdout = "" + return mock + +class TestDeterministicBattery(unittest.TestCase): def setUp(self): - self.runner_config = self.__class__.BatteryConfig() + self.tmpdir = tempfile.mkdtemp() def tearDown(self): - self.runner_config.clear() + shutil.rmtree(self.tmpdir) def test_monitor(self): - - class FakeContext: - run_dir = Path(TestADBCombined.tmpdir) - - context = FakeContext() - - self.runner_config.start_measurement(context) - self.runner_config.interact(context) - self.runner_config.stop_measurement(context) - - run_data = self.runner_config.populate_run_data(context) - - self.assertTrue((Path(TestADBCombined.tmpdir)/ "android_battery.csv").is_file()) - print(run_data) \ No newline at end of file + device = FakeBatteryDevice() + fake_adb = FakeADB(device) + + def patched_run(cmd, *args, **kwargs): + return fake_adb.run(cmd, *args, **kwargs) + + with patch("subprocess.run", side_effect=patched_run): + monitor = AndroidBatteryMonitor( + out_file=Path(self.tmpdir) / "battery.csv", + poll_interval=1 + ) + monitor.start() + + time.sleep(4) + monitor.stop() + + df = pd.read_csv(Path(self.tmpdir) / "battery.csv") + + # deterministic checks + self.assertGreater(len(df), 2) + self.assertIn("voltage", df.columns) + self.assertIn("power_draw", df.columns) + self.assertTrue(df["voltage"].iloc[-1] <= df["voltage"].iloc[0]) + print(df) \ No newline at end of file diff --git a/test/ProgressManager/test_AnomaliesChecker.py b/test/ProgressManager/test_AnomaliesChecker.py new file mode 100644 index 000000000..f97a795cf --- /dev/null +++ b/test/ProgressManager/test_AnomaliesChecker.py @@ -0,0 +1,138 @@ +import unittest +import tempfile +from pathlib import Path +import pandas as pd +import sys + +sys.path.append("experiment-runner") + +from ProgressManager.Validation.AnomaliesChecker import ResultsValidator, AnomalyReport + +class TestAnomaliesChecker(unittest.TestCase): + def create_run_folder(self, df): + tmpdir = tempfile.TemporaryDirectory() + run_dir = Path(tmpdir.name) + csv_file = run_dir / "energibridge.csv" + df.to_csv(csv_file, index=False) + return tmpdir, run_dir + + def test_positive_values(self): + df = pd.DataFrame({ + "CPU_ENERGY (J)": [10, 12, 15], + "CORE0_ENERGY (J)": [1.5, 1.7, 2.0] + }) + tmpdir, run_dir = self.create_run_folder(df) + report = ResultsValidator.validate_output_log( + run_dir, + "run_1", + {"workload": "light"} + ) + self.assertFalse(report.has_anomalies()) + tmpdir.cleanup() + + def test_zero_value(self): + df = pd.DataFrame({ + "CPU_ENERGY (J)": [10, 0, 15] + }) + tmpdir, run_dir = self.create_run_folder(df) + report = ResultsValidator.validate_output_log( + run_dir, + "run_1", + {"workload": "light"} + ) + self.assertTrue(report.has_anomalies()) + self.assertEqual(report.anomalies[0]["anomaly_type"], "zero") + tmpdir.cleanup() + + def test_negative_value(self): + df = pd.DataFrame({ + "CPU_ENERGY (J)": [10, -5, 15] + }) + tmpdir, run_dir = self.create_run_folder(df) + report = ResultsValidator.validate_output_log( + run_dir, + "run_1", + {"workload": "medium"} + ) + self.assertTrue(report.has_anomalies()) + self.assertEqual(report.anomalies[0]["anomaly_type"], "negative") + tmpdir.cleanup() + + def test_nan_value(self): + df = pd.DataFrame({ + "CPU_ENERGY (J)": [10, None, 15] + }) + tmpdir, run_dir = self.create_run_folder(df) + report = ResultsValidator.validate_output_log( + run_dir, + "run_1", + {"workload": "heavy"} + ) + self.assertTrue(report.has_anomalies()) + self.assertEqual(report.anomalies[0]["anomaly_type"], "NaN") + + tmpdir.cleanup() + + def test_missing_file(self): + tmpdir = tempfile.TemporaryDirectory() + run_dir = Path(tmpdir.name) + report = ResultsValidator.validate_output_log( + run_dir, + "run_1", + {"workload": "light"} + ) + self.assertTrue(report.has_anomalies()) + self.assertEqual(report.anomalies[0]["anomaly_type"], "missing_file") + tmpdir.cleanup() + + def test_generate_report(self): + tmpdir = tempfile.TemporaryDirectory() + experiment_path = Path(tmpdir.name) + + run0 = experiment_path / "run_0" + run0.mkdir() + + pd.DataFrame({ + "CPU_ENERGY (J)": [10, 0, 15] + }).to_csv(run0 / "energibridge.csv", index=False) + + run1 = experiment_path / "run_1" + run1.mkdir() + + pd.DataFrame({ + "CPU_ENERGY (J)": [10, -5, 15] + }).to_csv(run1 / "energibridge.csv", index=False) + + run_table = [ + {"__run_id": "run_0", "workload": "light", "brightness": "low"}, + {"__run_id": "run_1", "workload": "heavy", "brightness": "high"} + ] + final_report = AnomalyReport() + + for run in run_table: + run_id = run["__run_id"] + treatment_levels = { + k: v for k, v in run.items() + if not k.startswith("__") + } + run_dir = experiment_path / run_id + run_report = ResultsValidator.validate_output_log( + run_dir, + run_id, + treatment_levels + ) + final_report.anomalies.extend(run_report.anomalies) + + self.assertTrue(final_report.has_anomalies()) + log_file = experiment_path / "energibridge.log" + ResultsValidator.save_report_to_file( + final_report, + log_file + ) + self.assertTrue(log_file.exists()) + print(log_file.read_text()) + tmpdir.cleanup() + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/test/ProgressManager/test_EnergyValidator.py b/test/ProgressManager/test_EnergyValidator.py deleted file mode 100644 index 6defd635a..000000000 --- a/test/ProgressManager/test_EnergyValidator.py +++ /dev/null @@ -1,82 +0,0 @@ -import unittest -import sys -from pathlib import Path - -sys.path.append("experiment-runner") - -from ProgressManager.Validation.EnergyValidator import (EnergyValidator,EnergyAnomalyReport) - - -class TestEnergyValidator(unittest.TestCase): - def test_positive_energy(self): - run_table = [ - { - "__run_id": "run_1", - "cpu_energy": 10.5 - }] - - report = EnergyValidator.validate_run_table(run_table,["cpu_energy"]) - self.assertFalse(report.has_anomalies()) - - def test_zero_energy(self): - run_table = [ - { - "__run_id": "run_1", - "cpu_energy": 0 - }] - - report = EnergyValidator.validate_run_table(run_table,["cpu_energy"]) - self.assertTrue(report.has_anomalies()) - self.assertEqual(len(report.anomalies), 1) - - def test_negative_energy(self): - run_table = [ - { - "__run_id": "run_1", - "cpu_energy": -1 - }] - - report = EnergyValidator.validate_run_table(run_table,["cpu_energy"]) - self.assertTrue(report.has_anomalies()) - self.assertEqual(len(report.anomalies), 1) - - def test_mixed_values(self): - run_table = [ - { - "__run_id": "run_1", - "cpu_energy": 10 - }, - { - "__run_id": "run_2", - "cpu_energy": 0 - }, - { - "__run_id": "run_3", - "cpu_energy": -1 - }] - - report = EnergyValidator.validate_run_table(run_table, ["cpu_energy"]) - self.assertTrue(report.has_anomalies()) - self.assertEqual(len(report.anomalies), 2) - - def test_treatment_levels_saved(self): - run_table = [ - { - "__run_id": "run_1", - "__done": "DONE", - "fib_type": "iter", - "problem_size": 1000, - "cpu_energy": -1 - }] - - report = EnergyValidator.validate_run_table(run_table,["cpu_energy"]) - anomaly = report.anomalies[0] - - self.assertEqual(anomaly["treatment_levels"]["fib_type"],"iter") - self.assertEqual(anomaly["treatment_levels"]["problem_size"], 1000) - self.assertNotIn("__run_id", anomaly["treatment_levels"]) - self.assertNotIn("__done", anomaly["treatment_levels"]) - - -if __name__ == "__main__": - unittest.main() \ No newline at end of file From 311cef2af357452d0acf90f396f5e9681603328b Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Tue, 23 Jun 2026 12:54:58 +0200 Subject: [PATCH 26/30] comments --- .../Validation/AnomaliesChecker.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/experiment-runner/ProgressManager/Validation/AnomaliesChecker.py b/experiment-runner/ProgressManager/Validation/AnomaliesChecker.py index 1b69190cb..224a222c9 100644 --- a/experiment-runner/ProgressManager/Validation/AnomaliesChecker.py +++ b/experiment-runner/ProgressManager/Validation/AnomaliesChecker.py @@ -11,19 +11,21 @@ } class AnomalyReport: + def __init__(self): self.anomalies: List[Dict[str, Any]] = [] - def add_anomaly( - self, - run_id: str, - treatment_levels: Dict[str, Any], - file_path: str, - row_number: int, - column_name: str, - value: Any, - anomaly_type: str - ): + """ + Each anomaly detected has the following structure: + "run_id": the run where is located + "treatment_levels": the specific values of the run + "file_path": the file path + "row_number": the row where is located + "column_name": the column where is located + "value": the value + "anomaly_type": NAN or Zero or Negative Number or Missing file + """ + def add_anomaly(self, run_id: str, treatment_levels: Dict[str, Any], file_path: str, row_number: int, column_name: str, value: Any, anomaly_type: str): self.anomalies.append({ "run_id": run_id, "treatment_levels": treatment_levels, From 0aff27954429ff6107d18c77d43cfd7c1d41d06b Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Tue, 23 Jun 2026 13:03:48 +0200 Subject: [PATCH 27/30] Anomalies small look waise change --- .../ProgressManager/Validation/AnomaliesChecker.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/experiment-runner/ProgressManager/Validation/AnomaliesChecker.py b/experiment-runner/ProgressManager/Validation/AnomaliesChecker.py index 224a222c9..ab8235ce7 100644 --- a/experiment-runner/ProgressManager/Validation/AnomaliesChecker.py +++ b/experiment-runner/ProgressManager/Validation/AnomaliesChecker.py @@ -11,7 +11,6 @@ } class AnomalyReport: - def __init__(self): self.anomalies: List[Dict[str, Any]] = [] @@ -101,12 +100,7 @@ def generate_report_text(report: AnomalyReport) -> str: return "\n".join(lines) @staticmethod - def validate_output_log( - run_dir: Path, - run_id: str, - treatment_levels: Dict[str, Any], - ) -> AnomalyReport: - + def validate_output_log(run_dir: Path,run_id: str,treatment_levels: Dict[str, Any],) -> AnomalyReport: report = AnomalyReport() csv_files = list(run_dir.glob("*.csv")) From cb099d5d121dcb7c08782f7a1c2910eb66ffd65a Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Wed, 24 Jun 2026 19:42:37 +0200 Subject: [PATCH 28/30] I changed where Validation_EXPERIMENT is Calleds --- README.md | 4 ++-- .../DistributedOrchestrator.py | 7 +++---- .../Experiment/ExperimentController.py | 18 +++++++++--------- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index a8d61337e..ef09e1ff6 100644 --- a/README.md +++ b/README.md @@ -131,11 +131,11 @@ Experiment Runner supports **distributed execution across multiple machines** us ### How to run it Start the orchestrator on the master machine: ```bash -python experiment-runner/ examples// --distribute master --host host_nr --port port_nr +python experiment-runner/ examples// --distribute master ``` On each worker machine, connect to the master: ```bash -experiment-runner/ examples// --distribute worker --master orchestor_adress +experiment-runner/ examples// --distribute worker --master ``` When the experiment finish it, the master would close automatically, the rest of the workers would need manually closing, they would close after 120s diff --git a/experiment-runner/DistributedExecution/DistributedOrchestrator.py b/experiment-runner/DistributedExecution/DistributedOrchestrator.py index d4c9f2bfe..8b85b2b84 100644 --- a/experiment-runner/DistributedExecution/DistributedOrchestrator.py +++ b/experiment-runner/DistributedExecution/DistributedOrchestrator.py @@ -306,6 +306,9 @@ def __init__(self, config, metadata, host="0.0.0.0", port=5000): self.experiment_path.mkdir(parents=True, exist_ok=True) self.run_table_path = (self.experiment_path / "run_table.csv") + EventSubscriptionController.raise_event( + RunnerEvents.VALIDATE_EXPERIMENT + ) if self.run_table_path.exists(): print("[MASTER] Existing experiment detected") @@ -333,10 +336,6 @@ def __init__(self, config, metadata, host="0.0.0.0", port=5000): def start(self): if self.finished_before_start: return - - EventSubscriptionController.raise_event( - RunnerEvents.VALIDATE_EXPERIMENT - ) EventSubscriptionController.raise_event( RunnerEvents.BEFORE_EXPERIMENT diff --git a/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py b/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py index 5abc9d1d6..6331839a6 100644 --- a/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py +++ b/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py @@ -42,6 +42,15 @@ def __init__(self, config: RunnerConfig, metadata: Metadata): self.csv_data_manager = CSVOutputManager(self.config.experiment_path) self.json_data_manager = JSONOutputManager(self.config.experiment_path) + + # -- Validate experiment setup + # TODO: From the user perspective, it would be nice to know if are any possible issues with the experiment before staring the experiment runs. For example, if the config hooks are not properly defined, or if there are any issues with the config file itself + output.console_log_WARNING("Calling validate_experiment config hook") + try: + EventSubscriptionController.raise_event(RunnerEvents.VALIDATE_EXPERIMENT) + except BaseError as e: + output.console_log_FAIL(f"Experiment validation failed: {e}") + raise run_tbl = self.config.create_run_table_model() # Add in the proper data column for energibridge @@ -123,15 +132,6 @@ def __init__(self, config: RunnerConfig, metadata: Metadata): output.console_log_WARNING("Experiment run table created...") def do_experiment(self): - # -- Validate experiment setup - # TODO: From the user perspective, it would be nice to know if are any possible issues with the experiment before staring the experiment runs. For example, if the config hooks are not properly defined, or if there are any issues with the config file itself - output.console_log_WARNING("Calling validate_experiment config hook") - try: - EventSubscriptionController.raise_event(RunnerEvents.VALIDATE_EXPERIMENT) - except BaseError as e: - output.console_log_FAIL(f"Experiment validation failed: {e}") - raise - output.console_log_OK("Experiment setup completed...") # -- Before experiment From c493d6d1babdf3790d1157f119cfc764ca9459d2 Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Wed, 24 Jun 2026 20:17:30 +0200 Subject: [PATCH 29/30] Added the CONTINUE hook --- .../DistributedExecution/DistributedOrchestrator.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/experiment-runner/DistributedExecution/DistributedOrchestrator.py b/experiment-runner/DistributedExecution/DistributedOrchestrator.py index 8b85b2b84..b4614dfca 100644 --- a/experiment-runner/DistributedExecution/DistributedOrchestrator.py +++ b/experiment-runner/DistributedExecution/DistributedOrchestrator.py @@ -1,5 +1,7 @@ from ProgressManager.RunTable.Models.RunProgress import RunProgress +from ConfigValidator.Config.Models.Metadata import Metadata from ProgressManager.Output.CSVOutputManager import CSVOutputManager +from ConfigValidator.Config.Models.OperationType import OperationType from EventManager.Models.RunnerEvents import RunnerEvents from EventManager.EventSubscriptionController import EventSubscriptionController from ProgressManager.Validation.AnomaliesChecker import ResultsValidator, AnomalyReport @@ -26,7 +28,8 @@ ### ========================================================= class TaskManager: - def __init__(self, run_table, experiment_path: Path): + def __init__(self,config, run_table, experiment_path: Path): + self.config = config self.run_table = run_table self.experiment_path = experiment_path self.assigned_runs = {} @@ -93,13 +96,14 @@ def complete_task(self, run_id, data): self.shutdown = True print("\n[MASTER] ALL RUNS COMPLETED\n") + if self.config.operation_type is OperationType.SEMI: + EventSubscriptionController.raise_event(RunnerEvents.CONTINUE) + # AFTER_EXPERIMENT hook print("[MASTER] Calling AFTER_EXPERIMENT hook") EventSubscriptionController.raise_event( RunnerEvents.AFTER_EXPERIMENT ) - #time.sleep(5) - #shutdown_server() def restore_crashed_runs(self): """ @@ -320,7 +324,7 @@ def __init__(self, config, metadata, host="0.0.0.0", port=5000): run_table = (config.create_run_table_model().generate_experiment_run_table()) pd.DataFrame(run_table).to_csv(self.run_table_path, index=False) - self.task_manager = TaskManager(run_table, self.experiment_path) + self.task_manager = TaskManager(self.config, run_table, self.experiment_path) self.task_manager.restore_crashed_runs() if self.task_manager.experiment_already_completed(): From 04a7b8819476ca95f6ddc152b03501c7b3b5f7f8 Mon Sep 17 00:00:00 2001 From: andaBarbu Date: Wed, 24 Jun 2026 21:11:53 +0200 Subject: [PATCH 30/30] chnage in anomalies checker --- .../DistributedOrchestrator.py | 25 +++---------- .../Experiment/ExperimentController.py | 30 ++++++++-------- .../Validation/AnomaliesChecker.py | 36 ++++++++++++++----- 3 files changed, 46 insertions(+), 45 deletions(-) diff --git a/experiment-runner/DistributedExecution/DistributedOrchestrator.py b/experiment-runner/DistributedExecution/DistributedOrchestrator.py index b4614dfca..270cd7616 100644 --- a/experiment-runner/DistributedExecution/DistributedOrchestrator.py +++ b/experiment-runner/DistributedExecution/DistributedOrchestrator.py @@ -144,11 +144,10 @@ def experiment_already_completed(self): ### ========================================================= class APIServer: - def __init__(self, task_manager, worker_monitor, validation_results): + def __init__(self, task_manager, worker_monitor): self.app = Flask(__name__) self.task_manager = task_manager self.monitor = worker_monitor - self.validation_results = validation_results @self.app.route('/task', methods=['GET']) def get_task(): @@ -194,7 +193,8 @@ def submit_result(): if anomalies: report = AnomalyReport() report.anomalies.extend(anomalies) - self.validation_results[run_id] = report + log_file_path = (self.task_manager.experiment_path/ self.task_manager.config.energy_validation_log_file) + ResultsValidator.update_report(report, log_file_path) return jsonify({"status": "ok"}) @self.app.route('/heartbeat', methods=['POST']) @@ -304,7 +304,6 @@ def __init__(self, config, metadata, host="0.0.0.0", port=5000): self.metadata = metadata self.host = host self.port = port - self.validation_results = {} self.experiment_path = (config.results_output_path / config.name) self.experiment_path.mkdir(parents=True, exist_ok=True) @@ -335,7 +334,7 @@ def __init__(self, config, metadata, host="0.0.0.0", port=5000): self.finished_before_start = False self.monitor = WorkerMonitor(self.task_manager) - self.api = APIServer(self.task_manager, self.monitor, self.validation_results) + self.api = APIServer(self.task_manager, self.monitor) def start(self): if self.finished_before_start: @@ -369,22 +368,6 @@ def start(self): print("[MASTER] Waiting for workers to shutdown...") time.sleep(10) - combined_report = AnomalyReport() - - for report in self.validation_results.values(): - combined_report.anomalies.extend(report.anomalies) - - if combined_report.has_anomalies(): - log_file_path = ( - self.experiment_path - / self.config.energy_validation_log_file - ) - - ResultsValidator.save_report_to_file( - combined_report, - log_file_path - ) - print("[MASTER] Shutting down") os._exit(0) diff --git a/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py b/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py index 6331839a6..124cc9002 100644 --- a/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py +++ b/experiment-runner/ExperimentOrchestrator/Experiment/ExperimentController.py @@ -38,13 +38,15 @@ class ExperimentController: def __init__(self, config: RunnerConfig, metadata: Metadata): self.config = config self.metadata = metadata - self.validation_results: dict[str, AnomalyReport] = {} + self.validation_state = 0 + self.validation_log_file_path = (self.config.experiment_path / self.config.energy_validation_log_file) + self.csv_data_manager = CSVOutputManager(self.config.experiment_path) self.json_data_manager = JSONOutputManager(self.config.experiment_path) - # -- Validate experiment setup # TODO: From the user perspective, it would be nice to know if are any possible issues with the experiment before staring the experiment runs. For example, if the config hooks are not properly defined, or if there are any issues with the config file itself + output.console_log_WARNING("Calling validate_experiment config hook") try: EventSubscriptionController.raise_event(RunnerEvents.VALIDATE_EXPERIMENT) @@ -70,6 +72,12 @@ def __init__(self, config: RunnerConfig, metadata: Metadata): output.console_log_WARNING(f"Reusing already existing experiment path: {self.config.experiment_path}") existing_run_table = self.csv_data_manager.read_run_table() + for run in existing_run_table: + if run['__done'] == RunProgress.RUNNING: + run['__done'] = RunProgress.TODO + self.csv_data_manager.write_run_table(existing_run_table) + print("[MASTER] Restored RUNNING -> TODO after restart") + # First sanity check. If there is no "TODO" in the __done column, simply abort. todo_run_found = any([current_run['__done'] != RunProgress.DONE for current_run in existing_run_table]) if not todo_run_found: @@ -170,7 +178,10 @@ def do_experiment(self): treatment_levels, ) if run_report.has_anomalies(): - self.validation_results[run_id] = run_report + ResultsValidator.update_report( + run_report, + self.validation_log_file_path + ) time_btwn_runs = self.config.time_between_runs_in_ms if time_btwn_runs > 0: @@ -184,15 +195,4 @@ def do_experiment(self): # -- After experiment output.console_log_WARNING("Calling after_experiment config hook") - EventSubscriptionController.raise_event(RunnerEvents.AFTER_EXPERIMENT) - - # -- Anomalies Report creation - combined_report = AnomalyReport() - - for report in self.validation_results.values(): - combined_report.anomalies.extend(report.anomalies) - - if combined_report.has_anomalies(): - log_file_path = (self.config.experiment_path / self.config.energy_validation_log_file) - output.console_log_WARNING(f"Anomalies detected. Report saved to {log_file_path}") - ResultsValidator.save_report_to_file(combined_report, log_file_path) \ No newline at end of file + EventSubscriptionController.raise_event(RunnerEvents.AFTER_EXPERIMENT) \ No newline at end of file diff --git a/experiment-runner/ProgressManager/Validation/AnomaliesChecker.py b/experiment-runner/ProgressManager/Validation/AnomaliesChecker.py index ab8235ce7..ab356bdea 100644 --- a/experiment-runner/ProgressManager/Validation/AnomaliesChecker.py +++ b/experiment-runner/ProgressManager/Validation/AnomaliesChecker.py @@ -66,16 +66,14 @@ def _detect_numeric_columns(df: pd.DataFrame) -> List[str]: return numeric_cols @staticmethod - def generate_report_text(report: AnomalyReport) -> str: + def generate_report_text(report: AnomalyReport, include_header: bool = True) -> str: lines = [] - lines.append("=" * 80) - lines.append("GENERIC MEASUREMENT VALIDATION REPORT") - lines.append("=" * 80) - lines.append("") - - if not report.has_anomalies(): - lines.append("No anomalies found.") - return "\n".join(lines) + + if include_header: + lines.append("=" * 80) + lines.append("GENERIC MEASUREMENT VALIDATION REPORT") + lines.append("=" * 80) + lines.append("") runs: Dict[str, List[Dict[str, Any]]] = {} @@ -130,6 +128,26 @@ def validate_output_log(run_dir: Path,run_id: str,treatment_levels: Dict[str, An elif value == 0: report.add_anomaly(run_id, treatment_levels, str(csv_file), row_number, column, value, "zero") return report + + @staticmethod + def update_report(report: AnomalyReport, log_file: Path): + if not report.has_anomalies(): + return + + first_report = not log_file.exists() + report_text = ResultsValidator.generate_report_text(report, include_header = first_report) + + try: + log_file.parent.mkdir(parents=True, exist_ok=True) + mode = "a" if log_file.exists() else "w" + + with open(log_file, mode) as f: + if mode == "a": + f.write("\n\n") + f.write(report_text) + output.console_log_OK(f"Results validation report updated: {log_file}") + except Exception as e: + output.console_log_FAIL(f"Failed to update results validation report: {e}") @staticmethod def save_report_to_file(report: EnergyAnomalyReport, log_file: Path) -> None: