From 5acb208ae0ddee1f30619a5822d51d9b0591be48 Mon Sep 17 00:00:00 2001
From: naomi-simumba <naomi.simumba@gmail.com>
Date: Thu, 14 Aug 2025 13:19:51 +0100
Subject: [PATCH 01/11] automatically plot

Signed-off-by: naomi-simumba <naomi.simumba@gmail.com>
---
 benchmark/utils.py | 139 +++++++++++++++++++++++++--------------------
 1 file changed, 78 insertions(+), 61 deletions(-)

diff --git a/benchmark/utils.py b/benchmark/utils.py
index 8c8a7f8..e8dc57c 100644
--- a/benchmark/utils.py
+++ b/benchmark/utils.py
@@ -28,7 +28,6 @@
     "0.01x_train": 1,
 }
 
-
 def unflatten(dictionary: Dict[str, Any]):
     resultDict: Dict = {}
     for key, value in dictionary.items():
@@ -212,13 +211,15 @@ def extract_repeated_experiment_results(
                 if task in task_info:
                     metric_name = task_info[task]
                     metric_name = 'test_test/' + metric_name.split("/")[-1]
-                else:
+                else:  
                     continue
 
                 if metric_name not in run.data.metrics:
                     logger.info(f"{metric_name} not found in task {task}. Skipping")
                     continue
                 score = run.data.metrics[metric_name]
+                if ("rmse" in metric_name) or ("RMSE" in metric_name):
+                    score = 1-score
                 run_names.append(run.info.run_name)
                 exp_ids.append(experiment_id)
                 exp_names.append(original_experiment_name)
@@ -305,21 +306,31 @@ def extract_parameters(
         if experiment_info is None:
             continue
         experiment_id = experiment_info.experiment_id
-        logger.info(f"\nexperiment_name: {experiment_name} ")
+        logger.info(f"\n\n\nexperiment_name: {experiment_name}.")
         logger.info(f"experiment_id: {experiment_info.experiment_id}")
+
         exp_parent_run_name = f"top_run_{experiment_name}"
         experiment_parent_run_data = client.search_runs(
             experiment_ids=[experiment_id],
             filter_string=f'tags."mlflow.runName" LIKE "{exp_parent_run_name}"',
         )
+
+        logger.info(
+                f"experiment_parent_run_data: {len(experiment_parent_run_data)}"
+            )
+        for run in experiment_parent_run_data:
+            logger.info(
+                f"{run.info.run_id}: {run.info.run_name}"
+            )
         if (len(experiment_parent_run_data) > 1) or (
             len(experiment_parent_run_data) == 0
         ):
-            logger.debug(
+            logger.info(
                 f"The number of parent runs for each experiment should be 1. \
-                         It is currently {len(experiment_parent_run_data)}"
+                         It is currently {len(experiment_parent_run_data)}. Skipping."
             )
-            raise RuntimeError
+            continue
+            #raise RuntimeError
         for run in experiment_parent_run_data:
             exp_parent_run_id = run.info.run_id
 
@@ -333,39 +344,45 @@ def extract_parameters(
 
         for task in task_names:
             logger.info(f"task: {task}")
-            matching_runs = [run for run in runs if run.info.run_name.endswith(task)]  # type: ignore
-            best_params = matching_runs[0].data.params
-
-            # eval them
-            best_params = {k: literal_eval(v) for k, v in best_params.items()}
-            best_params["experiment_name"] = experiment_name
-            best_params["dataset"] = task
-            best_params["decoder"] = matching_runs[0].data.tags["decoder"]
-            best_params["backbone"] = matching_runs[0].data.tags["backbone"]
-            best_params["early_stop_patience"] = matching_runs[0].data.tags[
-                "early_stop_patience"
-            ]
-            best_params["n_trials"] = matching_runs[0].data.tags["n_trials"]
-            best_params["partition_name"] = matching_runs[0].data.tags["partition_name"]
-            best_params["data_percentages"] = DATA_PARTITIONS[
-                best_params["partition_name"]
-            ]
-            if 'optimizer_hparams' in best_params:
-                logger.info(
-                    f"optimizer_hparams: {best_params['optimizer_hparams'].items()}"
-                )
-                optimizer_hparams = {
-                    k: v for k, v in best_params['optimizer_hparams'].items()
-                }
-                best_params.update(optimizer_hparams)
-                del best_params['optimizer_hparams']
-            if 'model_args' in best_params:
-                model_args = {k: v for k, v in best_params['model_args'].items()}
-                best_params.update(model_args)
-                del best_params['model_args']
-
-            best_params = pd.DataFrame(best_params, index=[0])
-            all_params.append(best_params)
+            try: #doing try/except because some tasks are incomplete and will raise an error
+                matching_runs = [run for run in runs if run.info.run_name.endswith(task)]  # type: ignore
+                best_params = matching_runs[0].data.params
+
+                # eval them
+                best_params = {k: literal_eval(v) for k, v in best_params.items()}
+                best_params["experiment_name"] = experiment_name
+                best_params["dataset"] = task
+                best_params["decoder"] = matching_runs[0].data.tags["decoder"] if "decoder" in matching_runs[0].data.tags else "N/A"
+                best_params["backbone"] = matching_runs[0].data.tags["backbone"]
+                best_params["early_stop_patience"] = matching_runs[0].data.tags[
+                    "early_stop_patience"
+                ]
+                best_params["n_trials"] = matching_runs[0].data.tags["n_trials"]
+                best_params["partition_name"] = matching_runs[0].data.tags["partition_name"]
+                best_params["data_percentages"] = DATA_PARTITIONS[
+                    best_params["partition_name"]
+                ]
+                if 'optimizer_hparams' in best_params:
+                    logger.info(
+                        f"optimizer_hparams: {best_params['optimizer_hparams'].items()}"
+                    )
+                    optimizer_hparams = {
+                        k: v for k, v in best_params['optimizer_hparams'].items()
+                    }
+                    best_params.update(optimizer_hparams)
+                    del best_params['optimizer_hparams']
+                if 'model_args' in best_params:
+                    model_args = {k: v for k, v in best_params['model_args'].items()}
+                    best_params.update(model_args)
+                    del best_params['model_args']
+
+                best_params = pd.DataFrame(best_params, index=[0])
+                all_params.append(best_params)
+            except Exception as e:
+                logger.info(f"error: {e}.")
+                continue
+
+    logger.info(f"\n all_params: {len(all_params)}.")
     all_params = pd.concat(all_params, axis=0)
     all_params = all_params.reset_index()
     return all_params
@@ -395,11 +412,9 @@ def get_results_and_parameters(
         pd.DataFrame with results and parameters
     """
     if Path(storage_uri).exists() and Path(storage_uri).is_dir():
-        results_dir = (
-            Path(storage_uri).parents[0] / "summarized_results" / benchmark_name
-        )
+        results_dir = Path(storage_uri).parents[0] / "summarized_results" / benchmark_name
     else:
-        print("Please use a valid directory for storage_uri")
+        logger.info("Please use a valid directory for storage_uri")
         raise ValueError
     if not os.path.exists(results_dir):
         os.makedirs(results_dir)
@@ -430,6 +445,14 @@ def get_results_and_parameters(
     results_and_parameters.to_csv(
         f"{str(results_dir)}/results_and_parameters.csv", index=False
     )
+
+    visualize_combined_results(
+            combined_results=results_and_parameters,
+            storage_uri=storage_uri,
+            logger=logger,
+            plot_file_base_name=f"multiple_models_{benchmark_name}",
+        )
+
     return results_and_parameters
 
 
@@ -577,7 +600,7 @@ def check_existing_experiments(
     exp_parent_run_name: str,
     task_names: list,
     n_trials: int,
-    backbone: str,
+    backbone: str
 ) -> Dict[str, Any]:
     """
     checks if experiment has been completed (i.e. both task run and nested individual runs are complete)
@@ -700,14 +723,15 @@ def visualize_combined_results(
     save_folder = Path(storage_uri).parents[0] / "visualizations"
     tables_folder = save_folder / "tables"
     plots_folder = save_folder / "plots"
+    normalizer_folder = save_folder / "normalizer"
     if not os.path.exists(tables_folder):
         os.makedirs(tables_folder)
     if not os.path.exists(plots_folder):
         os.makedirs(plots_folder)
+    if not os.path.exists(normalizer_folder):
+        os.makedirs(normalizer_folder)
 
-    combined_results = []
-    model_order = []
-    experiments = list(set(combined_results["experiment_name"]))
+    experiments = list(set(combined_results["experiment_name"].tolist()))
     combined_results = combined_results.rename(columns={"experiment_name": "model"})
     num_experiments = len(experiments)
     fig_size = (num_experiments * 5, 6) if num_experiments >= 3 else (15, 6)
@@ -717,12 +741,11 @@ def visualize_combined_results(
         zip(model_order, sns.color_palette("tab20", n_colors=len(model_order)))
     )
 
-    try:
+    if True:
         # plot raw values
         plot_tools.plot_per_dataset(
             combined_results,
             model_order=model_order,
-            plot_file_base_name=plot_file_base_name,
             model_colors=model_colors,
             metric="test metric",
             sharey=False,
@@ -741,12 +764,13 @@ def visualize_combined_results(
             combined_results,
             metrics=("test metric",),
             benchmark_name=plot_file_base_name,
+            normalizer_folder= normalizer_folder
         )
         bootstrapped_iqm, normalized_combined_results = (
             plot_tools.normalize_bootstrap_and_plot(
-                combined_results,
-                plot_file_base_name=plot_file_base_name,
+                df=combined_results,
                 metric="test metric",
+                normalizer_folder=normalizer_folder,
                 benchmark_name=plot_file_base_name,
                 model_order=model_order,
                 model_colors=model_colors,
@@ -754,7 +778,6 @@ def visualize_combined_results(
                 n_legend_rows=n_legend_rows,
             )
         )
-        # dataset_name_map=dataset_name_map)
 
         plt.savefig(
             str(
@@ -772,7 +795,7 @@ def visualize_combined_results(
                 tables_folder / f"{plot_file_base_name}_normalized_combined_results.csv"
             )
         )
-    except Exception as e:
+    else: #except Exception as e:
         logger.info(f"could not visualize due to error: {e}")
 
 
@@ -797,7 +820,6 @@ def get_logger(log_level="INFO", log_folder="./experiment_logs") -> logging.Root
     logging.basicConfig(level=logging.CRITICAL)
     return logger
 
-
 def import_custom_modules(
     logger: logging.RootLogger,
     custom_modules_path: str | Path | None = None,
@@ -819,17 +841,12 @@ def import_custom_modules(
                 module = importlib.import_module(module_dir)
                 logger.info(f"Found {custom_modules_path}")
             except ImportError:
-                raise ImportError(
-                    f"It was not possible to import modules from {custom_modules_path}."
-                )
+                raise ImportError(f"It was not possible to import modules from {custom_modules_path}.")
         else:
-            raise ValueError(
-                f"Modules path {custom_modules_path} isn't a directory. Check if you have defined it properly."
-            )
+            raise ValueError(f"Modules path {custom_modules_path} isn't a directory. Check if you have defined it properly.")
     else:
         logger.debug("No custom module is being used.")
 
-
 if __name__ == "__main__":
     logger = get_logger()
     storage_uri = "results_folder/hpo"  # storage_uri from config

From ee23d32c50c59e2396d1fd77fec1bfa5df54721f Mon Sep 17 00:00:00 2001
From: naomi-simumba <naomi.simumba@gmail.com>
Date: Thu, 14 Aug 2025 13:29:15 +0100
Subject: [PATCH 02/11] fix reading underscore metrics

Signed-off-by: naomi-simumba <naomi.simumba@gmail.com>
---
 benchmark/utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/benchmark/utils.py b/benchmark/utils.py
index e8dc57c..4127ee7 100644
--- a/benchmark/utils.py
+++ b/benchmark/utils.py
@@ -210,7 +210,9 @@ def extract_repeated_experiment_results(
                 seed = int(run.info.run_name.split("_")[-1])
                 if task in task_info:
                     metric_name = task_info[task]
-                    metric_name = 'test_test/' + metric_name.split("/")[-1]
+                    name_1 = 'test_test/' + metric_name.split("/")[-1]
+                    name_2 = 'test_test_' + task.metric.replace(task.metric.split('_')[0] + "_", '')
+                    metric_name = name_1 if '/' in task.metric else name_2
                 else:  
                     continue
 

From b05ec337bda90e0b775eea403632fe1187c67c69 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Mon, 18 Aug 2025 20:20:23 -0300
Subject: [PATCH 03/11] change input param: list of experiments to experiment
 name so we can test summarize function

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 benchmark/main.py  |  22 ++++---
 benchmark/utils.py | 151 +++++++++++++++++++++++----------------------
 run_tests.py       |   1 +
 3 files changed, 91 insertions(+), 83 deletions(-)

diff --git a/benchmark/main.py b/benchmark/main.py
index 8d7128f..95b43aa 100644
--- a/benchmark/main.py
+++ b/benchmark/main.py
@@ -41,7 +41,7 @@ def main():
         help="summarize results from repeated experiments",
         action="store_true",
     )
-    parser.add_argument('--list_of_experiment_names', type=list[str])
+    parser.add_argument('--list_of_experiment_names', type=str, nargs=1)
     parser.add_argument('--task_names', type=list[str])
     parser.add_argument('--task_metrics', type=list[str])
     parser.add_argument(
@@ -52,15 +52,17 @@ def main():
 
     args = parser.parse_args()
     paths: List[Any] = args.config
+    assert isinstance(paths, list), f"Error! {paths=} is not a list"
+    assert len(paths) > 0
     path = paths[0]
     config = parser.parse_path(path)
     config_init = parser.instantiate_classes(config)
 
-    summarize = args.summarize
+    summarize: bool = args.summarize
     assert isinstance(summarize, bool), f"Error! {summarize=} is not a bool"
-    repeat = args.repeat
+    repeat: bool = args.repeat
     assert isinstance(repeat, bool), f"Error! {repeat=} is not a bool"
-    hpo = args.hpo
+    hpo: bool = args.hpo
     assert isinstance(hpo, bool), f"Error! {hpo=} is not a bool"
 
     storage_uri = config_init.storage_uri
@@ -79,12 +81,12 @@ def main():
             hpo is False and repeat is False
         ), f"Error! both {repeat=} and {hpo=} must be False when summarizing results from multiple experiments."
 
-        list_of_experiment_names = config_init.list_of_experiment_names
+        experiment_name = config_init.list_of_experiment_names
         assert isinstance(
-            list_of_experiment_names, list
-        ), f"Error! {list_of_experiment_names=} is not a list"
-        for exp in list_of_experiment_names:
-            assert isinstance(exp, str), f"Error! {exp=} is not a str"
+            experiment_name, str
+        ), f"Error! {experiment_name=} is not a str"
+        # for exp in experiment_name:
+        #     assert isinstance(exp, str), f"Error! {exp=} is not a str"
 
         task_names = config_init.task_names
         assert isinstance(task_names, list), f"Error! {task_names=} is not a list"
@@ -108,7 +110,7 @@ def main():
             benchmark_name=benchmark_name,
             storage_uri=storage_uri,
             logger=logger,
-            experiments=list_of_experiment_names,
+            experiments=[experiment_name],
             task_names=task_names,
             num_repetitions=run_repetitions,
             task_metrics=task_metrics,
diff --git a/benchmark/utils.py b/benchmark/utils.py
index 4127ee7..7c494ca 100644
--- a/benchmark/utils.py
+++ b/benchmark/utils.py
@@ -28,6 +28,7 @@
     "0.01x_train": 1,
 }
 
+
 def unflatten(dictionary: Dict[str, Any]):
     resultDict: Dict = {}
     for key, value in dictionary.items():
@@ -149,7 +150,7 @@ def extract_repeated_experiment_results(
     task_metrics: list,
     task_names: list,
     num_repetitions: int = REPEATED_SEEDS_DEFAULT,
-) -> (pd.DataFrame, list):
+) -> tuple[pd.DataFrame, list]:
     """
     extracts results of repeated experiments from mlflow logs and saves them in csv
     save list of incomplete experiments to a txt file
@@ -211,9 +212,11 @@ def extract_repeated_experiment_results(
                 if task in task_info:
                     metric_name = task_info[task]
                     name_1 = 'test_test/' + metric_name.split("/")[-1]
-                    name_2 = 'test_test_' + task.metric.replace(task.metric.split('_')[0] + "_", '')
+                    name_2 = 'test_test_' + task.metric.replace(
+                        task.metric.split('_')[0] + "_", ''
+                    )
                     metric_name = name_1 if '/' in task.metric else name_2
-                else:  
+                else:
                     continue
 
                 if metric_name not in run.data.metrics:
@@ -221,7 +224,7 @@ def extract_repeated_experiment_results(
                     continue
                 score = run.data.metrics[metric_name]
                 if ("rmse" in metric_name) or ("RMSE" in metric_name):
-                    score = 1-score
+                    score = 1 - score
                 run_names.append(run.info.run_name)
                 exp_ids.append(experiment_id)
                 exp_names.append(original_experiment_name)
@@ -317,13 +320,9 @@ def extract_parameters(
             filter_string=f'tags."mlflow.runName" LIKE "{exp_parent_run_name}"',
         )
 
-        logger.info(
-                f"experiment_parent_run_data: {len(experiment_parent_run_data)}"
-            )
+        logger.info(f"experiment_parent_run_data: {len(experiment_parent_run_data)}")
         for run in experiment_parent_run_data:
-            logger.info(
-                f"{run.info.run_id}: {run.info.run_name}"
-            )
+            logger.info(f"{run.info.run_id}: {run.info.run_name}")
         if (len(experiment_parent_run_data) > 1) or (
             len(experiment_parent_run_data) == 0
         ):
@@ -332,7 +331,7 @@ def extract_parameters(
                          It is currently {len(experiment_parent_run_data)}. Skipping."
             )
             continue
-            #raise RuntimeError
+            # raise RuntimeError
         for run in experiment_parent_run_data:
             exp_parent_run_id = run.info.run_id
 
@@ -346,7 +345,7 @@ def extract_parameters(
 
         for task in task_names:
             logger.info(f"task: {task}")
-            try: #doing try/except because some tasks are incomplete and will raise an error
+            try:  # doing try/except because some tasks are incomplete and will raise an error
                 matching_runs = [run for run in runs if run.info.run_name.endswith(task)]  # type: ignore
                 best_params = matching_runs[0].data.params
 
@@ -354,13 +353,19 @@ def extract_parameters(
                 best_params = {k: literal_eval(v) for k, v in best_params.items()}
                 best_params["experiment_name"] = experiment_name
                 best_params["dataset"] = task
-                best_params["decoder"] = matching_runs[0].data.tags["decoder"] if "decoder" in matching_runs[0].data.tags else "N/A"
+                best_params["decoder"] = (
+                    matching_runs[0].data.tags["decoder"]
+                    if "decoder" in matching_runs[0].data.tags
+                    else "N/A"
+                )
                 best_params["backbone"] = matching_runs[0].data.tags["backbone"]
                 best_params["early_stop_patience"] = matching_runs[0].data.tags[
                     "early_stop_patience"
                 ]
                 best_params["n_trials"] = matching_runs[0].data.tags["n_trials"]
-                best_params["partition_name"] = matching_runs[0].data.tags["partition_name"]
+                best_params["partition_name"] = matching_runs[0].data.tags[
+                    "partition_name"
+                ]
                 best_params["data_percentages"] = DATA_PARTITIONS[
                     best_params["partition_name"]
                 ]
@@ -414,7 +419,9 @@ def get_results_and_parameters(
         pd.DataFrame with results and parameters
     """
     if Path(storage_uri).exists() and Path(storage_uri).is_dir():
-        results_dir = Path(storage_uri).parents[0] / "summarized_results" / benchmark_name
+        results_dir = (
+            Path(storage_uri).parents[0] / "summarized_results" / benchmark_name
+        )
     else:
         logger.info("Please use a valid directory for storage_uri")
         raise ValueError
@@ -449,11 +456,11 @@ def get_results_and_parameters(
     )
 
     visualize_combined_results(
-            combined_results=results_and_parameters,
-            storage_uri=storage_uri,
-            logger=logger,
-            plot_file_base_name=f"multiple_models_{benchmark_name}",
-        )
+        combined_results=results_and_parameters,
+        storage_uri=storage_uri,
+        logger=logger,
+        plot_file_base_name=f"multiple_models_{benchmark_name}",
+    )
 
     return results_and_parameters
 
@@ -602,7 +609,7 @@ def check_existing_experiments(
     exp_parent_run_name: str,
     task_names: list,
     n_trials: int,
-    backbone: str
+    backbone: str,
 ) -> Dict[str, Any]:
     """
     checks if experiment has been completed (i.e. both task run and nested individual runs are complete)
@@ -743,62 +750,54 @@ def visualize_combined_results(
         zip(model_order, sns.color_palette("tab20", n_colors=len(model_order)))
     )
 
-    if True:
-        # plot raw values
-        plot_tools.plot_per_dataset(
-            combined_results,
+    # plot raw values
+    plot_tools.plot_per_dataset(
+        combined_results,
+        model_order=model_order,
+        model_colors=model_colors,
+        metric="test metric",
+        sharey=False,
+        inner="points",
+        fig_size=fig_size,
+        n_legend_rows=n_legend_rows,
+    )
+    plt.savefig(
+        str(plots_folder / f"violin_{plot_file_base_name}_raw.png"),
+        bbox_inches="tight",
+    )
+    plt.close()
+
+    # plot normalized, bootstrapped values values
+    plot_tools.make_normalizer(
+        combined_results,
+        metrics=("test metric",),
+        benchmark_name=plot_file_base_name,
+        normalizer_folder=normalizer_folder,
+    )
+    bootstrapped_iqm, normalized_combined_results = (
+        plot_tools.normalize_bootstrap_and_plot(
+            df=combined_results,
+            metric="test metric",
+            normalizer_folder=normalizer_folder,
+            benchmark_name=plot_file_base_name,
             model_order=model_order,
             model_colors=model_colors,
-            metric="test metric",
-            sharey=False,
-            inner="points",
             fig_size=fig_size,
             n_legend_rows=n_legend_rows,
         )
-        plt.savefig(
-            str(plots_folder / f"violin_{plot_file_base_name}_raw.png"),
-            bbox_inches="tight",
-        )
-        plt.close()
-
-        # plot normalized, bootstrapped values values
-        plot_tools.make_normalizer(
-            combined_results,
-            metrics=("test metric",),
-            benchmark_name=plot_file_base_name,
-            normalizer_folder= normalizer_folder
-        )
-        bootstrapped_iqm, normalized_combined_results = (
-            plot_tools.normalize_bootstrap_and_plot(
-                df=combined_results,
-                metric="test metric",
-                normalizer_folder=normalizer_folder,
-                benchmark_name=plot_file_base_name,
-                model_order=model_order,
-                model_colors=model_colors,
-                fig_size=fig_size,
-                n_legend_rows=n_legend_rows,
-            )
-        )
+    )
 
-        plt.savefig(
-            str(
-                plots_folder
-                / f"violin_{plot_file_base_name}_normalized_bootstrapped.png"
-            ),
-            bbox_inches="tight",
-        )
-        plt.close()
-        bootstrapped_iqm.to_csv(
-            str(tables_folder / f"{plot_file_base_name}_bootstrapped_iqm.csv")
-        )
-        combined_results.to_csv(
-            str(
-                tables_folder / f"{plot_file_base_name}_normalized_combined_results.csv"
-            )
-        )
-    else: #except Exception as e:
-        logger.info(f"could not visualize due to error: {e}")
+    plt.savefig(
+        str(plots_folder / f"violin_{plot_file_base_name}_normalized_bootstrapped.png"),
+        bbox_inches="tight",
+    )
+    plt.close()
+    bootstrapped_iqm.to_csv(
+        str(tables_folder / f"{plot_file_base_name}_bootstrapped_iqm.csv")
+    )
+    combined_results.to_csv(
+        str(tables_folder / f"{plot_file_base_name}_normalized_combined_results.csv")
+    )
 
 
 def get_logger(log_level="INFO", log_folder="./experiment_logs") -> logging.RootLogger:
@@ -822,6 +821,7 @@ def get_logger(log_level="INFO", log_folder="./experiment_logs") -> logging.Root
     logging.basicConfig(level=logging.CRITICAL)
     return logger
 
+
 def import_custom_modules(
     logger: logging.RootLogger,
     custom_modules_path: str | Path | None = None,
@@ -843,12 +843,17 @@ def import_custom_modules(
                 module = importlib.import_module(module_dir)
                 logger.info(f"Found {custom_modules_path}")
             except ImportError:
-                raise ImportError(f"It was not possible to import modules from {custom_modules_path}.")
+                raise ImportError(
+                    f"It was not possible to import modules from {custom_modules_path}."
+                )
         else:
-            raise ValueError(f"Modules path {custom_modules_path} isn't a directory. Check if you have defined it properly.")
+            raise ValueError(
+                f"Modules path {custom_modules_path} isn't a directory. Check if you have defined it properly."
+            )
     else:
         logger.debug("No custom module is being used.")
 
+
 if __name__ == "__main__":
     logger = get_logger()
     storage_uri = "results_folder/hpo"  # storage_uri from config
diff --git a/run_tests.py b/run_tests.py
index 95f5fc4..203b63c 100644
--- a/run_tests.py
+++ b/run_tests.py
@@ -32,6 +32,7 @@ def run_tests(test_id: Optional[str] = None):
             out_file.unlink(missing_ok=True)
             assert not out_file.exists()
         jbsub = f"bsub -e {err_file} -o {out_file} -M 40G -gpu \"num=1/task:mode=exclusive_process:gmodel=NVIDIAA100_SXM4_80GB\" pytest -vv tests/test_benchmark.py::test_run_benchmark[{tc_id}]"
+        
         cmd = jbsub.split()
         result = subprocess.run(cmd, capture_output=True)
         if result.returncode == 0:

From d311041c4a7d6b1ef7a97b55b17d1faa967f9a7f Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Mon, 18 Aug 2025 20:22:48 -0300
Subject: [PATCH 04/11] fix invalid param name

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 benchmark/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/main.py b/benchmark/main.py
index 95b43aa..2c50aae 100644
--- a/benchmark/main.py
+++ b/benchmark/main.py
@@ -81,7 +81,7 @@ def main():
             hpo is False and repeat is False
         ), f"Error! both {repeat=} and {hpo=} must be False when summarizing results from multiple experiments."
 
-        experiment_name = config_init.list_of_experiment_names
+        experiment_name = config_init.experiment_name
         assert isinstance(
             experiment_name, str
         ), f"Error! {experiment_name=} is not a str"

From b965a5b84d68164d1f9ae36a60939fe32b6a6074 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Mon, 18 Aug 2025 20:28:54 -0300
Subject: [PATCH 05/11] fix task names and task metrics

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 benchmark/main.py                      | 6 +++---
 configs/tests/benchmark_v2_simple.yaml | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/benchmark/main.py b/benchmark/main.py
index 2c50aae..6975f58 100644
--- a/benchmark/main.py
+++ b/benchmark/main.py
@@ -41,9 +41,9 @@ def main():
         help="summarize results from repeated experiments",
         action="store_true",
     )
-    parser.add_argument('--list_of_experiment_names', type=str, nargs=1)
-    parser.add_argument('--task_names', type=list[str])
-    parser.add_argument('--task_metrics', type=list[str])
+    parser.add_argument('--experiment_name', type=str)
+    parser.add_argument('--task_names', type=list[str], nargs='+', default=[])
+    parser.add_argument('--task_metrics', type=list[str], nargs='+', default=[])
     parser.add_argument(
         '--benchmark_name',
         type=str,
diff --git a/configs/tests/benchmark_v2_simple.yaml b/configs/tests/benchmark_v2_simple.yaml
index 608bc64..3d0cb83 100644
--- a/configs/tests/benchmark_v2_simple.yaml
+++ b/configs/tests/benchmark_v2_simple.yaml
@@ -21,7 +21,6 @@ tasks:
   - name: chesapeake
     type: segmentation
     direction: max
-    metric: val/Multiclass_Jaccard_Index
     early_stop_patience: 10
     terratorch_task:
       loss: ce

From a36f6dbebf391e617b2e61e8a015db401e6e46d9 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Mon, 18 Aug 2025 20:31:10 -0300
Subject: [PATCH 06/11] handling single tasks

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 benchmark/main.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/benchmark/main.py b/benchmark/main.py
index 6975f58..1c1f78c 100644
--- a/benchmark/main.py
+++ b/benchmark/main.py
@@ -89,11 +89,15 @@ def main():
         #     assert isinstance(exp, str), f"Error! {exp=} is not a str"
 
         task_names = config_init.task_names
+        if isinstance(task_names, str):
+            task_names = [task_names]
         assert isinstance(task_names, list), f"Error! {task_names=} is not a list"
         for t in task_names:
             assert isinstance(t, str), f"Error! {t=} is not a str"
 
         task_metrics = config_init.task_metrics
+        if isinstance(task_metrics, str):
+            task_metrics = [task_metrics]
         assert isinstance(task_metrics, list), f"Error! {task_metrics=} is not a list"
         for t in task_metrics:
             assert isinstance(t, str), f"Error! {t=} is not a str"

From e4bcee5fe36a49b6f2972b49d6d28367b96f9a43 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Mon, 18 Aug 2025 20:37:01 -0300
Subject: [PATCH 07/11] remove redundant param

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 benchmark/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/main.py b/benchmark/main.py
index 1c1f78c..890aafe 100644
--- a/benchmark/main.py
+++ b/benchmark/main.py
@@ -41,7 +41,7 @@ def main():
         help="summarize results from repeated experiments",
         action="store_true",
     )
-    parser.add_argument('--experiment_name', type=str)
+    # parser.add_argument('--experiment_name', type=str)s
     parser.add_argument('--task_names', type=list[str], nargs='+', default=[])
     parser.add_argument('--task_metrics', type=list[str], nargs='+', default=[])
     parser.add_argument(

From 894e71c88f8f9b36073be1de676d840ad524fb91 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Mon, 18 Aug 2025 20:41:10 -0300
Subject: [PATCH 08/11] fix task name param

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 benchmark/main.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/benchmark/main.py b/benchmark/main.py
index 890aafe..f31fe5a 100644
--- a/benchmark/main.py
+++ b/benchmark/main.py
@@ -42,8 +42,12 @@ def main():
         action="store_true",
     )
     # parser.add_argument('--experiment_name', type=str)s
-    parser.add_argument('--task_names', type=list[str], nargs='+', default=[])
-    parser.add_argument('--task_metrics', type=list[str], nargs='+', default=[])
+    parser.add_argument(
+        '--task_names', type=list[str], nargs='+', action='append', default=[]
+    )
+    parser.add_argument(
+        '--task_metrics', type=list[str], nargs='+', action='append', default=[]
+    )
     parser.add_argument(
         '--benchmark_name',
         type=str,

From 52e95839a8797151dd6a793f6241170f0b7e0ba9 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Mon, 18 Aug 2025 20:42:27 -0300
Subject: [PATCH 09/11] fix task name param

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 benchmark/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmark/main.py b/benchmark/main.py
index f31fe5a..77b4ebc 100644
--- a/benchmark/main.py
+++ b/benchmark/main.py
@@ -43,10 +43,10 @@ def main():
     )
     # parser.add_argument('--experiment_name', type=str)s
     parser.add_argument(
-        '--task_names', type=list[str], nargs='+', action='append', default=[]
+        '--task_names', nargs='+', action='append', default=[]
     )
     parser.add_argument(
-        '--task_metrics', type=list[str], nargs='+', action='append', default=[]
+        '--task_metrics', nargs='+', action='append', default=[]
     )
     parser.add_argument(
         '--benchmark_name',

From ad86761b8bc723bff16d92e480b2d4f9a65dfa8d Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Mon, 18 Aug 2025 21:53:35 -0300
Subject: [PATCH 10/11] fix num repetitions

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 benchmark/main.py                      | 247 +++++++++++++------------
 configs/tests/benchmark_v2_simple.yaml |   3 +-
 2 files changed, 129 insertions(+), 121 deletions(-)

diff --git a/benchmark/main.py b/benchmark/main.py
index 77b4ebc..83ef807 100644
--- a/benchmark/main.py
+++ b/benchmark/main.py
@@ -34,6 +34,11 @@ def main():
     parser.add_argument('--report_on_best_val', type=bool, default=True)
     parser.add_argument('--test_models', type=bool, default=False)
     parser.add_argument('--bayesian_search', type=bool, default=True)
+    parser.add_argument(
+        '--benchmark_name',
+        type=str,
+        help="name of summarized results file",
+    )
     parser.add_argument("--hpo", help="optimize hyperparameters", action="store_true")
     parser.add_argument("--repeat", help="repeat best experiments", action="store_true")
     parser.add_argument(
@@ -42,17 +47,8 @@ def main():
         action="store_true",
     )
     # parser.add_argument('--experiment_name', type=str)s
-    parser.add_argument(
-        '--task_names', nargs='+', action='append', default=[]
-    )
-    parser.add_argument(
-        '--task_metrics', nargs='+', action='append', default=[]
-    )
-    parser.add_argument(
-        '--benchmark_name',
-        type=str,
-        help="name of summarized results file",
-    )
+    parser.add_argument('--task_names', nargs='+', action='append', default=[])
+    parser.add_argument('--task_metrics', nargs='+', action='append', default=[])
 
     args = parser.parse_args()
     paths: List[Any] = args.config
@@ -78,6 +74,9 @@ def main():
     else:
         logging.config.fileConfig(fname=logger_path, disable_existing_loggers=False)
         logger = logging.getLogger("terratorch-iterate")
+    
+    run_repetitions = config_init.run_repetitions
+    assert isinstance(run_repetitions, int), f"Error! {run_repetitions=} is invalid"
 
     # only summarize results from multiple experiments
     if summarize:
@@ -107,12 +106,10 @@ def main():
             assert isinstance(t, str), f"Error! {t=} is not a str"
 
         benchmark_name = config_init.benchmark_name
+        if benchmark_name is None:
+            benchmark_name = "summary.csv"
         assert isinstance(benchmark_name, str), f"Error! {benchmark_name=} is not a str"
 
-        run_repetitions = config_init.run_repetitions
-        assert (
-            isinstance(run_repetitions, int) and run_repetitions > 0
-        ), f"Error! {run_repetitions=} is invalid"
         # get results and parameters from mlflow logs
         results_and_parameters = get_results_and_parameters(
             benchmark_name=benchmark_name,
@@ -124,119 +121,129 @@ def main():
             task_metrics=task_metrics,
         )
         return
-
-    # optimize hyperparameters and/or do repeated runs for single experiments
-    assert (
-        hpo is True or repeat is True
-    ), f"Error! either {repeat=} or {hpo=} must be True"
-    parent_run_id = args.parent_run_id
-    if parent_run_id is not None:
-        assert isinstance(parent_run_id, str), f"Error! {parent_run_id=} is not a str"
-
-    # validate the objects
-    experiment_name = config_init.experiment_name
-    assert isinstance(experiment_name, str), f"Error! {experiment_name=} is not a str"
-    run_name = config_init.run_name
-    if run_name is not None:
-        assert isinstance(run_name, str), f"Error! {run_name=} is not a str"
-    # validate defaults
-    defaults = config_init.defaults
-    assert isinstance(defaults, Defaults), f"Error! {defaults=} is not a Defaults"
-
-    tasks = config_init.tasks
-    assert isinstance(tasks, list), f"Error! {tasks=} is not a list"
-    for t in tasks:
-        assert isinstance(t, Task), f"Error! {t=} is not a Task"
-        # if there is not specific terratorch_task specified, then use default terratorch_task
-        if t.terratorch_task is None:
-            t.terratorch_task = defaults.terratorch_task
-    # defaults.trainer_args["max_epochs"] = 5
-
-    optimization_space = config_init.optimization_space
-    assert isinstance(
-        optimization_space, dict
-    ), f"Error! {optimization_space=} is not a dict"
-
-    # ray_storage_path is optional
-    ray_storage_path = config_init.ray_storage_path
-    if ray_storage_path is not None:
+    else:
+        # optimize hyperparameters and/or do repeated runs for single experiments
+        assert (
+            hpo is True or repeat is True
+        ), f"Error! either {repeat=} or {hpo=} must be True"
+        parent_run_id = args.parent_run_id
+        if parent_run_id is not None:
+            assert isinstance(
+                parent_run_id, str
+            ), f"Error! {parent_run_id=} is not a str"
+
+        # validate the objects
+        experiment_name = config_init.experiment_name
         assert isinstance(
-            ray_storage_path, str
-        ), f"Error! {ray_storage_path=} is not a str"
+            experiment_name, str
+        ), f"Error! {experiment_name=} is not a str"
+        run_name = config_init.run_name
+        if run_name is not None:
+            assert isinstance(run_name, str), f"Error! {run_name=} is not a str"
+        # validate defaults
+        defaults = config_init.defaults
+        assert isinstance(defaults, Defaults), f"Error! {defaults=} is not a Defaults"
+
+        tasks = config_init.tasks
+        assert isinstance(tasks, list), f"Error! {tasks=} is not a list"
+        for t in tasks:
+            assert isinstance(t, Task), f"Error! {t=} is not a Task"
+            # if there is not specific terratorch_task specified, then use default terratorch_task
+            if t.terratorch_task is None:
+                t.terratorch_task = defaults.terratorch_task
+        # defaults.trainer_args["max_epochs"] = 5
+
+        optimization_space = config_init.optimization_space
+        assert isinstance(
+            optimization_space, dict
+        ), f"Error! {optimization_space=} is not a dict"
 
-    n_trials = config_init.n_trials
-    assert isinstance(n_trials, int) and n_trials > 0, f"Error! {n_trials=} is invalid"
-    run_repetitions = config_init.run_repetitions
+        # ray_storage_path is optional
+        ray_storage_path = config_init.ray_storage_path
+        if ray_storage_path is not None:
+            assert isinstance(
+                ray_storage_path, str
+            ), f"Error! {ray_storage_path=} is not a str"
 
-    report_on_best_val = config_init.report_on_best_val
-    assert isinstance(
-        report_on_best_val, bool
-    ), f"Error! {ray_storage_path=} is not a bool"
+        n_trials = config_init.n_trials
+        assert (
+            isinstance(n_trials, int) and n_trials > 0
+        ), f"Error! {n_trials=} is invalid"
+        run_repetitions = config_init.run_repetitions
 
-    save_models = config_init.save_models
-    assert isinstance(save_models, bool), f"Error! {save_models=} is not a bool"
+        report_on_best_val = config_init.report_on_best_val
+        assert isinstance(
+            report_on_best_val, bool
+        ), f"Error! {ray_storage_path=} is not a bool"
 
-    test_models = config_init.test_models
-    assert isinstance(test_models, bool), f"Error! {test_models=} is not a bool"
+        save_models = config_init.save_models
+        assert isinstance(save_models, bool), f"Error! {save_models=} is not a bool"
 
-    bayesian_search = config_init.bayesian_search
-    assert isinstance(bayesian_search, bool), f"Error! {bayesian_search=} is not a bool"
+        test_models = config_init.test_models
+        assert isinstance(test_models, bool), f"Error! {test_models=} is not a bool"
 
-    # custom_modules_path is optional
-    custom_modules_path = config_init.custom_modules_path
-    if custom_modules_path is not None:
+        bayesian_search = config_init.bayesian_search
         assert isinstance(
-            custom_modules_path, str
-        ), f"Error! {custom_modules_path=} is not a str"
-        import_custom_modules(logger=logger, custom_modules_path=custom_modules_path)
-
-    if repeat and not hpo:
-        output = config_init.output_path
-        if output is None:
-            storage_uri_path = Path(storage_uri)
-            assert (
-                storage_uri_path.exists() and storage_uri_path.is_dir()
-            ), f"Error! Unable to create new output_path based on storage_uri_path because the latter does not exist: {storage_uri_path}"
-            output_path = storage_uri_path.parents[0] / "repeated_exp_output_csv"
-            output_path.mkdir(parents=True, exist_ok=True)
-            output_path = output_path / f"{experiment_name}_repeated_exp_mlflow.csv"
-            output = str(output_path)
-
-        logger.info("Rerun best experiments...")
-        rerun_best_from_backbone(
-            logger=logger,
-            parent_run_id=parent_run_id,
-            output_path=str(output_path),
-            defaults=defaults,
-            tasks=tasks,
-            experiment_name=experiment_name,
-            storage_uri=storage_uri,
-            optimization_space=optimization_space,
-            run_repetitions=run_repetitions,
-            save_models=save_models,
-            report_on_best_val=report_on_best_val,
-        )
-    else:
-        if not repeat and hpo:
-            run_repetitions = 0
-
-        # run_repetitions is an optional parameter
-        benchmark_backbone(
-            defaults=defaults,
-            tasks=tasks,
-            experiment_name=experiment_name,
-            storage_uri=storage_uri,
-            ray_storage_path=ray_storage_path,
-            run_name=run_name,
-            optimization_space=optimization_space,
-            n_trials=n_trials,
-            run_repetitions=run_repetitions,
-            save_models=save_models,
-            report_on_best_val=report_on_best_val,
-            test_models=test_models,
-            bayesian_search=bayesian_search,
-            logger=logger,
-        )
+            bayesian_search, bool
+        ), f"Error! {bayesian_search=} is not a bool"
+
+        # custom_modules_path is optional
+        custom_modules_path = config_init.custom_modules_path
+        if custom_modules_path is not None:
+            assert isinstance(
+                custom_modules_path, str
+            ), f"Error! {custom_modules_path=} is not a str"
+            import_custom_modules(
+                logger=logger, custom_modules_path=custom_modules_path
+            )
+
+        if repeat and not hpo:
+            output = config_init.output_path
+            if output is None:
+                storage_uri_path = Path(storage_uri)
+                assert (
+                    storage_uri_path.exists() and storage_uri_path.is_dir()
+                ), f"Error! Unable to create new output_path based on storage_uri_path because the latter does not exist: {storage_uri_path}"
+                output_path = storage_uri_path.parents[0] / "repeated_exp_output_csv"
+                output_path.mkdir(parents=True, exist_ok=True)
+                output_path = output_path / f"{experiment_name}_repeated_exp_mlflow.csv"
+                output = str(output_path)
+
+            logger.info("Rerun best experiments...")
+            rerun_best_from_backbone(
+                logger=logger,
+                parent_run_id=parent_run_id,
+                output_path=str(output_path),
+                defaults=defaults,
+                tasks=tasks,
+                experiment_name=experiment_name,
+                storage_uri=storage_uri,
+                optimization_space=optimization_space,
+                run_repetitions=run_repetitions,
+                save_models=save_models,
+                report_on_best_val=report_on_best_val,
+            )
+        else:
+            if not repeat and hpo:
+                run_repetitions = 0
+
+            # run_repetitions is an optional parameter
+            benchmark_backbone(
+                defaults=defaults,
+                tasks=tasks,
+                experiment_name=experiment_name,
+                storage_uri=storage_uri,
+                ray_storage_path=ray_storage_path,
+                run_name=run_name,
+                optimization_space=optimization_space,
+                n_trials=n_trials,
+                run_repetitions=run_repetitions,
+                save_models=save_models,
+                report_on_best_val=report_on_best_val,
+                test_models=test_models,
+                bayesian_search=bayesian_search,
+                logger=logger,
+            )
 
 
 if __name__ == "__main__":
diff --git a/configs/tests/benchmark_v2_simple.yaml b/configs/tests/benchmark_v2_simple.yaml
index 3d0cb83..9e29926 100644
--- a/configs/tests/benchmark_v2_simple.yaml
+++ b/configs/tests/benchmark_v2_simple.yaml
@@ -78,7 +78,8 @@ tasks:
           - "NIR"
 n_trials: 2
 save_models: False
-storage_uri: /dccstor/geofm-finetuning/terratorch-iterate-test/
+storage_uri: /Users/ltizzei/Projects/Orgs/IBM/terratorch-iterate
+# storage_uri: /dccstor/geofm-finetuning/terratorch-iterate-test/
 ray_storage_path: /dccstor/geofm-finetuning/terratorch-iterate-test/ray_storage
 run_repetitions: 0
 optimization_space:

From 22c5f0db9a10ed194278edfd2265fffb7bfafc1f Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Mon, 18 Aug 2025 21:57:46 -0300
Subject: [PATCH 11/11] fix benchmark config

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 configs/tests/benchmark_v2_simple.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/configs/tests/benchmark_v2_simple.yaml b/configs/tests/benchmark_v2_simple.yaml
index 9e29926..3d0cb83 100644
--- a/configs/tests/benchmark_v2_simple.yaml
+++ b/configs/tests/benchmark_v2_simple.yaml
@@ -78,8 +78,7 @@ tasks:
           - "NIR"
 n_trials: 2
 save_models: False
-storage_uri: /Users/ltizzei/Projects/Orgs/IBM/terratorch-iterate
-# storage_uri: /dccstor/geofm-finetuning/terratorch-iterate-test/
+storage_uri: /dccstor/geofm-finetuning/terratorch-iterate-test/
 ray_storage_path: /dccstor/geofm-finetuning/terratorch-iterate-test/ray_storage
 run_repetitions: 0
 optimization_space: