From 3143f01505d8b74b0ddb08c679870ba073eadc24 Mon Sep 17 00:00:00 2001
From: Paolo Fraccaro <paolo.fraccaro.dms@gmail.com>
Date: Fri, 11 Jul 2025 13:00:03 +0000
Subject: [PATCH 01/16] generalise monitoring

---
 terratorch_iterate/model_fitting.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/terratorch_iterate/model_fitting.py b/terratorch_iterate/model_fitting.py
index 93a367b..9a25ed3 100644
--- a/terratorch_iterate/model_fitting.py
+++ b/terratorch_iterate/model_fitting.py
@@ -286,10 +286,10 @@ def launch_training(
         client = mlflow.tracking.MlflowClient(
             tracking_uri=storage_uri,
         )
-
-        if not metric.startswith("val/"):
+        pdb.set_trace()
+        if not metric.startswith("val"):
             raise Exception(
-                f"Metric {metric} does not start with `val/`. Please choose a validation metric"
+                f"Metric {metric} does not start with `val`. Please choose a validation metric"
             )
         for_pd_collect = []
         val_metrics_names = []

From 1640513e17f2c83d0ae953c24061761dc70bdcf1 Mon Sep 17 00:00:00 2001
From: Paolo Fraccaro <paolo.fraccaro.dms@gmail.com>
Date: Fri, 11 Jul 2025 13:01:03 +0000
Subject: [PATCH 02/16] generalise modelling

---
 terratorch_iterate/model_fitting.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/terratorch_iterate/model_fitting.py b/terratorch_iterate/model_fitting.py
index 9a25ed3..82ea59d 100644
--- a/terratorch_iterate/model_fitting.py
+++ b/terratorch_iterate/model_fitting.py
@@ -286,7 +286,7 @@ def launch_training(
         client = mlflow.tracking.MlflowClient(
             tracking_uri=storage_uri,
         )
-        pdb.set_trace()
+        
         if not metric.startswith("val"):
             raise Exception(
                 f"Metric {metric} does not start with `val`. Please choose a validation metric"

From 8bc4c5ec071e69d94bd8e5c99d6d4b7fa3b05563 Mon Sep 17 00:00:00 2001
From: Paolo Fraccaro <paolo.fraccaro.dms@gmail.com>
Date: Sun, 13 Jul 2025 09:53:34 +0000
Subject: [PATCH 03/16] add type for Geobench

---
 benchmark/benchmark_types.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/benchmark/benchmark_types.py b/benchmark/benchmark_types.py
index bf8ccab..a3d9610 100644
--- a/benchmark/benchmark_types.py
+++ b/benchmark/benchmark_types.py
@@ -15,6 +15,7 @@
     ObjectDetectionTask,
 )
 from torchgeo.datamodules import BaseDataModule
+from geobench_v2.datamodules import GeoBenchDataModule
 
 valid_task_types = type[
     SemanticSegmentationTask
@@ -116,7 +117,7 @@ class Task:
         name (str): Name for this task
         type (TaskTypeEnum): Type of task.
         terratorch_task (dict): Arguments for the Terratorch Task.
-        datamodule (BaseDataModule): Datamodule to be used.
+        datamodule (BaseDataModule  | GeoBenchDataModule): Datamodule to be used.
         direction (str): One of min or max. Direction to optimize the metric in.
         metric (str): Metric to be optimized. Defaults to "val/loss".
         early_prune (bool): Whether to prune unpromising runs early. Defaults to False.
@@ -128,7 +129,7 @@ class Task:
 
     name: str
     type: TaskTypeEnum
-    datamodule: BaseDataModule
+    datamodule: BaseDataModule | GeoBenchDataModule
     direction: str
     terratorch_task: Optional[dict[str, Any]] = None
     metric: str = "val/loss"

From dcf06479b8281b3204420aa97c926320b87c8c4d Mon Sep 17 00:00:00 2001
From: Paolo Fraccaro <paolo.fraccaro.dms@gmail.com>
Date: Sun, 13 Jul 2025 09:55:35 +0000
Subject: [PATCH 04/16] remove unused tt iterate folder

---
 terratorch_iterate/__init__.py                |   0
 terratorch_iterate/backbone_benchmark.py      | 395 --------
 terratorch_iterate/benchmark_ray.py           | 256 ------
 terratorch_iterate/benchmark_types.py         | 175 ----
 terratorch_iterate/main.py                    | 215 -----
 terratorch_iterate/model_fitting.py           | 671 --------------
 terratorch_iterate/module.py                  |  66 --
 terratorch_iterate/plot_tools.py              | 256 ------
 terratorch_iterate/py.typed                   |   0
 terratorch_iterate/repeat_best_experiment.py  | 468 ----------
 .../resources/dataset_specifications/agb.yaml |  64 --
 .../dataset_specifications/eurosat.yaml       |  28 -
 .../dataset_specifications/fire_scars.yaml    |  56 --
 .../multi_temporal_crop.yaml                  |  57 --
 .../dataset_specifications/sen1floods11.yaml  |  59 --
 .../sen1floods11_transforms.yaml              |  67 --
 terratorch_iterate/tests/__init__.py          |   0
 terratorch_iterate/utils.py                   | 866 ------------------
 18 files changed, 3699 deletions(-)
 delete mode 100644 terratorch_iterate/__init__.py
 delete mode 100644 terratorch_iterate/backbone_benchmark.py
 delete mode 100644 terratorch_iterate/benchmark_ray.py
 delete mode 100644 terratorch_iterate/benchmark_types.py
 delete mode 100644 terratorch_iterate/main.py
 delete mode 100644 terratorch_iterate/model_fitting.py
 delete mode 100644 terratorch_iterate/module.py
 delete mode 100644 terratorch_iterate/plot_tools.py
 delete mode 100644 terratorch_iterate/py.typed
 delete mode 100644 terratorch_iterate/repeat_best_experiment.py
 delete mode 100644 terratorch_iterate/resources/dataset_specifications/agb.yaml
 delete mode 100644 terratorch_iterate/resources/dataset_specifications/eurosat.yaml
 delete mode 100644 terratorch_iterate/resources/dataset_specifications/fire_scars.yaml
 delete mode 100644 terratorch_iterate/resources/dataset_specifications/multi_temporal_crop.yaml
 delete mode 100644 terratorch_iterate/resources/dataset_specifications/sen1floods11.yaml
 delete mode 100644 terratorch_iterate/resources/dataset_specifications/sen1floods11_transforms.yaml
 delete mode 100644 terratorch_iterate/tests/__init__.py
 delete mode 100644 terratorch_iterate/utils.py

diff --git a/terratorch_iterate/__init__.py b/terratorch_iterate/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/terratorch_iterate/backbone_benchmark.py b/terratorch_iterate/backbone_benchmark.py
deleted file mode 100644
index c586311..0000000
--- a/terratorch_iterate/backbone_benchmark.py
+++ /dev/null
@@ -1,395 +0,0 @@
-"""
-This module contains the high level functions for benchmarking on a single node.
-"""
-
-# import argparse
-import os
-import importlib
-from functools import partial
-from typing import Any
-from pathlib import Path
-import mlflow
-import optuna
-import pandas as pd
-import torch
-import logging
-from optuna.pruners import HyperbandPruner
-from optuna.samplers import BaseSampler, RandomSampler
-from tabulate import tabulate
-import pickle
-from benchmark.benchmark_types import (
-    Defaults,
-    ParameterBounds,
-    Task,
-    combine_with_defaults,
-    optimization_space_type,
-)
-from benchmark.model_fitting import fit_model, fit_model_with_hparams
-from benchmark.repeat_best_experiment import rerun_best_from_backbone
-from benchmark.utils import (
-    check_existing_task_parent_runs,
-    check_existing_experiments,
-    unflatten,
-    get_logger,
-    sync_mlflow_optuna,
-    REPEATED_SEEDS_DEFAULT,
-)
-
-direction_type_to_optuna = {"min": "minimize", "max": "maximize"}
-
-
-def benchmark_backbone_on_task(
-    logger,
-    defaults: Defaults,
-    task: Task,
-    storage_uri: str,
-    experiment_name: str,
-    experiment_run_id: str,
-    task_run_id: str | None = None,
-    optimization_space: optimization_space_type | None = None,
-    n_trials: int = 1,
-    save_models: bool = False,
-    sampler: BaseSampler | None = None,
-    test_models: bool = False,
-) -> tuple[float, str | list[str] | None, dict[str, Any]]:
-
-    optuna_db_path = Path(storage_uri).parents[0] / "optuna_db"
-    if not os.path.exists(optuna_db_path):
-        os.makedirs(optuna_db_path)
-    optuna_db_path = optuna_db_path / f"{experiment_name}_{experiment_run_id}"
-    optuna_db_path = str(optuna_db_path)
-
-    task_run_id = sync_mlflow_optuna(
-        optuna_db_path=optuna_db_path,
-        storage_uri=storage_uri,
-        experiment_name=experiment_name,
-        task_run_id=task_run_id,
-        task=task,
-        n_trials=n_trials,
-        logger=logger,
-    )
-
-    with mlflow.start_run(run_name=task.name, nested=True, run_id=task_run_id) as run:
-        logger.info(f"starting task run with id: {run.info.run_id}")
-        training_spec = combine_with_defaults(task, defaults)
-        if "max_epochs" not in training_spec.trainer_args:
-            raise Exception("Must specify max_epochs for the trainer")
-        task = training_spec.task
-        lightning_task_class = training_spec.task.type.get_class_from_enum()
-
-        # if no optimization params, just run it
-        if optimization_space is None:
-            return (
-                *fit_model(
-                    training_spec,
-                    lightning_task_class,
-                    run.info.run_name,
-                    experiment_name,
-                    storage_uri,
-                    run.info.run_id,
-                    save_models=save_models,
-                    test_models=test_models,
-                ),
-                {},
-            )
-
-        # if optimization parameters specified, do hyperparameter tuning
-        study = optuna.create_study(
-            sampler=sampler,
-            direction=direction_type_to_optuna[
-                training_spec.task.direction
-            ],  # in the future may want to allow user to specify this
-            pruner=HyperbandPruner(),
-            study_name=task.name,
-            storage="sqlite:///{}.db".format(optuna_db_path),
-            load_if_exists=True,
-        )
-
-        objective = partial(
-            fit_model_with_hparams,
-            training_spec,
-            lightning_task_class,
-            task.name,
-            experiment_name,
-            optimization_space,
-            storage_uri,
-            run.info.run_id,
-            save_models,
-            test_models,
-        )
-
-        n_trials = n_trials - len(study.trials)
-        for trial in study.trials:
-            if (trial.state == optuna.trial.TrialState.FAIL) | (
-                trial.state == optuna.trial.TrialState.RUNNING
-            ):
-                n_trials = n_trials + 1
-
-        study.optimize(
-            objective,
-            n_trials=n_trials,
-            # callbacks=[champion_callback],
-            catch=[torch.cuda.OutOfMemoryError],
-        )
-
-        tags = {
-            "early_stop_patience": str(training_spec.task.early_stop_patience),
-            "partition_name": str(training_spec.task.datamodule.partition) if hasattr(training_spec.task.datamodule, 'partition') else 'default',
-            "decoder": str(training_spec.task.terratorch_task["model_args"]["decoder"]),
-            "backbone": str(
-                training_spec.task.terratorch_task["model_args"]["backbone"]
-            ),
-            "n_trials": str(n_trials),
-        }
-        mlflow.set_tags(tags)
-
-        best_params = unflatten(study.best_trial.params)
-        mlflow.log_params(best_params)  # unflatten
-        mlflow.log_metric(f"best_{task.metric}", study.best_value)
-        return study.best_value, task.metric, best_params
-
-
-# Custom function to parse the optimization space argument
-def parse_optimization_space(space: dict | None) -> optimization_space_type | None:
-    if space is None:
-        return None
-    parsed_space: optimization_space_type = {}
-    for key, value in space.items():
-        if isinstance(value, dict):
-            try:
-                bounds = ParameterBounds(**value)
-                parsed_space[key] = bounds
-            except TypeError:
-                # Recursively parse nested optimization spaces
-                parsed_space[key] = parse_optimization_space(value)
-        elif isinstance(value, list):
-            # If it's a list, leave it as is
-            parsed_space[key] = value
-        else:
-            raise ValueError(f"Invalid type for {key}: {value}")
-    return parsed_space
-
-
-def benchmark_backbone(
-    defaults: Defaults,
-    tasks: list[Task],
-    experiment_name: str,
-    storage_uri: str,
-    logger: logging.RootLogger | None,
-    ray_storage_path: str | None = None,
-    backbone_import: str | None = None,
-    run_name: str | None = None,
-    n_trials: int = 1,
-    optimization_space: dict | None = None,
-    save_models: bool = False,
-    run_id: str | None = None,
-    description: str = "No description provided",
-    bayesian_search: bool = True,
-    continue_existing_experiment: bool = True,
-    test_models: bool = False,
-    run_repetitions: int = REPEATED_SEEDS_DEFAULT,
-    report_on_best_val: bool = True,
-) -> str:
-    """Highest level function to benchmark a backbone using a single node
-
-    Args:
-        defaults (Defaults): Defaults that are set for all tasks
-        tasks (list[Task]): List of Tasks to benchmark over. Will be combined with defaults to get the final parameters of the task.
-        experiment_name (str): Name of the MLFlow experiment to be used.
-        storage_uri (str): Path to MLFLow storage location.
-        ray_storage_path (str | None): Ignored. Exists for compatibility with ray configs.
-        backbone_import (str | None): Path to module that will be imported to register a potential new backbone. Defaults to None.
-        run_name (str | None, optional): Name of highest level mlflow run. Defaults to None.
-        n_trials (int, optional): Number of hyperparameter optimization trials to run. Defaults to 1.
-        optimization_space (dict | None): Parameters to optimize over. Should be a dictionary (may be nested)
-            of strings (parameter name) to list (discrete set of possibilities) or ParameterBounds, defining a range to optimize over. The structure should be the same as would be passed under tasks.terratorch_task. Defaults to None.
-        save_models (bool, optional): Whether to save the model. Defaults to False.
-        run_id (str | None): id of existing mlflow run to use as top-level run. Useful to add more experiments to a previous benchmark run. Defaults to None.
-        description (str): Optional description for mlflow parent run.
-        bayesian_search (bool): Whether to use bayesian optimization for the hyperparameter search. False uses random sampling. Defaults to True.
-        run_repetitions (int): Number of times that the experiment will be repeated. Defaults to 1.
-    """
-    base = Path(storage_uri).parents[0]
-    PATH_TO_JOB_TRACKING = base / "job_progress_tracking"
-    REPEATED_EXP_FOLDER = base / "repeated_exp_output_csv"
-
-    if logger is None:
-        logger = get_logger(log_folder=str(base / "job_logs"))
-
-    if not os.path.exists(REPEATED_EXP_FOLDER):
-        os.makedirs(REPEATED_EXP_FOLDER)
-    if not os.path.exists(PATH_TO_JOB_TRACKING):
-        os.makedirs(PATH_TO_JOB_TRACKING)
-
-    if backbone_import:
-        importlib.import_module(backbone_import)
-
-    mlflow.set_tracking_uri(storage_uri)
-    mlflow.set_experiment(experiment_name)
-
-    if bayesian_search:
-        sampler: BaseSampler | None = None  # take the default
-    else:
-        sampler = RandomSampler()
-
-    optimization_space = parse_optimization_space(optimization_space)
-    table_columns = ["Task", "Metric", "Best Score", "Hyperparameters"]
-    table_entries = []
-
-    backbone: str = defaults.terratorch_task["model_args"]["backbone"]
-    task_names = [task.name for task in tasks]
-    run_name = f"top_run_{experiment_name}" if run_name is None else run_name
-
-    completed_task_run_names = []
-    run_hpo = True
-    task_run_to_id_match = {}
-    if continue_existing_experiment:
-        # find status of existing runs, and delete incomplete runs except one with the most complete tasks
-        existing_experiments = check_existing_experiments(
-            logger=logger,
-            storage_uri=storage_uri,
-            experiment_name=experiment_name,
-            exp_parent_run_name=run_name,
-            task_names=task_names,
-            n_trials=n_trials,
-            backbone=backbone,
-        )
-        if existing_experiments["no_existing_runs"]:
-            logger.info("\nStarting new experiment from scratch")
-        else:
-            if (existing_experiments["incomplete_run_to_finish"] is not None) and (
-                run_id is None
-            ):
-                logger.info("Continuing previous experiment parent run")
-                run_id = existing_experiments["incomplete_run_to_finish"]
-                experiment_id = existing_experiments["experiment_id"]
-                run_hpo = True
-
-            if existing_experiments["finished_run"] is not None:
-                run_hpo = False
-                finished_run_id = existing_experiments["finished_run"]
-                run_id = existing_experiments["finished_run"]
-
-            # get previously completed tasks
-            completed_task_run_names, _, task_run_to_id_match = (
-                check_existing_task_parent_runs(
-                    logger, run_id, storage_uri, experiment_name, n_trials
-                )
-            )
-
-            table_entries_filename = str(
-                PATH_TO_JOB_TRACKING / f"{experiment_name}-{run_id}_table_entries.pkl"
-            )
-            if os.path.exists(table_entries_filename):
-                with open(table_entries_filename, 'rb') as handle:
-                    table_entries = pickle.load(handle)
-    else:
-        logger.info("Starting new experiment from scratch")
-
-    # only run hyperparameter optimization (HPO) if there are no experiments with finished HPO
-    if run_hpo:
-        logger.info("Running hyperparameter optimization")
-        with mlflow.start_run(
-            run_name=run_name, run_id=run_id, description=description
-        ) as run:
-            for task in tasks:
-                # only run task if it was not completed before
-                task_run_name = task.name
-                if task_run_name in completed_task_run_names:
-                    logger.info(f"{task_run_name} already completed")
-                    continue
-                else:
-                    logger.info(f"{task_run_name} not completed. starting now")
-
-                task_run_id = (
-                    task_run_to_id_match[task_run_name]
-                    if task_run_name in task_run_to_id_match
-                    else None
-                )
-                best_value, metric_name, hparams = benchmark_backbone_on_task(
-                    logger,
-                    defaults,
-                    task,
-                    storage_uri,
-                    experiment_name,
-                    experiment_run_id=run.info.run_id,
-                    task_run_id=task_run_id,
-                    optimization_space=optimization_space,
-                    n_trials=n_trials,
-                    save_models=save_models,
-                    sampler=sampler,
-                    test_models=test_models,
-                )
-                table_entries.append([task.name, metric_name, best_value, hparams])
-                table_entries_filename = str(
-                    PATH_TO_JOB_TRACKING
-                    / f"{experiment_name}-{run.info.run_id}_table_entries.pkl"
-                )
-                with open(table_entries_filename, 'wb') as handle:
-                    pickle.dump(table_entries, handle, protocol=pickle.HIGHEST_PROTOCOL)
-
-            table = tabulate(table_entries, headers=table_columns)
-            logger.info(table)
-            df = pd.DataFrame(data=table_entries, columns=table_columns)
-            df.set_index("Task")
-            logger.info("Starting to save results")
-            mlflow.log_table(
-                df,
-                "results_table.json",
-                run.info.run_id,
-            )
-            experiment_id = run.info.experiment_id
-
-        # check completion of HPO for all tasks before proceeding to next stage
-        existing_experiments = check_existing_experiments(
-            logger=logger,
-            storage_uri=storage_uri,
-            experiment_name=experiment_name,
-            exp_parent_run_name=run_name,
-            task_names=task_names,
-            n_trials=n_trials,
-            backbone=backbone
-        )
-        if existing_experiments["finished_run"] is not None:
-            finished_run_id = existing_experiments["finished_run"]
-        else:
-            logger.info("HPO is not complete. Please re-run this experiment")
-            raise RuntimeError
-    logger.info("HPO complete")
-
-    logger.info(f"run_repetitions: {run_repetitions}")
-    
-    if run_repetitions >= 1:
-        # run repeated experiments
-        logger.info(
-            f"Now running {run_repetitions} repeats per experiment \n\
-                    Parent run: {finished_run_id} \n\
-                    Experiment name: {experiment_name} \n\
-                    "
-        )
-        path_to_final_results = str(
-            REPEATED_EXP_FOLDER / f"{experiment_name}_repeated_exp_mlflow.csv"
-        )
-
-        rerun_best_from_backbone(
-            logger=logger,
-            parent_run_id=finished_run_id,
-            output_path=path_to_final_results,
-            defaults=defaults,
-            tasks=tasks,
-            experiment_name=experiment_name,
-            storage_uri=storage_uri,
-            tmp_dir=ray_storage_path,
-            backbone_import=backbone_import,
-            run_name=run_name,
-            n_trials=n_trials,
-            ray_storage_path=ray_storage_path,
-            optimization_space=optimization_space,
-            save_models=save_models,
-            description=description,
-            use_ray=False,
-            run_repetitions=run_repetitions,
-            report_on_best_val=report_on_best_val,
-        )
-
-    return finished_run_id
diff --git a/terratorch_iterate/benchmark_ray.py b/terratorch_iterate/benchmark_ray.py
deleted file mode 100644
index 81eed60..0000000
--- a/terratorch_iterate/benchmark_ray.py
+++ /dev/null
@@ -1,256 +0,0 @@
-"""
-This module contains the high level functions for benchmarking on a single node.
-"""
-
-import importlib
-import os
-
-import mlflow
-import pandas as pd
-import ray
-from jsonargparse import CLI
-from ray.tune.search import SearchAlgorithm, Searcher
-from ray.tune.search.basic_variant import BasicVariantGenerator
-from ray.tune.search.optuna import OptunaSearch
-from tabulate import tabulate
-
-from benchmark.backbone_benchmark import parse_optimization_space
-from benchmark.benchmark_types import (
-    Defaults,
-    Task,
-    TrainingSpec,
-    combine_with_defaults,
-    optimization_space_type,
-)
-from benchmark.model_fitting import fit_model, ray_tune_model, valid_task_types
-
-
-def benchmark_backbone_on_task(
-    training_spec: TrainingSpec,
-    lightning_task_class: valid_task_types,
-    storage_uri: str,
-    experiment_name: str,
-    ray_storage_path: str,
-    optimization_space: optimization_space_type | None = None,
-    n_trials: int = 1,
-    save_models: bool = False,
-    backbone_import: str | None = None,
-    searcher: SearchAlgorithm | None = None,
-) -> dict:
-    if not searcher:
-        raise ValueError("Searcher must not be None")
-    with mlflow.start_run(
-        run_name=training_spec.task.name,
-        nested=True,
-    ) as run:
-        # if no optimization params, just run it
-        if optimization_space is None:
-            raise Exception("For no optimization space, run benchmark.py")
-
-        results = ray_tune_model(
-            training_spec,
-            lightning_task_class,
-            optimization_space,
-            storage_uri,
-            ray_storage_path,
-            experiment_name,
-            save_models,
-            n_trials,
-            backbone_import=backbone_import,
-            searcher=searcher,
-        )
-
-        mlflow.log_table(
-            results.get_dataframe(),
-            f"results_{run.info.run_name}.json",
-            run.info.run_id,
-        )
-        if results.get_best_result().metrics is None:
-            raise Exception("Best result metrics were none")
-        if results.get_best_result().config is None:
-            raise Exception("Best result config was none")
-
-        mlflow.log_params(results.get_best_result().config)
-        mlflow.log_metric(
-            f"best_{training_spec.task.metric}",
-            results.get_best_result().metrics[training_spec.task.metric],
-        )
-        return {
-            "best_result": results.get_best_result().metrics[training_spec.task.metric],
-            "metric": training_spec.task.metric,
-            "best_config": results.get_best_result().config,
-        }
-
-
-@ray.remote(num_cpus=8, num_gpus=1)
-def remote_fit(
-    training_spec: TrainingSpec,
-    lightning_task_class: valid_task_types,
-    run_name: str,
-    storage_uri: str,
-    experiment_name: str,
-    parent_run_id: str,
-    save_models: bool,
-    backbone_import: str | None,
-) -> float:
-    mlflow.set_tracking_uri(storage_uri)
-    mlflow.set_experiment(experiment_name)
-    if backbone_import:
-        importlib.import_module(backbone_import)
-    return fit_model(
-        training_spec,
-        lightning_task_class,
-        run_name,
-        experiment_name,
-        storage_uri,
-        parent_run_id,
-        save_models=save_models,
-    )[0]
-
-
-def benchmark_backbone(
-    defaults: Defaults,
-    tasks: list[Task],
-    experiment_name: str,
-    storage_uri: str,
-    tmp_dir: str | None = None,
-    backbone_import: str | None = None,
-    run_name: str | None = None,
-    n_trials: int = 1,
-    ray_storage_path: str | None = None,
-    optimization_space: dict | None = None,
-    save_models: bool = False,
-    run_id: str | None = None,
-    description: str = "No description provided",
-    bayesian_search: bool = True,
-):
-    """Highest level function to benchmark a backbone using a ray cluster
-
-    Args:
-        tmp_dir (str): Path to temporary directory to be used for ray
-        defaults (Defaults): Defaults that are set for all tasks
-        tasks (list[Task]): List of Tasks to benchmark over. Will be combined with defaults to get the final parameters of the task.
-        experiment_name (str): Name of the MLFlow experiment to be used.
-        storage_uri (str): Path to MLFlow storage location.
-        ray_storage_path (str | None): Path to storage of ray outputs, including saved models, when using ray tune. Required if optimization_space is specified
-        backbone_import (str | None): Path to module that will be imported to register a potential new backbone. Defaults to None.
-        run_name (str | None, optional): Name of highest level mlflow run. Defaults to None.
-        n_trials (int, optional): Number of hyperparameter optimization trials to run. Defaults to 1.
-        optimization_space (dict | None): Parameters to optimize over. Should be a dictionary (may be nested)
-            of strings (parameter name) to list (discrete set of possibilities) or ParameterBounds, defining a range to optimize over. The structure should be the same as would be passed under tasks.terratorch_task. Defaults to None.
-        save_models (bool, optional): Whether to save the models. Defaults to False.
-        run_id (str | None): id of existing mlflow run to use as top-level run. Useful to add more experiments to a previous benchmark run. Defaults to None.
-        description (str): Optional description for mlflow parent run.
-        bayesian_search (bool): Whether to use bayesian optimization for the hyperparameter search. False uses random sampling. Defaults to True.
-    """
-    if tmp_dir is None:
-        raise Exception("tmp_dir must be specified for runs with ray.")
-    os.environ["RAY_TMPDIR"] = tmp_dir
-    ray.init(_temp_dir=tmp_dir)
-    if backbone_import:
-        importlib.import_module(backbone_import)
-    mlflow.set_tracking_uri(storage_uri)
-    mlflow.set_experiment(experiment_name)
-    # mlflow.pytorch.autolog(log_datasets=False)
-
-    if bayesian_search:
-        searcher: Searcher | SearchAlgorithm = OptunaSearch()
-    else:
-        searcher = BasicVariantGenerator()
-
-    optimization_space = parse_optimization_space(optimization_space)
-
-    table_columns = ["Task", "Metric", "Best Score", "Hyperparameters"]
-    table_entries = []
-
-    with mlflow.start_run(
-        run_name=run_name, run_id=run_id, description=description
-    ) as run:
-
-        if optimization_space is None:
-            # no hparams, parallelize over tasks
-            ray_tasks = []
-            for task in tasks:
-                training_spec = combine_with_defaults(task, defaults)
-                if "max_epochs" not in training_spec.trainer_args:
-                    raise Exception("Must specify max_epochs for the trainer")
-                task = training_spec.task
-                lightning_task_class = training_spec.task.type.get_class_from_enum()
-                ray_tasks.append(
-                    remote_fit.remote(
-                        training_spec,
-                        lightning_task_class,
-                        run.info.run_name,
-                        storage_uri,
-                        experiment_name,
-                        run.info.run_id,
-                        save_models,
-                        backbone_import,
-                    )
-                )
-            results = ray.get(ray_tasks)
-            table_entries = [
-                [
-                    task.name,
-                    task.metric,
-                    result,
-                    None,
-                ]
-                for task, result in zip(tasks, results)
-            ]
-        else:
-            if ray_storage_path is None:
-                raise Exception(
-                    "`ray_storage_path` must be specified if `optimization_space` is specified."
-                )
-            # hparams, parallelize within tasks, run one task at a time.
-            results = []
-            for task in tasks:
-                training_spec = combine_with_defaults(task, defaults)
-                if "max_epochs" not in training_spec.trainer_args:
-                    raise Exception("Must specify max_epochs for the trainer")
-                task = training_spec.task
-                lightning_task_class = training_spec.task.type.get_class_from_enum()
-                results.append(
-                    benchmark_backbone_on_task(
-                        training_spec,
-                        lightning_task_class,
-                        storage_uri,
-                        experiment_name,
-                        ray_storage_path,
-                        optimization_space=optimization_space,
-                        n_trials=n_trials,
-                        save_models=save_models,
-                        backbone_import=backbone_import,
-                        searcher=searcher,
-                    )
-                )
-
-            table_entries = [
-                [
-                    task.name,
-                    result["metric"],
-                    result["best_result"],
-                    str(result["best_config"]),
-                ]
-                for task, result in zip(tasks, results)
-            ]
-
-        table = tabulate(table_entries, headers=table_columns)
-        print(table)
-        df = pd.DataFrame(data=table_entries, columns=table_columns)
-        df.set_index("Task")
-        mlflow.log_table(
-            df,
-            "results_table.json",
-            run.info.run_id,
-        )
-        ray.shutdown()
-
-
-def main():
-    CLI(benchmark_backbone, fail_untyped=False)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/terratorch_iterate/benchmark_types.py b/terratorch_iterate/benchmark_types.py
deleted file mode 100644
index 4188d76..0000000
--- a/terratorch_iterate/benchmark_types.py
+++ /dev/null
@@ -1,175 +0,0 @@
-"""
-This module defines all the types expected at input. Used for type checking by jsonargparse.
-"""
-
-from ast import Dict
-import copy
-import enum
-from dataclasses import dataclass, field, replace
-from typing import Any, Optional, Union
-
-from terratorch.tasks import (
-    ClassificationTask,
-    MultiLabelClassificationTask,
-    PixelwiseRegressionTask,
-    SemanticSegmentationTask,
-)
-from torchgeo.datamodules import BaseDataModule
-
-valid_task_types = type[
-    SemanticSegmentationTask | ClassificationTask | PixelwiseRegressionTask
-]
-
-
-class TaskTypeEnum(enum.Enum):
-    """
-    Enum for the type of task to be performed. segmentation, regression or classification.
-    """
-
-    segmentation = "segmentation"
-    regression = "regression"
-    classification = "classification"
-    multilabel_classification = "multilabel_classification"
-
-    def get_class_from_enum(
-        self,
-    ) -> valid_task_types:
-        match self:
-            case TaskTypeEnum.segmentation:
-                return SemanticSegmentationTask
-            case TaskTypeEnum.regression:
-                return PixelwiseRegressionTask
-            case TaskTypeEnum.classification:
-                return ClassificationTask
-            case TaskTypeEnum.multilabel_classification:
-                return MultiLabelClassificationTask
-            case _:
-                raise TypeError("Task type does not exist")
-
-
-class ParameterTypeEnum(enum.Enum):
-    """
-    Enum for the type of parameter allowed in ParameterBounds. integer or real.
-    """
-
-    integer = "int"
-    real = "real"
-
-
-@dataclass
-class ParameterBounds:
-    """
-    Dataclass defining a numerical range to search over.
-
-    Args:
-        min (float | int): Minimum.
-        max (float | int): Maximum.
-        type (ParameterTypeEnum): Whether the range is in the space of integers or real numbers.
-        log (bool): Whether to search over the log space (useful for parameters that vary wildly in scale, e.g. learning rate)
-    """
-
-    min: float | int
-    max: float | int
-    type: ParameterTypeEnum
-    log: bool = False
-
-    def __post_init__(self):
-        if not isinstance(self.type, ParameterTypeEnum):
-            self.type = ParameterTypeEnum(self.type)
-
-
-optimization_space_type = dict[
-    str, Union[list, ParameterBounds, 'optimization_space_type']
-]
-
-
-@dataclass
-class Defaults:
-    """
-    Default parameters set for each of the tasks.
-
-    These parameters will be combined with task specific ones to form the final parameters for the Terratorch training.
-
-    Args:
-        trainer_args (dict): Arguments passed to Lightning Trainer.
-        terratorch_task (dict): Arguments for the Terratorch Task.
-    """
-
-    trainer_args: dict[str, Any] = field(default_factory=dict)
-    terratorch_task: dict[str, Any] = field(default_factory=dict)
-
-
-@dataclass
-class Task:
-    """
-    Parameters passed to define each of the tasks.
-
-    These parameters are combined with any specified defaults to generate the final task parameters.
-
-    Args:
-        name (str): Name for this task
-        type (TaskTypeEnum): Type of task.
-        terratorch_task (dict): Arguments for the Terratorch Task.
-        datamodule (BaseDataModule): Datamodule to be used.
-        direction (str): One of min or max. Direction to optimize the metric in.
-        metric (str): Metric to be optimized. Defaults to "val/loss".
-        early_prune (bool): Whether to prune unpromising runs early. Defaults to False.
-        early_stop_patience (int, None): Whether to use Lightning early stopping of runs. Defaults to None, which does not do early stopping.
-        optimization_except (str[str]): HyperParameters from the optimization space to be ignored for this task.
-        max_run_duration (str, None): maximum allowed run duration in the form DD:HH:MM:SS; will stop a run after this
-            amount of time. Defaults to None, which doesn't stop runs by time.
-    """
-
-    name: str
-    type: TaskTypeEnum
-    datamodule: BaseDataModule
-    direction: str
-    terratorch_task: Optional[dict[str, Any]] = None
-    metric: str = "val/loss"
-    early_prune: bool = False
-    early_stop_patience: int | None = None
-    optimization_except: set[str] = field(default_factory=set)
-    max_run_duration: str | None = None
-
-
-@dataclass
-class TrainingSpec:
-    task: Task
-    trainer_args: dict[str, Any] = field(default_factory=dict)
-
-
-def recursive_merge(first_dict: dict[str, Any], second_dict: dict[str, Any]):
-
-    # consider using deepmerge instead of this
-    for key, val in second_dict.items():
-        if key not in first_dict:
-            first_dict[key] = val
-        else:
-            # if it is a dictionary, recurse deeper
-            if isinstance(val, dict):
-                recursive_merge(first_dict[key], val)
-            # if it is not further nested, just replace the value
-            else:
-                first_dict[key] = val
-
-
-def combine_with_defaults(task: Task, defaults: Defaults) -> TrainingSpec:
-    """
-    Combine task-specific parameters with default parameters.
-
-    Args:
-        task (Task): Task object containing task-specific parameters.
-        defaults (Defaults): Defaults object containing default parameters.
-
-    Returns:
-        TrainingSpec: TrainingSpec object containing combined parameters.
-    """
-    terratorch_task: Optional[Dict[str, Any]] = copy.deepcopy(defaults.terratorch_task)
-    if terratorch_task is None:
-        terratorch_task = {}
-    if task.terratorch_task is None:
-        task.terratorch_task = {}
-    # merge task specific args with default args
-    recursive_merge(terratorch_task, task.terratorch_task)
-    task_with_defaults = replace(task, terratorch_task=terratorch_task)
-    return TrainingSpec(task_with_defaults, defaults.trainer_args)
diff --git a/terratorch_iterate/main.py b/terratorch_iterate/main.py
deleted file mode 100644
index ebad90c..0000000
--- a/terratorch_iterate/main.py
+++ /dev/null
@@ -1,215 +0,0 @@
-import logging
-import uuid
-from pathlib import Path
-from typing import Any, List
-from jsonargparse import ArgumentParser
-from benchmark.backbone_benchmark import benchmark_backbone
-from benchmark.benchmark_types import Defaults, Task
-from benchmark.repeat_best_experiment import rerun_best_from_backbone
-from benchmark.utils import (get_logger, import_custom_modules,
-                             get_results_and_parameters, extract_parameters)
-
-def main():
-    print("Running terratorch-iterate...")
-    parser = ArgumentParser()
-
-    parser.add_argument('--defaults', type=Defaults)  # to ignore model
-    parser.add_argument('--optimization_space', type=dict)  # to ignore model
-    parser.add_argument('--experiment_name', type=str)  # to ignore model
-    parser.add_argument('--run_name', type=str)  # to ignore model
-    parser.add_argument('--save_models', type=bool)  # to ignore model
-    parser.add_argument('--storage_uri', type=str)  # to ignore model
-    parser.add_argument('--ray_storage_path', type=str)  # to ignore model
-    parser.add_argument('--n_trials', type=int)  # to ignore model
-    parser.add_argument('--run_repetitions', type=int)  # to ignore model
-    parser.add_argument('--tasks', type=list[Task])
-    parser.add_argument("--parent_run_id", type=str)
-    parser.add_argument("--output_path", type=str)
-    parser.add_argument("--logger", type=str)
-    parser.add_argument("--config", action="config")
-    parser.add_argument('--custom_modules_path', type=str) 
-    parser.add_argument('--report_on_best_val', type=bool, default=True) 
-    parser.add_argument('--test_models', type=bool, default=False) 
-    parser.add_argument('--bayesian_search', type=bool, default=True)
-    parser.add_argument("--hpo", help="optimize hyperparameters", action="store_true")
-    parser.add_argument("--repeat", help="repeat best experiments", action="store_true") 
-    parser.add_argument("--summarize", help="summarize results from repeated experiments", action="store_true") 
-    
-    
-
-    args = parser.parse_args()
-    paths: List[Any] = args.config
-    path = paths[0]
-    config = parser.parse_path(path)
-    config_init = parser.instantiate_classes(config)
-
-    #summarize results from multiple experiments
-    summarize = args.summarize
-    assert isinstance(summarize, bool), f"Error! {summarize=} is not a bool"
-    if summarize:
-        assert (
-            hpo is False and repeat is False
-        ), f"Error! both {repeat=} and {hpo=} must be False when summarizing results from multiple experiments."
-        storage_uri = config_init.storage_uri
-        assert isinstance(storage_uri, str), f"Error! {storage_uri=} is not a str"
-
-        list_of_experiment_names = config_init.list_of_experiment_names
-        assert isinstance(list_of_experiment_names, list), f"Error! {list_of_experiment_names=} is not a list"
-        for exp in list_of_experiment_names:
-            assert isinstance(exp, str), f"Error! {exp=} is not a str"
-
-        task_names = config_init.task_names
-        assert isinstance(task_names, list), f"Error! {task_names=} is not a list"
-        for t in task_names:
-            assert isinstance(t, str), f"Error! {t=} is not a str"
-
-        run_repetitions = config_init.run_repetitions
-        assert isinstance(run_repetitions, int) and run_repetitions > 0, f"Error! {run_repetitions=} is invalid"
-        #get results and parameters from mlflow logs
-        results_and_parameters = get_results_and_parameters(
-                                        storage_uri = storage_uri,
-                                        logger = logger,
-                                        experiments = list_of_experiment_names,
-                                        task_names = task_names,
-                                        num_repetitions = run_repetitions
-                                        )
-        return
-
-    #optimize hyperparameters and/or do repeated runs for single experiments
-    repeat = args.repeat
-    assert isinstance(repeat, bool), f"Error! {repeat=} is not a bool"
-    hpo = args.hpo
-    assert isinstance(hpo, bool), f"Error! {hpo=} is not a bool"
-    assert (
-        hpo is True or repeat is True
-    ), f"Error! either {repeat=} or {hpo=} must be True"
-    parent_run_id = args.parent_run_id
-    if parent_run_id is not None:
-        assert isinstance(parent_run_id, str), f"Error! {parent_run_id=} is not a str"
-
-    
-    # validate the objects
-    experiment_name = config_init.experiment_name
-    assert isinstance(experiment_name, str), f"Error! {experiment_name=} is not a str"
-    run_name = config_init.run_name
-    if run_name is not None:
-        assert isinstance(run_name, str), f"Error! {run_name=} is not a str"
-    # validate defaults
-    defaults = config_init.defaults
-    assert isinstance(defaults, Defaults), f"Error! {defaults=} is not a Defaults"
-
-    tasks = config_init.tasks
-    assert isinstance(tasks, list), f"Error! {tasks=} is not a list"
-    for t in tasks:
-        assert isinstance(t, Task), f"Error! {t=} is not a Task"
-        # if there is not specific terratorch_task specified, then use default terratorch_task
-        if t.terratorch_task is None:
-            t.terratorch_task = defaults.terratorch_task
-    # defaults.trainer_args["max_epochs"] = 5
-    storage_uri = config_init.storage_uri
-    assert isinstance(storage_uri, str), f"Error! {storage_uri=} is not a str"
-
-    optimization_space = config_init.optimization_space
-    assert isinstance(
-        optimization_space, dict
-    ), f"Error! {optimization_space=} is not a dict"
-
-    # ray_storage_path is optional
-    ray_storage_path = config_init.ray_storage_path
-    if ray_storage_path is not None:
-        assert isinstance(
-            ray_storage_path, str
-        ), f"Error! {ray_storage_path=} is not a str"
-
-    n_trials = config_init.n_trials
-    assert isinstance(n_trials, int) and n_trials > 0, f"Error! {n_trials=} is invalid"
-    run_repetitions = config_init.run_repetitions
-
-    report_on_best_val = config_init.report_on_best_val
-    assert isinstance(
-            report_on_best_val, bool
-        ), f"Error! {ray_storage_path=} is not a bool"
-
-    save_models = config_init.save_models
-    assert isinstance(
-            save_models, bool
-        ), f"Error! {save_models=} is not a bool"
-
-    test_models = config_init.test_models
-    assert isinstance(
-            test_models, bool
-        ), f"Error! {test_models=} is not a bool"
-
-    bayesian_search = config_init.bayesian_search
-    assert isinstance(
-            bayesian_search, bool
-        ), f"Error! {bayesian_search=} is not a bool"
-        
-
-    logger_path = config_init.logger
-    if logger_path is None:
-        storage_uri_path = Path(storage_uri)
-        logger = get_logger(log_folder=f"{str(storage_uri_path.parents[0])}/job_logs")
-    else:
-        logging.config.fileConfig(fname=logger_path, disable_existing_loggers=False)
-        logger = logging.getLogger("terratorch-iterate")
-
-    #custom_modules_path is optional
-    custom_modules_path = config_init.custom_modules_path
-    if custom_modules_path is not None:
-        assert isinstance(
-            custom_modules_path, str
-        ), f"Error! {custom_modules_path=} is not a str"
-        import_custom_modules(logger=logger, custom_modules_path=custom_modules_path)
-        
-    if repeat and not hpo:
-        output = config_init.output_path
-        if output is None:
-            storage_uri_path = Path(storage_uri)
-            assert (
-                storage_uri_path.exists() and storage_uri_path.is_dir()
-            ), f"Error! Unable to create new output_path based on storage_uri_path because the latter does not exist: {storage_uri_path}"
-            output_path = storage_uri_path.parents[0] / "repeated_exp_output_csv"
-            output_path.mkdir(parents=True, exist_ok=True)
-            output_path = output_path /  f"{experiment_name}_repeated_exp_mlflow.csv"
-            output = str(output_path)
-
-        logger.info("Rerun best experiments...")
-        rerun_best_from_backbone(
-            logger=logger,
-            parent_run_id=parent_run_id,
-            output_path=str(output_path),
-            defaults=defaults,
-            tasks=tasks,
-            experiment_name=experiment_name,
-            storage_uri=storage_uri,
-            optimization_space=optimization_space,
-            run_repetitions=run_repetitions,
-            save_models=save_models,
-            report_on_best_val=report_on_best_val,
-        )
-    else:
-        if not repeat and hpo:
-            run_repetitions = 0
-
-        # run_repetitions is an optional parameter
-        benchmark_backbone(
-            defaults=defaults,
-            tasks=tasks,
-            experiment_name=experiment_name,
-            storage_uri=storage_uri,
-            ray_storage_path=ray_storage_path,
-            run_name=run_name,
-            optimization_space=optimization_space,
-            n_trials=n_trials,
-            run_repetitions=run_repetitions,
-            save_models=save_models,
-            report_on_best_val=report_on_best_val,
-            test_models=test_models,
-            bayesian_search=bayesian_search,
-            logger=logger,
-        )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/terratorch_iterate/model_fitting.py b/terratorch_iterate/model_fitting.py
deleted file mode 100644
index 82ea59d..0000000
--- a/terratorch_iterate/model_fitting.py
+++ /dev/null
@@ -1,671 +0,0 @@
-"""
-This module contains all the logic for fitting models
-"""
-
-import abc
-import copy
-import dataclasses
-import importlib
-import os
-import shutil
-import types
-import uuid
-import warnings
-from abc import abstractmethod
-from functools import wraps
-from typing import Callable
-import pandas as pd
-import lightning.pytorch as pl
-import mlflow
-import optuna
-from lightning import Callback, Trainer
-from lightning.pytorch.callbacks import (
-    EarlyStopping,
-    LearningRateMonitor,
-    ModelCheckpoint,
-    Timer,
-)
-from lightning.pytorch.loggers.mlflow import MLFlowLogger
-
-# from ray.air.integrations.mlflow import
-from optuna.integration import PyTorchLightningPruningCallback
-from ray import tune
-from ray.air import CheckpointConfig, RunConfig
-from ray.train._internal.storage import StorageContext
-from ray.tune.experiment import Trial
-import pdb
-# for ddp in the future if required
-# import ray
-# from ray.train import report
-# from ray import train
-# from ray.air import CheckpointConfig, ScalingConfig
-# from ray.train.lightning import (
-#     RayDeepSpeedStrategy,
-#     RayLightningEnvironment,
-#     RayTrainReportCallback,
-#     prepare_trainer,
-# )
-# from ray.train.torch import TorchTrainer
-from ray.tune.integration.pytorch_lightning import TuneReportCheckpointCallback
-from ray.tune.schedulers import FIFOScheduler, TrialScheduler
-from ray.tune.schedulers.hb_bohb import HyperBandForBOHB
-from ray.tune.search import SearchAlgorithm, Searcher
-from ray.tune.search.bohb import TuneBOHB
-from terratorch.tasks import PixelwiseRegressionTask, SemanticSegmentationTask
-from torchgeo.datamodules import BaseDataModule
-from torchgeo.trainers import BaseTask
-
-from benchmark.benchmark_types import (
-    ParameterBounds,
-    ParameterTypeEnum,
-    TrainingSpec,
-    optimization_space_type,
-    recursive_merge,
-    valid_task_types,
-)
-
-os.environ["TUNE_DISABLE_AUTO_CALLBACK_LOGGERS"] = (
-    "1"  # disable tune loggers, will add csv and json manually. If this is not here, it will log to tensorboard automatically
-)
-
-SEED = 42
-
-
-class ParameterPicker(abc.ABC):
-    @abstractmethod
-    def pick_categorical(self, variable, choices):
-        pass
-
-    @abstractmethod
-    def pick_int(self, variable, low, high):
-        pass
-
-    @abstractmethod
-    def pick_float(self, variable, low, high, log=False):
-        pass
-
-
-class OptunaParameterPicker(ParameterPicker):
-    def __init__(self, trial: optuna.Trial):
-        super().__init__()
-        self.trial = trial
-
-    def pick_categorical(self, variable, choices):
-        return self.trial.suggest_categorical(variable, choices)
-
-    def pick_int(self, variable, low, high):
-        return self.trial.suggest_int(variable, low, high)
-
-    def pick_float(self, variable, low, high, log=False):
-        return self.trial.suggest_float(variable, low, high, log=log)
-
-
-class RayTuneParameterPicker(ParameterPicker):
-    def __init__(self):
-        super().__init__()
-
-    def pick_categorical(self, variable, choices):
-        return tune.choice(choices)
-
-    def pick_int(self, variable, low, high):
-        return tune.quniform(low, high, 1)
-
-    def pick_float(self, variable, low, high, log=False):
-        if log:
-            return tune.loguniform(low, high)
-        return tune.uniform(low, high)
-
-
-class _TuneReportCallback(TuneReportCheckpointCallback, pl.Callback):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-
-def inject_hparams(training_spec: TrainingSpec, config: dict):
-    # treat batch size specially
-    config_without_batch_size = copy.deepcopy(config)
-    assert isinstance(
-        config_without_batch_size, dict
-    ), f"Error! Unexpected config type: {config_without_batch_size}"
-    batch_size: int | None = config_without_batch_size.pop("batch_size", None)  # type: ignore
-    datamodule_with_generated_hparams = copy.deepcopy(training_spec.task.datamodule)
-    if batch_size:
-        datamodule_with_generated_hparams.batch_size = batch_size
-
-    terratorch_task_with_generated_hparams = copy.deepcopy(
-        training_spec.task.terratorch_task
-    )
-    if terratorch_task_with_generated_hparams is None:
-        terratorch_task_with_generated_hparams = {}
-
-    recursive_merge(terratorch_task_with_generated_hparams, config_without_batch_size)
-    task_with_generated_hparams = dataclasses.replace(
-        training_spec.task,
-        terratorch_task=terratorch_task_with_generated_hparams,
-        datamodule=datamodule_with_generated_hparams,
-    )
-    training_spec_with_generated_hparams = dataclasses.replace(
-        training_spec, task=task_with_generated_hparams
-    )
-    return training_spec_with_generated_hparams
-
-
-def get_default_callbacks(
-    early_stop_patience: int | None, max_run_duration: str | None
-) -> list[Callback]:
-    default_callbacks: list[Callback] = [
-        LearningRateMonitor(logging_interval="epoch"),
-    ]
-    if early_stop_patience is not None:
-        default_callbacks.append(
-            EarlyStopping("val/loss", patience=early_stop_patience)
-        )
-    if max_run_duration is not None:
-        default_callbacks.append(Timer(duration=max_run_duration))
-    return default_callbacks
-
-
-def generate_parameters(
-    parameter_picker: ParameterPicker,
-    current_hparams: dict,
-    hparam_space: dict,
-    ignore_keys: set[str] | None = None,
-    dictionary_position: list[str] | None = None,
-):
-    if ignore_keys is None:
-        ignore_keys = set()
-    if dictionary_position is None:
-        dictionary_position = []
-    _generate_parameters(
-        parameter_picker,
-        current_hparams,
-        hparam_space,
-        ignore_keys,
-        dictionary_position,
-    )
-
-
-def _generate_parameters(
-    parameter_picker: ParameterPicker,
-    current_hparams: dict,
-    hparam_space: dict,
-    ignore_keys: set[str],
-    dictionary_position: list[str],
-):
-    for parameter, space in hparam_space.items():
-        if parameter in ignore_keys:
-            continue
-        # if its a dictionary, continue to recurse
-        if isinstance(space, dict):
-            if parameter not in current_hparams:
-                current_hparams[parameter] = {}
-            dictionary_position.append(parameter)
-            _generate_parameters(
-                parameter_picker,
-                current_hparams[parameter],
-                hparam_space[parameter],
-                ignore_keys,
-                dictionary_position,
-            )
-            dictionary_position.pop()
-        # if not, get a value from the parameter_picker and insert it with the name prepended by the dictionary position
-        # this is important so that the full path of the parameter is used
-        # this will avoid confusion between parameters with the same name but from different components
-        else:
-            full_parameter_name = ".".join(dictionary_position + [parameter])
-            if isinstance(space, list):
-                suggestion = parameter_picker.pick_categorical(
-                    full_parameter_name, space
-                )
-                current_hparams[parameter] = suggestion
-            elif isinstance(space, ParameterBounds):
-                match space.type:
-                    case ParameterTypeEnum.integer:
-                        current_hparams[parameter] = parameter_picker.pick_int(
-                            full_parameter_name,
-                            int(space.min),
-                            int(space.max),
-                        )
-                    case ParameterTypeEnum.real:
-                        current_hparams[parameter] = parameter_picker.pick_float(
-                            full_parameter_name, space.min, space.max, log=space.log
-                        )
-                    case _:
-                        raise Exception(
-                            f"Type {space.type} not recognized. Suggest one of {[e.value for e in ParameterTypeEnum]}"
-                        )
-            else:
-                raise Exception(
-                    "Leaves of optimization space must be lists or ParameterBounds"
-                )
-
-
-"""
-single node - optuna
-"""
-
-
-def launch_training(
-    trainer: Trainer,
-    task: BaseTask,
-    datamodule: BaseDataModule,
-    run_name: str,
-    experiment_name: str,
-    metric: str,
-    storage_uri: str,
-    parent_run_id: str,
-    direction: str,
-    test_models: bool,
-    delete_models_after_testing: bool,
-) -> float:
-
-    with mlflow.start_run(run_name=run_name, nested=True) as run:
-        mlflow.set_tag("mlflow.parentRunId", parent_run_id)
-        # explicitly log batch_size. Since it is not a model param, it will not be logged
-        mlflow.log_param("batch_size", datamodule.batch_size)
-
-        trainer.logger = MLFlowLogger(
-            experiment_name=experiment_name,
-            run_id=run.info.run_id,
-            save_dir=storage_uri,
-            log_model=not delete_models_after_testing,
-        )
-        trainer.fit(task, datamodule=datamodule)
-        if test_models:
-            trainer.test(ckpt_path="best", datamodule=datamodule)
-        if delete_models_after_testing:
-            # delete the checkpoints folder in the run
-            ckpts_folder = os.path.join(
-                trainer.logger.save_dir,
-                str(trainer.logger.name),
-                trainer.logger.version,
-                "checkpoints",
-            )
-            shutil.rmtree(ckpts_folder)
-
-        client = mlflow.tracking.MlflowClient(
-            tracking_uri=storage_uri,
-        )
-        
-        if not metric.startswith("val"):
-            raise Exception(
-                f"Metric {metric} does not start with `val`. Please choose a validation metric"
-            )
-        for_pd_collect = []
-        val_metrics_names = []
-        for metric_name in client.get_run(run.info.run_id).data.metrics:
-            if metric_name.startswith("val/"):
-                val_metrics_names.append(metric_name)
-                val_metric_history = client.get_metric_history(
-                    run.info.run_id, metric_name
-                )
-                pd_convertible_metric_history = [
-                    {
-                        "metric_name": mm.key,
-                        "step": mm.step,
-                        "value": mm.value,
-                    }
-                    for mm in val_metric_history
-                ]
-                for_pd_collect += pd_convertible_metric_history
-        df_val_metrics = pd.DataFrame.from_records(for_pd_collect)
-        df_val_metrics = df_val_metrics.set_index(
-            ["metric_name", "step"], verify_integrity=True
-        )
-        series_val_metrics = df_val_metrics["value"]
-        if direction == "max":
-            best_step = series_val_metrics[metric].idxmax()
-        elif direction == "min":
-            best_step = series_val_metrics[metric].idxmin()
-        else:
-            raise Exception(f"Direction must be `max` or `min` but got {direction}")
-
-        for val_metric_name in val_metrics_names:
-            mlflow.log_metric(
-                f"best_step_{val_metric_name}",
-                series_val_metrics[(val_metric_name, best_step)],
-            )
-
-        return series_val_metrics[(metric, best_step)]
-
-
-def fit_model(
-    training_spec: TrainingSpec,
-    lightning_task_class: valid_task_types,
-    run_name: str,
-    experiment_name: str,
-    storage_uri: str,
-    parent_run_id: str,
-    trial: optuna.Trial | None = None,
-    save_models: bool = False,
-    test_models: bool = False,
-) -> tuple[float, str]:
-    pl.seed_everything(SEED, workers=True)
-    training_spec_copy = copy.deepcopy(training_spec)
-    task = training_spec_copy.task
-
-    if lightning_task_class in [
-        SemanticSegmentationTask,
-        PixelwiseRegressionTask,
-    ]:
-        task.terratorch_task["plot_on_val"] = False
-    assert isinstance(
-        task.terratorch_task, dict
-    ), f"Error! Invalid type: {task.terratorch_task}"
-
-    lightning_task = lightning_task_class(**task.terratorch_task)
-
-    if len(training_spec.trainer_args.get("callbacks", [])) > 0:
-        warnings.warn(
-            "Callbacks passed to trainer. Make sure these are stateless, as they will not be reinitialized for each task!"
-        )
-    default_callbacks: list[Callback] = get_default_callbacks(
-        task.early_stop_patience, task.max_run_duration
-    )
-
-    if task.early_prune and trial is not None:
-        default_callbacks.append(
-            PyTorchLightningPruningCallback(trial, monitor="val/loss")
-        )
-
-    delete_models_after_testing = False
-    if test_models and not save_models:
-        # we need to save the models during training to be able to test but can be deleted afterwards
-        save_models = True
-        delete_models_after_testing = True
-
-    if save_models:
-        default_callbacks.append(
-            ModelCheckpoint(monitor=task.metric, mode=task.direction)
-        )
-    if "enable_checkpointing" in training_spec_copy.trainer_args:
-        warnings.warn(
-            f"enable_checkpointing found. Will be overwritten to the value of save_models {save_models}"
-        )
-    training_spec_copy.trainer_args["enable_checkpointing"] = save_models
-    training_spec_copy.trainer_args["enable_progress_bar"] = (
-        training_spec_copy.trainer_args.get("enable_progress_bar", True)
-    )
-    # get callbacks (set to empty list if none defined) and extend with default ones
-    training_spec_copy.trainer_args.setdefault("callbacks", []).extend(
-        default_callbacks
-    )  # type: ignore
-
-    trainer = Trainer(**training_spec_copy.trainer_args)
-
-    return (
-        launch_training(
-            trainer,
-            lightning_task,
-            task.datamodule,
-            run_name,
-            experiment_name,
-            task.metric,
-            storage_uri,
-            parent_run_id,
-            task.direction,
-            test_models=test_models,
-            delete_models_after_testing=delete_models_after_testing,
-        ),
-        task.metric,
-    )
-
-
-def fit_model_with_hparams(
-    training_spec: TrainingSpec,
-    lightning_task_class: valid_task_types,
-    run_name: str,
-    experiment_name: str,
-    hparam_space: optimization_space_type,
-    storage_uri: str,
-    parent_run_id: str,
-    save_models: bool,
-    test_models: bool,
-    trial: optuna.Trial,
-) -> float:
-    """
-    Generate parameters using the optuna trial from the given parameters.
-    Then inject these into the given task.
-    It is important to make sure to not overwrite the task passed in the arguments, or these updates may affect
-    subsequent trials.
-    """
-    current_hparams: dict[str, int | float | str | bool] = {}
-    task = training_spec.task
-    generate_parameters(
-        OptunaParameterPicker(trial),
-        current_hparams,
-        hparam_space,
-        ignore_keys=task.optimization_except,
-    )
-
-    training_spec_with_generated_hparams = inject_hparams(
-        training_spec, current_hparams
-    )
-    run_name = f"{run_name}_{trial.number}"
-    return fit_model(
-        training_spec_with_generated_hparams,
-        lightning_task_class,
-        run_name,
-        experiment_name,
-        storage_uri,
-        parent_run_id,
-        trial,
-        save_models=save_models,
-        test_models=test_models,
-    )[
-        0
-    ]  # return only the metric value for optuna
-
-
-"""
-multi node - ray
-"""
-
-
-def ray_tune_model(
-    training_spec: TrainingSpec,
-    lightning_task_class: valid_task_types,
-    hparam_space: optimization_space_type,
-    storage_uri: str,
-    ray_storage_path: str,
-    experiment_name: str,
-    save_models: bool,
-    num_trials: int,
-    backbone_import: str | None = None,
-    searcher: Searcher | SearchAlgorithm | None = None,
-) -> tune.ResultGrid:
-
-    if not searcher:
-        raise ValueError("searcher must be specified")
-    trainable = tune.with_parameters(
-        ray_fit_model,
-        training_spec=training_spec,
-        lightning_task_class=lightning_task_class,
-        storage_uri=storage_uri,
-        experiment_name=experiment_name,
-        parent_run_id=mlflow.active_run().info.run_id,
-        save_models=save_models,
-        backbone_import=backbone_import,
-    )
-
-    current_hparams: dict[str, int | float | str | bool] = {}
-    task = training_spec.task
-    generate_parameters(
-        RayTuneParameterPicker(),
-        current_hparams,
-        hparam_space,
-        ignore_keys=task.optimization_except,
-    )
-
-    # Early stopping
-    # It is unclear if this is working properly when checkpoints are disabled
-    if task.early_prune:
-        search_alg: Searcher | SearchAlgorithm = TuneBOHB()
-        scheduler: TrialScheduler = HyperBandForBOHB(
-            time_attr="training_iteration",
-            max_t=training_spec.trainer_args["max_epochs"],
-            reduction_factor=2,
-            stop_last_trials=False,
-        )
-        if not save_models:
-            raise RuntimeWarning(
-                "It is unclear if using `early_prune=True` with `save_models=False` produces correct results."
-            )
-    else:
-        scheduler = FIFOScheduler()
-        search_alg = searcher
-
-    # monkey patch scheduler to add trial storage dir
-    def decorate_to_add_trial_info(fn: Callable):
-        old_fn = fn
-
-        @wraps(fn)
-        def new_func(self, tune_controller, trial: Trial):
-            trial.config["trial_storage"] = trial.storage
-            return old_fn(tune_controller, trial)
-
-        return new_func
-
-    scheduler.on_trial_add = types.MethodType(
-        decorate_to_add_trial_info(scheduler.on_trial_add), scheduler
-    )
-
-    # for ddp if required in the future
-    # scaling_config = ScalingConfig(
-    #     use_gpu=True,
-    #     num_workers=1,
-    #     resources_per_worker={"CPU": 4, "GPU": 1},
-    #     trainer_resources={"CPU": 1, "GPU": 0},
-    # )
-    # ray_trainer = TorchTrainer(
-    #     trainable,
-    #     scaling_config=scaling_config,
-    # )
-
-    trainable_with_resources = tune.with_resources(
-        trainable, resources={"cpu": 8, "gpu": 1}
-    )
-
-    storage_path = os.path.join(ray_storage_path, experiment_name)
-    tuner = tune.Tuner(
-        trainable_with_resources,
-        tune_config=tune.TuneConfig(
-            metric=task.metric,
-            mode=task.direction,
-            num_samples=num_trials,
-            search_alg=search_alg,
-            scheduler=scheduler,
-            reuse_actors=False,
-        ),
-        run_config=RunConfig(
-            name=mlflow.active_run().info.run_name,
-            storage_path=storage_path,
-            callbacks=[
-                tune.logger.CSVLoggerCallback(),
-                tune.logger.JsonLoggerCallback(),
-                # RayLogArtifactsMlFlowCallback(),
-            ],
-            checkpoint_config=(
-                CheckpointConfig(
-                    num_to_keep=1,
-                    checkpoint_score_attribute=task.metric,
-                    checkpoint_score_order=task.direction,
-                )
-                if save_models
-                else None
-            ),
-            # stop={"training_iteration": training_spec.trainer_args["max_epochs"]},
-        ),
-        param_space=current_hparams,
-    )
-    results = tuner.fit()
-    return results
-
-
-def _generate_random_name(task_name: str):
-    # needed since the random names from mlflow are affected by the seed
-    # so they are always the same
-    return f"{task_name}_{uuid.uuid4().hex[:8]}"
-
-
-def ray_fit_model(
-    config: dict,
-    training_spec: TrainingSpec,
-    lightning_task_class: valid_task_types,
-    storage_uri: str,
-    experiment_name: str,
-    parent_run_id: str,
-    save_models: bool = True,
-    backbone_import: str | None = None,
-) -> None:
-    if backbone_import:
-        importlib.import_module(backbone_import)
-    print(config)
-    pl.seed_everything(SEED, workers=True)
-    tune.utils.wait_for_gpu(
-        target_util=0.07, delay_s=10, retry=50
-    )  # sometimes process needs some time to release GPU
-
-    trial_storage: StorageContext = config.pop("trial_storage", None)
-
-    training_spec_with_generated_hparams = inject_hparams(training_spec, config)
-    task = training_spec_with_generated_hparams.task
-
-    if lightning_task_class in [
-        SemanticSegmentationTask,
-        PixelwiseRegressionTask,
-    ]:
-        task.terratorch_task["plot_on_val"] = False
-    lightning_task = lightning_task_class(**task.terratorch_task)
-
-    if len(training_spec.trainer_args.get("callbacks", [])) > 0:
-        warnings.warn(
-            "Callbacks passed to trainer. Make sure these are stateless, as they will not be reinitialized for each task!"
-        )
-
-    default_callbacks: list[Callback] = get_default_callbacks(
-        task.early_stop_patience, task.max_run_duration
-    )
-    default_callbacks.append(
-        _TuneReportCallback(metrics=[task.metric], save_checkpoints=save_models)
-    )
-
-    if "enable_checkpointing" in training_spec_with_generated_hparams.trainer_args:
-        warnings.warn(
-            "enable_checkpointing found. Will be overwritten to False as ray will be responsible for saving models."
-        )
-    training_spec_with_generated_hparams.trainer_args["enable_checkpointing"] = False
-    if "enable_progress_bar" in training_spec_with_generated_hparams.trainer_args:
-        warnings.warn("enable_progress_bar found. Will be overwritten to False")
-    training_spec_with_generated_hparams.trainer_args["enable_progress_bar"] = False
-
-    # get callbacks (set to empty list if none defined) and extend with default ones
-    training_spec_with_generated_hparams.trainer_args.setdefault(
-        "callbacks", []
-    ).extend(default_callbacks)
-
-    trainer = Trainer(**training_spec_with_generated_hparams.trainer_args)
-
-    # trainer = prepare_trainer(trainer)
-
-    mlflow.set_tracking_uri(storage_uri)
-    mlflow.set_experiment(experiment_name)
-
-    with mlflow.start_run(
-        run_name=_generate_random_name(training_spec.task.name),
-        parent_run_id=parent_run_id,
-    ) as run:
-        trainer.logger = MLFlowLogger(
-            experiment_name=experiment_name,
-            run_id=run.info.run_id,
-            run_name=run.info.run_name,
-            save_dir=storage_uri,
-            log_model=save_models,
-        )
-
-        # explicitly log batch_size. Since it is not a model param, it will not be logged
-        mlflow.log_param("batch_size", task.datamodule.batch_size)
-        trainer.fit(lightning_task, datamodule=task.datamodule)
-        print("Trial Storage: ", trial_storage.trial_fs_path)
-        if trial_storage is not None:
-            mlflow.log_artifacts(trial_storage.trial_fs_path)
diff --git a/terratorch_iterate/module.py b/terratorch_iterate/module.py
deleted file mode 100644
index 53f8eb3..0000000
--- a/terratorch_iterate/module.py
+++ /dev/null
@@ -1,66 +0,0 @@
-"""Module functions."""
-
-__copyright__ = """
-LICENSED INTERNAL CODE. PROPERTY OF IBM.
-IBM Research Licensed Internal Code
-(C) Copyright IBM Corp. 2024
-ALL RIGHTS RESERVED
-"""
-
-from dataclasses import dataclass
-from typing import Callable
-
-
-def hello_world() -> str:
-    """Return Hello World."""
-    return "Hello World"
-
-
-class Foo:
-    """An example class."""
-
-    def __init__(self, a: int) -> None:
-        """Initialize Foo.
-
-        Args:
-            a : documentation for argument a.
-        """
-        self.a = a
-
-    def method_that_would_really_waste_your_time_if_it_fails(self) -> str:
-        """Static typing could help you fix a bug in here before running any test.
-
-        Returns:
-            documentation for the returned string
-        """
-        self.a_times_1 = [1] * self.a
-        # example that would trigger a mypy typechecking failure
-        # return self.a + "When will you find out that this fails?"
-        return f"{self.a} This works"
-
-
-@dataclass
-class Bar:
-    """An example dataclass."""
-
-    #: some documentation for attribute b
-    b: str
-
-    def set_b(self, compute_b: Callable[[], str]) -> None:
-        """Set b from return of a given function.
-
-        Args:
-            compute_b (Callable[[], str]): function without arguments to determine b.
-        """
-        self.b = compute_b()
-
-
-if __name__ == "__main__":
-    # foo = Foo(1.0)  # example that would fail (but mypy can tell you in advance)
-    foo = Foo(1)
-
-    bar = Bar(b="excellent to each other")
-    print(bar.b)
-
-    bar.set_b(hello_world)
-    print(bar.b)
diff --git a/terratorch_iterate/plot_tools.py b/terratorch_iterate/plot_tools.py
deleted file mode 100644
index 5ce6c82..0000000
--- a/terratorch_iterate/plot_tools.py
+++ /dev/null
@@ -1,256 +0,0 @@
-# Copyright contributors to the geobench project
-# modified from geobench (https://github.com/ServiceNow/geo-bench/blob/main/geobench/plot_tools.py)
-
-
-import numpy as np
-from matplotlib import pyplot as plt
-import pandas as pd
-import seaborn as sns
-from matplotlib.ticker import FormatStrFormatter
-import json
-from scipy.stats import trim_mean
-
-
-sns.set_style("dark", {"grid.color": "0.98", "axes.facecolor": "(0.95, 0.95, 0.97)"})
-GEO_BENCH_DIR = "geobench"
-
-
-def biqm(scores):
-    """Return a bootstram sample of iqm."""
-    b_scores = np.random.choice(scores, size=len(scores), replace=True)
-    return trim_mean(b_scores, proportiontocut=0.25, axis=None)
-
-
-def iqm(scores):
-    """Interquantile mean."""
-    return trim_mean(scores, proportiontocut=0.25, axis=None)
-
-
-def bootstrap_iqm(
-    df,
-    group_keys=("model", "dataset", "partition name"),
-    metric="test_metric",
-    repeat=100,
-):
-    """Boostram of seeds for all model and all datasets to comput iqm score distribution."""
-    df_list = []
-    for i in range(repeat):
-        series = df.groupby(list(group_keys))[metric].apply(biqm)
-        df_list.append(series.to_frame().reset_index())
-
-    return pd.concat(df_list)
-
-
-def bootstrap_iqm_aggregate(df, metric="test_metric", repeat=100):
-    """Stratified bootstrap (by dataset) of all seeds to compute iqm score distribution for each model."""
-    group = df.groupby(["model", "dataset", "partition name"])
-
-    df_list = []
-    for i in range(repeat):
-        new_df = group.sample(frac=1, replace=True)
-        series = new_df.groupby(["model", "partition name"])[metric].apply(iqm)
-        df_list.append(series.to_frame().reset_index())
-
-    new_df = pd.concat(df_list)
-    new_df.loc[:, "dataset"] = "aggregated"
-    return new_df
-
-
-def average_seeds(
-    df, group_keys=("model", "dataset", "partition name"), metric="test metric"
-):
-    """Average seeds for all model and all datasets."""
-    df_avg = df.groupby(list(group_keys))[metric].mean()
-    df_avg = df_avg.unstack(level="dataset")
-
-    df_avg = df_avg.round(3)
-    return df_avg
-
-
-def extract_1x_data(df_all):
-    """Extract only resutls trained on 100% of the data"""
-    return df_all[
-        (df_all["partition name"] == "1.00x train")
-        | (df_all["partition name"] == "default")
-    ].copy()
-
-
-def normalize_bootstrap_and_plot(
-    df,
-    metric,
-    benchmark_name,
-    model_order,
-    model_colors=None,
-    repeat=100,
-    fig_size=None,
-    n_legend_rows=2,
-):
-    """Add aggregated data as a new dataset."""
-
-    # normalize all the scores based on the benchmark name.
-    # the normalizing data is expected to be found in the benchmark directory under normalizer.json
-    if benchmark_name:
-        normalizer = load_normalizer(benchmark_name=benchmark_name)
-        new_metric = normalizer.normalize_data_frame(df, metric)
-    else:
-        new_metric = metric
-
-    # create a new df containing bootstrapped samples of iqm
-    bootstrapped_iqm = pd.concat(
-        (
-            bootstrap_iqm_aggregate(
-                df, metric=new_metric, repeat=repeat
-            ),  # stratified bootstrap across all datasets
-            bootstrap_iqm(
-                df, metric=new_metric, repeat=repeat
-            ),  # bootstrapped iqm for each dataset
-        )
-    )
-
-    # plot results per dataset (aggregated results is an extra dataset)
-    plot_per_dataset(
-        bootstrapped_iqm,
-        model_order,
-        model_colors=model_colors,
-        metric=new_metric,
-        fig_size=fig_size,
-        n_legend_rows=n_legend_rows,
-    )
-
-
-class Normalizer:
-    """Class used to normalize results beween min and max for each dataset."""
-
-    def __init__(self, range_dict):
-        """Initialize a new instance of Normalizer class."""
-        self.range_dict = range_dict
-
-    def __call__(self, ds_name, values, scale_only=False):
-        """Call the Normalizer class."""
-        mn, mx = self.range_dict[ds_name]
-        range = mx - mn
-        if scale_only:
-            return values / range
-        else:
-            return (values - mn) / range
-
-    def from_row(self, row, scale_only=False):
-        """Normalize from row."""
-        return [
-            self(ds_name, val, scale_only=scale_only) for ds_name, val in row.items()
-        ]
-
-    def normalize_data_frame(self, df, metric):
-        """Normalize the entire dataframe."""
-        new_metric = f"normalized {metric}"
-        df[new_metric] = df.apply(
-            lambda row: self.__call__(row["dataset"], row[metric]), axis=1
-        )
-        return new_metric
-
-    def save(self, benchmark_name):
-        """Save normalizer to json file."""
-        with open(GEO_BENCH_DIR / benchmark_name / "normalizer.json", "w") as f:
-            json.dump(self.range_dict, f, indent=2)
-
-
-def load_normalizer(benchmark_name):
-    """Load normalizer from json file."""
-    with open(GEO_BENCH_DIR / benchmark_name / "normalizer.json", "r") as f:
-        range_dict = json.load(f)
-    return Normalizer(range_dict)
-
-
-def make_normalizer(data_frame, metrics=("test metric",), benchmark_name=None):
-    """Extract min and max from data_frame to build Normalizer object for all datasets."""
-    datasets = data_frame["dataset"].unique()
-    range_dict = {}
-
-    for dataset in datasets:
-        sub_df = data_frame[data_frame["dataset"] == dataset]
-        data = []
-        for metric in metrics:
-            data.append(sub_df[metric].to_numpy())
-        range_dict[dataset] = (np.min(data), np.max(data))
-
-    normalizer = Normalizer(range_dict)
-
-    if benchmark_name:
-        normalizer.save(benchmark_name)
-
-    return normalizer
-
-
-def remove_violin_outline(ax):
-    """Remove the outline of the violin plot."""
-    for pc in ax.collections:
-        pc.set_edgecolor("none")
-
-
-def plot_per_dataset(
-    df,
-    model_order,
-    metric="test metric",
-    aggregated_name="aggregated",
-    sharey=True,
-    inner="box",
-    fig_size=None,
-    n_legend_rows=1,
-    model_colors=None,
-):
-    """Violin plots for each datasets and each models.
-
-    If a dataset is named `aggregated_name` it will be the first and will be highlighted in light blue.
-
-    """
-    datasets = sorted(df["dataset"].unique())
-
-    if fig_size is None:
-        fig_width = len(datasets) * 2
-        fig_size = (fig_width, 3)
-    fig, axes = plt.subplots(1, len(datasets), sharey=sharey, figsize=fig_size)
-
-    if model_colors is None:
-        colors = sns.color_palette("colorblind", n_colors=len(model_order))
-        model_colors = dict(zip(model_order, colors))
-
-    for dataset, ax in zip(datasets, axes):
-        sub_df = df[df["dataset"] == dataset]
-        sns.violinplot(
-            x="dataset",
-            y=metric,
-            hue="model",
-            data=sub_df,
-            hue_order=model_order,
-            linewidth=0.5,
-            saturation=1,
-            scale="count",
-            inner=inner,
-            palette=model_colors,
-            ax=ax,
-        )
-        remove_violin_outline(ax)
-        ax.tick_params(axis="y", labelsize=8)
-        ax.yaxis.set_major_formatter(FormatStrFormatter("%.2f"))
-        ax.grid(axis="y")
-
-        if dataset == aggregated_name:
-            ax.set_facecolor("#cff6fc")
-
-        ax.set(xlabel=None)
-
-        if dataset != datasets[int((len(datasets) - 1) / 2)]:
-            ax.get_legend().remove()
-        else:
-            ncols = int(np.ceil(len(model_order) / n_legend_rows))
-            sns.move_legend(
-                ax, loc="lower center", bbox_to_anchor=(0.5, 1), ncol=ncols, title=""
-            )
-
-        if dataset != datasets[0]:
-            ax.set(ylabel=None)
-
-    if sharey:
-        fig.subplots_adjust(wspace=0.02)
-    else:
-        fig.subplots_adjust(wspace=0.3)
diff --git a/terratorch_iterate/py.typed b/terratorch_iterate/py.typed
deleted file mode 100644
index e69de29..0000000
diff --git a/terratorch_iterate/repeat_best_experiment.py b/terratorch_iterate/repeat_best_experiment.py
deleted file mode 100644
index 4103e99..0000000
--- a/terratorch_iterate/repeat_best_experiment.py
+++ /dev/null
@@ -1,468 +0,0 @@
-"""
-This module contains functions to re-run a best backbone with different seeds
-"""
-
-import copy
-import importlib
-import os
-import glob
-import warnings
-import logging
-from ast import literal_eval
-from random import randint
-
-import mlflow
-import mlflow.entities
-import pandas as pd
-import ray
-from jsonargparse import CLI
-from lightning import Callback, Trainer
-from lightning.pytorch import seed_everything
-from lightning.pytorch.callbacks import ModelCheckpoint
-import shutil
-from tabulate import tabulate
-from terratorch.tasks import PixelwiseRegressionTask, SemanticSegmentationTask
-
-from lightning.pytorch.loggers.mlflow import MLFlowLogger
-import time
-
-from benchmark.benchmark_types import (
-    Defaults,
-    Task,
-    TrainingSpec,
-    combine_with_defaults,
-)
-from benchmark.model_fitting import (
-    get_default_callbacks,
-    inject_hparams,
-    valid_task_types,
-)
-import pdb
-
-@ray.remote(num_cpus=8, num_gpus=1)
-def remote_fit(
-    training_spec: TrainingSpec,
-    lightning_task_class: valid_task_types,
-    best_params: dict,
-    seed: int,
-    backbone_import: str | None = None,
-) -> float | None:
-    seed_everything(seed, workers=True)
-    if backbone_import:
-        importlib.import_module(backbone_import)
-
-    with mlflow.start_run(
-        run_name=f"{lightning_task_class.name}_{seed}",
-        nested=True,
-    ):
-
-        training_spec_copy = copy.deepcopy(training_spec)
-        training_spec_with_generated_hparams = inject_hparams(
-            training_spec_copy, best_params
-        )
-        task = training_spec_with_generated_hparams.task
-
-        if lightning_task_class in [
-            SemanticSegmentationTask,
-            PixelwiseRegressionTask,
-        ]:
-            task.terratorch_task["plot_on_val"] = False
-        lightning_task = lightning_task_class(**task.terratorch_task)
-
-        if len(training_spec.trainer_args.get("callbacks", [])) > 0:
-            warnings.warn(
-                "Callbacks passed to trainer. Make sure these are stateless, as they will not be reinitialized for each task!"
-            )
-
-        default_callbacks: list[Callback] = get_default_callbacks(
-            task.early_stop_patience, task.max_run_duration
-        )
-        # get callbacks (set to empty list if none defined) and extend with default ones
-        training_spec_with_generated_hparams.trainer_args.setdefault(
-            "callbacks", []
-        ).extend(
-            default_callbacks
-        )  # type: ignore
-        if "enable_checkpointing" in training_spec_with_generated_hparams.trainer_args:
-            warnings.warn(
-                "enable_checkpointing found. Will be overwritten to False as ray will be responsible for saving models."
-            )
-        training_spec_with_generated_hparams.trainer_args["enable_checkpointing"] = (
-            False
-        )
-        if "enable_progress_bar" in training_spec_with_generated_hparams.trainer_args:
-            warnings.warn("enable_progress_bar found. Will be overwritten to False")
-        training_spec_with_generated_hparams.trainer_args["enable_progress_bar"] = False
-        trainer = Trainer(**training_spec_with_generated_hparams.trainer_args)
-        try:
-            trainer.fit(lightning_task, datamodule=task.datamodule)
-            metrics = trainer.test(
-                lightning_task, datamodule=task.datamodule, verbose=False
-            )
-            metrics = metrics[0]
-        except Exception as e:
-            raise Exception(str(e))
-        #        warnings.warn(str(e))
-        #        return None
-        test_metric = "test/" + task.metric.split("/")[1]
-        mlflow.log_metric(f"test_{test_metric}", metrics[test_metric])
-        return metrics[test_metric]
-
-
-def non_remote_fit(
-    experiment_name: str,
-    parent_run_id: str,
-    storage_uri: str,
-    task: Task,
-    training_spec: TrainingSpec,
-    lightning_task_class: valid_task_types,
-    best_params: dict,
-    seed: int,
-    backbone_import: str | None = None,
-    save_models: bool = False,
-    report_on_best_val: bool = True,
-) -> float | None:
-    seed_everything(seed, workers=True)
-    if backbone_import:
-        importlib.import_module(backbone_import)
-    with mlflow.start_run(
-        run_name=f"{task.name}_{seed}",
-        nested=True,
-    ) as run:
-        mlflow.set_tag("mlflow.parentRunId", parent_run_id)
-        training_spec_copy = copy.deepcopy(training_spec)
-        training_spec_with_generated_hparams = inject_hparams(
-            training_spec_copy, best_params
-        )
-        task = training_spec_with_generated_hparams.task
-
-        if lightning_task_class in [
-            SemanticSegmentationTask,
-            PixelwiseRegressionTask,
-        ]:
-            task.terratorch_task["plot_on_val"] = False
-        lightning_task = lightning_task_class(**task.terratorch_task)
-
-        if len(training_spec.trainer_args.get("callbacks", [])) > 0:
-            warnings.warn(
-                "Callbacks passed to trainer. Make sure these are stateless, as they will not be reinitialized for each task!"
-            )
-
-        default_callbacks: list[Callback] = get_default_callbacks(
-            task.early_stop_patience, task.max_run_duration
-        )
-        delete_models_after_testing = False
-
-        if report_on_best_val and not save_models:
-            # we need to save the models to be able to report results on best validation model
-            save_models = True
-            delete_models_after_testing = True
-
-        if save_models:
-            default_callbacks.append(
-                ModelCheckpoint(monitor=task.metric, mode=task.direction)
-            )
-
-        if "enable_checkpointing" in training_spec_with_generated_hparams.trainer_args:
-            warnings.warn(
-                f"enable_checkpointing found. Will be overwritten to the value of save_models {save_models}"
-            )
-        training_spec_with_generated_hparams.trainer_args["enable_checkpointing"] = (
-            save_models
-        )
-        if "enable_progress_bar" in training_spec_with_generated_hparams.trainer_args:
-            warnings.warn("enable_progress_bar found. Will be overwritten to False")
-        training_spec_with_generated_hparams.trainer_args["enable_progress_bar"] = False
-        # get callbacks (set to empty list if none defined) and extend with default ones
-        training_spec_with_generated_hparams.trainer_args.setdefault(
-            "callbacks", []
-        ).extend(
-            default_callbacks
-        )  # type: ignore
-
-        trainer = Trainer(**training_spec_with_generated_hparams.trainer_args)
-        trainer.logger = MLFlowLogger(
-            experiment_name=experiment_name,
-            run_id=run.info.run_id,
-            save_dir=storage_uri,
-            log_model=False,  # don't copy saved checkpoints to artifacts
-        )
-        try:
-            trainer.fit(lightning_task, datamodule=task.datamodule)
-            ckpt_path = "best" if report_on_best_val else "last"
-            metrics = trainer.test(
-                lightning_task,
-                datamodule=task.datamodule,
-                verbose=False,
-                ckpt_path=ckpt_path,
-            )
-            metrics = metrics[0]
-
-            if delete_models_after_testing:
-                # delete the checkpoints' folder in the run
-                ckpts_folder = os.path.join(
-                    trainer.logger.save_dir,  # mlflow root dir
-                    str(trainer.logger.name),  # experiment_id
-                    trainer.logger.version,  # run_id
-                    "checkpoints",
-                )
-                shutil.rmtree(ckpts_folder)
-
-        except Exception as e:
-            raise Exception(str(e))
-        #        warnings.warn(str(e))
-        #        return None
-        test_metric = "test/" + task.metric.split("/")[1]
-        mlflow.log_metric(f"test_{test_metric}", metrics[test_metric])
-        return metrics[test_metric]
-
-
-def rerun_best_from_backbone(
-    logger: logging.RootLogger,
-    parent_run_id: str,
-    output_path: str,
-    defaults: Defaults,
-    tasks: list[Task],
-    experiment_name: str,
-    storage_uri: str,
-    *args,
-    tmp_dir: str | None = None,
-    run_repetitions: int = 10,
-    backbone_import: str | None = None,
-    run_name: str | None = None,
-    n_trials: int = 1,
-    ray_storage_path: str | None = None,
-    save_models: bool = False,
-    report_on_best_val: bool = True,
-    run_id: str | None = None,
-    optimization_space: dict | None = None,
-    description: str | None = None,
-    use_ray=False,
-    **kwargs,
-):
-    """Repeat best experiments from a benchmark run. Only works with a ray cluster.
-
-    Args:
-        parent_run_id (str): mlflow id of parent run
-        output_path (str): path to store the results of the run
-        tmp_dir (str): Path to temporary directory to be used for ray
-        run_repetitions (int): How many runs (each with a different seed) to run per task.
-
-    """
-    if not os.path.isabs(output_path):
-        raise Exception(
-            f"output_path must be absolute. Consider using $(pwd)/{output_path}."
-        )
-    if (tmp_dir is None) & (use_ray == True):
-        raise Exception("tmp_dir must be specified for runs with ray.")
-
-    if use_ray:
-        os.environ["RAY_TMPDIR"] = tmp_dir
-        ray.init(_temp_dir=tmp_dir)
-    if backbone_import:
-        importlib.import_module(backbone_import)
-    mlflow.set_tracking_uri(storage_uri)
-    mlflow.set_experiment(experiment_name)
-
-    runs: list[mlflow.entities.Run] = mlflow.search_runs(
-        filter_string=f"tags.mlflow.parentRunId='{parent_run_id}'", output_format="list"
-    )  # type: ignore
-    logger.info(f"\nparent_run_id {parent_run_id}")
-    logger.info(f"\nFound runs: {[run.info.run_name for run in runs]}")
-
-    task_names = [task.name for task in tasks]
-    logger.info(f"Will only run the following: {task_names}")
-
-    table_columns = [
-        "Task",
-        "Metric",
-        "Score",
-        "mlflow_run_name",
-        "mlflow_run_id",
-        "mlflow_run_status",
-    ]
-    table_entries = []
-    ray_tasks = []
-
-    repeated_storage_uri = f"{storage_uri}_repeated_exp"
-    if not os.path.exists(repeated_storage_uri):
-        os.makedirs(repeated_storage_uri)
-
-    repeated_experiment_name = f"{experiment_name}_repeated_exp"
-    mlflow.set_tracking_uri(repeated_storage_uri)
-    mlflow.set_experiment(repeated_experiment_name)
-
-    #backbone_name = defaults.terratorch_task["model_args"]["backbone"]
-    with mlflow.start_run(run_name=experiment_name, run_id=None) as run:
-        for task in tasks:
-            logger.info(f"\n\ntask: {task.name}")
-            matching_runs = [run for run in runs if run.info.run_name.endswith(task.name)]  # type: ignore
-            if len(matching_runs) == 0:
-                msg = f"No runs found for task {task.name}. Skipping."
-                warnings.warn(msg)
-                continue
-            if len(matching_runs) > 1:
-                msg = f"More than 1 run found for task {task.name}"
-                raise Exception(msg)
-
-            # check if there are already results for this task and exp in the folder
-            past_output_path = (
-                f"{output_path.split(experiment_name)[0]}{experiment_name}_*"
-            )
-            past_output_path = glob.glob(past_output_path)
-            if len(sorted(past_output_path)) > 0:
-                output_path = sorted(past_output_path)[0]
-            logger.info(f"output path: {output_path}")
-            if os.path.exists(output_path):
-                logger.info("there are previous results from repeated experiments")
-                existing_output = pd.read_csv(output_path)
-                existing_output = existing_output[table_columns]
-                existing_task_output = existing_output.loc[
-                    existing_output["Task"] == task.name
-                ].copy()
-                rows, cols = existing_task_output.shape
-                logger.info(f"rows: {rows} \t cols: {cols}")
-                if rows > run_repetitions:
-                    logger.info("task has valid results, will not re-run")
-                    continue
-                past_seeds = [
-                    int(item.split("_")[-1])
-                    for item in existing_task_output["mlflow_run_name"].tolist()
-                ]
-            else:
-                past_seeds = []
-            logger.info(f"past_seeds for task: {past_seeds}")
-
-            best_params = matching_runs[0].data.params
-            best_params = {k: literal_eval(v) for k, v in best_params.items()}
-            training_spec = combine_with_defaults(task, defaults)
-            lightning_task_class = training_spec.task.type.get_class_from_enum()
-
-            if use_ray:  # experimental
-                successful_seeds = [randint(1, 5000) for i in range(run_repetitions)]
-                for seed in successful_seeds:
-                    ray_tasks.append(
-                        remote_fit.remote(
-                            training_spec,
-                            lightning_task_class,
-                            best_params,
-                            seed,
-                            backbone_import=backbone_import,
-                        )
-                    )
-            else:
-                experiment_info = mlflow.get_experiment_by_name(
-                    repeated_experiment_name
-                )
-                seeds = [randint(1, 5000) for i in range(run_repetitions * 3)]
-                seeds = [seed for seed in seeds if seed not in past_seeds]
-
-                for seed in seeds:
-                    if len(past_seeds) >= run_repetitions:
-                        break
-
-                    seed_run_name = f"{task.name}_{seed}"
-                    logger.info(f"now trying: {seed_run_name}")
-                    seed_run_data = mlflow.search_runs(
-                        experiment_ids=[experiment_info.experiment_id],
-                        filter_string=f'tags."mlflow.runName" LIKE "{seed_run_name}"',
-                        output_format="list",
-                    )  # type: ignore
-                    if len(seed_run_data) > 0:
-                        for item in seed_run_data:
-                            logger.info(f"deleting existing run: {item}")
-                            mlflow.delete_run(item.info.run_id)
-
-                    score = non_remote_fit(
-                        experiment_name=repeated_experiment_name,
-                        parent_run_id=run.info.run_id,
-                        storage_uri=repeated_storage_uri,
-                        task=task,
-                        training_spec=training_spec,
-                        lightning_task_class=lightning_task_class,
-                        best_params=best_params,
-                        seed=seed,
-                        backbone_import=backbone_import,
-                        save_models=save_models,
-                        report_on_best_val=report_on_best_val,
-                    )
-                    # check if run with name finished successfully
-                    logger.info(f"score: {score}")
-                    # TODO improve this sleep command - try to get a better estimate than this
-                    time.sleep(60)
-                    seed_run_data = mlflow.search_runs(
-                        experiment_ids=[experiment_info.experiment_id],
-                        filter_string=f'tags."mlflow.runName" LIKE "{seed_run_name}"',
-                        output_format="list",
-                    )  # type: ignore
-
-                    logger.info(
-                        f"run for task {task.name} seed {seed} complete"
-                    )
-                    if len(seed_run_data) > 0:
-                        if seed_run_data[0].info.status != "FINISHED":
-                            mlflow.delete_run(seed_run_data[0].info.run_id)
-                            continue
-                        past_seeds.append(seed)
-                        new_data = pd.DataFrame(
-                            {
-                                "Task": [task.name],
-                                "Metric": [task.metric.split("/")[-1]],
-                                "Score": [score],
-                                "mlflow_run_name": [seed_run_name],
-                                "mlflow_run_id": [seed_run_data[0].info.run_id],
-                                "mlflow_run_status": [seed_run_data[0].info.status],
-                            }
-                        )
-                        logger.info(
-                            f"completed seeds so far for this task: {len(past_seeds)}"
-                        )
-                        if os.path.exists(output_path):
-                            logger.info(
-                                "there are previous results from repeated experiments"
-                            )
-                            existing_output = pd.read_csv(output_path)
-                            existing_output = existing_output[table_columns]
-                            existing_output.reset_index(inplace=True)
-                            existing_task_output = existing_output.loc[
-                                existing_output["Task"] == task.name
-                            ].copy()
-                            rows, cols = existing_task_output.shape
-                            logger.info(f"rows: {rows} \t cols: {cols}")
-                            if rows == 0:
-                                logger.info("no past results for this task")
-                            existing_output = pd.concat(
-                                [existing_output, new_data], axis=0
-                            )
-                            existing_output.reset_index(inplace=True)
-                            existing_output.to_csv(output_path, index=False)
-                        else:
-                            new_data.to_csv(output_path, index=False)
-
-    if use_ray:  # experimental
-        results = ray.get(ray_tasks)
-        table_entries = [
-            [
-                task.name,
-                task.metric.split("/")[-1],
-                result,
-                matching_runs[0].info.run_id,
-            ]
-            for task, result in zip(
-                [task for task in tasks for _ in seeds], results
-            )  # expand tasks
-        ]
-
-        table = tabulate(table_entries, headers=table_columns)
-        logger.info(table)
-        df = pd.DataFrame(data=table_entries, columns=table_columns)
-        df.to_csv(output_path, index=False)
-        ray.shutdown()
-
-
-def main():
-    CLI(rerun_best_from_backbone, fail_untyped=False)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/terratorch_iterate/resources/dataset_specifications/agb.yaml b/terratorch_iterate/resources/dataset_specifications/agb.yaml
deleted file mode 100644
index 33e9c95..0000000
--- a/terratorch_iterate/resources/dataset_specifications/agb.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-class_path: terratorch.datamodules.GenericNonGeoPixelwiseRegressionDataModule
-init_args:
-  batch_size: 16
-  num_workers: 4
-  train_transform:
-    - class_path: albumentations.HorizontalFlip
-      init_args:
-        p: 0.5
-    - class_path: albumentations.augmentations.geometric.rotate.Rotate
-      init_args:
-        limit: 30
-        border_mode: 0 # cv2.BORDER_CONSTANT
-        # value: 0
-        # mask_value: 1
-        p: 0.5
-      dict_kwargs:
-        value: 0
-        mask_value: 1
-    - class_path: ToTensorV2
-  dataset_bands:
-    - 0
-    - BLUE
-    - GREEN
-    - RED
-    - NIR_NARROW
-    - SWIR_1
-    - SWIR_2
-    - 1
-    - 2
-    - 3
-    - 4
-  output_bands:
-    - BLUE
-    - GREEN
-    - RED
-    - NIR_NARROW
-    - SWIR_1
-    - SWIR_2
-  rgb_indices:
-    - 2
-    - 1
-    - 0
-  train_data_root: /dccstor/hhr-weather/latest_filters_all_agb_patches_tts_clipped_0_500/train_images
-  train_label_data_root: /dccstor/hhr-weather/latest_filters_all_agb_patches_tts_clipped_0_500/train_labels
-  val_data_root: /dccstor/hhr-weather/latest_filters_all_agb_patches_tts_clipped_0_500/val_images
-  val_label_data_root: /dccstor/hhr-weather/latest_filters_all_agb_patches_tts_clipped_0_500/val_labels
-  test_data_root: /dccstor/hhr-weather/latest_filters_all_agb_patches_tts_clipped_0_500/test_images
-  test_label_data_root: /dccstor/hhr-weather/latest_filters_all_agb_patches_tts_clipped_0_500/test_labels
-  # img_grep: "*.tif"
-  # label_grep: "*.tif"
-  means:
-    - 385.88501817
-    - 714.60615207
-    - 658.96267376
-    - 3314.57774238
-    - 2238.71812558
-    - 1250.00982518
-  stds:
-    - 264.62872
-    - 355.62848
-    - 504.54855
-    - 898.4953
-    - 947.22894
-    - 828.1297
diff --git a/terratorch_iterate/resources/dataset_specifications/eurosat.yaml b/terratorch_iterate/resources/dataset_specifications/eurosat.yaml
deleted file mode 100644
index 029ee51..0000000
--- a/terratorch_iterate/resources/dataset_specifications/eurosat.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-class_path: terratorch.datamodules.TorchNonGeoDataModule
-init_args:
-  transforms:
-    # a possible way to select bands:
-    # - class_path: SelectBands
-    #   init_args:
-    #     band_indices:
-    #       - 2
-    #       - 1
-    #       - 0
-    - class_path: albumentations.augmentations.geometric.resize.Resize
-      dict_kwargs:
-        height: 224
-        width: 224
-    - class_path: ToTensorV2
-  cls: torchgeo.datamodules.EuroSATDataModule
-  batch_size: 16
-  num_workers: 4
-dict_kwargs:
-  root: /dccstor/geofm-pre/EuroSat
-  download: True
-  bands:
-    - B02
-    - B03
-    - B04
-    - B08A
-    - B11
-    - B12
diff --git a/terratorch_iterate/resources/dataset_specifications/fire_scars.yaml b/terratorch_iterate/resources/dataset_specifications/fire_scars.yaml
deleted file mode 100644
index a2f50a1..0000000
--- a/terratorch_iterate/resources/dataset_specifications/fire_scars.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-class_path: terratorch.datamodules.GenericNonGeoSegmentationDataModule
-init_args:
-  batch_size: 4
-  num_workers: 8
-  dataset_bands:
-    - BLUE
-    - GREEN
-    - RED
-    - NIR_NARROW
-    - SWIR_1
-    - SWIR_2
-  output_bands:
-    - BLUE
-    - GREEN
-    - RED
-    - NIR_NARROW
-    - SWIR_1
-    - SWIR_2
-  rgb_indices:
-    - 2
-    - 1
-    - 0
-  train_transform:
-    - class_path: albumentations.RandomCrop
-      init_args:
-        height: 224
-        width: 224
-    - class_path: albumentations.HorizontalFlip
-      init_args:
-        p: 0.5
-    - class_path: ToTensorV2
-  no_data_replace: 0
-  no_label_replace: -1
-  train_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/training
-  train_label_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/training
-  val_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
-  val_label_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
-  test_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
-  test_label_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
-  img_grep: "*_merged.tif"
-  label_grep: "*.mask.tif"
-  means:
-    - 0.033349706741586264
-    - 0.05701185520536176
-    - 0.05889748132001316
-    - 0.2323245113436119
-    - 0.1972854853760658
-    - 0.11944914225186566
-  stds:
-    - 0.02269135568823774
-    - 0.026807560223070237
-    - 0.04004109844362779
-    - 0.07791732423672691
-    - 0.08708738838140137
-    - 0.07241979477437814
-  num_classes: 2
diff --git a/terratorch_iterate/resources/dataset_specifications/multi_temporal_crop.yaml b/terratorch_iterate/resources/dataset_specifications/multi_temporal_crop.yaml
deleted file mode 100644
index bc30877..0000000
--- a/terratorch_iterate/resources/dataset_specifications/multi_temporal_crop.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-class_path: terratorch.datamodules.GenericNonGeoSegmentationDataModule
-init_args:
-  batch_size: 8
-  num_workers: 12
-  train_transform:
-    - class_path: FlattenTemporalIntoChannels
-    - class_path: albumentations.Flip
-    - class_path: ToTensorV2
-    - class_path: UnflattenTemporalFromChannels
-      init_args:
-        n_timesteps: 3
-  dataset_bands:
-      - BLUE
-      - GREEN
-      - RED
-      - NIR_NARROW
-      - SWIR_1
-      - SWIR_2
-  output_bands:
-    - BLUE
-    - GREEN
-    - RED
-    - NIR_NARROW
-    - SWIR_1
-    - SWIR_2
-  rgb_indices:
-    - 2
-    - 1
-    - 0
-  reduce_zero_label: True
-  expand_temporal_dimension: True
-  train_data_root: /dccstor/geofm-finetuning/hls_cdl_reclassed/training_chips
-  train_label_data_root: /dccstor/geofm-finetuning/hls_cdl_reclassed/training_chips
-  val_data_root: /dccstor/geofm-finetuning/hls_cdl_reclassed/validation_chips
-  val_label_data_root: /dccstor/geofm-finetuning/hls_cdl_reclassed/validation_chips
-  test_data_root: /dccstor/geofm-finetuning/hls_cdl_reclassed/validation_chips
-  test_label_data_root: /dccstor/geofm-finetuning/hls_cdl_reclassed/validation_chips
-  train_split: /dccstor/geofm-finetuning/hls_cdl_reclassed/training_chips/training_data.txt
-  test_split: /dccstor/geofm-finetuning/hls_cdl_reclassed/validation_chips/validation_data.txt
-  val_split: /dccstor/geofm-finetuning/hls_cdl_reclassed/validation_chips/validation_data.txt
-  img_grep: "*_merged.tif"
-  label_grep: "*.mask.tif"
-  means:
-    - 494.905781
-    - 815.239594
-    - 924.335066
-    - 2968.881459
-    - 2634.621962
-    - 1739.579917
-  stds:
-    - 284.925432
-    - 357.84876
-    - 575.566823
-    - 896.601013
-    - 951.900334
-    - 921.407808
-  num_classes: 13
diff --git a/terratorch_iterate/resources/dataset_specifications/sen1floods11.yaml b/terratorch_iterate/resources/dataset_specifications/sen1floods11.yaml
deleted file mode 100644
index d3201e1..0000000
--- a/terratorch_iterate/resources/dataset_specifications/sen1floods11.yaml
+++ /dev/null
@@ -1,59 +0,0 @@
-class_path: terratorch.datamodules.GenericNonGeoSegmentationDataModule
-init_args:
-  batch_size: 8
-  num_workers: 4
-  constant_scale: 0.0001
-  dataset_bands:
-      - COASTAL_AEROSOL
-      - BLUE
-      - GREEN
-      - RED
-      - RED_EDGE_1
-      - RED_EDGE_2
-      - RED_EDGE_3
-      - NIR_BROAD
-      - NIR_NARROW
-      - WATER_VAPOR
-      - CIRRUS
-      - SWIR_1
-      - SWIR_2
-  output_bands:
-    - BLUE
-    - GREEN
-    - RED
-    - NIR_NARROW
-    - SWIR_1
-    - SWIR_2
-  rgb_indices:
-    - 2
-    - 1
-    - 0
-  train_data_root: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/data/flood_events/HandLabeled/S2Hand/
-  train_label_data_root: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/data/flood_events/HandLabeled/LabelHand
-  val_data_root: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/data/flood_events/HandLabeled/S2Hand/
-  val_label_data_root: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/data/flood_events/HandLabeled/LabelHand
-  test_data_root: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/data/flood_events/HandLabeled/S2Hand/
-  test_label_data_root: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/data/flood_events/HandLabeled/LabelHand
-  # these must be obtained by running terratorch/examples/scripts/convert_sen1floods11_splits.py on the original split csv files
-  train_split: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/splits/flood_handlabeled/flood_train_data.txt
-  test_split: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/splits/flood_handlabeled/flood_test_data.txt
-  val_split: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/splits/flood_handlabeled/flood_valid_data.txt
-  img_grep: "*_S2Hand.tif"
-  label_grep: "*_LabelHand.tif"
-  no_label_replace: -1
-  no_data_replace: 0
-means:
-  - 0.1412956
-  - 0.13795798
-  - 0.12353792
-  - 0.30902815
-  - 0.2044958
-  - 0.11912015
-stds:
-  - 0.07406382
-  - 0.07370365
-  - 0.08692279
-  - 0.11798815
-  - 0.09772074
-  - 0.07659938
-num_classes: 2
diff --git a/terratorch_iterate/resources/dataset_specifications/sen1floods11_transforms.yaml b/terratorch_iterate/resources/dataset_specifications/sen1floods11_transforms.yaml
deleted file mode 100644
index ffea683..0000000
--- a/terratorch_iterate/resources/dataset_specifications/sen1floods11_transforms.yaml
+++ /dev/null
@@ -1,67 +0,0 @@
-class_path: terratorch.datamodules.GenericNonGeoSegmentationDataModule
-init_args:
-  batch_size: 8
-  num_workers: 4
-  constant_scale: 0.0001
-  dataset_bands:
-      - COASTAL_AEROSOL
-      - BLUE
-      - GREEN
-      - RED
-      - RED_EDGE_1
-      - RED_EDGE_2
-      - RED_EDGE_3
-      - NIR_BROAD
-      - NIR_NARROW
-      - WATER_VAPOR
-      - CIRRUS
-      - SWIR_1
-      - SWIR_2
-  output_bands:
-    - BLUE
-    - GREEN
-    - RED
-    - NIR_NARROW
-    - SWIR_1
-    - SWIR_2
-  rgb_indices:
-    - 2
-    - 1
-    - 0
-  train_data_root: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/data/flood_events/HandLabeled/S2Hand/
-  train_label_data_root: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/data/flood_events/HandLabeled/LabelHand
-  val_data_root: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/data/flood_events/HandLabeled/S2Hand/
-  val_label_data_root: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/data/flood_events/HandLabeled/LabelHand
-  test_data_root: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/data/flood_events/HandLabeled/S2Hand/
-  test_label_data_root: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/data/flood_events/HandLabeled/LabelHand
-  # these must be obtained by running terratorch/examples/scripts/convert_sen1floods11_splits.py on the original split csv files
-  train_split: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/splits/flood_handlabeled/flood_train_data.txt
-  test_split: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/splits/flood_handlabeled/flood_test_data.txt
-  val_split: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/splits/flood_handlabeled/flood_valid_data.txt
-  img_grep: "*_S2Hand.tif"
-  label_grep: "*_LabelHand.tif"
-  no_label_replace: -1
-  no_data_replace: 0
-  train_transform:
-    - class_path: albumentations.HorizontalFlip
-      init_args:
-        p: 0.5
-    - class_path: albumentations.VerticalFlip
-      init_args:
-        p: 0.5
-    - class_path: ToTensorV2
-  means:
-    - 0.1412956
-    - 0.13795798
-    - 0.12353792
-    - 0.30902815
-    - 0.2044958
-    - 0.11912015
-  stds:
-    - 0.07406382
-    - 0.07370365
-    - 0.08692279
-    - 0.11798815
-    - 0.09772074
-    - 0.07659938
-  num_classes: 2
diff --git a/terratorch_iterate/tests/__init__.py b/terratorch_iterate/tests/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/terratorch_iterate/utils.py b/terratorch_iterate/utils.py
deleted file mode 100644
index d6564c4..0000000
--- a/terratorch_iterate/utils.py
+++ /dev/null
@@ -1,866 +0,0 @@
-import os
-from typing import Any, Dict
-import mlflow
-import datetime
-import logging
-from pathlib import Path
-import pandas as pd
-import numpy as np
-import seaborn as sns
-from matplotlib import pyplot as plt
-from ast import literal_eval
-import optuna
-from benchmark.benchmark_types import Task
-from benchmark import plot_tools
-import sys
-from mlflow.entities.experiment import Experiment
-import importlib
-import logging
-
-SEGMENTATION_BASE_TASKS = [
-    'chesapeake',
-    'sa_crop_type',
-    'pv4ger_seg',
-    'cashew',
-    'neontree',
-    'nz_cattle',
-]
-CLASSIFICATION_BASE_TASKS = [
-    'pv4ger',
-    'so2sat',
-    'brick_kiln',
-    'big_earth_net',
-    'eurosat',
-    'forestnet',
-]
-N_TRIALS_DEFAULT = 16
-REPEATED_SEEDS_DEFAULT = 10
-DATA_PARTITIONS = {
-    "default": 100,
-    "1.00x_train": 100,
-    "0.50x_train": 50,
-    "0.20x_train": 20,
-    "0.10x_train": 10,
-    "0.01x_train": 1,
-}
-
-
-def unflatten(dictionary: Dict[str, Any]):
-    resultDict: Dict = {}
-    for key, value in dictionary.items():
-        parts = key.split(".")
-        d = resultDict
-        for part in parts[:-1]:
-            if part not in d:
-                d[part] = {}
-            d = d[part]
-        d[parts[-1]] = value
-    return resultDict
-
-
-def sync_mlflow_optuna(
-    optuna_db_path: str,
-    storage_uri: str,
-    experiment_name: str,
-    task_run_id: str | None,
-    task: Task,
-    n_trials: int,
-    logger: logging.RootLogger,
-) -> str | None:
-    """
-        syncs the number of completed trials in mflow and optuna
-    Args:
-        optuna_db_path: path to optuna database
-        storage_uri: path to mlflow storage folder
-        experiment_name: name on experiment in mlflow
-        task_run_id: run_id of the task
-        task: name of the task
-        logger: logging.RootLogger to save logs to file
-    Returns:
-        task_run_id: run id of the task to be continued (if one exists) or None
-    """
-    # check number of successful mlflow runs in task
-    client = mlflow.tracking.MlflowClient(tracking_uri=storage_uri)
-    completed_in_mlflow_for_task = []
-    all_mlflow_runs_for_task = []
-    if task_run_id is not None:
-        all_mlflow_runs_for_task.append(task_run_id)
-        logger.info(f"task_run_id : {task_run_id}")
-        experiment_info = client.get_experiment_by_name(experiment_name)
-        assert isinstance(
-            experiment_info, Experiment
-        ), f"Error! Unexpected type of {experiment_info=}"
-        individual_run_data = client.search_runs(
-            experiment_ids=[experiment_info.experiment_id],
-            filter_string=f'tags."mlflow.parentRunId" LIKE "{task_run_id}"',
-        )
-        for individual_run in individual_run_data:
-            if individual_run.info.status == "FINISHED":
-                completed_in_mlflow_for_task.append(individual_run.info.run_id)
-            all_mlflow_runs_for_task.append(individual_run.info.run_id)
-
-    # check number of successful optuna trials in the database
-    study_names = optuna.study.get_all_study_names(
-        storage="sqlite:///{}.db".format(optuna_db_path)
-    )
-    if task.name in study_names:
-        loaded_study = optuna.load_study(
-            study_name=task.name, storage="sqlite:///{}.db".format(optuna_db_path)
-        )
-        logger.info(f"loaded_study has : {len(loaded_study.trials)} trials")
-        incomplete = 0
-        for trial in loaded_study.trials:
-            if (trial.state == optuna.trial.TrialState.FAIL) | (
-                trial.state == optuna.trial.TrialState.RUNNING
-            ):
-                incomplete += 1
-        logger.info(f"{incomplete} trials are incomplete")
-        successful_optuna_trials = len(loaded_study.trials) - incomplete
-        too_many_trials = successful_optuna_trials > n_trials
-        no_existing_task = task_run_id is None
-        optuna_mlflow_mismatch = (
-            len(completed_in_mlflow_for_task) != successful_optuna_trials
-        )
-        logger.info(
-            f"successful optuna trials {successful_optuna_trials} . mlflow runs {len(completed_in_mlflow_for_task)}"
-        )
-
-        if too_many_trials or no_existing_task or optuna_mlflow_mismatch:
-            logger.info(f"deleting study with name {task.name}")
-            logger.info(f"too_many_trials {too_many_trials}")
-            logger.info(f"no_existing_task {no_existing_task}")
-
-            # delete optuna study in database
-            optuna.delete_study(
-                study_name=task.name, storage="sqlite:///{}.db".format(optuna_db_path)
-            )
-
-            # delete any existing mlflow runs
-            if len(all_mlflow_runs_for_task) > 0:
-                for item in all_mlflow_runs_for_task:
-                    logger.info(f"deleting {item}")
-                    client.delete_run(item)
-                    assert isinstance(
-                        experiment_info, Experiment
-                    ), f"Error! Unexpected type of {experiment_info=}"
-                    os.system(f"rm -r {experiment_info.artifact_location}/{item}")
-                    task_run_id = None
-    else:
-        # delete any existing mlflow runs
-        if len(all_mlflow_runs_for_task) > 0:
-            for item in all_mlflow_runs_for_task:
-                logger.info(f"deleting {item}")
-                client.delete_run(item)
-                assert isinstance(
-                    experiment_info, Experiment
-                ), f"Error! Unexpected type of {experiment_info=}"
-                os.system(f"rm -r {experiment_info.artifact_location}/{item}")
-            task_run_id = None
-    return task_run_id
-
-
-def extract_repeated_experiment_results(
-    storage_uri: str,
-    logger: logging.RootLogger,
-    experiments: list,
-    num_repetitions: int = REPEATED_SEEDS_DEFAULT,
-    task_names: list = SEGMENTATION_BASE_TASKS,
-) -> (pd.DataFrame, list):
-    """
-    extracts results of repeated experiments from mlflow logs and saves them in csv
-    save list of incomplete experiments to a txt file
-    Args:
-        storage_uri: path to mlflow storage folder
-        logger: logging.RootLogger to save logs to file
-        experiments: list of experiment names
-        num_repetitions: number of repeated seeds per task
-        task_names: list of tasks
-    """
-    if Path(storage_uri).exists() and Path(storage_uri).is_dir():
-        storage_uri = Path(storage_uri)
-        repeated_exp_storage_uri = storage_uri.with_name(
-            f"{storage_uri.name}_repeated_exp"
-        )
-    else:
-        print("Please use a valid directory for storage_uri")
-        raise ValueError
-    logger.info(
-        f"\n Extracting results of repeated experiments from: {str(repeated_exp_storage_uri)}"
-    )
-    client = mlflow.tracking.MlflowClient(tracking_uri=str(repeated_exp_storage_uri))
-    experiments = list(set(experiments))
-    incomplete_experiments = []
-    num_tasks = len(task_names)
-    combine_exp_results = []
-
-    for original_experiment_name in experiments:
-        experiment_name = f"{original_experiment_name}_repeated_exp"
-        logger.info(f"\nexperiment_name: {experiment_name}")
-        experiment_info = client.get_experiment_by_name(experiment_name)
-        if experiment_info is None:
-            logger.info(
-                f"EXPERIMENT {experiment_name} DOES NOT EXIST IN THIS FOLDER: {str(repeated_exp_storage_uri)}"
-            )
-            incomplete_experiments.append(experiment_name)
-            continue
-        experiment_id = experiment_info.experiment_id
-        logger.info(f"experiment_id: {experiment_id}")
-        logger.info(f"experiment_info: {experiment_info}")
-        experiment_parent_run_data = client.search_runs(experiment_ids=[experiment_id])
-        run_names = []
-        run_ids = []
-        run_seed = []
-        run_task = []
-        run_score = []
-        run_metric = []
-        run_status = []
-        exp_ids = []
-        exp_names = []
-        logger.info(f"experiment_parent_run_data: {len(experiment_parent_run_data)}")
-        for run in experiment_parent_run_data:
-            run_name = run.info.run_name
-            task = "_".join(run_name.split("_")[:-1])
-            if (task in task_names) and (run.info.status == "FINISHED"):
-                seed = int(run.info.run_name.split("_")[-1])
-                if task in SEGMENTATION_BASE_TASKS:
-                    metric_name = 'test_test/Multiclass_Jaccard_Index'
-                else:  # conditions for other task types to be added
-                    if task == "big_earth_net":
-                        metric_name = 'test_test/Multilabel_F1_Score'
-                    else:
-                        metric_name = 'test_test/Overall_Accuracy'
-
-                if metric_name not in run.data.metrics:
-                    continue
-                score = run.data.metrics[metric_name]
-                run_names.append(run.info.run_name)
-                exp_ids.append(experiment_id)
-                exp_names.append(original_experiment_name)
-                run_ids.append(run.info.run_id)
-                run_status.append(run.info.status)
-                run_seed.append(seed)
-                run_metric.append(metric_name.split("/")[-1])
-                run_task.append(task)
-                run_score.append(score)
-
-        df = pd.DataFrame(
-            {
-                "dataset": run_task,
-                "Metric": run_metric,
-                "test metric": run_score,
-                "mlflow_run_name": run_names,
-                "mlflow_run_id": run_ids,
-                "mlflow_run_status": run_status,
-                "Seed": run_seed,
-                "experiment_id": exp_ids,
-                "experiment_name": exp_names,
-            }
-        )
-        if len(run_task) == 0:
-            logger.info(
-                f"EXPERIMENT INCOMPLETE: {experiment_name} has no complete tasks."
-            )
-            incomplete_experiments.append(experiment_name)
-            continue
-        print(f"\n\n\ndf: {df}")
-
-        # get successful results per task
-        combine_task_results = []
-        for task in task_names:
-            task_df = df.loc[
-                (df["dataset"] == task) & (df["mlflow_run_status"] == "FINISHED")
-            ].copy()
-            task_df = task_df.loc[(task_df["test metric"] != 0.0)].copy()
-            rows, _ = task_df.shape
-            if (rows >= num_repetitions) and (
-                sum(np.isnan(task_df["test metric"])) == 0
-            ):
-                task_df = task_df.iloc[list(range(num_repetitions))].copy()
-                combine_task_results.append(task_df)
-            elif rows < num_repetitions:
-                logger.info(f"TASK INCOMPLETE: {task} only has {rows} seeds")
-                incomplete_experiments.append(experiment_name)
-        if len(combine_task_results) > 0:
-            combine_task_results = pd.concat(combine_task_results, axis=0)
-            combine_exp_results.append(combine_task_results)
-        if len(combine_task_results) < num_tasks:
-            logger.info(
-                f"EXPERIMENT INCOMPLETE: {experiment_name} has {len(combine_task_results)} complete tasks only"
-            )
-            incomplete_experiments.append(experiment_name)
-    combine_exp_results = pd.concat(combine_exp_results, axis=0)
-    print(f"\n\n\ncombine_exp_results: {combine_exp_results}")
-    return (combine_exp_results, incomplete_experiments)
-
-
-def extract_parameters(
-    storage_uri: str,
-    logger: logging.RootLogger,
-    experiments: list,
-    task_names: list = SEGMENTATION_BASE_TASKS,
-) -> pd.DataFrame:
-    """
-    extracts hyper-parameter information for each experiment from the mlflow logs
-    saves this information to a csv file
-
-    Args:
-        storage_uri: path to mlflow storage folder used in configs
-        logger: logging.RootLogger to save logs to file
-        experiment_data: list of experiment names
-        task_names: list of tasks
-    """
-    logger.info(f"\n Extracting parameters of experiments from: {storage_uri}")
-    experiments = list(set(experiments))
-    all_params = []
-    client = mlflow.tracking.MlflowClient(tracking_uri=storage_uri)
-    for experiment_name in experiments:
-        # get experiment id
-        experiment_info = client.get_experiment_by_name(experiment_name)
-        if experiment_info is None:
-            continue
-        experiment_id = experiment_info.experiment_id
-        logger.info(f"\nexperiment_name: {experiment_name} ")
-        logger.info(f"experiment_id: {experiment_info.experiment_id}")
-        exp_parent_run_name = f"top_run_{experiment_name}"
-        experiment_parent_run_data = client.search_runs(
-            experiment_ids=[experiment_id],
-            filter_string=f'tags."mlflow.runName" LIKE "{exp_parent_run_name}"',
-        )
-        if (len(experiment_parent_run_data) > 1) or (
-            len(experiment_parent_run_data) == 0
-        ):
-            logger.debug(
-                f"The number of parent runs for each experiment should be 1. \
-                         It is currently {len(experiment_parent_run_data)}"
-            )
-            raise RuntimeError
-        for run in experiment_parent_run_data:
-            exp_parent_run_id = run.info.run_id
-
-        mlflow.set_tracking_uri(storage_uri)
-        mlflow.set_experiment(experiment_name)
-        runs: list[mlflow.entities.Run] = mlflow.search_runs(
-            filter_string=f"tags.mlflow.parentRunId='{exp_parent_run_id}'",
-            output_format="list",
-        )  # type: ignore
-        logger.info(f"Found runs: {[run.info.run_name for run in runs]}")
-
-        for task in task_names:
-            logger.info(f"task: {task}")
-            matching_runs = [run for run in runs if run.info.run_name.endswith(task)]  # type: ignore
-            best_params = matching_runs[0].data.params
-
-            # eval them
-            best_params = {k: literal_eval(v) for k, v in best_params.items()}
-            best_params["experiment_name"] = experiment_name
-            best_params["dataset"] = task
-            best_params["decoder"] = matching_runs[0].data.tags["decoder"]
-            best_params["backbone"] = matching_runs[0].data.tags["backbone"]
-            best_params["early_stop_patience"] = matching_runs[0].data.tags[
-                "early_stop_patience"
-            ]
-            best_params["n_trials"] = matching_runs[0].data.tags["n_trials"]
-            best_params["partition_name"] = matching_runs[0].data.tags["partition_name"]
-            best_params["data_percentages"] = DATA_PARTITIONS[
-                best_params["partition_name"]
-            ]
-            if 'optimizer_hparams' in best_params:
-                logger.info(
-                    f"optimizer_hparams: {best_params['optimizer_hparams'].items()}"
-                )
-                optimizer_hparams = {
-                    k: v for k, v in best_params['optimizer_hparams'].items()
-                }
-                best_params.update(optimizer_hparams)
-                del best_params['optimizer_hparams']
-            if 'model_args' in best_params:
-                model_args = {k: v for k, v in best_params['model_args'].items()}
-                best_params.update(model_args)
-                del best_params['model_args']
-
-            best_params = pd.DataFrame(best_params, index=[0])
-            all_params.append(best_params)
-    all_params = pd.concat(all_params, axis=0)
-    all_params = all_params.reset_index()
-    return all_params
-
-
-def get_results_and_parameters(
-    storage_uri: str,
-    logger: logging.RootLogger,
-    experiments: list,
-    task_names: list = SEGMENTATION_BASE_TASKS + CLASSIFICATION_BASE_TASKS,
-    num_repetitions: int = REPEATED_SEEDS_DEFAULT,
-) -> pd.DataFrame:
-    """
-    extracts results and parameters for experiments from mlflow logs
-
-    Args:
-        storage_uri: path to mlflow storage folder used in configs
-        logger: logging.RootLogger to save logs to file
-        experiment_data: list of experiment names
-        task_names: list of tasks
-        num_repetitions: number of repeated seeds per task
-    Returns:
-        pd.DataFrame with results and parameters
-    """
-    if Path(storage_uri).exists() and Path(storage_uri).is_dir():
-        results_dir = Path(storage_uri).parents[0] / "summarized_results"
-    else:
-        print("Please use a valid directory for storage_uri")
-        raise ValueError
-    if not os.path.exists(results_dir):
-        os.makedirs(results_dir)
-
-    parameters = extract_parameters(
-        storage_uri=storage_uri,
-        logger=logger,
-        experiments=experiments,
-        task_names=task_names,
-    )
-
-    # extract repeated experiment results from mlflow logs
-    (results, incomplete_experiments) = extract_repeated_experiment_results(
-        storage_uri=storage_uri,
-        logger=logger,
-        experiments=experiments,
-        num_repetitions=num_repetitions,
-        task_names=task_names,
-    )
-
-    with open(f"{results_dir}/incomplete_experiments.txt", 'w') as f:
-        for line in incomplete_experiments:
-            f.write(f"{line}\n")
-    results_and_parameters = results.merge(
-        parameters, on=['experiment_name', 'dataset']
-    )
-    results_and_parameters.to_csv(
-        f"{str(results_dir)}/results_and_parameters.csv", index=False
-    )
-    return results_and_parameters
-
-
-def delete_nested_experiment_parent_runs(
-    logger: logging.RootLogger,
-    delete_runs: list,
-    experiment_info: mlflow.entities.experiment.Experiment,
-    client: mlflow.tracking.client.MlflowClient,
-    leave_one: bool = True,
-) -> str | None:
-    """
-    if there are multiple runs for a single experiment,
-    will delete all runs except the one with the most nested runs (most complete)
-    Args:
-        logger: logging.RootLogger to save logs to file
-        delete_runs: list of runs to delete
-        experiment_info: info of experiment
-        client: mlflow client pointing to correct storage uri
-        leave_one: if True, will not delete the most complete experiment. If False, will delete all experiments
-    Returns:
-        run id of the experiment run that was not deleted or None
-    """
-    experiment_id = experiment_info.experiment_id
-    exp_parent_run_ids = []
-    counts = []
-    runs_in_experiment = []
-    logger.info(f"Deleting from experiment_id:{experiment_id} ")
-    logger.info(f"delete_runs:{delete_runs} ")
-
-    for exp_parent_run_id in delete_runs:
-        runs = []
-        runs.append(exp_parent_run_id)
-        task_parent_run_data = client.search_runs(
-            experiment_ids=[experiment_id],
-            filter_string=f'tags."mlflow.parentRunId" LIKE "{exp_parent_run_id}"',
-        )
-        for task_parent_run in task_parent_run_data:
-            task_parent_run_id = task_parent_run.info.run_id
-            runs.append(task_parent_run_id)
-            individual_run_data = client.search_runs(
-                experiment_ids=[experiment_id],
-                filter_string=f'tags."mlflow.parentRunId" LIKE "{task_parent_run_id}"',
-            )
-            for individual_run in individual_run_data:
-                runs.append(individual_run.info.run_id)
-        exp_parent_run_ids.append(exp_parent_run_id)
-        counts.append(len(runs))
-        runs_in_experiment.append(runs)
-
-    if leave_one and (len(counts) > 0):
-        index_to_keep = counts.index(max(counts))
-        incomplete_run_to_finish = exp_parent_run_ids[index_to_keep]
-        runs_in_experiment.pop(index_to_keep)
-    else:
-        incomplete_run_to_finish = None
-
-    logger.info(f"Deleting runs:{runs_in_experiment} ")
-    logger.info(
-        f"experiment_info.artifact_location:{experiment_info.artifact_location}"
-    )
-    for runs in runs_in_experiment:
-        for run_id in runs:
-            client.delete_run(run_id)
-            os.system(f"rm -r {experiment_info.artifact_location}/{run_id}")
-    return incomplete_run_to_finish
-
-
-def check_existing_task_parent_runs(
-    logger: logging.RootLogger,
-    exp_parent_run_id: str,
-    storage_uri: str,
-    experiment_name: str,
-    n_trials: int = N_TRIALS_DEFAULT,
-):
-    """
-    checks if tasks have been completed (both task run and nested individual runs are complete)
-    Args:
-        logger: logging.RootLogger to save logs to file
-        exp_parent_run_id: run id of the experiment run being used (top level run id)
-        storage_uri: folder containing mlflow log data
-        experiment_name: name of experiment
-        n_trials: number of trials (runs) expected in HPO of each task
-    Returns:
-        complete_task_run_names: list of task names that have been completed
-        all_tasks_finished: bool showing if all tasks have been completed
-        task_run_to_id_match: dict matching task names to the task run id
-
-    """
-    client = mlflow.tracking.MlflowClient(tracking_uri=storage_uri)
-    experiment_info = client.get_experiment_by_name(experiment_name)
-    experiment_id = experiment_info.experiment_id
-    task_parent_run_data = client.search_runs(
-        experiment_ids=[experiment_id],
-        filter_string=f'tags."mlflow.parentRunId" LIKE "{exp_parent_run_id}"',
-    )
-    complete_task_run_names = []
-    all_tasks_finished = []
-    #   TO DO: make sure we only have one task_parent_run for each name (needed for repeated exps)
-    task_run_to_id_match = {}
-    for task_parent_run in task_parent_run_data:
-        task_run_statuses = []
-        task_run_ids = []
-        task_run_statuses.append(task_parent_run.info.status)
-        task_run_ids.append(task_parent_run.info.run_id)
-
-        individual_run_data = client.search_runs(
-            experiment_ids=[experiment_id],
-            filter_string=f'tags."mlflow.parentRunId" LIKE "{task_parent_run.info.run_id}"',
-        )
-        for individual_run in individual_run_data:
-            if (individual_run.info.status == "RUNNING") or (
-                individual_run.info.status == "FAILED"
-            ):
-                continue
-            task_run_statuses.append(individual_run.info.status)
-            task_run_ids.append(individual_run.info.run_id)
-
-        task_run_to_id_match[task_parent_run.info.run_name] = (
-            task_parent_run.info.run_id
-        )
-        task_run_statuses = list(set(task_run_statuses))
-
-        condition_1 = len(task_run_statuses) == 1
-        condition_2 = task_run_statuses[0] == "FINISHED"
-        # condition_3 = len(task_run_ids) == (n_trials+1)
-        if condition_1 and condition_2:  # and condition_3:
-            complete_task_run_names.append(task_parent_run.info.run_name)
-            task_parent_status = True
-        else:
-            task_parent_status = False
-        all_tasks_finished.append(task_parent_status)
-
-    if all(all_tasks_finished) and (len(all_tasks_finished) > 0):
-        all_tasks_finished = True
-    else:
-        all_tasks_finished = False
-    complete_task_run_names = list(set(complete_task_run_names))
-    return complete_task_run_names, all_tasks_finished, task_run_to_id_match
-
-
-def check_existing_experiments(
-    logger: logging.RootLogger,
-    storage_uri: str,
-    experiment_name: str,
-    exp_parent_run_name: str,
-    task_names: list,
-    n_trials: int,
-    backbone: str
-) -> Dict[str, Any]:
-    """
-    checks if experiment has been completed (i.e. both task run and nested individual runs are complete)
-    Args:
-        logger: logging.RootLogger to save logs to file
-        storage_uri: folder containing mlflow log data
-        experiment_name: name of experiment
-        exp_parent_run_name: run name of the top level experiment run
-        task_names: list of task names that should be completed
-        n_trials: number of trials (runs) expected in HPO of each task
-    Returns:
-        output: dict with:
-            no_existing_runs: bool, if True, there are no existing runs
-            incomplete_run_to_finish: str | None, run id of the experiment run to finish
-            finished_run: str | None, run id of the finished experiment run
-            experiment_id: str | None, experiment id it experiment already exists
-
-    """
-    client = mlflow.tracking.MlflowClient(tracking_uri=storage_uri)
-    experiment_info = client.get_experiment_by_name(experiment_name)
-
-    output = {
-        "no_existing_runs": True,
-        "incomplete_run_to_finish": None,
-        "finished_run": None,
-        "experiment_id": None,
-    }
-    if experiment_info is None:
-        return output
-
-    experiment_id = experiment_info.experiment_id
-    logger.info(f"\nexperiment_id: {experiment_id}")
-    logger.info(f"experiment_name: {experiment_name}")
-    output["experiment_id"] = experiment_id
-    experiment_parent_run_data = client.search_runs(
-        experiment_ids=[experiment_id],
-        filter_string=f'tags."mlflow.runName" LIKE "{exp_parent_run_name}"',
-    )
-    if len(experiment_parent_run_data) >= 1:
-        logger.info("there is at least one experiment parent run")
-        finished_run_id = None
-        incomplete_runs = []
-
-        # check if one of the runs is complete
-        for run in experiment_parent_run_data:
-            completed_task_run_names, all_tasks_in_experiment_finished, _ = (
-                check_existing_task_parent_runs(
-                    logger=logger,
-                    exp_parent_run_id=run.info.run_id,
-                    storage_uri=storage_uri,
-                    experiment_name=experiment_name,
-                    n_trials=n_trials,
-                )
-            )
-            logger.info(f"tasks that should be completed: {task_names}")
-            logger.info(f"completed_task_run_names: {completed_task_run_names}")
-            logger.info(
-                f"all_tasks_in_experiment_finished: {all_tasks_in_experiment_finished}"
-            )
-            all_expected_tasks_completed = [
-                item for item in task_names if item in completed_task_run_names
-            ]
-            all_expected_tasks_completed = len(task_names) == len(
-                all_expected_tasks_completed
-            )
-            if all_expected_tasks_completed:
-                finished_run_id = run.info.run_id
-                logger.info(
-                    f"The following run FINISHED and will be used for repeated experiments: {finished_run_id}"
-                )
-            else:
-                incomplete_tasks = [
-                    item for item in task_names if item not in completed_task_run_names
-                ]
-                logger.info(
-                    f"The following run {run.info.run_id} is incomplete, with status {run.info.status} and missing tasks: {incomplete_tasks}"
-                )
-                incomplete_runs.append(run.info.run_id)
-
-        if finished_run_id is not None:
-            # delete all incomplete runs
-            delete_nested_experiment_parent_runs(
-                logger=logger,
-                delete_runs=incomplete_runs,
-                experiment_info=experiment_info,
-                client=client,
-                leave_one=False,
-            )
-            output["finished_run"] = finished_run_id
-            output["no_existing_runs"] = False
-        else:
-            # delete all incomplete runs, leave one
-            logger.info(f"incomplete_runs: {incomplete_runs}")
-            output["incomplete_run_to_finish"] = delete_nested_experiment_parent_runs(
-                logger=logger,
-                delete_runs=incomplete_runs,
-                experiment_info=experiment_info,
-                client=client,
-                leave_one=True,
-            )
-            output["no_existing_runs"] = False
-    return output
-
-
-def visualize_combined_results(
-    combined_results: pd.DataFrame,
-    storage_uri: str,
-    logger: logging.RootLogger,
-    plot_file_base_name: str,
-):
-    """
-    compiles and visualizes results from experiment
-    Args:
-        combined_results: table containing results and parameters for all experiments
-        storage_uri: storage_uri from config
-        logger: logging.RootLogger to save logs to file
-        plot_file_base_name: unique string to be added to all file names
-    """
-    logger.info("Starting to visualize")
-    save_folder = Path(storage_uri).parents[0] / "visualizations"
-    tables_folder = save_folder / "tables"
-    plots_folder = save_folder / "plots"
-    if not os.path.exists(tables_folder):
-        os.makedirs(tables_folder)
-    if not os.path.exists(plots_folder):
-        os.makedirs(plots_folder)
-
-    combined_results = []
-    model_order = []
-    experiments = list(set(combined_results["experiment_name"]))
-    combined_results = combined_results.rename(columns={"experiment_name": "model"})
-    num_experiments = len(experiments)
-    fig_size = (num_experiments * 5, 6) if num_experiments >= 3 else (15, 6)
-    n_legend_rows = num_experiments // 3 if num_experiments >= 3 else 1
-    model_order = sorted(experiments)
-    model_colors = dict(
-        zip(model_order, sns.color_palette("tab20", n_colors=len(model_order)))
-    )
-
-    try:
-        # plot raw values
-        plot_tools.plot_per_dataset(
-            combined_results,
-            model_order=model_order,
-            plot_file_base_name=plot_file_base_name,
-            model_colors=model_colors,
-            metric="test metric",
-            sharey=False,
-            inner="points",
-            fig_size=fig_size,
-            n_legend_rows=n_legend_rows,
-        )
-        plt.savefig(
-            str(plots_folder / f"violin_{plot_file_base_name}_raw.png"),
-            bbox_inches="tight",
-        )
-        plt.close()
-
-        # plot normalized, bootstrapped values values
-        plot_tools.make_normalizer(
-            combined_results,
-            metrics=("test metric",),
-            benchmark_name=plot_file_base_name,
-        )
-        bootstrapped_iqm, normalized_combined_results = (
-            plot_tools.normalize_bootstrap_and_plot(
-                combined_results,
-                plot_file_base_name=plot_file_base_name,
-                metric="test metric",
-                benchmark_name=plot_file_base_name,
-                model_order=model_order,
-                model_colors=model_colors,
-                fig_size=fig_size,
-                n_legend_rows=n_legend_rows,
-            )
-        )
-        # dataset_name_map=dataset_name_map)
-
-        plt.savefig(
-            str(
-                plots_folder
-                / f"violin_{plot_file_base_name}_normalized_bootstrapped.png"
-            ),
-            bbox_inches="tight",
-        )
-        plt.close()
-        bootstrapped_iqm.to_csv(
-            str(tables_folder / f"{plot_file_base_name}_bootstrapped_iqm.csv")
-        )
-        combined_results.to_csv(
-            str(
-                tables_folder / f"{plot_file_base_name}_normalized_combined_results.csv"
-            )
-        )
-    except Exception as e:
-        logger.info(f"could not visualize due to error: {e}")
-
-
-def get_logger(log_level="INFO", log_folder="./experiment_logs") -> logging.RootLogger:
-    # set up logging file
-    if not os.path.exists(log_folder):
-        os.makedirs(log_folder)
-    current_time = datetime.datetime.now()
-    current_time = (
-        str(current_time).replace(" ", "_").replace(":", "-").replace(".", "-")
-    )
-    log_file = f"{log_folder}/{current_time}"
-    logger = logging.getLogger()
-    logger.setLevel(log_level)
-    handler = logging.FileHandler(log_file)
-    handler.setLevel(log_level)
-    formatter = logging.Formatter(
-        '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-    )
-    handler.setFormatter(formatter)
-    logger.addHandler(handler)
-    logging.basicConfig(level=logging.CRITICAL)
-    return logger
-
-def import_custom_modules(
-    logger: logging.RootLogger,
-    custom_modules_path: str | Path | None = None,
-) -> None:
-
-    if custom_modules_path:
-
-        custom_modules_path = Path(custom_modules_path)
-
-        if custom_modules_path.is_dir():
-
-            # Add 'custom_modules' folder to sys.path
-            workdir = custom_modules_path.parents[0]
-            module_dir = custom_modules_path.name
-
-            sys.path.insert(0, str(workdir))
-
-            try:
-                module = importlib.import_module(module_dir)
-                logger.info(f"Found {custom_modules_path}")
-            except ImportError:
-                raise ImportError(f"It was not possible to import modules from {custom_modules_path}.")
-        else:
-            raise ValueError(f"Modules path {custom_modules_path} isn't a directory. Check if you have defined it properly.")
-    else:
-        logger.debug("No custom module is being used.")
-
-if __name__ == "__main__":
-    logger = get_logger()
-    storage_uri = "results_folder/hpo"  # storage_uri from config
-
-    list_of_experiments = [
-        "early_stopping_10_prithvi_600",
-        "early_stopping_10_prithvi_600_tl",
-        "early_stopping_10_dofa_vit_300",
-    ]
-    # get results and parameters from mlflow logs
-    results_and_parameters = get_results_and_parameters(
-        storage_uri=storage_uri,
-        logger=logger,
-        experiments=list_of_experiments,
-    )
-
-    settings_per_model = [
-        "early_stopping_10_data_100_perc",
-        "early_stopping_50_data_10_perc",
-        "early_stopping_50_data_100_perc",
-    ]
-
-    # create box plots across multiple models
-    for setting in settings_per_model:
-        combined_results = results_and_parameters.loc[
-            results_and_parameters["experiment_name"].str.contains(setting)
-        ].copy()
-        model_order = visualize_combined_results(
-            combined_results=results_and_parameters,
-            storage_uri=storage_uri,
-            logger=logger,
-            plot_file_base_name=f"multiple_models_{setting}",
-        )

From 8737cc056680231b95de834fd12d23aa3a610bb0 Mon Sep 17 00:00:00 2001
From: Paolo Fraccaro <paolo.fraccaro.dms@gmail.com>
Date: Fri, 18 Jul 2025 14:12:46 +0000
Subject: [PATCH 05/16] fix continue existing run

---
 benchmark/backbone_benchmark.py     |  3 +-
 benchmark/repeat_best_experiment.py | 16 +++++++--
 benchmark/utils.py                  | 51 +++++++++++++++++++++++++++++
 3 files changed, 67 insertions(+), 3 deletions(-)

diff --git a/benchmark/backbone_benchmark.py b/benchmark/backbone_benchmark.py
index 54934bc..bb0d6e6 100644
--- a/benchmark/backbone_benchmark.py
+++ b/benchmark/backbone_benchmark.py
@@ -232,6 +232,7 @@ def benchmark_backbone(
 
     mlflow.set_tracking_uri(storage_uri)
     mlflow.set_experiment(experiment_name)
+    experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
 
     if bayesian_search:
         sampler: BaseSampler | None = None  # take the default
@@ -344,7 +345,7 @@ def benchmark_backbone(
                 "results_table.json",
                 run.info.run_id,
             )
-            experiment_id = run.info.experiment_id
+            # experiment_id = run.info.experiment_id
 
         # check completion of HPO for all tasks before proceeding to next stage
         existing_experiments = check_existing_experiments(
diff --git a/benchmark/repeat_best_experiment.py b/benchmark/repeat_best_experiment.py
index 67a8bbc..1f39e5c 100644
--- a/benchmark/repeat_best_experiment.py
+++ b/benchmark/repeat_best_experiment.py
@@ -37,6 +37,9 @@
     valid_task_types,
 )
 
+from .utils import get_nested_runs
+
+
 @ray.remote(num_cpus=8, num_gpus=1)
 def remote_fit(
     training_spec: TrainingSpec,
@@ -261,6 +264,7 @@ def rerun_best_from_backbone(
     if backbone_import:
         importlib.import_module(backbone_import)
     mlflow.set_tracking_uri(storage_uri)
+    
     mlflow.set_experiment(experiment_name)
 
     runs: list[mlflow.entities.Run] = mlflow.search_runs(
@@ -290,9 +294,17 @@ def rerun_best_from_backbone(
     repeated_experiment_name = f"{experiment_name}_repeated_exp"
     mlflow.set_tracking_uri(repeated_storage_uri)
     mlflow.set_experiment(repeated_experiment_name)
-
+    experiment_id = mlflow.get_experiment_by_name(repeated_experiment_name).experiment_id
+
+    tmp_runs = get_nested_runs(experiment_id, experiment_name, repeated_storage_uri)
+    if len(tmp_runs) > 0:
+        if len(tmp_runs) > 1: tmp_runs = [x for x in runs if x["run_name"] == experiment_name]
+        run_id = tmp_runs[0]["run_id"]
+    else:
+        run_id = None
+    
     #backbone_name = defaults.terratorch_task["model_args"]["backbone"]
-    with mlflow.start_run(run_name=experiment_name, run_id=None) as run:
+    with mlflow.start_run(run_name=experiment_name, run_id=run_id) as run:
         for task in tasks:
             logger.info(f"\n\ntask: {task.name}")
             matching_runs = [run for run in runs if run.info.run_name.endswith(task.name)]  # type: ignore
diff --git a/benchmark/utils.py b/benchmark/utils.py
index 8d77675..c3b6636 100644
--- a/benchmark/utils.py
+++ b/benchmark/utils.py
@@ -16,6 +16,11 @@
 from mlflow.entities.experiment import Experiment
 import importlib
 import logging
+from mlflow.tracking import MlflowClient
+from mlflow.entities import ViewType
+from collections import defaultdict
+
+
 
 N_TRIALS_DEFAULT = 16
 REPEATED_SEEDS_DEFAULT = 10
@@ -854,3 +859,49 @@ def import_custom_modules(
             logger=logger,
             plot_file_base_name=f"multiple_models_{setting}",
         )
+
+        
+
+### code written with the help of Perplexity platform
+def get_nested_runs(experiment_id, filter_string = None, mlflow_uri= "mlflow"):
+    client = MlflowClient(mlflow_uri)
+    
+    # Get all runs for the experiment
+    all_runs = client.search_runs(
+        experiment_ids=[experiment_id],
+        run_view_type=ViewType.ACTIVE_ONLY
+    )
+    
+    # Create a dictionary to store the run hierarchy
+    run_hierarchy = defaultdict(list)
+    parent_runs = []
+
+    # First pass: Identify parent-child relationships
+    for run in all_runs:
+        parent_run_id = run.data.tags.get("mlflow.parentRunId")
+        
+        if parent_run_id:
+            run_hierarchy[parent_run_id].append(run)
+        else:
+            parent_runs.append(run)
+
+    # Function to create a nested dictionary for a run and its children
+    def create_nested_dict(run):
+        
+        run_dict = {
+            "run": run,
+            "run_id": run.info.run_id,
+            "run_name": run.data.tags.get("mlflow.runName", "Unnamed"),
+            "status": run.info.status,
+            "start_time": run.info.start_time,
+            "end_time": run.info.end_time,
+            "children": [create_nested_dict(child) for child in run_hierarchy[run.info.run_id]]
+        }
+        return run_dict
+     # Create the final nested structure
+    if filter_string:
+        nested_runs = [create_nested_dict(parent_run) for parent_run in parent_runs if parent_run.data.tags.get("mlflow.runName", "Unnamed").find(filter_string) > -1]
+    else:
+        nested_runs = [create_nested_dict(parent_run) for parent_run in parent_runs]
+    
+    return nested_runs

From 5007ed42e38ec216667ef0844f74574ce0e750f0 Mon Sep 17 00:00:00 2001
From: Paolo Fraccaro <paolo.fraccaro.dms@gmail.com>
Date: Fri, 10 Oct 2025 11:18:27 +0000
Subject: [PATCH 06/16] fix paths

---
 benchmark/plot_tools.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/benchmark/plot_tools.py b/benchmark/plot_tools.py
index 5ce6c82..184eb0a 100644
--- a/benchmark/plot_tools.py
+++ b/benchmark/plot_tools.py
@@ -10,7 +10,6 @@
 import json
 from scipy.stats import trim_mean
 
-
 sns.set_style("dark", {"grid.color": "0.98", "axes.facecolor": "(0.95, 0.95, 0.97)"})
 GEO_BENCH_DIR = "geobench"
 
@@ -28,7 +27,7 @@ def iqm(scores):
 
 def bootstrap_iqm(
     df,
-    group_keys=("model", "dataset", "partition name"),
+    group_keys=("model", "dataset", "partition_name"),
     metric="test_metric",
     repeat=100,
 ):
@@ -43,12 +42,13 @@ def bootstrap_iqm(
 
 def bootstrap_iqm_aggregate(df, metric="test_metric", repeat=100):
     """Stratified bootstrap (by dataset) of all seeds to compute iqm score distribution for each model."""
-    group = df.groupby(["model", "dataset", "partition name"])
+
+    group = df.groupby(["model", "dataset", "partition_name"])
 
     df_list = []
     for i in range(repeat):
         new_df = group.sample(frac=1, replace=True)
-        series = new_df.groupby(["model", "partition name"])[metric].apply(iqm)
+        series = new_df.groupby(["model", "partition_name"])[metric].apply(iqm)
         df_list.append(series.to_frame().reset_index())
 
     new_df = pd.concat(df_list)
@@ -57,7 +57,7 @@ def bootstrap_iqm_aggregate(df, metric="test_metric", repeat=100):
 
 
 def average_seeds(
-    df, group_keys=("model", "dataset", "partition name"), metric="test metric"
+    df, group_keys=("model", "dataset", "partition_name"), metric="test metric"
 ):
     """Average seeds for all model and all datasets."""
     df_avg = df.groupby(list(group_keys))[metric].mean()
@@ -70,8 +70,8 @@ def average_seeds(
 def extract_1x_data(df_all):
     """Extract only resutls trained on 100% of the data"""
     return df_all[
-        (df_all["partition name"] == "1.00x train")
-        | (df_all["partition name"] == "default")
+        (df_all["partition_name"] == "1.00x train")
+        | (df_all["partition_name"] == "default")
     ].copy()
 
 
@@ -150,13 +150,14 @@ def normalize_data_frame(self, df, metric):
 
     def save(self, benchmark_name):
         """Save normalizer to json file."""
-        with open(GEO_BENCH_DIR / benchmark_name / "normalizer.json", "w") as f:
+
+        with open(f"{benchmark_name}/normalizer.json", "w") as f:
             json.dump(self.range_dict, f, indent=2)
 
 
 def load_normalizer(benchmark_name):
     """Load normalizer from json file."""
-    with open(GEO_BENCH_DIR / benchmark_name / "normalizer.json", "r") as f:
+    with open(f"{benchmark_name}/normalizer.json", "r") as f:
         range_dict = json.load(f)
     return Normalizer(range_dict)
 

From c579d27b22ba59be5f5200dbca2f587f0a1b54bf Mon Sep 17 00:00:00 2001
From: Paolo Fraccaro <paolo.fraccaro.dms@gmail.com>
Date: Fri, 10 Oct 2025 11:19:27 +0000
Subject: [PATCH 07/16] fix_plots

---
 benchmark/utils.py | 117 +++++++++++++++++++++++----------------------
 1 file changed, 60 insertions(+), 57 deletions(-)

diff --git a/benchmark/utils.py b/benchmark/utils.py
index c3b6636..77bc9c4 100644
--- a/benchmark/utils.py
+++ b/benchmark/utils.py
@@ -19,7 +19,7 @@
 from mlflow.tracking import MlflowClient
 from mlflow.entities import ViewType
 from collections import defaultdict
-
+import pdb
 
 
 N_TRIALS_DEFAULT = 16
@@ -215,7 +215,7 @@ def extract_repeated_experiment_results(
                 seed = int(run.info.run_name.split("_")[-1])
                 if task in task_info:
                     metric_name = task_info[task]
-                    metric_name = 'test_test/' + metric_name.split("/")[-1]
+                    metric_name = 'test_test/' + metric_name.split("/")[-1] if '/' in metric_name else 'test_test_' + metric_name.replace(metric_name.split('_')[0] + "_", '')
                 else:  
                     continue
 
@@ -278,7 +278,10 @@ def extract_repeated_experiment_results(
                 f"EXPERIMENT INCOMPLETE: {experiment_name} has {len(combine_task_results)} complete tasks only"
             )
             incomplete_experiments.append(experiment_name)
-    combine_exp_results = pd.concat(combine_exp_results, axis=0)
+    if len(combine_exp_results) > 0: 
+        combine_exp_results = pd.concat(combine_exp_results, axis=0)
+    else:
+        combine_exp_results = pd.DataFrame()
     print(f"\n\n\ncombine_exp_results: {combine_exp_results}")
     return (combine_exp_results, incomplete_experiments)
 
@@ -383,6 +386,7 @@ def get_results_and_parameters(
     task_metrics: list,
     task_names: list,
     num_repetitions: int = REPEATED_SEEDS_DEFAULT,
+    visualise: bool = True,
 ) -> pd.DataFrame:
     """
     extracts results and parameters for experiments from mlflow logs
@@ -395,6 +399,7 @@ def get_results_and_parameters(
         task_metrics: metrics used to evaluate each task
         task_names: list of tasks
         num_repetitions: number of repeated seeds per task
+        visualise: whether to visualise the summarised results or not
     Returns:
         pd.DataFrame with results and parameters
     """
@@ -432,6 +437,16 @@ def get_results_and_parameters(
     results_and_parameters.to_csv(
         f"{str(results_dir)}/results_and_parameters.csv", index=False
     )
+    
+    if visualise:
+
+        model_order = visualize_combined_results(
+            combined_results=results_and_parameters,
+            storage_uri=storage_uri,
+            logger=logger,
+            plot_file_base_name=f"summary_plot",
+        )
+
     return results_and_parameters
 
 
@@ -707,7 +722,6 @@ def visualize_combined_results(
     if not os.path.exists(plots_folder):
         os.makedirs(plots_folder)
 
-    combined_results = []
     model_order = []
     experiments = list(set(combined_results["experiment_name"]))
     combined_results = combined_results.rename(columns={"experiment_name": "model"})
@@ -719,63 +733,54 @@ def visualize_combined_results(
         zip(model_order, sns.color_palette("tab20", n_colors=len(model_order)))
     )
 
-    try:
-        # plot raw values
-        plot_tools.plot_per_dataset(
+    plot_tools.plot_per_dataset(
+        combined_results,
+        model_order=model_order,
+        aggregated_name=plot_file_base_name,
+        model_colors=model_colors,
+        metric="test metric",
+        sharey=False,
+        inner="points",
+        fig_size=fig_size,
+        n_legend_rows=n_legend_rows,
+    )
+    plt.savefig(
+        str(f"{plots_folder}/violin_{plot_file_base_name}_raw.png"),
+        bbox_inches="tight",
+    )
+    plt.close()
+
+    # plot normalized, bootstrapped values values
+    plot_tools.make_normalizer(
+        combined_results,
+        metrics=("test metric",),
+        benchmark_name=plots_folder,
+    )
+
+    tmp = (
+        plot_tools.normalize_bootstrap_and_plot(
             combined_results,
+            # plot_file_base_name=plot_file_base_name,
+            metric="test metric",
+            benchmark_name=plots_folder,
             model_order=model_order,
-            plot_file_base_name=plot_file_base_name,
             model_colors=model_colors,
-            metric="test metric",
-            sharey=False,
-            inner="points",
             fig_size=fig_size,
             n_legend_rows=n_legend_rows,
         )
-        plt.savefig(
-            str(plots_folder / f"violin_{plot_file_base_name}_raw.png"),
-            bbox_inches="tight",
-        )
-        plt.close()
+    )
 
-        # plot normalized, bootstrapped values values
-        plot_tools.make_normalizer(
-            combined_results,
-            metrics=("test metric",),
-            benchmark_name=plot_file_base_name,
-        )
-        bootstrapped_iqm, normalized_combined_results = (
-            plot_tools.normalize_bootstrap_and_plot(
-                combined_results,
-                plot_file_base_name=plot_file_base_name,
-                metric="test metric",
-                benchmark_name=plot_file_base_name,
-                model_order=model_order,
-                model_colors=model_colors,
-                fig_size=fig_size,
-                n_legend_rows=n_legend_rows,
-            )
-        )
-        # dataset_name_map=dataset_name_map)
-
-        plt.savefig(
-            str(
-                plots_folder
-                / f"violin_{plot_file_base_name}_normalized_bootstrapped.png"
-            ),
-            bbox_inches="tight",
-        )
-        plt.close()
-        bootstrapped_iqm.to_csv(
-            str(tables_folder / f"{plot_file_base_name}_bootstrapped_iqm.csv")
-        )
-        combined_results.to_csv(
-            str(
-                tables_folder / f"{plot_file_base_name}_normalized_combined_results.csv"
-            )
+    plt.savefig(
+        str(f"{plots_folder}/violin_{plot_file_base_name}_normalized_bootstrapped.png"
+        ),
+        bbox_inches="tight",
+    )
+    plt.close()
+
+    combined_results.to_csv(
+        str(f"{tables_folder}/{plot_file_base_name}_normalized_combined_results.csv"
         )
-    except Exception as e:
-        logger.info(f"could not visualize due to error: {e}")
+    )
 
 
 def get_logger(log_level="INFO", log_folder="./experiment_logs") -> logging.RootLogger:
@@ -843,11 +848,9 @@ def import_custom_modules(
     )
 
     settings_per_model = [
-        "early_stopping_10_data_100_perc",
-        "early_stopping_50_data_10_perc",
-        "early_stopping_50_data_100_perc",
+        "detection",
     ]
-
+    
     # create box plots across multiple models
     for setting in settings_per_model:
         combined_results = results_and_parameters.loc[

From 112b591a83d27927efc3ff8fd28a107672f7fcae Mon Sep 17 00:00:00 2001
From: Paolo Fraccaro <paolo.fraccaro@ibm.com>
Date: Fri, 10 Oct 2025 14:16:39 +0200
Subject: [PATCH 08/16] rename

---
 {benchmark => terratorch_iterate}/__init__.py             | 0
 {benchmark => terratorch_iterate}/backbone_benchmark.py   | 8 ++++----
 {benchmark => terratorch_iterate}/benchmark_ray.py        | 6 +++---
 {benchmark => terratorch_iterate}/benchmark_types.py      | 0
 {benchmark => terratorch_iterate}/main.py                 | 8 ++++----
 {benchmark => terratorch_iterate}/model_fitting.py        | 2 +-
 {benchmark => terratorch_iterate}/module.py               | 0
 {benchmark => terratorch_iterate}/plot_tools.py           | 0
 {benchmark => terratorch_iterate}/py.typed                | 0
 .../repeat_best_experiment.py                             | 4 ++--
 .../resources/dataset_specifications/agb.yaml             | 0
 .../resources/dataset_specifications/eurosat.yaml         | 0
 .../resources/dataset_specifications/fire_scars.yaml      | 0
 .../dataset_specifications/multi_temporal_crop.yaml       | 0
 .../resources/dataset_specifications/sen1floods11.yaml    | 0
 .../dataset_specifications/sen1floods11_transforms.yaml   | 0
 {benchmark => terratorch_iterate}/tests/__init__.py       | 0
 {benchmark => terratorch_iterate}/utils.py                | 4 ++--
 18 files changed, 16 insertions(+), 16 deletions(-)
 rename {benchmark => terratorch_iterate}/__init__.py (100%)
 rename {benchmark => terratorch_iterate}/backbone_benchmark.py (98%)
 rename {benchmark => terratorch_iterate}/benchmark_ray.py (97%)
 rename {benchmark => terratorch_iterate}/benchmark_types.py (100%)
 rename {benchmark => terratorch_iterate}/main.py (97%)
 rename {benchmark => terratorch_iterate}/model_fitting.py (99%)
 rename {benchmark => terratorch_iterate}/module.py (100%)
 rename {benchmark => terratorch_iterate}/plot_tools.py (100%)
 rename {benchmark => terratorch_iterate}/py.typed (100%)
 rename {benchmark => terratorch_iterate}/repeat_best_experiment.py (99%)
 rename {benchmark => terratorch_iterate}/resources/dataset_specifications/agb.yaml (100%)
 rename {benchmark => terratorch_iterate}/resources/dataset_specifications/eurosat.yaml (100%)
 rename {benchmark => terratorch_iterate}/resources/dataset_specifications/fire_scars.yaml (100%)
 rename {benchmark => terratorch_iterate}/resources/dataset_specifications/multi_temporal_crop.yaml (100%)
 rename {benchmark => terratorch_iterate}/resources/dataset_specifications/sen1floods11.yaml (100%)
 rename {benchmark => terratorch_iterate}/resources/dataset_specifications/sen1floods11_transforms.yaml (100%)
 rename {benchmark => terratorch_iterate}/tests/__init__.py (100%)
 rename {benchmark => terratorch_iterate}/utils.py (99%)

diff --git a/benchmark/__init__.py b/terratorch_iterate/__init__.py
similarity index 100%
rename from benchmark/__init__.py
rename to terratorch_iterate/__init__.py
diff --git a/benchmark/backbone_benchmark.py b/terratorch_iterate/backbone_benchmark.py
similarity index 98%
rename from benchmark/backbone_benchmark.py
rename to terratorch_iterate/backbone_benchmark.py
index bb0d6e6..aa0e8a0 100644
--- a/benchmark/backbone_benchmark.py
+++ b/terratorch_iterate/backbone_benchmark.py
@@ -17,16 +17,16 @@
 from optuna.samplers import BaseSampler, RandomSampler
 from tabulate import tabulate
 import pickle
-from benchmark.benchmark_types import (
+from terratorch_iterate.benchmark_types import (
     Defaults,
     ParameterBounds,
     Task,
     combine_with_defaults,
     optimization_space_type,
 )
-from benchmark.model_fitting import fit_model, fit_model_with_hparams
-from benchmark.repeat_best_experiment import rerun_best_from_backbone
-from benchmark.utils import (
+from terratorch_iterate.model_fitting import fit_model, fit_model_with_hparams
+from terratorch_iterate.repeat_best_experiment import rerun_best_from_backbone
+from terratorch_iterate.utils import (
     check_existing_task_parent_runs,
     check_existing_experiments,
     unflatten,
diff --git a/benchmark/benchmark_ray.py b/terratorch_iterate/benchmark_ray.py
similarity index 97%
rename from benchmark/benchmark_ray.py
rename to terratorch_iterate/benchmark_ray.py
index 81eed60..087fa2b 100644
--- a/benchmark/benchmark_ray.py
+++ b/terratorch_iterate/benchmark_ray.py
@@ -14,15 +14,15 @@
 from ray.tune.search.optuna import OptunaSearch
 from tabulate import tabulate
 
-from benchmark.backbone_benchmark import parse_optimization_space
-from benchmark.benchmark_types import (
+from terratorch_iterate.backbone_benchmark import parse_optimization_space
+from terratorch_iterate.benchmark_types import (
     Defaults,
     Task,
     TrainingSpec,
     combine_with_defaults,
     optimization_space_type,
 )
-from benchmark.model_fitting import fit_model, ray_tune_model, valid_task_types
+from terratorch_iterate.model_fitting import fit_model, ray_tune_model, valid_task_types
 
 
 def benchmark_backbone_on_task(
diff --git a/benchmark/benchmark_types.py b/terratorch_iterate/benchmark_types.py
similarity index 100%
rename from benchmark/benchmark_types.py
rename to terratorch_iterate/benchmark_types.py
diff --git a/benchmark/main.py b/terratorch_iterate/main.py
similarity index 97%
rename from benchmark/main.py
rename to terratorch_iterate/main.py
index 376de06..d6ad13f 100644
--- a/benchmark/main.py
+++ b/terratorch_iterate/main.py
@@ -3,10 +3,10 @@
 from pathlib import Path
 from typing import Any, List
 from jsonargparse import ArgumentParser
-from benchmark.backbone_benchmark import benchmark_backbone
-from benchmark.benchmark_types import Defaults, Task
-from benchmark.repeat_best_experiment import rerun_best_from_backbone
-from benchmark.utils import (get_logger, import_custom_modules,
+from terratorch_iterate.backbone_benchmark import benchmark_backbone
+from terratorch_iterate.benchmark_types import Defaults, Task
+from terratorch_iterate.repeat_best_experiment import rerun_best_from_backbone
+from terratorch_iterate.utils import (get_logger, import_custom_modules,
                              get_results_and_parameters, extract_parameters)
 
 def main():
diff --git a/benchmark/model_fitting.py b/terratorch_iterate/model_fitting.py
similarity index 99%
rename from benchmark/model_fitting.py
rename to terratorch_iterate/model_fitting.py
index 61375b9..5fea265 100644
--- a/benchmark/model_fitting.py
+++ b/terratorch_iterate/model_fitting.py
@@ -55,7 +55,7 @@
 from torchgeo.datamodules import BaseDataModule
 from torchgeo.trainers import BaseTask
 
-from benchmark.benchmark_types import (
+from terratorch_iterate.benchmark_types import (
     ParameterBounds,
     ParameterTypeEnum,
     TrainingSpec,
diff --git a/benchmark/module.py b/terratorch_iterate/module.py
similarity index 100%
rename from benchmark/module.py
rename to terratorch_iterate/module.py
diff --git a/benchmark/plot_tools.py b/terratorch_iterate/plot_tools.py
similarity index 100%
rename from benchmark/plot_tools.py
rename to terratorch_iterate/plot_tools.py
diff --git a/benchmark/py.typed b/terratorch_iterate/py.typed
similarity index 100%
rename from benchmark/py.typed
rename to terratorch_iterate/py.typed
diff --git a/benchmark/repeat_best_experiment.py b/terratorch_iterate/repeat_best_experiment.py
similarity index 99%
rename from benchmark/repeat_best_experiment.py
rename to terratorch_iterate/repeat_best_experiment.py
index 1f39e5c..5fa921e 100644
--- a/benchmark/repeat_best_experiment.py
+++ b/terratorch_iterate/repeat_best_experiment.py
@@ -25,13 +25,13 @@
 from lightning.pytorch.loggers.mlflow import MLFlowLogger
 import time
 import pdb
-from benchmark.benchmark_types import (
+from terratorch_iterate.benchmark_types import (
     Defaults,
     Task,
     TrainingSpec,
     combine_with_defaults,
 )
-from benchmark.model_fitting import (
+from terratorch_iterate.model_fitting import (
     get_default_callbacks,
     inject_hparams,
     valid_task_types,
diff --git a/benchmark/resources/dataset_specifications/agb.yaml b/terratorch_iterate/resources/dataset_specifications/agb.yaml
similarity index 100%
rename from benchmark/resources/dataset_specifications/agb.yaml
rename to terratorch_iterate/resources/dataset_specifications/agb.yaml
diff --git a/benchmark/resources/dataset_specifications/eurosat.yaml b/terratorch_iterate/resources/dataset_specifications/eurosat.yaml
similarity index 100%
rename from benchmark/resources/dataset_specifications/eurosat.yaml
rename to terratorch_iterate/resources/dataset_specifications/eurosat.yaml
diff --git a/benchmark/resources/dataset_specifications/fire_scars.yaml b/terratorch_iterate/resources/dataset_specifications/fire_scars.yaml
similarity index 100%
rename from benchmark/resources/dataset_specifications/fire_scars.yaml
rename to terratorch_iterate/resources/dataset_specifications/fire_scars.yaml
diff --git a/benchmark/resources/dataset_specifications/multi_temporal_crop.yaml b/terratorch_iterate/resources/dataset_specifications/multi_temporal_crop.yaml
similarity index 100%
rename from benchmark/resources/dataset_specifications/multi_temporal_crop.yaml
rename to terratorch_iterate/resources/dataset_specifications/multi_temporal_crop.yaml
diff --git a/benchmark/resources/dataset_specifications/sen1floods11.yaml b/terratorch_iterate/resources/dataset_specifications/sen1floods11.yaml
similarity index 100%
rename from benchmark/resources/dataset_specifications/sen1floods11.yaml
rename to terratorch_iterate/resources/dataset_specifications/sen1floods11.yaml
diff --git a/benchmark/resources/dataset_specifications/sen1floods11_transforms.yaml b/terratorch_iterate/resources/dataset_specifications/sen1floods11_transforms.yaml
similarity index 100%
rename from benchmark/resources/dataset_specifications/sen1floods11_transforms.yaml
rename to terratorch_iterate/resources/dataset_specifications/sen1floods11_transforms.yaml
diff --git a/benchmark/tests/__init__.py b/terratorch_iterate/tests/__init__.py
similarity index 100%
rename from benchmark/tests/__init__.py
rename to terratorch_iterate/tests/__init__.py
diff --git a/benchmark/utils.py b/terratorch_iterate/utils.py
similarity index 99%
rename from benchmark/utils.py
rename to terratorch_iterate/utils.py
index 77bc9c4..6ca0580 100644
--- a/benchmark/utils.py
+++ b/terratorch_iterate/utils.py
@@ -10,8 +10,8 @@
 from matplotlib import pyplot as plt
 from ast import literal_eval
 import optuna
-from benchmark.benchmark_types import Task
-from benchmark import plot_tools
+from terratorch_iterate.benchmark_types import Task
+from terratorch_iterate import plot_tools
 import sys
 from mlflow.entities.experiment import Experiment
 import importlib

From a1e13abdf687b42ef9d48fcbda917c8cd7d09df4 Mon Sep 17 00:00:00 2001
From: Paolo Fraccaro <paolo.fraccaro@ibm.com>
Date: Fri, 10 Oct 2025 14:17:28 +0200
Subject: [PATCH 09/16] fix test

---
 tests/test_benchmark.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 62d2091..81c46b2 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -1,8 +1,8 @@
 import itertools
 import logging
-from benchmark.benchmark_types import Defaults, Task, TaskTypeEnum
+from terratorch_iterate.benchmark_types import Defaults, Task, TaskTypeEnum
 import pytest
-from benchmark.backbone_benchmark import benchmark_backbone
+from terratorch_iterate.backbone_benchmark import benchmark_backbone
 from terratorch.datamodules import MChesapeakeLandcoverNonGeoDataModule
 from albumentations import HorizontalFlip, VerticalFlip, Resize
 from albumentations.pytorch.transforms import ToTensorV2

From 4ead581500ca9c1e715119de8556d5c298ced032 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Mon, 13 Oct 2025 10:38:14 -0300
Subject: [PATCH 10/16] change minimum python version; move module to unit test

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 pyproject.toml                   |  2 +-
 tests/unit/test_model_fitting.py | 38 ++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 tests/unit/test_model_fitting.py

diff --git a/pyproject.toml b/pyproject.toml
index e36edb0..16d7f11 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,7 +14,7 @@ include = ["benchmark*"]
 
 name = "terratorch-iterate"
 version = "0.2.0"
-requires-python = ">= 3.10"
+requires-python = ">= 3.11"
 description = "A terratorch's plugin for benchmarking and hyperparameter optimization"
 authors = [
       { name = "Carlos Gomes"},
diff --git a/tests/unit/test_model_fitting.py b/tests/unit/test_model_fitting.py
new file mode 100644
index 0000000..7a4dcb2
--- /dev/null
+++ b/tests/unit/test_model_fitting.py
@@ -0,0 +1,38 @@
+from pathlib import Path
+
+from jsonargparse import ArgumentParser, Namespace
+from terratorch_iterate.iterate_types import Task
+import uuid
+import pytest
+
+
+@pytest.mark.skip()
+def test_launch_training():
+    # experiment_name='dofa_large_patch16_224_upernetdecoder_true_modified_continue_False_test_models_True' metric='val/loss' storage_uri='/dccstor/geofm-finetuning/terratorch-iterate-test/39d14a9ed79e4ee39739fa92a4cdd758/hpo' direction='max'
+    random_hex = uuid.uuid4().hex
+
+    storage_uri = Path(f"/tmp/{random_hex}")
+    if not storage_uri.exists():
+        storage_uri.mkdir()
+    parser = ArgumentParser()
+    config_path = (
+        Path(__file__).parent.parent.parent
+        / "configs/tests/dofa_large_patch16_224_upernetdecoder_true_modified.yaml"
+    )
+    assert config_path.exists()
+    config = parser.parse_path(config_path)
+    config_init: Namespace = parser.instantiate_classes(config)
+    tasks = config_init.tasks
+    assert isinstance(tasks, list), f"Error! {tasks=} is not a list"
+    for t in tasks:
+        assert isinstance(t, Task), f"Error! {t=} is not a Task"
+    # data_module = MNzCattleNonGeoDataModule()
+    # trainer = Trainer(**training_spec_copy.trainer_args)
+    # launch_training(
+    #     trainer=trainer,
+    #     datamodule=datamodule,
+    #     experiment_name=experiment_name,
+    #     metric=metric,
+    #     direction=direction,
+    #     storage_uri=storage_uri,
+    # )

From ea78d5d595cbb9a5d05fd14d4976aa33831d8d77 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Mon, 20 Oct 2025 11:07:47 -0300
Subject: [PATCH 11/16] add mlflow; change version

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 .pre-commit-config.yaml | 3 ++-
 .secrets.baseline       | 4 ++--
 pyproject.toml          | 5 +++--
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 08e218f..0b8e668 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -10,9 +10,10 @@ repos:
     # You are encouraged to use static refs such as tags, instead of branch name
     #
     # Running "pre-commit autoupdate" automatically updates rev to latest tag
-    rev: 0.13.1+ibm.61.dss
+    rev: 0.13.1+ibm.62.dss
     hooks:
       - id: detect-secrets # pragma: whitelist secret
+        additional_dependencies: [boxsdk<4]
         # Add options for detect-secrets-hook binary. You can run `detect-secrets-hook --help` to list out all possible options.
         # You may also run `pre-commit run detect-secrets` to preview the scan result.
         # when "--baseline" without "--use-all-plugins", pre-commit scan with just plugins in baseline file
diff --git a/.secrets.baseline b/.secrets.baseline
index e3ac1a7..959a570 100644
--- a/.secrets.baseline
+++ b/.secrets.baseline
@@ -3,7 +3,7 @@
     "files": "^.secrets.baseline$",
     "lines": null
   },
-  "generated_at": "2025-10-01T20:02:29Z",
+  "generated_at": "2025-10-20T14:06:44Z",
   "plugins_used": [
     {
       "name": "AWSKeyDetector"
@@ -138,7 +138,7 @@
       }
     ]
   },
-  "version": "0.13.1+ibm.61.dss",
+  "version": "0.13.1+ibm.62.dss",
   "word_list": {
     "file": null,
     "hash": null
diff --git a/pyproject.toml b/pyproject.toml
index 16d7f11..1f1933b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,12 +8,12 @@ requires = ["setuptools >= 77.0.3"]
 build-backend = "setuptools.build_meta"
 
 [tool.setuptools.packages.find]
-include = ["benchmark*"]
+include = ["terratorch_iterate*"]
 
 [project]
 
 name = "terratorch-iterate"
-version = "0.2.0"
+version = "0.2.2rc1"
 requires-python = ">= 3.11"
 description = "A terratorch's plugin for benchmarking and hyperparameter optimization"
 authors = [
@@ -61,6 +61,7 @@ dependencies = [
 "more-itertools", 
 "importlib-metadata",
 "numpy",
+"mlflow",
 "optuna",
 "types-tabulate",
 "ray",

From 54a3ff0932ce9ab277952c9aebefe052b30e6393 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Mon, 27 Oct 2025 17:14:14 -0300
Subject: [PATCH 12/16] run tests using iterate command instead of terratorch
 iterate

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 run_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run_tests.py b/run_tests.py
index 60fa0b1..bd814df 100644
--- a/run_tests.py
+++ b/run_tests.py
@@ -59,7 +59,7 @@ def submit_job(
     if tc_id is not None:
         jbsub = f'bsub -e {err_file} -o {out_file} -M 40G -gpu "num=1/task:mode=exclusive_process:gmodel=NVIDIAA100_SXM4_80GB" pytest -vv tests/integration/test_main.py::test_main[{tc_id}]'
     elif config is not None:
-        jbsub = f'bsub -e {err_file} -o {out_file} -M 40G -gpu "num=1/task:mode=exclusive_process:gmodel=NVIDIAA100_SXM4_80GB" terratorch iterate --hpo --config {config}'
+        jbsub = f'bsub -e {err_file} -o {out_file} -M 40G -gpu "num=1/task:mode=exclusive_process:gmodel=NVIDIAA100_SXM4_80GB" iterate --hpo --config {config}'
     else:
         raise ValueError("Error! Either tc_id or config must be not None")
     cmd = jbsub.split()

From d7c56528f3a4b4b874cfd02734e43f8df2657196 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Tue, 28 Oct 2025 09:51:04 -0300
Subject: [PATCH 13/16] replace flake8 by ruff on github workflows; fix linter
 errors

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 .github/workflows/pylint.yml                 | 23 -----
 .github/workflows/python-package.yml         |  8 +-
 .pre-commit-config.yaml                      | 12 ++-
 .secrets.baseline                            | 18 ++--
 plotting/plot_results_mlflow.ipynb           |  5 +-
 plotting/plot_results_repeated_runs.ipynb    |  9 +-
 pyproject.toml                               |  4 +-
 terratorch_iterate/benchmark_types.py        |  1 -
 terratorch_iterate/iterate_types.py          |  1 -
 terratorch_iterate/repeat_best_experiment.py | 15 +--
 terratorch_iterate/utils.py                  | 98 ++++++++++----------
 11 files changed, 85 insertions(+), 109 deletions(-)
 delete mode 100644 .github/workflows/pylint.yml

diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
deleted file mode 100644
index 3a2b5d1..0000000
--- a/.github/workflows/pylint.yml
+++ /dev/null
@@ -1,23 +0,0 @@
-name: Pylint
-
-on: [push]
-
-jobs:
-  build:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.10", "3.11", "3.12"]
-    steps:
-    - uses: actions/checkout@v5
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v6
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install pylint
-    - name: Analysing the code with pylint
-      run: |
-        pylint $(git ls-files '*.py')
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 72e1ce6..1067638 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -27,13 +27,11 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        python -m pip install flake8 pytest
+        python -m pip install ruff pytest
         python -m pip install -e .
         python -m pip install -e ".[dev]"
         python -m pip install -e ".[test]"
-    - name: Lint with flake8
+    - name: Lint with ruff
       run: |
         # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=F821,F401 --show-source --statistics
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+        ruff check
\ No newline at end of file
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0b8e668..3df104d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -21,9 +21,13 @@ repos:
         # add "--fail-on-unaudited" to fail pre-commit for unaudited potential secrets
         args: [--baseline, .secrets.baseline, --use-all-plugins]
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.12.3
+    # Ruff version.
+    rev: v0.14.2
     hooks:
+      # Run the linter.
+      - id: ruff-check
+        types_or: [ python, pyi ]
+        args: [ --fix ]
+      # Run the formatter.
       - id: ruff-format
-        types_or:
-          - python
-          - jupyter
+        types_or: [ python, pyi ]
diff --git a/.secrets.baseline b/.secrets.baseline
index 959a570..d18435e 100644
--- a/.secrets.baseline
+++ b/.secrets.baseline
@@ -3,7 +3,7 @@
     "files": "^.secrets.baseline$",
     "lines": null
   },
-  "generated_at": "2025-10-20T14:06:44Z",
+  "generated_at": "2025-10-28T12:40:55Z",
   "plugins_used": [
     {
       "name": "AWSKeyDetector"
@@ -81,21 +81,21 @@
       {
         "hashed_secret": "5810b71c07271f259208c5790992170ac1e13b37",
         "is_verified": false,
-        "line_number": 437,
+        "line_number": 436,
         "type": "Base64 High Entropy String",
         "verified_result": null
       },
       {
         "hashed_secret": "1c1dc227208cec78bbdb8d9247164879f908a9ad",
         "is_verified": false,
-        "line_number": 482,
+        "line_number": 481,
         "type": "Base64 High Entropy String",
         "verified_result": null
       },
       {
         "hashed_secret": "e57967bc8f018a30bb192717673876f0ebdbe5d9",
         "is_verified": false,
-        "line_number": 558,
+        "line_number": 557,
         "type": "Base64 High Entropy String",
         "verified_result": null
       }
@@ -104,35 +104,35 @@
       {
         "hashed_secret": "e52b18568a4fa073b958134ea5ec0f9407b6ebc3",
         "is_verified": false,
-        "line_number": 352,
+        "line_number": 345,
         "type": "Base64 High Entropy String",
         "verified_result": null
       },
       {
         "hashed_secret": "43cf2641021e5833120affd5a2bcdf35089eaf75",
         "is_verified": false,
-        "line_number": 417,
+        "line_number": 410,
         "type": "Base64 High Entropy String",
         "verified_result": null
       },
       {
         "hashed_secret": "78f9a422a3afb6ff5aff30094699c2b299dfd614",
         "is_verified": false,
-        "line_number": 949,
+        "line_number": 942,
         "type": "Base64 High Entropy String",
         "verified_result": null
       },
       {
         "hashed_secret": "2525429c7a93512ed0c4b799b867a83a6b19f7ff",
         "is_verified": false,
-        "line_number": 1014,
+        "line_number": 1007,
         "type": "Base64 High Entropy String",
         "verified_result": null
       },
       {
         "hashed_secret": "8915fab07d3bf85d3755089a7fc82e911405d40a",
         "is_verified": false,
-        "line_number": 1080,
+        "line_number": 1073,
         "type": "Base64 High Entropy String",
         "verified_result": null
       }
diff --git a/plotting/plot_results_mlflow.ipynb b/plotting/plot_results_mlflow.ipynb
index 5d3a752..2f1865e 100644
--- a/plotting/plot_results_mlflow.ipynb
+++ b/plotting/plot_results_mlflow.ipynb
@@ -2,14 +2,13 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "import pandas as pd\n",
     "import seaborn as sns\n",
-    "import matplotlib.pyplot as plt\n",
-    "import json"
+    "import matplotlib.pyplot as plt"
    ]
   },
   {
diff --git a/plotting/plot_results_repeated_runs.ipynb b/plotting/plot_results_repeated_runs.ipynb
index bcb8f85..a612fa6 100644
--- a/plotting/plot_results_repeated_runs.ipynb
+++ b/plotting/plot_results_repeated_runs.ipynb
@@ -19,25 +19,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "import os\n",
     "\n",
     "os.environ[\"GEO_BENCH_DIR\"] = \"/Users/cpi/terratorch\"\n",
-    "import numpy as np\n",
     "from matplotlib import pyplot as plt\n",
     "import pandas as pd\n",
-    "from pathlib import Path\n",
     "import seaborn as sns\n",
     "\n",
-    "import geobench as gb\n",
-    "\n",
     "# from geobench_exp.experiment import parse_results\n",
-    "from matplotlib.ticker import FormatStrFormatter\n",
-    "import json\n",
-    "from scipy.stats import trim_mean\n",
     "import plot_tools"
    ]
   },
diff --git a/pyproject.toml b/pyproject.toml
index 1f1933b..0d4e4d6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -70,7 +70,6 @@ dependencies = [
 "opencv-python-headless",
 "configspace",
 "optuna-integration",
-"seaborn",
 "psutil",
 "tabulate>=0.9.0",
 ]
@@ -87,7 +86,8 @@ dev = [
   "mkdocstrings[python]",
   "mike", # for building docs with versions
   "tox",
-  "pre-commit"
+  "pre-commit",
+  "seaborn"
 ]
 
 test = [
diff --git a/terratorch_iterate/benchmark_types.py b/terratorch_iterate/benchmark_types.py
index aaf3ea6..1ff0201 100644
--- a/terratorch_iterate/benchmark_types.py
+++ b/terratorch_iterate/benchmark_types.py
@@ -15,7 +15,6 @@
     ObjectDetectionTask,
 )
 from torchgeo.datamodules import BaseDataModule
-from geobench_v2.datamodules import GeoBenchDataModule
 
 valid_task_types = type[
     SemanticSegmentationTask
diff --git a/terratorch_iterate/iterate_types.py b/terratorch_iterate/iterate_types.py
index aaf3ea6..1ff0201 100644
--- a/terratorch_iterate/iterate_types.py
+++ b/terratorch_iterate/iterate_types.py
@@ -15,7 +15,6 @@
     ObjectDetectionTask,
 )
 from torchgeo.datamodules import BaseDataModule
-from geobench_v2.datamodules import GeoBenchDataModule
 
 valid_task_types = type[
     SemanticSegmentationTask
diff --git a/terratorch_iterate/repeat_best_experiment.py b/terratorch_iterate/repeat_best_experiment.py
index 70e91dc..6391369 100644
--- a/terratorch_iterate/repeat_best_experiment.py
+++ b/terratorch_iterate/repeat_best_experiment.py
@@ -257,7 +257,7 @@ def rerun_best_from_backbone(
         raise Exception(
             f"output_path must be absolute. Consider using $(pwd)/{output_path}."
         )
-    if (tmp_dir is None) & (use_ray == True):
+    if (tmp_dir is None) & use_ray:
         raise Exception("tmp_dir must be specified for runs with ray.")
 
     if use_ray:
@@ -266,7 +266,7 @@ def rerun_best_from_backbone(
     if backbone_import:
         importlib.import_module(backbone_import)
     mlflow.set_tracking_uri(storage_uri)
-    
+
     mlflow.set_experiment(experiment_name)
 
     runs: list[mlflow.entities.Run] = mlflow.search_runs(
@@ -296,16 +296,19 @@ def rerun_best_from_backbone(
     repeated_experiment_name = f"{experiment_name}_repeated_exp"
     mlflow.set_tracking_uri(repeated_storage_uri)
     mlflow.set_experiment(repeated_experiment_name)
-    experiment_id = mlflow.get_experiment_by_name(repeated_experiment_name).experiment_id
+    experiment_id = mlflow.get_experiment_by_name(
+        repeated_experiment_name
+    ).experiment_id
 
     tmp_runs = get_nested_runs(experiment_id, experiment_name, repeated_storage_uri)
     if len(tmp_runs) > 0:
-        if len(tmp_runs) > 1: tmp_runs = [x for x in runs if x["run_name"] == experiment_name]
+        if len(tmp_runs) > 1:
+            tmp_runs = [x for x in runs if x["run_name"] == experiment_name]
         run_id = tmp_runs[0]["run_id"]
     else:
         run_id = None
-    
-    #backbone_name = defaults.terratorch_task["model_args"]["backbone"]
+
+    # backbone_name = defaults.terratorch_task["model_args"]["backbone"]
     with mlflow.start_run(run_name=experiment_name, run_id=run_id) as run:
         for task in tasks:
             logger.info(f"\n\ntask: {task.name}")
diff --git a/terratorch_iterate/utils.py b/terratorch_iterate/utils.py
index bc6cf96..5fcce35 100644
--- a/terratorch_iterate/utils.py
+++ b/terratorch_iterate/utils.py
@@ -15,12 +15,9 @@
 import sys
 from mlflow.entities.experiment import Experiment
 import importlib
-import logging
 from mlflow.tracking import MlflowClient
 from mlflow.entities import ViewType
 from collections import defaultdict
-import pdb
-
 
 N_TRIALS_DEFAULT = 16
 REPEATED_SEEDS_DEFAULT = 10
@@ -220,8 +217,13 @@ def extract_repeated_experiment_results(
                 seed = int(run.info.run_name.split("_")[-1])
                 if task in task_info:
                     metric_name = task_info[task]
-                    metric_name = 'test_test/' + metric_name.split("/")[-1] if '/' in metric_name else 'test_test_' + metric_name.replace(metric_name.split('_')[0] + "_", '')
-                else:  
+                    metric_name = (
+                        "test_test/" + metric_name.split("/")[-1]
+                        if "/" in metric_name
+                        else "test_test_"
+                        + metric_name.replace(metric_name.split("_")[0] + "_", "")
+                    )
+                else:
                     continue
 
                 if metric_name not in run.data.metrics:
@@ -283,7 +285,7 @@ def extract_repeated_experiment_results(
                 f"EXPERIMENT INCOMPLETE: {experiment_name} has {len(combine_task_results)} complete tasks only"
             )
             incomplete_experiments.append(experiment_name)
-    if len(combine_exp_results) > 0: 
+    if len(combine_exp_results) > 0:
         combine_exp_results = pd.concat(combine_exp_results, axis=0)
     else:
         combine_exp_results = pd.DataFrame()
@@ -444,14 +446,13 @@ def get_results_and_parameters(
     results_and_parameters.to_csv(
         f"{str(results_dir)}/results_and_parameters.csv", index=False
     )
-    
-    if visualise:
 
-        model_order = visualize_combined_results(
+    if visualise:
+        visualize_combined_results(
             combined_results=results_and_parameters,
             storage_uri=storage_uri,
             logger=logger,
-            plot_file_base_name=f"summary_plot",
+            plot_file_base_name="summary_plot",
         )
 
     return results_and_parameters
@@ -647,14 +648,16 @@ def check_existing_experiments(
 
         # check if one of the runs is complete
         for run in experiment_parent_run_data:
-            completed_task_run_names, all_tasks_in_experiment_finished, _ = (
-                check_existing_task_parent_runs(
-                    logger=logger,
-                    exp_parent_run_id=run.info.run_id,
-                    storage_uri=storage_uri,
-                    experiment_name=experiment_name,
-                    n_trials=n_trials,
-                )
+            (
+                completed_task_run_names,
+                all_tasks_in_experiment_finished,
+                _,
+            ) = check_existing_task_parent_runs(
+                logger=logger,
+                exp_parent_run_id=run.info.run_id,
+                storage_uri=storage_uri,
+                experiment_name=experiment_name,
+                n_trials=n_trials,
             )
             logger.info(f"tasks that should be completed: {task_names}")
             logger.info(f"completed_task_run_names: {completed_task_run_names}")
@@ -764,29 +767,25 @@ def visualize_combined_results(
         benchmark_name=plots_folder,
     )
 
-    tmp = (
-        plot_tools.normalize_bootstrap_and_plot(
-            combined_results,
-            # plot_file_base_name=plot_file_base_name,
-            metric="test metric",
-            benchmark_name=plots_folder,
-            model_order=model_order,
-            model_colors=model_colors,
-            fig_size=fig_size,
-            n_legend_rows=n_legend_rows,
-        )
+    plot_tools.normalize_bootstrap_and_plot(
+        combined_results,
+        # plot_file_base_name=plot_file_base_name,
+        metric="test metric",
+        benchmark_name=plots_folder,
+        model_order=model_order,
+        model_colors=model_colors,
+        fig_size=fig_size,
+        n_legend_rows=n_legend_rows,
     )
 
     plt.savefig(
-        str(f"{plots_folder}/violin_{plot_file_base_name}_normalized_bootstrapped.png"
-        ),
+        str(f"{plots_folder}/violin_{plot_file_base_name}_normalized_bootstrapped.png"),
         bbox_inches="tight",
     )
     plt.close()
 
     combined_results.to_csv(
-        str(f"{tables_folder}/{plot_file_base_name}_normalized_combined_results.csv"
-        )
+        str(f"{tables_folder}/{plot_file_base_name}_normalized_combined_results.csv")
     )
 
 
@@ -827,7 +826,7 @@ def import_custom_modules(
             sys.path.insert(0, str(workdir))
 
             try:
-                module = importlib.import_module(module_dir)
+                importlib.import_module(module_dir)
                 logger.info(f"Found {custom_modules_path}")
             except ImportError:
                 raise ImportError(
@@ -860,7 +859,7 @@ def import_custom_modules(
     settings_per_model = [
         "detection",
     ]
-    
+
     # create box plots across multiple models
     for setting in settings_per_model:
         combined_results = results_and_parameters.loc[
@@ -873,18 +872,16 @@ def import_custom_modules(
             plot_file_base_name=f"multiple_models_{setting}",
         )
 
-        
 
 ### code written with the help of Perplexity platform
-def get_nested_runs(experiment_id, filter_string = None, mlflow_uri= "mlflow"):
+def get_nested_runs(experiment_id, filter_string=None, mlflow_uri="mlflow"):
     client = MlflowClient(mlflow_uri)
-    
+
     # Get all runs for the experiment
     all_runs = client.search_runs(
-        experiment_ids=[experiment_id],
-        run_view_type=ViewType.ACTIVE_ONLY
+        experiment_ids=[experiment_id], run_view_type=ViewType.ACTIVE_ONLY
     )
-    
+
     # Create a dictionary to store the run hierarchy
     run_hierarchy = defaultdict(list)
     parent_runs = []
@@ -892,7 +889,7 @@ def get_nested_runs(experiment_id, filter_string = None, mlflow_uri= "mlflow"):
     # First pass: Identify parent-child relationships
     for run in all_runs:
         parent_run_id = run.data.tags.get("mlflow.parentRunId")
-        
+
         if parent_run_id:
             run_hierarchy[parent_run_id].append(run)
         else:
@@ -900,7 +897,6 @@ def get_nested_runs(experiment_id, filter_string = None, mlflow_uri= "mlflow"):
 
     # Function to create a nested dictionary for a run and its children
     def create_nested_dict(run):
-        
         run_dict = {
             "run": run,
             "run_id": run.info.run_id,
@@ -908,13 +904,21 @@ def create_nested_dict(run):
             "status": run.info.status,
             "start_time": run.info.start_time,
             "end_time": run.info.end_time,
-            "children": [create_nested_dict(child) for child in run_hierarchy[run.info.run_id]]
+            "children": [
+                create_nested_dict(child) for child in run_hierarchy[run.info.run_id]
+            ],
         }
         return run_dict
-     # Create the final nested structure
+
+    # Create the final nested structure
     if filter_string:
-        nested_runs = [create_nested_dict(parent_run) for parent_run in parent_runs if parent_run.data.tags.get("mlflow.runName", "Unnamed").find(filter_string) > -1]
+        nested_runs = [
+            create_nested_dict(parent_run)
+            for parent_run in parent_runs
+            if parent_run.data.tags.get("mlflow.runName", "Unnamed").find(filter_string)
+            > -1
+        ]
     else:
         nested_runs = [create_nested_dict(parent_run) for parent_run in parent_runs]
-    
+
     return nested_runs

From 60102cb73a8511a1cc72f291ce0d6a51614f05bc Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Tue, 28 Oct 2025 09:51:04 -0300
Subject: [PATCH 14/16] replace flake8 by ruff on github workflows; fix linter
 errors

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
Signed-off-by: Leonardo P Tizzei <leonardo.tizzei@gmail.com>

Third-Party DCO Remediation Commit for Paolo Fraccaro <paolo.fraccaro@ibm.com>

On behalf of Paolo Fraccaro <paolo.fraccaro@ibm.com>, I, Leonardo P. Tizzei <leonardo.tizzei@gmail.com>, hereby add my Signed-off-by to this commit: 112b591a83d27927efc3ff8fd28a107672f7fcae
On behalf of Paolo Fraccaro <paolo.fraccaro@ibm.com>, I, Leonardo P. Tizzei <leonardo.tizzei@gmail.com>, hereby add my Signed-off-by to this commit: a1e13abdf687b42ef9d48fcbda917c8cd7d09df4

Signed-off-by: Leonardo P. Tizzei <leonardo.tizzei@gmail.com>
---
 .github/workflows/pylint.yml                 | 23 -----
 .github/workflows/python-package.yml         |  8 +-
 .pre-commit-config.yaml                      | 12 ++-
 .secrets.baseline                            | 18 ++--
 plotting/plot_results_mlflow.ipynb           |  5 +-
 plotting/plot_results_repeated_runs.ipynb    |  9 +-
 pyproject.toml                               |  4 +-
 terratorch_iterate/benchmark_types.py        |  1 -
 terratorch_iterate/iterate_types.py          |  1 -
 terratorch_iterate/repeat_best_experiment.py | 15 +--
 terratorch_iterate/utils.py                  | 98 ++++++++++----------
 11 files changed, 85 insertions(+), 109 deletions(-)
 delete mode 100644 .github/workflows/pylint.yml

diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
deleted file mode 100644
index 3a2b5d1..0000000
--- a/.github/workflows/pylint.yml
+++ /dev/null
@@ -1,23 +0,0 @@
-name: Pylint
-
-on: [push]
-
-jobs:
-  build:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.10", "3.11", "3.12"]
-    steps:
-    - uses: actions/checkout@v5
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v6
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install pylint
-    - name: Analysing the code with pylint
-      run: |
-        pylint $(git ls-files '*.py')
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 72e1ce6..1067638 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -27,13 +27,11 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        python -m pip install flake8 pytest
+        python -m pip install ruff pytest
         python -m pip install -e .
         python -m pip install -e ".[dev]"
         python -m pip install -e ".[test]"
-    - name: Lint with flake8
+    - name: Lint with ruff
       run: |
         # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=F821,F401 --show-source --statistics
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+        ruff check
\ No newline at end of file
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0b8e668..3df104d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -21,9 +21,13 @@ repos:
         # add "--fail-on-unaudited" to fail pre-commit for unaudited potential secrets
         args: [--baseline, .secrets.baseline, --use-all-plugins]
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.12.3
+    # Ruff version.
+    rev: v0.14.2
     hooks:
+      # Run the linter.
+      - id: ruff-check
+        types_or: [ python, pyi ]
+        args: [ --fix ]
+      # Run the formatter.
       - id: ruff-format
-        types_or:
-          - python
-          - jupyter
+        types_or: [ python, pyi ]
diff --git a/.secrets.baseline b/.secrets.baseline
index 959a570..d18435e 100644
--- a/.secrets.baseline
+++ b/.secrets.baseline
@@ -3,7 +3,7 @@
     "files": "^.secrets.baseline$",
     "lines": null
   },
-  "generated_at": "2025-10-20T14:06:44Z",
+  "generated_at": "2025-10-28T12:40:55Z",
   "plugins_used": [
     {
       "name": "AWSKeyDetector"
@@ -81,21 +81,21 @@
       {
         "hashed_secret": "5810b71c07271f259208c5790992170ac1e13b37",
         "is_verified": false,
-        "line_number": 437,
+        "line_number": 436,
         "type": "Base64 High Entropy String",
         "verified_result": null
       },
       {
         "hashed_secret": "1c1dc227208cec78bbdb8d9247164879f908a9ad",
         "is_verified": false,
-        "line_number": 482,
+        "line_number": 481,
         "type": "Base64 High Entropy String",
         "verified_result": null
       },
       {
         "hashed_secret": "e57967bc8f018a30bb192717673876f0ebdbe5d9",
         "is_verified": false,
-        "line_number": 558,
+        "line_number": 557,
         "type": "Base64 High Entropy String",
         "verified_result": null
       }
@@ -104,35 +104,35 @@
       {
         "hashed_secret": "e52b18568a4fa073b958134ea5ec0f9407b6ebc3",
         "is_verified": false,
-        "line_number": 352,
+        "line_number": 345,
         "type": "Base64 High Entropy String",
         "verified_result": null
       },
       {
         "hashed_secret": "43cf2641021e5833120affd5a2bcdf35089eaf75",
         "is_verified": false,
-        "line_number": 417,
+        "line_number": 410,
         "type": "Base64 High Entropy String",
         "verified_result": null
       },
       {
         "hashed_secret": "78f9a422a3afb6ff5aff30094699c2b299dfd614",
         "is_verified": false,
-        "line_number": 949,
+        "line_number": 942,
         "type": "Base64 High Entropy String",
         "verified_result": null
       },
       {
         "hashed_secret": "2525429c7a93512ed0c4b799b867a83a6b19f7ff",
         "is_verified": false,
-        "line_number": 1014,
+        "line_number": 1007,
         "type": "Base64 High Entropy String",
         "verified_result": null
       },
       {
         "hashed_secret": "8915fab07d3bf85d3755089a7fc82e911405d40a",
         "is_verified": false,
-        "line_number": 1080,
+        "line_number": 1073,
         "type": "Base64 High Entropy String",
         "verified_result": null
       }
diff --git a/plotting/plot_results_mlflow.ipynb b/plotting/plot_results_mlflow.ipynb
index 5d3a752..2f1865e 100644
--- a/plotting/plot_results_mlflow.ipynb
+++ b/plotting/plot_results_mlflow.ipynb
@@ -2,14 +2,13 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "import pandas as pd\n",
     "import seaborn as sns\n",
-    "import matplotlib.pyplot as plt\n",
-    "import json"
+    "import matplotlib.pyplot as plt"
    ]
   },
   {
diff --git a/plotting/plot_results_repeated_runs.ipynb b/plotting/plot_results_repeated_runs.ipynb
index bcb8f85..a612fa6 100644
--- a/plotting/plot_results_repeated_runs.ipynb
+++ b/plotting/plot_results_repeated_runs.ipynb
@@ -19,25 +19,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "import os\n",
     "\n",
     "os.environ[\"GEO_BENCH_DIR\"] = \"/Users/cpi/terratorch\"\n",
-    "import numpy as np\n",
     "from matplotlib import pyplot as plt\n",
     "import pandas as pd\n",
-    "from pathlib import Path\n",
     "import seaborn as sns\n",
     "\n",
-    "import geobench as gb\n",
-    "\n",
     "# from geobench_exp.experiment import parse_results\n",
-    "from matplotlib.ticker import FormatStrFormatter\n",
-    "import json\n",
-    "from scipy.stats import trim_mean\n",
     "import plot_tools"
    ]
   },
diff --git a/pyproject.toml b/pyproject.toml
index 1f1933b..0d4e4d6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -70,7 +70,6 @@ dependencies = [
 "opencv-python-headless",
 "configspace",
 "optuna-integration",
-"seaborn",
 "psutil",
 "tabulate>=0.9.0",
 ]
@@ -87,7 +86,8 @@ dev = [
   "mkdocstrings[python]",
   "mike", # for building docs with versions
   "tox",
-  "pre-commit"
+  "pre-commit",
+  "seaborn"
 ]
 
 test = [
diff --git a/terratorch_iterate/benchmark_types.py b/terratorch_iterate/benchmark_types.py
index aaf3ea6..1ff0201 100644
--- a/terratorch_iterate/benchmark_types.py
+++ b/terratorch_iterate/benchmark_types.py
@@ -15,7 +15,6 @@
     ObjectDetectionTask,
 )
 from torchgeo.datamodules import BaseDataModule
-from geobench_v2.datamodules import GeoBenchDataModule
 
 valid_task_types = type[
     SemanticSegmentationTask
diff --git a/terratorch_iterate/iterate_types.py b/terratorch_iterate/iterate_types.py
index aaf3ea6..1ff0201 100644
--- a/terratorch_iterate/iterate_types.py
+++ b/terratorch_iterate/iterate_types.py
@@ -15,7 +15,6 @@
     ObjectDetectionTask,
 )
 from torchgeo.datamodules import BaseDataModule
-from geobench_v2.datamodules import GeoBenchDataModule
 
 valid_task_types = type[
     SemanticSegmentationTask
diff --git a/terratorch_iterate/repeat_best_experiment.py b/terratorch_iterate/repeat_best_experiment.py
index 70e91dc..6391369 100644
--- a/terratorch_iterate/repeat_best_experiment.py
+++ b/terratorch_iterate/repeat_best_experiment.py
@@ -257,7 +257,7 @@ def rerun_best_from_backbone(
         raise Exception(
             f"output_path must be absolute. Consider using $(pwd)/{output_path}."
         )
-    if (tmp_dir is None) & (use_ray == True):
+    if (tmp_dir is None) & use_ray:
         raise Exception("tmp_dir must be specified for runs with ray.")
 
     if use_ray:
@@ -266,7 +266,7 @@ def rerun_best_from_backbone(
     if backbone_import:
         importlib.import_module(backbone_import)
     mlflow.set_tracking_uri(storage_uri)
-    
+
     mlflow.set_experiment(experiment_name)
 
     runs: list[mlflow.entities.Run] = mlflow.search_runs(
@@ -296,16 +296,19 @@ def rerun_best_from_backbone(
     repeated_experiment_name = f"{experiment_name}_repeated_exp"
     mlflow.set_tracking_uri(repeated_storage_uri)
     mlflow.set_experiment(repeated_experiment_name)
-    experiment_id = mlflow.get_experiment_by_name(repeated_experiment_name).experiment_id
+    experiment_id = mlflow.get_experiment_by_name(
+        repeated_experiment_name
+    ).experiment_id
 
     tmp_runs = get_nested_runs(experiment_id, experiment_name, repeated_storage_uri)
     if len(tmp_runs) > 0:
-        if len(tmp_runs) > 1: tmp_runs = [x for x in runs if x["run_name"] == experiment_name]
+        if len(tmp_runs) > 1:
+            tmp_runs = [x for x in runs if x["run_name"] == experiment_name]
         run_id = tmp_runs[0]["run_id"]
     else:
         run_id = None
-    
-    #backbone_name = defaults.terratorch_task["model_args"]["backbone"]
+
+    # backbone_name = defaults.terratorch_task["model_args"]["backbone"]
     with mlflow.start_run(run_name=experiment_name, run_id=run_id) as run:
         for task in tasks:
             logger.info(f"\n\ntask: {task.name}")
diff --git a/terratorch_iterate/utils.py b/terratorch_iterate/utils.py
index bc6cf96..5fcce35 100644
--- a/terratorch_iterate/utils.py
+++ b/terratorch_iterate/utils.py
@@ -15,12 +15,9 @@
 import sys
 from mlflow.entities.experiment import Experiment
 import importlib
-import logging
 from mlflow.tracking import MlflowClient
 from mlflow.entities import ViewType
 from collections import defaultdict
-import pdb
-
 
 N_TRIALS_DEFAULT = 16
 REPEATED_SEEDS_DEFAULT = 10
@@ -220,8 +217,13 @@ def extract_repeated_experiment_results(
                 seed = int(run.info.run_name.split("_")[-1])
                 if task in task_info:
                     metric_name = task_info[task]
-                    metric_name = 'test_test/' + metric_name.split("/")[-1] if '/' in metric_name else 'test_test_' + metric_name.replace(metric_name.split('_')[0] + "_", '')
-                else:  
+                    metric_name = (
+                        "test_test/" + metric_name.split("/")[-1]
+                        if "/" in metric_name
+                        else "test_test_"
+                        + metric_name.replace(metric_name.split("_")[0] + "_", "")
+                    )
+                else:
                     continue
 
                 if metric_name not in run.data.metrics:
@@ -283,7 +285,7 @@ def extract_repeated_experiment_results(
                 f"EXPERIMENT INCOMPLETE: {experiment_name} has {len(combine_task_results)} complete tasks only"
             )
             incomplete_experiments.append(experiment_name)
-    if len(combine_exp_results) > 0: 
+    if len(combine_exp_results) > 0:
         combine_exp_results = pd.concat(combine_exp_results, axis=0)
     else:
         combine_exp_results = pd.DataFrame()
@@ -444,14 +446,13 @@ def get_results_and_parameters(
     results_and_parameters.to_csv(
         f"{str(results_dir)}/results_and_parameters.csv", index=False
     )
-    
-    if visualise:
 
-        model_order = visualize_combined_results(
+    if visualise:
+        visualize_combined_results(
             combined_results=results_and_parameters,
             storage_uri=storage_uri,
             logger=logger,
-            plot_file_base_name=f"summary_plot",
+            plot_file_base_name="summary_plot",
         )
 
     return results_and_parameters
@@ -647,14 +648,16 @@ def check_existing_experiments(
 
         # check if one of the runs is complete
         for run in experiment_parent_run_data:
-            completed_task_run_names, all_tasks_in_experiment_finished, _ = (
-                check_existing_task_parent_runs(
-                    logger=logger,
-                    exp_parent_run_id=run.info.run_id,
-                    storage_uri=storage_uri,
-                    experiment_name=experiment_name,
-                    n_trials=n_trials,
-                )
+            (
+                completed_task_run_names,
+                all_tasks_in_experiment_finished,
+                _,
+            ) = check_existing_task_parent_runs(
+                logger=logger,
+                exp_parent_run_id=run.info.run_id,
+                storage_uri=storage_uri,
+                experiment_name=experiment_name,
+                n_trials=n_trials,
             )
             logger.info(f"tasks that should be completed: {task_names}")
             logger.info(f"completed_task_run_names: {completed_task_run_names}")
@@ -764,29 +767,25 @@ def visualize_combined_results(
         benchmark_name=plots_folder,
     )
 
-    tmp = (
-        plot_tools.normalize_bootstrap_and_plot(
-            combined_results,
-            # plot_file_base_name=plot_file_base_name,
-            metric="test metric",
-            benchmark_name=plots_folder,
-            model_order=model_order,
-            model_colors=model_colors,
-            fig_size=fig_size,
-            n_legend_rows=n_legend_rows,
-        )
+    plot_tools.normalize_bootstrap_and_plot(
+        combined_results,
+        # plot_file_base_name=plot_file_base_name,
+        metric="test metric",
+        benchmark_name=plots_folder,
+        model_order=model_order,
+        model_colors=model_colors,
+        fig_size=fig_size,
+        n_legend_rows=n_legend_rows,
     )
 
     plt.savefig(
-        str(f"{plots_folder}/violin_{plot_file_base_name}_normalized_bootstrapped.png"
-        ),
+        str(f"{plots_folder}/violin_{plot_file_base_name}_normalized_bootstrapped.png"),
         bbox_inches="tight",
     )
     plt.close()
 
     combined_results.to_csv(
-        str(f"{tables_folder}/{plot_file_base_name}_normalized_combined_results.csv"
-        )
+        str(f"{tables_folder}/{plot_file_base_name}_normalized_combined_results.csv")
     )
 
 
@@ -827,7 +826,7 @@ def import_custom_modules(
             sys.path.insert(0, str(workdir))
 
             try:
-                module = importlib.import_module(module_dir)
+                importlib.import_module(module_dir)
                 logger.info(f"Found {custom_modules_path}")
             except ImportError:
                 raise ImportError(
@@ -860,7 +859,7 @@ def import_custom_modules(
     settings_per_model = [
         "detection",
     ]
-    
+
     # create box plots across multiple models
     for setting in settings_per_model:
         combined_results = results_and_parameters.loc[
@@ -873,18 +872,16 @@ def import_custom_modules(
             plot_file_base_name=f"multiple_models_{setting}",
         )
 
-        
 
 ### code written with the help of Perplexity platform
-def get_nested_runs(experiment_id, filter_string = None, mlflow_uri= "mlflow"):
+def get_nested_runs(experiment_id, filter_string=None, mlflow_uri="mlflow"):
     client = MlflowClient(mlflow_uri)
-    
+
     # Get all runs for the experiment
     all_runs = client.search_runs(
-        experiment_ids=[experiment_id],
-        run_view_type=ViewType.ACTIVE_ONLY
+        experiment_ids=[experiment_id], run_view_type=ViewType.ACTIVE_ONLY
     )
-    
+
     # Create a dictionary to store the run hierarchy
     run_hierarchy = defaultdict(list)
     parent_runs = []
@@ -892,7 +889,7 @@ def get_nested_runs(experiment_id, filter_string = None, mlflow_uri= "mlflow"):
     # First pass: Identify parent-child relationships
     for run in all_runs:
         parent_run_id = run.data.tags.get("mlflow.parentRunId")
-        
+
         if parent_run_id:
             run_hierarchy[parent_run_id].append(run)
         else:
@@ -900,7 +897,6 @@ def get_nested_runs(experiment_id, filter_string = None, mlflow_uri= "mlflow"):
 
     # Function to create a nested dictionary for a run and its children
     def create_nested_dict(run):
-        
         run_dict = {
             "run": run,
             "run_id": run.info.run_id,
@@ -908,13 +904,21 @@ def create_nested_dict(run):
             "status": run.info.status,
             "start_time": run.info.start_time,
             "end_time": run.info.end_time,
-            "children": [create_nested_dict(child) for child in run_hierarchy[run.info.run_id]]
+            "children": [
+                create_nested_dict(child) for child in run_hierarchy[run.info.run_id]
+            ],
         }
         return run_dict
-     # Create the final nested structure
+
+    # Create the final nested structure
     if filter_string:
-        nested_runs = [create_nested_dict(parent_run) for parent_run in parent_runs if parent_run.data.tags.get("mlflow.runName", "Unnamed").find(filter_string) > -1]
+        nested_runs = [
+            create_nested_dict(parent_run)
+            for parent_run in parent_runs
+            if parent_run.data.tags.get("mlflow.runName", "Unnamed").find(filter_string)
+            > -1
+        ]
     else:
         nested_runs = [create_nested_dict(parent_run) for parent_run in parent_runs]
-    
+
     return nested_runs

From 14391a9d3813ff4d67558528e5d8f6928dc60ec6 Mon Sep 17 00:00:00 2001
From: Leonardo P Tizzei <leonardo.tizzei@gmail.com>
Date: Tue, 28 Oct 2025 11:17:32 -0300
Subject: [PATCH 15/16] Third-Party DCO Remediation Commit for Paolo Fraccaro
 <paolo.fraccaro@ibm.com>

On behalf of Paolo Fraccaro <paolo.fraccaro@ibm.com>, I, Leonardo P. Tizzei <leonardo.tizzei@gmail.com>, hereby add my Signed-off-by to this commit: 112b591a83d27927efc3ff8fd28a107672f7fcae
On behalf of Paolo Fraccaro <paolo.fraccaro@ibm.com>, I, Leonardo P. Tizzei <leonardo.tizzei@gmail.com>, hereby add my Signed-off-by to this commit: a1e13abdf687b42ef9d48fcbda917c8cd7d09df4

Signed-off-by: Leonardo P. Tizzei <leonardo.tizzei@gmail.com>
Signed-off-by: Leonardo P Tizzei <leonardo.tizzei@gmail.com>
---
 README.md | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 6af3542..58bf59d 100644
--- a/README.md
+++ b/README.md
@@ -75,28 +75,33 @@ If users want to optimize hyperparameters:
 terratorch iterate --hpo --config <config-file>
 ```
 
+Another way to run terratorch-iterate is to omit `terratorch` by running:
+```shell
+iterate --hpo --config <config-file>
+```
+
 For instance:
 ```shell
-terratorch iterate --hpo --config configs/dofa_large_patch16_224_upernetdecoder_true_modified.yaml
+iterate --hpo --config configs/dofa_large_patch16_224_upernetdecoder_true_modified.yaml
 ```
 
 
 If users want to rerun best experiment, please use the same config file. Additionally, the `parent_run_id`, which is the mlflow run id from optimization, should be added as shown below:
 ```shell
-terratorch iterate --repeat --config <config-file> --parent_run_id <mlflow run_id from hpo>
+iterate --repeat --config <config-file> --parent_run_id <mlflow run_id from hpo>
 ```
 For instance:
 ```shell
-terratorch iterate --repeat --config configs/dofa_large_patch16_224_upernetdecoder_true_modified.yaml --parent_run_id 61bdee4a35a94f988ad30c46c87d4fbd
+iterate --repeat --config configs/dofa_large_patch16_224_upernetdecoder_true_modified.yaml --parent_run_id 61bdee4a35a94f988ad30c46c87d4fbd
 ```
 
 If users want to optimize hyperparameters then the rerun best experiment in a single command, please use both settings as shown below:
 ```shell
-terratorch iterate --hpo --repeat --config <config-file>
+iterate --hpo --repeat --config <config-file>
 ```
 For instance:
 ```shell
-terratorch iterate --hpo --repeat --config configs/dofa_large_patch16_224_upernetdecoder_true_modified.yaml
+iterate --hpo --repeat --config configs/dofa_large_patch16_224_upernetdecoder_true_modified.yaml
 ```
 
 To check the experiment results, use `mlflow ui --host $(hostname -f) --port <port> --backend-store-uri <storage_uri>` 
@@ -121,11 +126,11 @@ See `configs/summarize_results_template.yaml` in the git repo for an example.
 
 To summarize results and hyperparameters, please run the following: 
 ```shell
-terratorch iterate --summarize --config <summarize-config-file>
+iterate --summarize --config <summarize-config-file>
 ```
 For instance:
 ```shell
-terratorch iterate --summarize --config configs/summarize_results.yaml
+iterate --summarize --config configs/summarize_results.yaml
 ```
 
 The results and hyperparameters are extracted into a csv file. For example, if `storage_uri` is `/opt/benchmark_experiments/hpo`, then sumarized results will be saved in last file as shown below:

From 5afd4ccdfb4a973d75cfa57f66718d12eb54e52e Mon Sep 17 00:00:00 2001
From: Leonardo P Tizzei <leonardo.tizzei@gmail.com>
Date: Tue, 28 Oct 2025 12:54:05 -0300
Subject: [PATCH 16/16] sign commit messaged

Signed-off-by: Leonardo P Tizzei <leonardo.tizzei@gmail.com>
---
 .pre-commit-config.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 3df104d..e7e44d1 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -31,3 +31,7 @@ repos:
       # Run the formatter.
       - id: ruff-format
         types_or: [ python, pyi ]
+  - repo: https://github.com/mattlqx/pre-commit-sign
+    rev: v1.2.0
+    hooks:
+    - id: sign-commit