From dba1ee1b8a662c787914c9a67826b7d78adeed34 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Mon, 22 Sep 2025 17:24:04 -0300
Subject: [PATCH 01/40] write test case

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 .../config_util/build_geobench_configs.py     |  15 +-
 .../oracle/convnext_LM_iterate.yaml           | 106 ++++++++++++
 .../test_config_util__convnext.yaml           | 106 ++++++++++++
 .../test_case_01/convnext.yaml                | 157 ++++++++++++++++++
 tests/test_build_geobench_configs.py          | 107 ++++++------
 5 files changed, 433 insertions(+), 58 deletions(-)
 create mode 100644 configs/tests/terratorch-iterate-configs/test_case_01/oracle/convnext_LM_iterate.yaml
 create mode 100644 configs/tests/terratorch-iterate-configs/test_case_01/test_config_util__convnext.yaml
 create mode 100644 configs/tests/terratorch_configs/test_case_01/convnext.yaml

diff --git a/benchmark/config_util/build_geobench_configs.py b/benchmark/config_util/build_geobench_configs.py
index d704883..4807b6a 100644
--- a/benchmark/config_util/build_geobench_configs.py
+++ b/benchmark/config_util/build_geobench_configs.py
@@ -71,7 +71,7 @@ def _create_task(
     direction: str,
     max_run_duration: str | None = None,
     early_stop_patience: int | None = None,
-    early_prune: bool = False,
+    early_prune: bool | None = None,
 ) -> dict:
     """instantiate Task dataclass and convert it to dict
 
@@ -97,10 +97,15 @@ def _create_task(
         "direction": direction,
         "metric": metric,
         "terratorch_task": terratorch_task,
-        "max_run_duration": max_run_duration,
-        "early_stop_patience": early_stop_patience,
-        "early_prune": early_prune,
     }
+    # set optional fields if they are not None
+    for k, v in [
+        ("max_run_duration", max_run_duration),
+        ("early_stop_patience", early_stop_patience),
+        ("early_prune", early_prune),
+    ]:
+        if v is not None:
+            task_dict[k] = v
 
     return task_dict
 
@@ -179,7 +184,7 @@ def generate_iterate_config(
             ):
                 metric = 'val_map'
             else:
-                metric = 'val_segm_map'
+                metric = 'val/loss'
 
             # terratorchtask is the data.model.init_args of terratorch config file
             terratorch_task = data['model']['init_args']
diff --git a/configs/tests/terratorch-iterate-configs/test_case_01/oracle/convnext_LM_iterate.yaml b/configs/tests/terratorch-iterate-configs/test_case_01/oracle/convnext_LM_iterate.yaml
new file mode 100644
index 0000000..b448c5a
--- /dev/null
+++ b/configs/tests/terratorch-iterate-configs/test_case_01/oracle/convnext_LM_iterate.yaml
@@ -0,0 +1,106 @@
+defaults:
+  terratorch_task:
+    model_args:
+      backbone: timm_convnext_large.fb_in22k
+      backbone_pretrained: true
+    model_factory: EncoderDecoderFactory
+    optimizer: AdamW
+    scheduler: ReduceLROnPlateau
+    scheduler_hparams:
+      cooldown: 0
+      eps: 1.0e-08
+      factor: 0.5
+      min_lr: 0.0
+      mode: min
+      patience: 5
+      threshold: 0.0001
+      threshold_mode: rel
+      verbose: deprecated
+  trainer_args:
+    log_every_n_steps: 1
+    max_epochs: 5
+experiment_name: convnext_LM
+n_trials: 1
+optimization_space:
+  lr:
+    log: true
+    max: 1e-3
+    min: 1e-6
+    type: real
+run_repetitions: 5
+save_models: false
+storage_uri: ./mlflow
+tasks:
+- datamodule:
+    class_path: terratorch.datamodules.GenericNonGeoSegmentationDataModule
+    init_args:
+      batch_size: 16
+      check_stackability: false
+      constant_scale: 1.0
+      dataset_bands:
+      - RED
+      - GREEN
+      - BLUE
+      img_grep: '*train.tif'
+      label_grep: '*label.tif'
+      means:
+      - 104.24203383423682
+      - 109.92963788132441
+      - 100.98120642006803
+      no_data_replace: 0
+      no_label_replace: -1
+      num_classes: 2
+      num_workers: 16
+      output_bands:
+      - RED
+      - GREEN
+      - BLUE
+      rgb_indices:
+      - 0
+      - 1
+      - 2
+      stds:
+      - 51.593745217159935
+      - 47.218880227273814
+      - 45.45813303733705
+      test_data_root: AerialImageDatasetTiledMergedFixedLabels_sample/
+      test_label_data_root: AerialImageDatasetTiledMergedFixedLabels_sample/
+      test_split: test.txt
+      test_transform:
+      - class_path: ToTensorV2
+      train_data_root: AerialImageDatasetTiledMergedFixedLabels_sample/
+      train_label_data_root: AerialImageDatasetTiledMergedFixedLabels_sample/
+      train_split: train.txt
+      train_transform:
+      - class_path: albumentations.D4
+      - class_path: ToTensorV2
+      val_data_root: AerialImageDatasetTiledMergedFixedLabels_sample/
+      val_label_data_root: AerialImageDatasetTiledMergedFixedLabels_sample/
+      val_split: val.txt
+      val_transform:
+      - class_path: ToTensorV2
+  direction: max
+  metric: val/loss
+  name: LM
+  terratorch_task:
+    freeze_backbone: false
+    freeze_decoder: false
+    ignore_index: -1
+    loss: dice
+    model_args:
+      backbone: timm_convnext_large.fb_in22k
+      backbone_pretrained: true
+      decoder: UNetDecoder
+      decoder_channels:
+      - 512
+      - 256
+      - 128
+      - 64
+      head_channel_list:
+      - 256
+      head_dropout: 0.1
+      necks: null
+      num_classes: 2
+    model_factory: EncoderDecoderFactory
+    plot_on_val: 2
+  type: segmentation
diff --git a/configs/tests/terratorch-iterate-configs/test_case_01/test_config_util__convnext.yaml b/configs/tests/terratorch-iterate-configs/test_case_01/test_config_util__convnext.yaml
new file mode 100644
index 0000000..c19de81
--- /dev/null
+++ b/configs/tests/terratorch-iterate-configs/test_case_01/test_config_util__convnext.yaml
@@ -0,0 +1,106 @@
+defaults:
+  terratorch_task:
+    model_args:
+      backbone: X
+      backbone_pretrained: true
+    model_factory: EncoderDecoderFactory
+    optimizer: AdamW
+    scheduler: ReduceLROnPlateau
+    scheduler_hparams:
+      cooldown: 0
+      eps: 1.0e-08
+      factor: 0.5
+      min_lr: 0.0
+      mode: min
+      patience: 5
+      threshold: 0.0001
+      threshold_mode: rel
+      verbose: deprecated
+  trainer_args:
+    log_every_n_steps: 1
+    max_epochs: 5
+experiment_name: test_config_util__convnext
+n_trials: 1
+optimization_space:
+  lr:
+    log: true
+    max: 1e-3
+    min: 1e-6
+    type: real
+run_repetitions: 5
+save_models: false
+storage_uri: ./mlflow
+tasks:
+- datamodule:
+    class_path: terratorch.datamodules.GenericNonGeoSegmentationDataModule
+    init_args:
+      batch_size: 16
+      check_stackability: false
+      constant_scale: 1.0
+      dataset_bands:
+      - RED
+      - GREEN
+      - BLUE
+      img_grep: '*train.tif'
+      label_grep: '*label.tif'
+      means:
+      - 104.24203383423682
+      - 109.92963788132441
+      - 100.98120642006803
+      no_data_replace: 0
+      no_label_replace: -1
+      num_classes: 2
+      num_workers: 16
+      output_bands:
+      - RED
+      - GREEN
+      - BLUE
+      rgb_indices:
+      - 0
+      - 1
+      - 2
+      stds:
+      - 51.593745217159935
+      - 47.218880227273814
+      - 45.45813303733705
+      test_data_root: AerialImageDatasetTiledMergedFixedLabels_sample/
+      test_label_data_root: AerialImageDatasetTiledMergedFixedLabels_sample/
+      test_split: test.txt
+      test_transform:
+      - class_path: ToTensorV2
+      train_data_root: AerialImageDatasetTiledMergedFixedLabels_sample/
+      train_label_data_root: AerialImageDatasetTiledMergedFixedLabels_sample/
+      train_split: train.txt
+      train_transform:
+      - class_path: albumentations.D4
+      - class_path: ToTensorV2
+      val_data_root: AerialImageDatasetTiledMergedFixedLabels_sample/
+      val_label_data_root: AerialImageDatasetTiledMergedFixedLabels_sample/
+      val_split: val.txt
+      val_transform:
+      - class_path: ToTensorV2
+  direction: max
+  metric: val/loss
+  name: convnext.yaml
+  terratorch_task:
+    freeze_backbone: false
+    freeze_decoder: false
+    ignore_index: -1
+    loss: dice
+    model_args:
+      backbone: timm_convnext_large.fb_in22k
+      backbone_pretrained: true
+      decoder: UNetDecoder
+      decoder_channels:
+      - 512
+      - 256
+      - 128
+      - 64
+      head_channel_list:
+      - 256
+      head_dropout: 0.1
+      necks: null
+      num_classes: 2
+    model_factory: EncoderDecoderFactory
+    plot_on_val: 2
+  type: segmentation
diff --git a/configs/tests/terratorch_configs/test_case_01/convnext.yaml b/configs/tests/terratorch_configs/test_case_01/convnext.yaml
new file mode 100644
index 0000000..ff30037
--- /dev/null
+++ b/configs/tests/terratorch_configs/test_case_01/convnext.yaml
@@ -0,0 +1,157 @@
+# lightning.pytorch==2.1.1
+seed_everything: 0
+trainer:
+  accelerator: auto
+  strategy: auto
+  devices: auto
+  num_nodes: 1
+  precision: 16-mixed
+               
+  callbacks:
+    - class_path: RichProgressBar
+    - class_path: LearningRateMonitor
+      init_args:
+        logging_interval: epoch
+    # - class_path: ModelCheckpoint
+    #   init_args:
+    #       mode: min
+    #       monitor: val/loss
+    #       filename: best-{epoch:02d}
+    - class_path: EarlyStopping
+      init_args:
+        monitor: val/loss
+        patience: 20
+    # ---- Early stop if ----
+    # ---- Early stop endif ----
+  max_epochs: 50
+  check_val_every_n_epoch: 1
+  log_every_n_steps: 5
+  enable_checkpointing: false
+  default_root_dir: logs/
+
+data:
+  class_path: terratorch.datamodules.GenericNonGeoSegmentationDataModule
+  init_args:
+    batch_size: 16
+    num_workers: 16
+    no_label_replace: -1
+    no_data_replace: 0
+    constant_scale: 1.0
+    dataset_bands:
+      - 'RED'
+      - 'GREEN'
+      - 'BLUE'
+
+    output_bands:
+      - 'RED'
+      - 'GREEN'
+      - 'BLUE'
+
+    rgb_indices:
+      - 0
+      - 1
+      - 2
+
+    train_data_root: AerialImageDatasetTiledMergedFixedLabels_sample/
+    train_label_data_root: AerialImageDatasetTiledMergedFixedLabels_sample/
+    val_data_root: AerialImageDatasetTiledMergedFixedLabels_sample/
+    val_label_data_root: AerialImageDatasetTiledMergedFixedLabels_sample/
+    test_data_root: AerialImageDatasetTiledMergedFixedLabels_sample/
+    test_label_data_root: AerialImageDatasetTiledMergedFixedLabels_sample/
+    img_grep: "*train.tif"
+    label_grep: "*label.tif"
+    train_split: train.txt
+    val_split: val.txt
+    test_split: test.txt
+    # constant_scale: 0.0039
+    # means: [0.485, 0.456, 0.406]
+    # stds: [0.229, 0.224, 0.225]
+    means: 
+      - 104.24203383423682
+      - 109.92963788132441
+      - 100.98120642006803
+
+    stds: 
+      - 51.593745217159935
+      - 47.218880227273814
+      - 45.45813303733705
+    
+    check_stackability: false
+
+    num_classes: 2
+
+    train_transform:
+      - class_path: albumentations.D4
+      - class_path: ToTensorV2
+    val_transform:
+      - class_path: ToTensorV2
+    test_transform:
+      - class_path: ToTensorV2
+model:
+  class_path: terratorch.tasks.SemanticSegmentationTask
+  init_args:
+    model_factory: EncoderDecoderFactory
+    model_args:
+      backbone: timm_convnext_large.fb_in22k
+      num_classes: 2
+      backbone_pretrained: true
+      necks:
+      # - name: SelectIndices
+      #   indices: [1,2,3,4]
+      decoder: UNetDecoder
+      decoder_channels: [512, 256, 128, 64]
+      head_channel_list: [256]
+      head_dropout: 0.1
+    loss: dice
+    # loss: ce
+    plot_on_val: 2
+    ignore_index: -1
+    freeze_backbone: false
+    freeze_decoder: false
+        
+    # tiled_inference_parameters: 
+    #   h_crop: 224
+    #   h_stride: 198
+    #   w_crop: 224
+    #   w_stride: 198
+    #   average_patches: True
+    
+optimizer:
+  class_path: torch.optim.AdamW
+  init_args:
+    lr: 5.0e-05
+    # betas:
+    # - 0.9
+    # - 0.999
+    # eps: 1.0e-08
+    # weight_decay: 0.05
+    # amsgrad: false
+    # maximize: false
+    # capturable: false
+    # differentiable: false
+    # ---- Optimizer stop if ----
+# lr_scheduler:
+#   class_path: CosineAnnealingLR
+#   init_args:
+#     T_max: 20
+
+# lr_scheduler_interval: step
+# lr_scheduler:
+#   class_path: torch.optim.lr_scheduler.CosineAnnealingWarmRestarts
+#   init_args:
+#     T_0: 1000   # first cycle: 1000 steps
+#     T_mult: 2   # cycles: 1000, 2000, 4000, ... (fits well in 10k)
+#     eta_min: 1.0e-6
+lr_scheduler:
+  class_path: lightning.pytorch.cli.ReduceLROnPlateau
+  init_args:
+    monitor: val/loss
+    mode: min
+    factor: 0.5
+    patience: 5
+    threshold: 0.0001
+    threshold_mode: rel
+    cooldown: 0
+    min_lr: 0.0
+    eps: 1.0e-08
+    verbose: deprecated
diff --git a/tests/test_build_geobench_configs.py b/tests/test_build_geobench_configs.py
index 62987d6..1649cd4 100644
--- a/tests/test_build_geobench_configs.py
+++ b/tests/test_build_geobench_configs.py
@@ -3,84 +3,85 @@
 import yaml
 from benchmark.config_util.build_geobench_configs import generate_iterate_config
 from deepdiff import DeepDiff
+import logging
+
+logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
 
 
 @pytest.mark.parametrize(
-    "input_dir, output_dir, template, prefix",
+    "input_dir, output_dir, template, prefix, oracle_config_file",
     [
         (
-            # terratorch branch geobench_v2_od
-            "/Users/ltizzei/Projects/Orgs/IBM/terratorch/examples/confs/geobenchv2_detection",
-            "/Users/ltizzei/Projects/Orgs/IBM/terratorch-iterate/tests/test_config_util",
-            "/Users/ltizzei/Projects/Orgs/IBM/terratorch-iterate/benchmark/config_util/geobenchv2_template.yaml",
-            "test_examples_confs_geobenchv2_detection",
-        ),
-        (
-            "/Users/ltizzei/Projects/Orgs/IBM/terratorch/tests/resources/configs",
-            "/Users/ltizzei/Projects/Orgs/IBM/terratorch-iterate/tests/test_config_util",
-            "/Users/ltizzei/Projects/Orgs/IBM/terratorch-iterate/benchmark/config_util/geobenchv2_template.yaml",
+            "./configs/tests/terratorch_configs/test_case_01",
+            "./configs/tests/terratorch-iterate-configs/test_case_01",
+            "./configs/templates/template.yaml",
             "test_config_util_",
+            "./configs/tests/terratorch-iterate-configs/test_case_01/oracle/convnext_LM_iterate.yaml",
         ),
     ],
 )
-def test__generate_iterate_config(input_dir, output_dir, template, prefix):
-    input_dir_path = Path(input_dir)
+def test__generate_iterate_config(
+    input_dir, output_dir, template, prefix, oracle_config_file
+):
+    # Get the absolute path of the current script file
+    script_path = Path(__file__).resolve()
+
+    # Get the home directory 
+    repo_home_dir = script_path.parent.parent
+    input_dir_path: Path = repo_home_dir / input_dir
     assert input_dir_path.exists()
     assert input_dir_path.is_dir()
-    output_path = Path(output_dir)
+    output_path: Path = repo_home_dir / output_dir
     assert output_path.exists()
     assert output_path.is_dir()
     # warning! delete all files of the output dir
     for item in output_path.iterdir():
         if item.is_file():
+            logging.debug(f"Cleaning up directory: {item} deleted")
             item.unlink()
 
     generate_iterate_config(
         input_dir=input_dir_path,
         output_dir=output_path,
-        template=template,
+        template=repo_home_dir / template,
         prefix=prefix,
     )
     generated_config_files = list(output_path.glob(f'**/{prefix}*.yaml'))
     assert len(generated_config_files) > 0
 
-    oracle_config_files = [
-        f for f in input_dir_path.glob(f'**/geobench*.yaml') if "template" not in str(f)
-    ]
-    for gen_config_file in generated_config_files:
-        end_gen_config_filename = gen_config_file.name.replace(prefix, "")
-        for oracle_config_file in oracle_config_files:
-            end_oracle_config_filename = oracle_config_file.name.replace(
-                "geobenchv2", ""
-            )
-            if end_gen_config_filename == end_oracle_config_filename:
-                with open(gen_config_file, "r") as gen_file:
-                    new_config = yaml.safe_load(gen_file)
-                with open(oracle_config_file, "r") as gt_file:
-                    oracle_config = yaml.safe_load(gt_file)
+    if oracle_config_file is not None:
+        oracle_path: Path = repo_home_dir / oracle_config_file
+        with open(oracle_path, "r") as gt_file:
+            oracle_config = yaml.safe_load(gt_file)
+
+        for gen_config_file in generated_config_files:
+
+            with open(gen_config_file, "r") as gen_file:
+                new_config = yaml.safe_load(gen_file)
 
-                oracle_tasks = oracle_config["tasks"]
-                new_config_tasks = new_config["tasks"]
-                # comparing the tasks
-                for oracle_task in oracle_tasks:
-                    oracle_task_name = oracle_task["name"]
-                    found = False
-                    for new_config_task in new_config_tasks:
-                        new_config_task_name = new_config_task["name"]
-                        if new_config_task_name == oracle_task_name:
+            oracle_tasks = oracle_config["tasks"]
+            new_config_tasks = new_config["tasks"]
+            # comparing the tasks
+            for oracle_task in oracle_tasks:
+                found = False
+                if oracle_task.get("name") is not None:
+                    del oracle_task["name"]
+                for new_config_task in new_config_tasks:
+                    if new_config_task.get("name") is not None:
+                        del new_config_task["name"]
 
-                            diff = DeepDiff(new_config_task, oracle_task)
-                            if len(diff) == 0:
-                                found = True
-                            else:
-                                for k in [
-                                    "datamodule",
-                                    "direction",
-                                    "metric",
-                                    "terratorch_task",
-                                    "type",
-                                ]:
-                                    diff = DeepDiff(new_config_task[k], oracle_task[k])
-                                    assert len(diff) == 0, f"Error! {diff}"
-                                found = True
-                    assert found
+                    diff = DeepDiff(new_config_task, oracle_task)
+                    if len(diff) == 0:
+                        found = True
+                    else:
+                        for k in [
+                            "datamodule",
+                            "direction",
+                            "metric",
+                            "terratorch_task",
+                            "type",
+                        ]:
+                            diff = DeepDiff(new_config_task[k], oracle_task[k])
+                            assert len(diff) == 0, f"Error! {diff}"
+                        found = True
+                assert found, f"Error! task not found: {oracle_task}"

From 1bfd8fdd697541c288436fd518776e39bf6b32da Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Wed, 24 Sep 2025 13:57:09 -0300
Subject: [PATCH 02/40] add another test case

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 ...ncoderdecoder_eo_v2_300_model_factory.yaml | 131 +++++++++++++++
 ...ncoderdecoder_eo_v2_300_model_factory.yaml | 156 ++++++++++++++++++
 tests/test_build_geobench_configs.py          |   9 +-
 3 files changed, 295 insertions(+), 1 deletion(-)
 create mode 100644 configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
 create mode 100644 configs/tests/terratorch_configs/test_case_02/test_encoderdecoder_eo_v2_300_model_factory.yaml

diff --git a/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml b/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
new file mode 100644
index 0000000..4895d6b
--- /dev/null
+++ b/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
@@ -0,0 +1,131 @@
+defaults:
+  terratorch_task:
+    model_args:
+      backbone: X
+      backbone_pretrained: true
+    model_factory: EncoderDecoderFactory
+    optimizer: AdamW
+    scheduler: ReduceLROnPlateau
+    scheduler_hparams:
+      cooldown: 0
+      eps: 1.0e-08
+      factor: 0.5
+      min_lr: 0.0
+      mode: min
+      patience: 5
+      threshold: 0.0001
+      threshold_mode: rel
+      verbose: deprecated
+  trainer_args:
+    log_every_n_steps: 1
+    max_epochs: 5
+experiment_name: test_config_util__encoderdecoder_eo_v2_300_model_factory
+n_trials: 1
+optimization_space:
+  lr:
+    log: true
+    max: 1e-3
+    min: 1e-6
+    type: real
+run_repetitions: 5
+save_models: false
+storage_uri: ./mlflow
+tasks:
+- datamodule:
+    class_path: terratorch.datamodules.GenericNonGeoSegmentationDataModule
+    init_args:
+      allow_substring_split_file: true
+      batch_size: 4
+      constant_scale: 1.0
+      dataset_bands:
+      - '0'
+      - '1'
+      - '2'
+      - '3'
+      - '4'
+      - '5'
+      ignore_split_file_extensions: true
+      img_grep: '*_merged.tif'
+      label_grep: '*.mask.tif'
+      means:
+      - 0.052829564761523104
+      - 0.07822514779700994
+      - 0.09545302348640401
+      - 0.2128596444116123
+      - 0.2363016737011897
+      - 0.17234100022878698
+      no_data_replace: 0
+      no_label_replace: -1
+      num_classes: 2
+      num_workers: 2
+      output_bands:
+      - '0'
+      - '1'
+      - '2'
+      - '3'
+      - '4'
+      - '5'
+      rgb_indices:
+      - 0
+      - 1
+      - 2
+      stds:
+      - 0.028757146620143812
+      - 0.03540772770593507
+      - 0.05291947163682527
+      - 0.06949186937256507
+      - 0.08958868240264736
+      - 0.08198354165348874
+      test_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
+      test_label_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
+      test_transform:
+      - class_path: ToTensorV2
+      train_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/training
+      train_label_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/training
+      val_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
+      val_label_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
+  direction: max
+  metric: val/loss
+  name: test
+  terratorch_task:
+    freeze_backbone: false
+    freeze_decoder: false
+    ignore_index: -1
+    loss: ce
+    model_args:
+      backbone: prithvi_eo_v2_300
+      backbone_bands:
+      - '0'
+      - '1'
+      - '2'
+      - '3'
+      - '4'
+      - '5'
+      backbone_drop_path: 0.1
+      backbone_pretrained: true
+      decoder: UNetDecoder
+      decoder_channels:
+      - 512
+      - 256
+      - 128
+      - 64
+      head_dropout: 0.1
+      necks:
+      - indices:
+        - 5
+        - 11
+        - 17
+        - 23
+        name: SelectIndices
+      - name: ReshapeTokensToImage
+      - name: LearnedInterpolateToPyramidal
+      num_classes: 2
+    model_factory: EncoderDecoderFactory
+    plot_on_val: 2
+    tiled_inference_parameters:
+      average_patches: true
+      h_crop: 512
+      h_stride: 448
+      w_crop: 512
+      w_stride: 448
+  type: segmentation
diff --git a/configs/tests/terratorch_configs/test_case_02/test_encoderdecoder_eo_v2_300_model_factory.yaml b/configs/tests/terratorch_configs/test_case_02/test_encoderdecoder_eo_v2_300_model_factory.yaml
new file mode 100644
index 0000000..c12eee4
--- /dev/null
+++ b/configs/tests/terratorch_configs/test_case_02/test_encoderdecoder_eo_v2_300_model_factory.yaml
@@ -0,0 +1,156 @@
+# lightning.pytorch==2.1.1
+seed_everything: 0
+trainer:
+  accelerator: auto
+  strategy: auto
+  devices: auto
+  num_nodes: 1
+  precision: 16-mixed
+  logger: true
+  max_epochs: 2
+               
+  callbacks:
+    - class_path: RichProgressBar
+    - class_path: LearningRateMonitor
+      init_args:
+        logging_interval: epoch
+    # ---- Early stop if ----
+    - class_path: EarlyStopping
+      init_args:
+        monitor: val/loss
+        patience: 20
+     # ---- Early stop endif ----
+    - class_path: ModelCheckpoint
+      init_args:
+        dirpath: /dccstor/terratorch/tmp/eo_v2_300/
+        mode: min
+        monitor: val/loss
+        filename: best-state_dict-{epoch:02d}
+        save_weights_only: True
+  check_val_every_n_epoch: 1
+  log_every_n_steps: 50
+  enable_checkpointing: true
+  default_root_dir: /dccstor/terratorch/tmp/eo_v2_300/
+
+data:
+  class_path: terratorch.datamodules.GenericNonGeoSegmentationDataModule
+  init_args:
+    batch_size: 4
+    num_workers: 2
+    no_label_replace: -1
+    no_data_replace: 0
+    constant_scale: 1.0
+    dataset_bands:
+      - '0'
+      - '1'
+      - '2'
+      - '3'
+      - '4'
+      - '5'
+
+    output_bands:
+      - '0'
+      - '1'
+      - '2'
+      - '3'
+      - '4'
+      - '5'
+
+    rgb_indices:
+      - 0
+      - 1
+      - 2
+
+    train_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/training
+    train_label_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/training
+    val_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
+    val_label_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
+    test_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
+    test_label_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
+    # Splits not available in ccc for burnscars data
+    # train_split: /data//geodata-060bbc44822a11efb3260a580a830dad/split_files/train_data.txt
+    # test_split: /data//geodata-060bbc44822a11efb3260a580a830dad/split_files/test_data.txt
+    # val_split: /data//geodata-060bbc44822a11efb3260a580a830dad/split_files/val_data.txt
+    ignore_split_file_extensions: true
+    allow_substring_split_file: true
+    img_grep: "*_merged.tif"
+    label_grep: "*.mask.tif"
+    means: 
+      - 0.052829564761523104
+      - 0.07822514779700994
+      - 0.09545302348640401
+      - 0.2128596444116123
+      - 0.2363016737011897
+      - 0.17234100022878698
+
+    stds: 
+      - 0.028757146620143812
+      - 0.03540772770593507
+      - 0.05291947163682527
+      - 0.06949186937256507
+      - 0.08958868240264736
+      - 0.08198354165348874
+
+    num_classes: 2
+    # ---- train_transform if ----
+    # ---- train_transform endif ----
+
+    # if backbone is prithvi-EO-v2
+    test_transform:
+      - class_path: ToTensorV2
+model:
+  class_path: terratorch.tasks.SemanticSegmentationTask
+  init_args:
+    model_args:
+      backbone_pretrained: true 
+      backbone: prithvi_eo_v2_300
+      # backbone_ckpt_path: /terratorch/gfm_models/prithvi_eo_v2_300/Prithvi_EO_V2_300M.pt 
+      backbone_drop_path: 0.1 
+      backbone_bands:
+        - '0'
+        - '1'
+        - '2'
+        - '3'
+        - '4'
+        - '5'
+
+
+      necks: 
+        - name: SelectIndices
+          indices: [5, 11, 17, 23]  # 300M models
+        - name: ReshapeTokensToImage # required
+        - name: LearnedInterpolateToPyramidal 
+      decoder: UNetDecoder
+      #TODO user provided channels
+      decoder_channels: [512, 256, 128, 64]
+      num_classes: 2
+      head_dropout: 0.1
+    model_factory: EncoderDecoderFactory 
+    loss: ce
+    plot_on_val: 2
+    ignore_index: -1
+    freeze_backbone: false
+    freeze_decoder: false
+
+    # ---- optimizer start ----
+    # ---- optimizer end ----
+    
+    tiled_inference_parameters: 
+      h_crop: 512
+      h_stride: 448
+      w_crop: 512
+      w_stride: 448
+      average_patches: True
+    
+optimizer:
+  class_path: torch.optim.Adam
+  init_args:
+    # ---- Optimizer start if ----
+    lr: 6e-05
+    
+    weight_decay: 0.05
+    # ---- Optimizer stop if ----
+lr_scheduler:
+  class_path: ReduceLROnPlateau
+  init_args:
+    monitor: val/loss
\ No newline at end of file
diff --git a/tests/test_build_geobench_configs.py b/tests/test_build_geobench_configs.py
index 1649cd4..9f3f89c 100644
--- a/tests/test_build_geobench_configs.py
+++ b/tests/test_build_geobench_configs.py
@@ -18,6 +18,13 @@
             "test_config_util_",
             "./configs/tests/terratorch-iterate-configs/test_case_01/oracle/convnext_LM_iterate.yaml",
         ),
+        (
+            "./configs/tests/terratorch_configs/test_case_02",
+            "./configs/tests/terratorch-iterate-configs/test_case_02",
+            "./configs/templates/template.yaml",
+            "test_config_util_",
+            None,
+        ),
     ],
 )
 def test__generate_iterate_config(
@@ -26,7 +33,7 @@ def test__generate_iterate_config(
     # Get the absolute path of the current script file
     script_path = Path(__file__).resolve()
 
-    # Get the home directory 
+    # Get the home directory
     repo_home_dir = script_path.parent.parent
     input_dir_path: Path = repo_home_dir / input_dir
     assert input_dir_path.exists()

From 1c0770721b8093ca64f1155aa960740e493cb409 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Wed, 24 Sep 2025 16:02:17 -0300
Subject: [PATCH 03/40] new util function to run single job

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 benchmark/main.py                             |  1 -
 ...ncoderdecoder_eo_v2_300_model_factory.yaml |  4 +-
 run_tests.py                                  | 40 ++++++++++++++-----
 tests/integration/__init__.py                 |  0
 tests/integration/test_main.py                | 25 ++++++++++++
 tests/test_benchmark.py                       |  5 +--
 .../{ => unit}/test_build_geobench_configs.py |  0
 tests/{ => unit}/test_cli.py                  |  0
 8 files changed, 58 insertions(+), 17 deletions(-)
 create mode 100644 tests/integration/__init__.py
 create mode 100644 tests/integration/test_main.py
 rename tests/{ => unit}/test_build_geobench_configs.py (100%)
 rename tests/{ => unit}/test_cli.py (100%)

diff --git a/benchmark/main.py b/benchmark/main.py
index b72b160..28ffc55 100644
--- a/benchmark/main.py
+++ b/benchmark/main.py
@@ -1,7 +1,6 @@
 from jsonargparse import Namespace
 import logging
 from pathlib import Path
-from typing import Any, List
 from jsonargparse import ArgumentParser
 import pandas as pd
 from benchmark.backbone_benchmark import benchmark_backbone
diff --git a/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml b/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
index 4895d6b..09410b3 100644
--- a/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
+++ b/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
@@ -2,7 +2,7 @@ defaults:
   terratorch_task:
     model_args:
       backbone: X
-      backbone_pretrained: true
+      # backbone_pretrained: true
     model_factory: EncoderDecoderFactory
     optimizer: AdamW
     scheduler: ReduceLROnPlateau
@@ -102,7 +102,7 @@ tasks:
       - '4'
       - '5'
       backbone_drop_path: 0.1
-      backbone_pretrained: true
+      # backbone_pretrained: true
       decoder: UNetDecoder
       decoder_channels:
       - 512
diff --git a/run_tests.py b/run_tests.py
index 7427fb7..607b7b7 100644
--- a/run_tests.py
+++ b/run_tests.py
@@ -7,6 +7,23 @@
 # rm geobench_v1_prithvi* && bsub -e ~/geobench_v1_prithvi.err -o ~/geobench_v1_prithvi.out -M 40G -gpu "num=1/task:mode=exclusive_process:gmodel=NVIDIAA100_SXM4_80GB" terratorch iterate --hpo --config configs/geobench_v1_prithvi.yaml
 
 
+def submit_job(err_file: str, out_file: str, tc_id: str | None, config: str | None):
+    if tc_id is not None:
+        jbsub = f"bsub -e {err_file} -o {out_file} -M 40G -gpu \"num=1/task:mode=exclusive_process:gmodel=NVIDIAA100_SXM4_80GB\" pytest -vv tests/test_benchmark.py::test_run_benchmark[{tc_id}]"
+    else:
+        jbsub = f"bsub -e {err_file} -o {out_file} -M 40G -gpu \"num=1/task:mode=exclusive_process:gmodel=NVIDIAA100_SXM4_80GB\" terratorch iterate --hpo --config {config}"
+
+    cmd = jbsub.split()
+    result = subprocess.run(cmd, capture_output=True)
+    if result.returncode == 0:
+        print(f"Command executed successfully: {jbsub}")
+
+    else:
+        print(f"Command failed: {jbsub}")
+        print("Command failed with error code:", result.returncode)
+        print("stderr:", result.stderr)
+
+
 @click.command()
 @click.option('--test_id', default=None, help='test ID')
 def run_tests(test_id: Optional[str] = None):
@@ -32,16 +49,19 @@ def run_tests(test_id: Optional[str] = None):
             print(f"Delete file {out_file}")
             out_file.unlink(missing_ok=True)
             assert not out_file.exists()
-        jbsub = f"bsub -e {err_file} -o {out_file} -M 40G -gpu \"num=1/task:mode=exclusive_process:gmodel=NVIDIAA100_SXM4_80GB\" pytest -vv tests/test_benchmark.py::test_run_benchmark[{tc_id}]"
-        cmd = jbsub.split()
-        result = subprocess.run(cmd, capture_output=True)
-        if result.returncode == 0:
-            print(f"Command executed successfully: {jbsub}")
-
-        else:
-            print(f"Command failed: {jbsub}")
-            print("Command failed with error code:", result.returncode)
-            print("stderr:", result.stderr)
+        submit_job(err_file=err_file, out_file=out_file, tc_id=tc_id)
+
+
+@click.command()
+@click.option('--config', default=None, help='path to config file')
+def run_job(config: str):
+    home_dir = Path(__file__).parent
+    config_path = home_dir / config
+    assert config_path.exists()
+    stem = config_path.stem
+    err_file = f"{stem}.err"
+    out_file = f"{stem}.out"
+    submit_job(err_file=err_file, out_file=out_file, config=config)
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/integration/test_main.py b/tests/integration/test_main.py
new file mode 100644
index 0000000..e4d5ae3
--- /dev/null
+++ b/tests/integration/test_main.py
@@ -0,0 +1,25 @@
+from pathlib import Path
+from benchmark.main import main
+import pytest
+import sys
+
+
+# terratorch iterate --hpo --config configs/tests/benchmark_v2_simple.yaml
+@pytest.mark.parametrize(
+    "hpo, config",
+    [
+        (
+            True,
+            "configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml"
+        )
+    ],
+)
+def test_main(hpo: bool, config: str):
+    home_dir = Path(__file__).parent.parent.parent 
+    config_file: Path = home_dir / config
+    assert config_file.exists()
+    arguments = ["terratorch", "--config", str(config_file.resolve())]
+    if hpo:
+        arguments.insert(1, "--hpo")
+    sys.argv = arguments
+    main()
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 54cea49..94bd3ca 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -104,13 +104,10 @@ def find_file(directory: str, filename: str):
 
 
 CONFIG_FILES = [
-    # "configs/tests/geobench_v1_resnet_cashew.yaml",
-    # "configs/tests/geobench_v1_prithvi_cashew.yaml",
     "configs/tests/benchmark_v2_simple.yaml",
     "configs/tests/dofa_large_patch16_224_upernetdecoder_true_modified.yaml",
     "configs/tests/test_config_util__prithvi_eo_v1_100.yaml",
-    # "configs/tests/geobench_v1_ssl4eos12_resnet50_sentinel2_all_moco_smp_unet_true.yaml",
-    # "configs/nasabench_vit_b_os.yaml",
+    "configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml",
 ]
 CONTINUE_EXISTING_EXPERIMENT = [True, False]
 TEST_MODELS = [True, False]
diff --git a/tests/test_build_geobench_configs.py b/tests/unit/test_build_geobench_configs.py
similarity index 100%
rename from tests/test_build_geobench_configs.py
rename to tests/unit/test_build_geobench_configs.py
diff --git a/tests/test_cli.py b/tests/unit/test_cli.py
similarity index 100%
rename from tests/test_cli.py
rename to tests/unit/test_cli.py

From 24c757138bbafe3c7111aaea32bbdf1feea6c523 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Wed, 24 Sep 2025 16:17:23 -0300
Subject: [PATCH 04/40] add template for converting configs

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 configs/templates/template.yaml | 41 +++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 configs/templates/template.yaml

diff --git a/configs/templates/template.yaml b/configs/templates/template.yaml
new file mode 100644
index 0000000..0e31607
--- /dev/null
+++ b/configs/templates/template.yaml
@@ -0,0 +1,41 @@
+experiment_name: X
+defaults:
+  trainer_args:
+    max_epochs: 5
+    log_every_n_steps: 1
+  terratorch_task:
+    model_factory: EncoderDecoderFactory
+    model_args:
+      backbone: X  
+      backbone_pretrained: true
+    optimizer: AdamW
+    scheduler: ReduceLROnPlateau
+    scheduler_hparams:
+      mode: min
+      factor: 0.5
+      patience: 5
+      threshold: 0.0001
+      threshold_mode: rel
+      cooldown: 0
+      min_lr: 0.0
+      eps: 1.0e-08
+    
+    
+tasks:
+  - name: X
+    type: segmentation
+    direction: max
+    metric: X
+    terratorch_task:
+    datamodule:
+
+n_trials: 1
+save_models: False
+storage_uri: ./mlflow
+run_repetitions: 5
+optimization_space:
+  lr:
+    max: 1e-3
+    min: 1e-6
+    type: real
+    log: true

From a745162ae5391492a6e5d5254b827a56ab81aaa2 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Wed, 24 Sep 2025 16:21:12 -0300
Subject: [PATCH 05/40] fix test script

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 run_tests.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/run_tests.py b/run_tests.py
index 607b7b7..8f63d5a 100644
--- a/run_tests.py
+++ b/run_tests.py
@@ -7,12 +7,13 @@
 # rm geobench_v1_prithvi* && bsub -e ~/geobench_v1_prithvi.err -o ~/geobench_v1_prithvi.out -M 40G -gpu "num=1/task:mode=exclusive_process:gmodel=NVIDIAA100_SXM4_80GB" terratorch iterate --hpo --config configs/geobench_v1_prithvi.yaml
 
 
-def submit_job(err_file: str, out_file: str, tc_id: str | None, config: str | None):
+def submit_job(err_file: str, out_file: str, tc_id: str | None = None, config: str | None = None):
     if tc_id is not None:
         jbsub = f"bsub -e {err_file} -o {out_file} -M 40G -gpu \"num=1/task:mode=exclusive_process:gmodel=NVIDIAA100_SXM4_80GB\" pytest -vv tests/test_benchmark.py::test_run_benchmark[{tc_id}]"
-    else:
+    elif config is not None:
         jbsub = f"bsub -e {err_file} -o {out_file} -M 40G -gpu \"num=1/task:mode=exclusive_process:gmodel=NVIDIAA100_SXM4_80GB\" terratorch iterate --hpo --config {config}"
-
+    else:
+        raise ValueError("Error! Either tc_id or config must be not None")
     cmd = jbsub.split()
     result = subprocess.run(cmd, capture_output=True)
     if result.returncode == 0:

From c22cb9b363c381d343e7a413d2ada39e075af05d Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Wed, 24 Sep 2025 16:29:41 -0300
Subject: [PATCH 06/40] fix test script

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 run_tests.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/run_tests.py b/run_tests.py
index 8f63d5a..7035f69 100644
--- a/run_tests.py
+++ b/run_tests.py
@@ -7,7 +7,14 @@
 # rm geobench_v1_prithvi* && bsub -e ~/geobench_v1_prithvi.err -o ~/geobench_v1_prithvi.out -M 40G -gpu "num=1/task:mode=exclusive_process:gmodel=NVIDIAA100_SXM4_80GB" terratorch iterate --hpo --config configs/geobench_v1_prithvi.yaml
 
 
-def submit_job(err_file: str, out_file: str, tc_id: str | None = None, config: str | None = None):
+@click.group()
+def cli():
+    pass
+
+
+def submit_job(
+    err_file: str, out_file: str, tc_id: str | None = None, config: str | None = None
+):
     if tc_id is not None:
         jbsub = f"bsub -e {err_file} -o {out_file} -M 40G -gpu \"num=1/task:mode=exclusive_process:gmodel=NVIDIAA100_SXM4_80GB\" pytest -vv tests/test_benchmark.py::test_run_benchmark[{tc_id}]"
     elif config is not None:
@@ -65,5 +72,8 @@ def run_job(config: str):
     submit_job(err_file=err_file, out_file=out_file, config=config)
 
 
-if __name__ == "__main__":
-    run_tests()
+cli.add_command(run_job)
+cli.add_command(run_tests)
+
+if __name__ == '__main__':
+    cli()

From c7ab31855a8974dc7fbcb620f197461a9c070e19 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Wed, 24 Sep 2025 16:35:10 -0300
Subject: [PATCH 07/40] comment out failing tests

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 tests/test_benchmark.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 94bd3ca..7cf8007 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -104,9 +104,9 @@ def find_file(directory: str, filename: str):
 
 
 CONFIG_FILES = [
-    "configs/tests/benchmark_v2_simple.yaml",
-    "configs/tests/dofa_large_patch16_224_upernetdecoder_true_modified.yaml",
-    "configs/tests/test_config_util__prithvi_eo_v1_100.yaml",
+    # "configs/tests/benchmark_v2_simple.yaml",
+    # "configs/tests/dofa_large_patch16_224_upernetdecoder_true_modified.yaml",
+    # "configs/tests/test_config_util__prithvi_eo_v1_100.yaml",
     "configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml",
 ]
 CONTINUE_EXISTING_EXPERIMENT = [True, False]

From 9d5cd3ccd2acae8fd1ad614b8431e8011a9f9890 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Wed, 24 Sep 2025 16:42:45 -0300
Subject: [PATCH 08/40] remove verbose deprecated

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 .../oracle/convnext_LM_iterate.yaml           |  1 -
 .../test_config_util__convnext.yaml           |  1 -
 ...ncoderdecoder_eo_v2_300_model_factory.yaml |  1 -
 .../test_case_01/convnext.yaml                |  1 -
 run_tests.py                                  | 35 ++++++++++---------
 5 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/configs/tests/terratorch-iterate-configs/test_case_01/oracle/convnext_LM_iterate.yaml b/configs/tests/terratorch-iterate-configs/test_case_01/oracle/convnext_LM_iterate.yaml
index b448c5a..975390b 100644
--- a/configs/tests/terratorch-iterate-configs/test_case_01/oracle/convnext_LM_iterate.yaml
+++ b/configs/tests/terratorch-iterate-configs/test_case_01/oracle/convnext_LM_iterate.yaml
@@ -15,7 +15,6 @@ defaults:
       patience: 5
       threshold: 0.0001
       threshold_mode: rel
-      verbose: deprecated
   trainer_args:
     log_every_n_steps: 1
     max_epochs: 5
diff --git a/configs/tests/terratorch-iterate-configs/test_case_01/test_config_util__convnext.yaml b/configs/tests/terratorch-iterate-configs/test_case_01/test_config_util__convnext.yaml
index c19de81..b465f60 100644
--- a/configs/tests/terratorch-iterate-configs/test_case_01/test_config_util__convnext.yaml
+++ b/configs/tests/terratorch-iterate-configs/test_case_01/test_config_util__convnext.yaml
@@ -15,7 +15,6 @@ defaults:
       patience: 5
       threshold: 0.0001
       threshold_mode: rel
-      verbose: deprecated
   trainer_args:
     log_every_n_steps: 1
     max_epochs: 5
diff --git a/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml b/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
index 09410b3..7ea9152 100644
--- a/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
+++ b/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
@@ -15,7 +15,6 @@ defaults:
       patience: 5
       threshold: 0.0001
       threshold_mode: rel
-      verbose: deprecated
   trainer_args:
     log_every_n_steps: 1
     max_epochs: 5
diff --git a/configs/tests/terratorch_configs/test_case_01/convnext.yaml b/configs/tests/terratorch_configs/test_case_01/convnext.yaml
index ff30037..c983aeb 100644
--- a/configs/tests/terratorch_configs/test_case_01/convnext.yaml
+++ b/configs/tests/terratorch_configs/test_case_01/convnext.yaml
@@ -154,4 +154,3 @@ lr_scheduler:
     cooldown: 0
     min_lr: 0.0
     eps: 1.0e-08
-    verbose: deprecated
diff --git a/run_tests.py b/run_tests.py
index 7035f69..11c1327 100644
--- a/run_tests.py
+++ b/run_tests.py
@@ -13,8 +13,24 @@ def cli():
 
 
 def submit_job(
-    err_file: str, out_file: str, tc_id: str | None = None, config: str | None = None
+    stderr_file: str,
+    stdout_file: str,
+    tc_id: str | None = None,
+    config: str | None = None,
 ):
+    err_file = Path.home() / stderr_file
+    # delete file if it exists
+    if err_file.exists():
+        print(f"Delete file {err_file}")
+        err_file.unlink(missing_ok=True)
+        assert not err_file.exists()
+
+    out_file = Path.home() / stdout_file
+    # delete file if it exists
+    if out_file.exists():
+        print(f"Delete file {out_file}")
+        out_file.unlink(missing_ok=True)
+        assert not out_file.exists()
     if tc_id is not None:
         jbsub = f"bsub -e {err_file} -o {out_file} -M 40G -gpu \"num=1/task:mode=exclusive_process:gmodel=NVIDIAA100_SXM4_80GB\" pytest -vv tests/test_benchmark.py::test_run_benchmark[{tc_id}]"
     elif config is not None:
@@ -44,20 +60,7 @@ def run_tests(test_id: Optional[str] = None):
         stderr_file = f"test-iterate-test_benchmark-{tc_id}.err"
         stdout_file = f"test-iterate-test_benchmark-{tc_id}.out"
 
-        err_file = Path.home() / stderr_file
-        # delete file if it exists
-        if err_file.exists():
-            print(f"Delete file {err_file}")
-            err_file.unlink(missing_ok=True)
-            assert not err_file.exists()
-        out_file = Path.home() / stdout_file
-
-        # delete file if it exists
-        if out_file.exists():
-            print(f"Delete file {out_file}")
-            out_file.unlink(missing_ok=True)
-            assert not out_file.exists()
-        submit_job(err_file=err_file, out_file=out_file, tc_id=tc_id)
+        submit_job(stderr_file=stderr_file, stdout_file=stdout_file, tc_id=tc_id)
 
 
 @click.command()
@@ -69,7 +72,7 @@ def run_job(config: str):
     stem = config_path.stem
     err_file = f"{stem}.err"
     out_file = f"{stem}.out"
-    submit_job(err_file=err_file, out_file=out_file, config=config)
+    submit_job(stdout_file=out_file, stderr_file=err_file, config=config)
 
 
 cli.add_command(run_job)

From 0e8654cd440b15be6c897629cecb81a5bbf1f3ca Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Wed, 24 Sep 2025 16:52:24 -0300
Subject: [PATCH 09/40] improve run test script

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 run_tests.py            |  4 ++--
 tests/test_benchmark.py | 11 +++++++++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/run_tests.py b/run_tests.py
index 11c1327..4e458c4 100644
--- a/run_tests.py
+++ b/run_tests.py
@@ -1,7 +1,7 @@
 import subprocess
 from pathlib import Path
 from typing import Optional
-from tests.test_benchmark import TEST_CASE_IDS
+from tests.test_benchmark import get_test_ids
 import click
 
 # rm geobench_v1_prithvi* && bsub -e ~/geobench_v1_prithvi.err -o ~/geobench_v1_prithvi.out -M 40G -gpu "num=1/task:mode=exclusive_process:gmodel=NVIDIAA100_SXM4_80GB" terratorch iterate --hpo --config configs/geobench_v1_prithvi.yaml
@@ -52,7 +52,7 @@ def submit_job(
 @click.option('--test_id', default=None, help='test ID')
 def run_tests(test_id: Optional[str] = None):
     if test_id is None:
-        test_ids = TEST_CASE_IDS
+        test_ids = get_test_ids()
     else:
         test_ids = [test_id]
     for tc_id in test_ids:
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 7cf8007..6087d68 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -114,13 +114,20 @@ def find_file(directory: str, filename: str):
 INPUT_TEST_RUN_BENCHMARK = list(
     itertools.product(CONFIG_FILES, CONTINUE_EXISTING_EXPERIMENT, TEST_MODELS)
 )
-TEST_CASE_IDS = [str(i) for i in range(0, len(INPUT_TEST_RUN_BENCHMARK))]
 
 
+def get_test_ids() -> list[str]:
+    test_case_ids = list()
+    for config, cee, tm in INPUT_TEST_RUN_BENCHMARK:
+        filename = config.split("/")[-1].replace(".yaml", "")
+        tid = f"{filename}_{cee}_{tm}"
+        test_case_ids.append(tid)
+    return test_case_ids
+
 @pytest.mark.parametrize(
     "config, continue_existing_experiment, test_models",
     INPUT_TEST_RUN_BENCHMARK,
-    ids=TEST_CASE_IDS,
+    ids=get_test_ids(),
 )
 def test_run_benchmark(
     config: str, continue_existing_experiment: bool, test_models: bool

From be5fca8a3059cf5b96881218ae4e6d6d2f6ff306 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Wed, 24 Sep 2025 16:54:52 -0300
Subject: [PATCH 10/40] minor modification

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 ...est_config_util__encoderdecoder_eo_v2_300_model_factory.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml b/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
index 7ea9152..284cbec 100644
--- a/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
+++ b/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
@@ -28,7 +28,7 @@ optimization_space:
     type: real
 run_repetitions: 5
 save_models: false
-storage_uri: ./mlflow
+storage_uri: /dccstor/geofm-finetuning/terratorch-iterate-test/
 tasks:
 - datamodule:
     class_path: terratorch.datamodules.GenericNonGeoSegmentationDataModule

From ccd540b0164bc2dbd923071fe367758124e37015 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Wed, 24 Sep 2025 17:27:36 -0300
Subject: [PATCH 11/40] improve test script

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 run_tests.py            |  4 ++--
 tests/test_benchmark.py | 11 ++++++++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/run_tests.py b/run_tests.py
index 4e458c4..9e15de0 100644
--- a/run_tests.py
+++ b/run_tests.py
@@ -57,8 +57,8 @@ def run_tests(test_id: Optional[str] = None):
         test_ids = [test_id]
     for tc_id in test_ids:
         print(f"Running test case: tests/test_benchmark.py::test_run_benchmark {tc_id}")
-        stderr_file = f"test-iterate-test_benchmark-{tc_id}.err"
-        stdout_file = f"test-iterate-test_benchmark-{tc_id}.out"
+        stderr_file = f"test-{tc_id}.err"
+        stdout_file = f"test-{tc_id}.out"
 
         submit_job(stderr_file=stderr_file, stdout_file=stdout_file, tc_id=tc_id)
 
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 6087d68..81e6aa7 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -1,4 +1,5 @@
 import itertools
+import shutil
 from benchmark.benchmark_types import Defaults, Task, TaskTypeEnum
 import pytest
 from benchmark.backbone_benchmark import benchmark_backbone
@@ -228,7 +229,7 @@ def test_run_benchmark(
 def test_run_benchmark_no_specific_terratorch_task(
     config: str, continue_existing_experiment: bool, test_models: bool
 ):
-
+    delete_new_dirs = True
     path = os.path.join(os.getcwd(), config)
     config_path = Path(path)
     assert (
@@ -318,6 +319,14 @@ def test_run_benchmark_no_specific_terratorch_task(
         finished_run_id=finished_run_id,
     )
 
+    if storage_uri_path.is_dir() and delete_new_dirs:
+        try:
+            shutil.rmtree(storage_uri_path)
+            print(f"Directory '{storage_uri_path}' and its contents removed successfully.")
+        except OSError as e:
+            print(f"Error: {storage_uri_path} : {e.strerror}")
+    else:
+        print(f"Directory '{storage_uri_path}' does not exist.")
 
 def validate_results(experiment_name: str, storage_uri: str, finished_run_id: str):
     # get the most recent modified directory

From 8950f9d96b1b37c67b47a740c0326198caa2dfa5 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Wed, 24 Sep 2025 17:42:50 -0300
Subject: [PATCH 12/40] minor modification

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 ...est_config_util__encoderdecoder_eo_v2_300_model_factory.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml b/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
index 284cbec..8625619 100644
--- a/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
+++ b/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
@@ -28,7 +28,7 @@ optimization_space:
     type: real
 run_repetitions: 5
 save_models: false
-storage_uri: /dccstor/geofm-finetuning/terratorch-iterate-test/
+storage_uri: /u/ltizzei/test_terratorch_iterate
 tasks:
 - datamodule:
     class_path: terratorch.datamodules.GenericNonGeoSegmentationDataModule

From e7717d6caa40796ac058729dd4a2242dcd6bb48c Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Thu, 25 Sep 2025 09:59:46 -0300
Subject: [PATCH 13/40] add another test case

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 ..._decoder_timm_resnet101_model_factory.yaml | 123 ++++++++++++++
 ..._decoder_timm_resnet101_model_factory.yaml | 154 ++++++++++++++++++
 pyproject.toml                                |   2 +-
 run_tests.py                                  |   4 +-
 tests/test_benchmark.py                       |  14 +-
 tests/unit/test_build_geobench_configs.py     |   7 +
 6 files changed, 294 insertions(+), 10 deletions(-)
 create mode 100644 configs/tests/terratorch-iterate-configs/test_case_03/terratorch__encoder_decoder_timm_resnet101_model_factory.yaml
 create mode 100644 configs/tests/terratorch_configs/test_case_03/test_encoder_decoder_timm_resnet101_model_factory.yaml

diff --git a/configs/tests/terratorch-iterate-configs/test_case_03/terratorch__encoder_decoder_timm_resnet101_model_factory.yaml b/configs/tests/terratorch-iterate-configs/test_case_03/terratorch__encoder_decoder_timm_resnet101_model_factory.yaml
new file mode 100644
index 0000000..9bdfa27
--- /dev/null
+++ b/configs/tests/terratorch-iterate-configs/test_case_03/terratorch__encoder_decoder_timm_resnet101_model_factory.yaml
@@ -0,0 +1,123 @@
+defaults:
+  terratorch_task:
+    model_args:
+      backbone: X
+      backbone_pretrained: true
+    model_factory: EncoderDecoderFactory
+    optimizer: AdamW
+    scheduler: ReduceLROnPlateau
+    scheduler_hparams:
+      cooldown: 0
+      eps: 1.0e-08
+      factor: 0.5
+      min_lr: 0.0
+      mode: min
+      patience: 5
+      threshold: 0.0001
+      threshold_mode: rel
+  trainer_args:
+    log_every_n_steps: 1
+    max_epochs: 5
+experiment_name: terratorch__encoder_decoder_timm_resnet101_model_factory
+n_trials: 1
+optimization_space:
+  lr:
+    log: true
+    max: 1e-3
+    min: 1e-6
+    type: real
+run_repetitions: 5
+save_models: false
+storage_uri: ./mlflow
+tasks:
+- datamodule:
+    class_path: terratorch.datamodules.GenericNonGeoSegmentationDataModule
+    init_args:
+      allow_substring_split_file: true
+      batch_size: 4
+      constant_scale: 1.0
+      dataset_bands:
+      - '0'
+      - '1'
+      - '2'
+      - '3'
+      - '4'
+      - '5'
+      ignore_split_file_extensions: true
+      img_grep: '*_merged.tif'
+      label_grep: '*.mask.tif'
+      means:
+      - 0.052829564761523104
+      - 0.07822514779700994
+      - 0.09545302348640401
+      - 0.2128596444116123
+      - 0.2363016737011897
+      - 0.17234100022878698
+      no_data_replace: 0
+      no_label_replace: -1
+      num_classes: 2
+      num_workers: 2
+      output_bands:
+      - '0'
+      - '1'
+      - '2'
+      - '3'
+      - '4'
+      - '5'
+      rgb_indices:
+      - 0
+      - 1
+      - 2
+      stds:
+      - 0.028757146620143812
+      - 0.03540772770593507
+      - 0.05291947163682527
+      - 0.06949186937256507
+      - 0.08958868240264736
+      - 0.08198354165348874
+      test_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
+      test_label_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
+      test_transform:
+      - class_path: ToTensorV2
+      train_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/training
+      train_label_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/training
+      val_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
+      val_label_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
+  direction: max
+  metric: val/loss
+  name: test
+  terratorch_task:
+    freeze_backbone: false
+    freeze_decoder: false
+    ignore_index: -1
+    loss: ce
+    model_args:
+      backbone: timm_resnet101
+      backbone_in_chans: 6
+      backbone_pretrained: true
+      decoder: UNetDecoder
+      decoder_channels:
+      - 512
+      - 256
+      - 128
+      - 64
+      head_channel_list:
+      - 256
+      head_dropout: 0.1
+      necks:
+      - indices:
+        - 0
+        - 1
+        - 2
+        - 3
+        name: SelectIndices
+      num_classes: 3
+    model_factory: EncoderDecoderFactory
+    plot_on_val: 2
+    tiled_inference_parameters:
+      average_patches: true
+      h_crop: 224
+      h_stride: 196
+      w_crop: 224
+      w_stride: 196
+  type: segmentation
diff --git a/configs/tests/terratorch_configs/test_case_03/test_encoder_decoder_timm_resnet101_model_factory.yaml b/configs/tests/terratorch_configs/test_case_03/test_encoder_decoder_timm_resnet101_model_factory.yaml
new file mode 100644
index 0000000..253de10
--- /dev/null
+++ b/configs/tests/terratorch_configs/test_case_03/test_encoder_decoder_timm_resnet101_model_factory.yaml
@@ -0,0 +1,154 @@
+################################################################
+# Licensed Materials - Property of IBM
+# "Restricted Materials of IBM"
+# Copyright IBM Corp. 2025 ALL RIGHTS RESERVED
+################################################################
+
+
+# lightning.pytorch==2.1.1
+seed_everything: 0
+trainer:
+  accelerator: auto
+  strategy: auto
+  devices: auto
+  num_nodes: 1
+  precision: 16-mixed
+  logger: true
+  callbacks:
+    - class_path: RichProgressBar
+    - class_path: LearningRateMonitor
+      init_args:
+        logging_interval: epoch
+    # ---- Early stop if ----
+    - class_path: EarlyStopping
+      init_args:
+        monitor: val/loss
+        patience: 20
+     # ---- Early stop endif ----
+    - class_path: ModelCheckpoint
+      init_args:
+        dirpath: /dccstor/terratorch/tmp/timm_resnet101/
+        mode: min
+        monitor: val/loss
+        filename: best-state_dict-{epoch:02d}
+        save_weights_only: True
+      
+  max_epochs: 2
+  check_val_every_n_epoch: 1
+  log_every_n_steps: 50
+  enable_checkpointing: true
+  default_root_dir: /dccstor/terratorch/tmp/timm_resnet101/
+data:
+  class_path: terratorch.datamodules.GenericNonGeoSegmentationDataModule
+  init_args:
+    batch_size: 4
+    num_workers: 2
+    no_label_replace: -1
+    no_data_replace: 0
+    constant_scale: 1.0
+    dataset_bands:
+      - '0'
+      - '1'
+      - '2'
+      - '3'
+      - '4'
+      - '5'
+
+    output_bands:
+      - '0'
+      - '1'
+      - '2'
+      - '3'
+      - '4'
+      - '5'
+
+    rgb_indices:
+      - 0
+      - 1
+      - 2
+
+    train_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/training
+    train_label_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/training
+    val_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
+    val_label_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
+    test_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
+    test_label_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
+    # Splits not available in ccc for burnscars data
+    # train_split: /data//geodata-060bbc44822a11efb3260a580a830dad/split_files/train_data.txt
+    # test_split: /data//geodata-060bbc44822a11efb3260a580a830dad/split_files/test_data.txt
+    # val_split: /data//geodata-060bbc44822a11efb3260a580a830dad/split_files/val_data.txt
+    ignore_split_file_extensions: true
+    allow_substring_split_file: true
+    img_grep: "*_merged.tif"
+    label_grep: "*.mask.tif"
+    means: 
+      - 0.052829564761523104
+      - 0.07822514779700994
+      - 0.09545302348640401
+      - 0.2128596444116123
+      - 0.2363016737011897
+      - 0.17234100022878698
+
+    stds: 
+      - 0.028757146620143812
+      - 0.03540772770593507
+      - 0.05291947163682527
+      - 0.06949186937256507
+      - 0.08958868240264736
+      - 0.08198354165348874
+
+    num_classes: 2
+    # ---- train_transform if ----
+    # ---- train_transform endif ----
+
+    # if backbone is prithvi-EO-v2
+    test_transform:
+      - class_path: ToTensorV2
+model:
+  class_path: terratorch.tasks.SemanticSegmentationTask
+  init_args:
+    model_args: 
+      backbone: timm_resnet101 # timm_resnet34 , timm_resnet18 , timm_resnet50 , timm_resnet101 , timm_resnet152
+      backbone_pretrained: true 
+      num_classes: 3
+      backbone_in_chans: 6 # To be used with RGB when pretrained, can be more if not retrained
+      necks: 
+        - name: SelectIndices
+          indices:  [0, 1, 2, 3]
+      decoder: UNetDecoder
+      #TODO user provided channels
+      decoder_channels: [512, 256, 128, 64]
+      head_channel_list:
+        - 256
+
+      head_dropout: 0.1
+    
+    model_factory: EncoderDecoderFactory
+    loss: ce
+    plot_on_val: 2
+    ignore_index: -1
+    freeze_backbone: false
+    freeze_decoder: false
+
+    # ---- optimizer start ----
+    # ---- optimizer end ----
+    
+    tiled_inference_parameters: 
+      h_crop: 224
+      h_stride: 196
+      w_crop: 224
+      w_stride: 196
+      average_patches: True
+    
+optimizer:
+  class_path: torch.optim.Adam
+  init_args:
+    # ---- Optimizer start if ----
+    lr: 6e-05
+    
+    weight_decay: 0.05
+    # ---- Optimizer stop if ----
+lr_scheduler:
+  class_path: ReduceLROnPlateau
+  init_args:
+    monitor: val/loss
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index e226ff1..999542c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,7 +38,7 @@ readme = "README.md"
 
 dependencies = [
 # ObjectDetection is not supported on terratorch==1.0.2, so iterate relies on main branch
-"terratorch", 
+"terratorch>=1.1.0", 
 # "terratorch",
 # requests>=2.32.0 because of this vulnerability https://github.com/psf/requests/security/advisories/GHSA-9wx4-h78v-vm56
 "requests>=2.32.0",
diff --git a/run_tests.py b/run_tests.py
index 9e15de0..ad3fa76 100644
--- a/run_tests.py
+++ b/run_tests.py
@@ -57,8 +57,8 @@ def run_tests(test_id: Optional[str] = None):
         test_ids = [test_id]
     for tc_id in test_ids:
         print(f"Running test case: tests/test_benchmark.py::test_run_benchmark {tc_id}")
-        stderr_file = f"test-{tc_id}.err"
-        stdout_file = f"test-{tc_id}.out"
+        stderr_file = f"{tc_id}.err"
+        stdout_file = f"{tc_id}.out"
 
         submit_job(stderr_file=stderr_file, stdout_file=stdout_file, tc_id=tc_id)
 
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 81e6aa7..ae19bc8 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -21,9 +21,6 @@
     "SEGMENTATION_V1", "/dccstor/geofm-finetuning/datasets/geobench/segmentation_v1.0"
 )
 
-# OUTPUT_DIR = os.getenv(
-#     "OUTPUT_DIR", "/dccstor/geofm-finetuning/terratorch-iterate-test/"
-# )
 
 RAY_STORAGE = os.getenv(
     "RAY_STORAGE", "/dccstor/geofm-finetuning/terratorch-iterate-test/ray_storage"
@@ -105,9 +102,8 @@ def find_file(directory: str, filename: str):
 
 
 CONFIG_FILES = [
-    # "configs/tests/benchmark_v2_simple.yaml",
-    # "configs/tests/dofa_large_patch16_224_upernetdecoder_true_modified.yaml",
-    # "configs/tests/test_config_util__prithvi_eo_v1_100.yaml",
+    "configs/tests/benchmark_v2_simple.yaml",
+    "configs/tests/dofa_large_patch16_224_upernetdecoder_true_modified.yaml",
     "configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml",
 ]
 CONTINUE_EXISTING_EXPERIMENT = [True, False]
@@ -125,6 +121,7 @@ def get_test_ids() -> list[str]:
         test_case_ids.append(tid)
     return test_case_ids
 
+
 @pytest.mark.parametrize(
     "config, continue_existing_experiment, test_models",
     INPUT_TEST_RUN_BENCHMARK,
@@ -322,12 +319,15 @@ def test_run_benchmark_no_specific_terratorch_task(
     if storage_uri_path.is_dir() and delete_new_dirs:
         try:
             shutil.rmtree(storage_uri_path)
-            print(f"Directory '{storage_uri_path}' and its contents removed successfully.")
+            print(
+                f"Directory '{storage_uri_path}' and its contents removed successfully."
+            )
         except OSError as e:
             print(f"Error: {storage_uri_path} : {e.strerror}")
     else:
         print(f"Directory '{storage_uri_path}' does not exist.")
 
+
 def validate_results(experiment_name: str, storage_uri: str, finished_run_id: str):
     # get the most recent modified directory
     dir_path = Path(storage_uri) / finished_run_id
diff --git a/tests/unit/test_build_geobench_configs.py b/tests/unit/test_build_geobench_configs.py
index 9f3f89c..7b007fc 100644
--- a/tests/unit/test_build_geobench_configs.py
+++ b/tests/unit/test_build_geobench_configs.py
@@ -25,6 +25,13 @@
             "test_config_util_",
             None,
         ),
+        (
+            "./configs/tests/terratorch_configs/test_case_03",
+            "./configs/tests/terratorch-iterate-configs/test_case_03",
+            "./configs/templates/template.yaml",
+            "test_config_util_",
+            None,
+        ),
     ],
 )
 def test__generate_iterate_config(

From 6b802e13904518c429b0664f318246925405d872 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Thu, 25 Sep 2025 10:05:40 -0300
Subject: [PATCH 14/40] add another parametrized test case

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 tests/test_benchmark.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index ae19bc8..aa23798 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -105,6 +105,7 @@ def find_file(directory: str, filename: str):
     "configs/tests/benchmark_v2_simple.yaml",
     "configs/tests/dofa_large_patch16_224_upernetdecoder_true_modified.yaml",
     "configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml",
+    "configs/tests/terratorch-iterate-configs/test_case_03/terratorch__encoder_decoder_timm_resnet101_model_factory.yaml",
 ]
 CONTINUE_EXISTING_EXPERIMENT = [True, False]
 TEST_MODELS = [True, False]

From 991bf0d5a29f371450ef0dd0a56561be98182a92 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Thu, 25 Sep 2025 10:37:47 -0300
Subject: [PATCH 15/40] handle relative paths

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 benchmark/main.py | 6 ++++++
 run_tests.py      | 9 +++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/benchmark/main.py b/benchmark/main.py
index 28ffc55..6ddb626 100644
--- a/benchmark/main.py
+++ b/benchmark/main.py
@@ -193,6 +193,12 @@ def main():
 
         storage_uri = config_init.storage_uri
         assert isinstance(storage_uri, str), f"Error! {storage_uri=} is not a str"
+        # handling relative paths
+        if storage_uri.startswith(".") or storage_uri.startswith(".."):
+            repo_home_dir = Path(__file__).parent.parent 
+            abs_path = repo_home_dir / storage_uri
+            storage_uri = str(abs_path.resolve())
+
         logger_path = config_init.logger
         if logger_path is None:
             storage_uri_path = Path(storage_uri)
diff --git a/run_tests.py b/run_tests.py
index ad3fa76..a9ee2f6 100644
--- a/run_tests.py
+++ b/run_tests.py
@@ -6,6 +6,11 @@
 
 # rm geobench_v1_prithvi* && bsub -e ~/geobench_v1_prithvi.err -o ~/geobench_v1_prithvi.out -M 40G -gpu "num=1/task:mode=exclusive_process:gmodel=NVIDIAA100_SXM4_80GB" terratorch iterate --hpo --config configs/geobench_v1_prithvi.yaml
 
+REPO_HOME_DIR = Path(__file__).parent
+LOGS_DIR = REPO_HOME_DIR / "logs"
+
+if not LOGS_DIR.exists():
+    LOGS_DIR.mkdir()
 
 @click.group()
 def cli():
@@ -18,14 +23,14 @@ def submit_job(
     tc_id: str | None = None,
     config: str | None = None,
 ):
-    err_file = Path.home() / stderr_file
+    err_file = LOGS_DIR / stderr_file
     # delete file if it exists
     if err_file.exists():
         print(f"Delete file {err_file}")
         err_file.unlink(missing_ok=True)
         assert not err_file.exists()
 
-    out_file = Path.home() / stdout_file
+    out_file = LOGS_DIR / stdout_file
     # delete file if it exists
     if out_file.exists():
         print(f"Delete file {out_file}")

From f1a6fc13c84a3b828540c383d6ba40659f2e0ef0 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Thu, 25 Sep 2025 10:56:41 -0300
Subject: [PATCH 16/40] minor modification

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 run_tests.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/run_tests.py b/run_tests.py
index a9ee2f6..0d2d84e 100644
--- a/run_tests.py
+++ b/run_tests.py
@@ -12,6 +12,11 @@
 if not LOGS_DIR.exists():
     LOGS_DIR.mkdir()
 
+# Delete all files in logs dir
+for item in LOGS_DIR.iterdir():
+    if item.is_file():
+        item.unlink()  
+
 @click.group()
 def cli():
     pass

From 00ac29be100c600f981e4c9d4f8b49581c4f3853 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Thu, 25 Sep 2025 14:12:55 -0300
Subject: [PATCH 17/40] add test oracle

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 .../config_util/build_geobench_configs.py     |  63 +++++----
 ...ncoderdecoder_eo_v2_300_model_factory.yaml |   6 +-
 ..._decoder_timm_resnet101_model_factory.yaml | 123 ------------------
 tests/unit/test_build_geobench_configs.py     |   4 +-
 4 files changed, 35 insertions(+), 161 deletions(-)
 delete mode 100644 configs/tests/terratorch-iterate-configs/test_case_03/terratorch__encoder_decoder_timm_resnet101_model_factory.yaml

diff --git a/benchmark/config_util/build_geobench_configs.py b/benchmark/config_util/build_geobench_configs.py
index 4807b6a..2342727 100644
--- a/benchmark/config_util/build_geobench_configs.py
+++ b/benchmark/config_util/build_geobench_configs.py
@@ -39,27 +39,27 @@ def _build_dataframe(config_files) -> pd.DataFrame:
     return df
 
 
-def _create_basemodule(data: dict[str, Any], model_filter: str) -> dict:
-    """create a dict based on the "data" field of the terratorch config
+# def _create_basemodule(data: dict[str, Any], model_filter: str) -> dict:
+#     """create a dict based on the "data" field of the terratorch config
 
-    Args:
-        data (dict[str, Any]): _description_
-        model_filter (str): model name is used to specify batch_size and eval_batch_size
+#     Args:
+#         data (dict[str, Any]): _description_
+#         model_filter (str): model name is used to specify batch_size and eval_batch_size
 
-    Returns:
-        dict: returns a dict that represents the datamodule field of iterate config file
-    """
-    base_module = dict()
-    base_module["class_path"] = data["class_path"]
-    if "dict_kwargs" in data.keys():
-        dict_kwargs = data["dict_kwargs"]
-        batch_size = 8 if model_filter != PRITHVI_600M else 4
-        dict_kwargs["batch_size"] = batch_size
-        dict_kwargs['eval_batch_size'] = 8 if model_filter != PRITHVI_600M else 4
+#     Returns:
+#         dict: returns a dict that represents the datamodule field of iterate config file
+#     """
+#     base_module = dict()
+#     base_module["class_path"] = data["class_path"]
+#     if "dict_kwargs" in data.keys():
+#         dict_kwargs = data["dict_kwargs"]
+#         batch_size = 8 if model_filter != PRITHVI_600M else 4
+#         dict_kwargs["batch_size"] = batch_size
+#         dict_kwargs['eval_batch_size'] = 8 if model_filter != PRITHVI_600M else 4
 
-        base_module["dict_kwargs"] = dict_kwargs
-    base_module["init_args"] = data["init_args"]
-    return base_module
+#         base_module["dict_kwargs"] = dict_kwargs
+#     base_module["init_args"] = data["init_args"]
+#     return base_module
 
 
 def _create_task(
@@ -146,31 +146,29 @@ def generate_iterate_config(
         input_dir (Path): contains all terratorch yaml files
         output_dir (Path): filename of the result
         template (Path): template file that contains pre-defined values
+        prefix (str): prefix for creating new config files
     """
 
     config_files = input_dir.glob('**/*.yaml')
     files_df = _build_dataframe(config_files=config_files)
 
-    files_df = files_df[files_df['dataset'].values != 'M4SAR']
-    files_df = files_df[files_df['model'].values != 'resnet50_torchgeo']
-
-    files_df = files_df.sort_values(['model', 'dataset'])
-
     models = files_df['model'].unique()
 
     with open(template, 'r') as file:
-        template = yaml.safe_load(file)
+        template_dict: dict = yaml.safe_load(file)
 
     # generate one config per model
     for model in models:
-        model_specific_template = deepcopy(template)
+        model_specific_template = deepcopy(template_dict)
+        # create unique name for experiment
         model_specific_template["experiment_name"] = f"{prefix}_{model}"
-        tasks = list()
-
+        tasks = list()  
+        
+        # filter dataframe by model
         single_model_df = files_df[files_df['model'].values == model]
 
         for i in range(single_model_df.shape[0]):
-
+            # open terratorch config file
             with open(single_model_df['file'].values[i], 'r') as file:
                 data = yaml.safe_load(file)
 
@@ -186,13 +184,12 @@ def generate_iterate_config(
             else:
                 metric = 'val/loss'
 
-            # terratorchtask is the data.model.init_args of terratorch config file
+            # terratorchtask is extracted from the data.model.init_args of terratorch config file
             terratorch_task = data['model']['init_args']
             # create datamodule based on data field
-            data = data['data']
-            datamodule = _create_basemodule(data=data, model_filter=model)
-            task_type = _get_task_type(template=template)
-            task_direction = _get_task_direction(template=template)
+            datamodule = data['data']
+            task_type = _get_task_type(template=template_dict)
+            task_direction = _get_task_direction(template=template_dict)
             task = _create_task(
                 name=name,
                 datamodule=datamodule,
diff --git a/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml b/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
index 8625619..dac536c 100644
--- a/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
+++ b/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
@@ -2,7 +2,7 @@ defaults:
   terratorch_task:
     model_args:
       backbone: X
-      # backbone_pretrained: true
+      backbone_pretrained: true
     model_factory: EncoderDecoderFactory
     optimizer: AdamW
     scheduler: ReduceLROnPlateau
@@ -28,7 +28,7 @@ optimization_space:
     type: real
 run_repetitions: 5
 save_models: false
-storage_uri: /u/ltizzei/test_terratorch_iterate
+storage_uri: ./mlflow
 tasks:
 - datamodule:
     class_path: terratorch.datamodules.GenericNonGeoSegmentationDataModule
@@ -101,7 +101,7 @@ tasks:
       - '4'
       - '5'
       backbone_drop_path: 0.1
-      # backbone_pretrained: true
+      backbone_pretrained: true
       decoder: UNetDecoder
       decoder_channels:
       - 512
diff --git a/configs/tests/terratorch-iterate-configs/test_case_03/terratorch__encoder_decoder_timm_resnet101_model_factory.yaml b/configs/tests/terratorch-iterate-configs/test_case_03/terratorch__encoder_decoder_timm_resnet101_model_factory.yaml
deleted file mode 100644
index 9bdfa27..0000000
--- a/configs/tests/terratorch-iterate-configs/test_case_03/terratorch__encoder_decoder_timm_resnet101_model_factory.yaml
+++ /dev/null
@@ -1,123 +0,0 @@
-defaults:
-  terratorch_task:
-    model_args:
-      backbone: X
-      backbone_pretrained: true
-    model_factory: EncoderDecoderFactory
-    optimizer: AdamW
-    scheduler: ReduceLROnPlateau
-    scheduler_hparams:
-      cooldown: 0
-      eps: 1.0e-08
-      factor: 0.5
-      min_lr: 0.0
-      mode: min
-      patience: 5
-      threshold: 0.0001
-      threshold_mode: rel
-  trainer_args:
-    log_every_n_steps: 1
-    max_epochs: 5
-experiment_name: terratorch__encoder_decoder_timm_resnet101_model_factory
-n_trials: 1
-optimization_space:
-  lr:
-    log: true
-    max: 1e-3
-    min: 1e-6
-    type: real
-run_repetitions: 5
-save_models: false
-storage_uri: ./mlflow
-tasks:
-- datamodule:
-    class_path: terratorch.datamodules.GenericNonGeoSegmentationDataModule
-    init_args:
-      allow_substring_split_file: true
-      batch_size: 4
-      constant_scale: 1.0
-      dataset_bands:
-      - '0'
-      - '1'
-      - '2'
-      - '3'
-      - '4'
-      - '5'
-      ignore_split_file_extensions: true
-      img_grep: '*_merged.tif'
-      label_grep: '*.mask.tif'
-      means:
-      - 0.052829564761523104
-      - 0.07822514779700994
-      - 0.09545302348640401
-      - 0.2128596444116123
-      - 0.2363016737011897
-      - 0.17234100022878698
-      no_data_replace: 0
-      no_label_replace: -1
-      num_classes: 2
-      num_workers: 2
-      output_bands:
-      - '0'
-      - '1'
-      - '2'
-      - '3'
-      - '4'
-      - '5'
-      rgb_indices:
-      - 0
-      - 1
-      - 2
-      stds:
-      - 0.028757146620143812
-      - 0.03540772770593507
-      - 0.05291947163682527
-      - 0.06949186937256507
-      - 0.08958868240264736
-      - 0.08198354165348874
-      test_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
-      test_label_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
-      test_transform:
-      - class_path: ToTensorV2
-      train_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/training
-      train_label_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/training
-      val_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
-      val_label_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
-  direction: max
-  metric: val/loss
-  name: test
-  terratorch_task:
-    freeze_backbone: false
-    freeze_decoder: false
-    ignore_index: -1
-    loss: ce
-    model_args:
-      backbone: timm_resnet101
-      backbone_in_chans: 6
-      backbone_pretrained: true
-      decoder: UNetDecoder
-      decoder_channels:
-      - 512
-      - 256
-      - 128
-      - 64
-      head_channel_list:
-      - 256
-      head_dropout: 0.1
-      necks:
-      - indices:
-        - 0
-        - 1
-        - 2
-        - 3
-        name: SelectIndices
-      num_classes: 3
-    model_factory: EncoderDecoderFactory
-    plot_on_val: 2
-    tiled_inference_parameters:
-      average_patches: true
-      h_crop: 224
-      h_stride: 196
-      w_crop: 224
-      w_stride: 196
-  type: segmentation
diff --git a/tests/unit/test_build_geobench_configs.py b/tests/unit/test_build_geobench_configs.py
index 7b007fc..313c3ef 100644
--- a/tests/unit/test_build_geobench_configs.py
+++ b/tests/unit/test_build_geobench_configs.py
@@ -23,7 +23,7 @@
             "./configs/tests/terratorch-iterate-configs/test_case_02",
             "./configs/templates/template.yaml",
             "test_config_util_",
-            None,
+            "./configs/tests/terratorch-iterate-configs/test_case_02/oracle/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml",
         ),
         (
             "./configs/tests/terratorch_configs/test_case_03",
@@ -41,7 +41,7 @@ def test__generate_iterate_config(
     script_path = Path(__file__).resolve()
 
     # Get the home directory
-    repo_home_dir = script_path.parent.parent
+    repo_home_dir = script_path.parent.parent.parent
     input_dir_path: Path = repo_home_dir / input_dir
     assert input_dir_path.exists()
     assert input_dir_path.is_dir()

From 24a874783bf2e7679443ebaac9ba4401d46f94f8 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Thu, 25 Sep 2025 14:32:23 -0300
Subject: [PATCH 18/40] add missing directories

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 .../config_util/geobenchv2_template.yaml      |  28 ----
 ...ncoderdecoder_eo_v2_300_model_factory.yaml | 130 ++++++++++++++++++
 ..._decoder_timm_resnet101_model_factory.yaml | 123 +++++++++++++++++
 3 files changed, 253 insertions(+), 28 deletions(-)
 delete mode 100644 benchmark/config_util/geobenchv2_template.yaml
 create mode 100644 configs/tests/terratorch-iterate-configs/test_case_02/oracle/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
 create mode 100644 configs/tests/terratorch-iterate-configs/test_case_03/test_config_util__encoder_decoder_timm_resnet101_model_factory.yaml

diff --git a/benchmark/config_util/geobenchv2_template.yaml b/benchmark/config_util/geobenchv2_template.yaml
deleted file mode 100644
index 38351a3..0000000
--- a/benchmark/config_util/geobenchv2_template.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-experiment_name: my_experiment
-defaults:
-  terratorch_task:
-    model_args:
-      backbone: terramind_v1_large
-      backbone_pretrained: true
-    model_factory: ObjectDetectionModelFactory
-    optimizer: AdamW
-  trainer_args:
-    log_every_n_steps: 1
-    max_epochs: 1
-tasks:
-  - name: X
-    type: object_detection
-    direction: max
-    metric: X
-    terratorch_task:
-    datamodule:
-n_trials: 1
-save_models: False
-storage_uri: /opt/app-root/src/fm-geospatial/pf/logs/geobench/mlflow
-run_repetitions: 1
-optimization_space:
-  lr:
-    max: 1e-3
-    min: 1e-6
-    type: real
-    log: true
diff --git a/configs/tests/terratorch-iterate-configs/test_case_02/oracle/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml b/configs/tests/terratorch-iterate-configs/test_case_02/oracle/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
new file mode 100644
index 0000000..0f57244
--- /dev/null
+++ b/configs/tests/terratorch-iterate-configs/test_case_02/oracle/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
@@ -0,0 +1,130 @@
+defaults:
+  terratorch_task:
+    model_args:
+      backbone: X
+      backbone_pretrained: true
+    model_factory: EncoderDecoderFactory
+    optimizer: AdamW
+    scheduler: ReduceLROnPlateau
+    scheduler_hparams:
+      cooldown: 0
+      eps: 1.0e-08
+      factor: 0.5
+      min_lr: 0.0
+      mode: min
+      patience: 5
+      threshold: 0.0001
+      threshold_mode: rel
+  trainer_args:
+    log_every_n_steps: 1
+    max_epochs: 5
+experiment_name: test_config_util__encoderdecoder_eo_v2_300_model_factory
+n_trials: 1
+optimization_space:
+  lr:
+    log: true
+    max: 1e-3
+    min: 1e-6
+    type: real
+run_repetitions: 5
+save_models: false
+storage_uri: /u/ltizzei/test_terratorch_iterate
+tasks:
+- datamodule:
+    class_path: terratorch.datamodules.GenericNonGeoSegmentationDataModule
+    init_args:
+      allow_substring_split_file: true
+      batch_size: 4
+      constant_scale: 1.0
+      dataset_bands:
+      - '0'
+      - '1'
+      - '2'
+      - '3'
+      - '4'
+      - '5'
+      ignore_split_file_extensions: true
+      img_grep: '*_merged.tif'
+      label_grep: '*.mask.tif'
+      means:
+      - 0.052829564761523104
+      - 0.07822514779700994
+      - 0.09545302348640401
+      - 0.2128596444116123
+      - 0.2363016737011897
+      - 0.17234100022878698
+      no_data_replace: 0
+      no_label_replace: -1
+      num_classes: 2
+      num_workers: 2
+      output_bands:
+      - '0'
+      - '1'
+      - '2'
+      - '3'
+      - '4'
+      - '5'
+      rgb_indices:
+      - 0
+      - 1
+      - 2
+      stds:
+      - 0.028757146620143812
+      - 0.03540772770593507
+      - 0.05291947163682527
+      - 0.06949186937256507
+      - 0.08958868240264736
+      - 0.08198354165348874
+      test_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
+      test_label_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
+      test_transform:
+      - class_path: ToTensorV2
+      train_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/training
+      train_label_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/training
+      val_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
+      val_label_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
+  direction: max
+  metric: val/loss
+  name: test
+  terratorch_task:
+    freeze_backbone: false
+    freeze_decoder: false
+    ignore_index: -1
+    loss: ce
+    model_args:
+      backbone: prithvi_eo_v2_300
+      backbone_bands:
+      - '0'
+      - '1'
+      - '2'
+      - '3'
+      - '4'
+      - '5'
+      backbone_drop_path: 0.1
+      # backbone_pretrained: true
+      decoder: UNetDecoder
+      decoder_channels:
+      - 512
+      - 256
+      - 128
+      - 64
+      head_dropout: 0.1
+      necks:
+      - indices:
+        - 5
+        - 11
+        - 17
+        - 23
+        name: SelectIndices
+      - name: ReshapeTokensToImage
+      - name: LearnedInterpolateToPyramidal
+      num_classes: 2
+    model_factory: EncoderDecoderFactory
+    plot_on_val: 2
+    tiled_inference_parameters:
+      average_patches: true
+      h_crop: 512
+      h_stride: 448
+      w_crop: 512
+      w_stride: 448
+  type: segmentation
diff --git a/configs/tests/terratorch-iterate-configs/test_case_03/test_config_util__encoder_decoder_timm_resnet101_model_factory.yaml b/configs/tests/terratorch-iterate-configs/test_case_03/test_config_util__encoder_decoder_timm_resnet101_model_factory.yaml
new file mode 100644
index 0000000..7f63285
--- /dev/null
+++ b/configs/tests/terratorch-iterate-configs/test_case_03/test_config_util__encoder_decoder_timm_resnet101_model_factory.yaml
@@ -0,0 +1,123 @@
+defaults:
+  terratorch_task:
+    model_args:
+      backbone: X
+      backbone_pretrained: true
+    model_factory: EncoderDecoderFactory
+    optimizer: AdamW
+    scheduler: ReduceLROnPlateau
+    scheduler_hparams:
+      cooldown: 0
+      eps: 1.0e-08
+      factor: 0.5
+      min_lr: 0.0
+      mode: min
+      patience: 5
+      threshold: 0.0001
+      threshold_mode: rel
+  trainer_args:
+    log_every_n_steps: 1
+    max_epochs: 5
+experiment_name: test_config_util__encoder_decoder_timm_resnet101_model_factory
+n_trials: 1
+optimization_space:
+  lr:
+    log: true
+    max: 1e-3
+    min: 1e-6
+    type: real
+run_repetitions: 5
+save_models: false
+storage_uri: ./mlflow
+tasks:
+- datamodule:
+    class_path: terratorch.datamodules.GenericNonGeoSegmentationDataModule
+    init_args:
+      allow_substring_split_file: true
+      batch_size: 4
+      constant_scale: 1.0
+      dataset_bands:
+      - '0'
+      - '1'
+      - '2'
+      - '3'
+      - '4'
+      - '5'
+      ignore_split_file_extensions: true
+      img_grep: '*_merged.tif'
+      label_grep: '*.mask.tif'
+      means:
+      - 0.052829564761523104
+      - 0.07822514779700994
+      - 0.09545302348640401
+      - 0.2128596444116123
+      - 0.2363016737011897
+      - 0.17234100022878698
+      no_data_replace: 0
+      no_label_replace: -1
+      num_classes: 2
+      num_workers: 2
+      output_bands:
+      - '0'
+      - '1'
+      - '2'
+      - '3'
+      - '4'
+      - '5'
+      rgb_indices:
+      - 0
+      - 1
+      - 2
+      stds:
+      - 0.028757146620143812
+      - 0.03540772770593507
+      - 0.05291947163682527
+      - 0.06949186937256507
+      - 0.08958868240264736
+      - 0.08198354165348874
+      test_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
+      test_label_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
+      test_transform:
+      - class_path: ToTensorV2
+      train_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/training
+      train_label_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/training
+      val_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
+      val_label_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
+  direction: max
+  metric: val/loss
+  name: test
+  terratorch_task:
+    freeze_backbone: false
+    freeze_decoder: false
+    ignore_index: -1
+    loss: ce
+    model_args:
+      backbone: timm_resnet101
+      backbone_in_chans: 6
+      backbone_pretrained: true
+      decoder: UNetDecoder
+      decoder_channels:
+      - 512
+      - 256
+      - 128
+      - 64
+      head_channel_list:
+      - 256
+      head_dropout: 0.1
+      necks:
+      - indices:
+        - 0
+        - 1
+        - 2
+        - 3
+        name: SelectIndices
+      num_classes: 3
+    model_factory: EncoderDecoderFactory
+    plot_on_val: 2
+    tiled_inference_parameters:
+      average_patches: true
+      h_crop: 224
+      h_stride: 196
+      w_crop: 224
+      w_stride: 196
+  type: segmentation

From 15cec0827c1ee27c2cad040d671c55ab5131d5b5 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Thu, 25 Sep 2025 14:37:07 -0300
Subject: [PATCH 19/40] ignore test case because checkpoint has been removed

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 tests/test_benchmark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index aa23798..ecc2850 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -102,7 +102,7 @@ def find_file(directory: str, filename: str):
 
 
 CONFIG_FILES = [
-    "configs/tests/benchmark_v2_simple.yaml",
+    # "configs/tests/benchmark_v2_simple.yaml",
     "configs/tests/dofa_large_patch16_224_upernetdecoder_true_modified.yaml",
     "configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml",
     "configs/tests/terratorch-iterate-configs/test_case_03/terratorch__encoder_decoder_timm_resnet101_model_factory.yaml",

From dbe992fddbff4271ec8864f774178e377e8fad5a Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Thu, 25 Sep 2025 14:48:42 -0300
Subject: [PATCH 20/40] replace test_benchmark by test_main

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 run_tests.py                   |   4 +-
 tests/integration/test_main.py |  40 +++-
 tests/test_benchmark.py        | 358 ---------------------------------
 3 files changed, 39 insertions(+), 363 deletions(-)
 delete mode 100644 tests/test_benchmark.py

diff --git a/run_tests.py b/run_tests.py
index 0d2d84e..d98e08c 100644
--- a/run_tests.py
+++ b/run_tests.py
@@ -1,7 +1,7 @@
 import subprocess
 from pathlib import Path
 from typing import Optional
-from tests.test_benchmark import get_test_ids
+from tests.integration.test_main import get_test_ids
 import click
 
 # rm geobench_v1_prithvi* && bsub -e ~/geobench_v1_prithvi.err -o ~/geobench_v1_prithvi.out -M 40G -gpu "num=1/task:mode=exclusive_process:gmodel=NVIDIAA100_SXM4_80GB" terratorch iterate --hpo --config configs/geobench_v1_prithvi.yaml
@@ -42,7 +42,7 @@ def submit_job(
         out_file.unlink(missing_ok=True)
         assert not out_file.exists()
     if tc_id is not None:
-        jbsub = f"bsub -e {err_file} -o {out_file} -M 40G -gpu \"num=1/task:mode=exclusive_process:gmodel=NVIDIAA100_SXM4_80GB\" pytest -vv tests/test_benchmark.py::test_run_benchmark[{tc_id}]"
+        jbsub = f"bsub -e {err_file} -o {out_file} -M 40G -gpu \"num=1/task:mode=exclusive_process:gmodel=NVIDIAA100_SXM4_80GB\" pytest -vv tests/integration/test_main.py::test_main[{tc_id}]"
     elif config is not None:
         jbsub = f"bsub -e {err_file} -o {out_file} -M 40G -gpu \"num=1/task:mode=exclusive_process:gmodel=NVIDIAA100_SXM4_80GB\" terratorch iterate --hpo --config {config}"
     else:
diff --git a/tests/integration/test_main.py b/tests/integration/test_main.py
index e4d5ae3..9789470 100644
--- a/tests/integration/test_main.py
+++ b/tests/integration/test_main.py
@@ -1,8 +1,37 @@
+import itertools
 from pathlib import Path
 from benchmark.main import main
 import pytest
 import sys
 
+CONFIG_FILES = [
+    # "configs/tests/benchmark_v2_simple.yaml",
+    "configs/tests/dofa_large_patch16_224_upernetdecoder_true_modified.yaml",
+    "configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml",
+    "configs/tests/terratorch-iterate-configs/test_case_03/terratorch__encoder_decoder_timm_resnet101_model_factory.yaml",
+]
+# CONTINUE_EXISTING_EXPERIMENT = [True, False]
+# TEST_MODELS = [True, False]
+HPO = [True]
+INPUT_TEST_MAIN = list(
+    itertools.product(HPO, CONFIG_FILES)
+)
+
+
+def get_test_ids() -> list[str]:
+    test_case_ids = list()
+    for config, cee, tm in INPUT_TEST_MAIN:
+        filename = config.split("/")[-1].replace(".yaml", "")
+        tid = f"{filename}_{cee}_{tm}"
+        test_case_ids.append(tid)
+    return test_case_ids
+
+
+@pytest.mark.parametrize(
+    "config, continue_existing_experiment, test_models",
+    INPUT_TEST_MAIN,
+    ids=get_test_ids(),
+)
 
 # terratorch iterate --hpo --config configs/tests/benchmark_v2_simple.yaml
 @pytest.mark.parametrize(
@@ -10,12 +39,17 @@
     [
         (
             True,
-            "configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml"
+            "configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml",
         )
     ],
 )
-def test_main(hpo: bool, config: str):
-    home_dir = Path(__file__).parent.parent.parent 
+def test_main(
+    hpo: bool,
+    config: str,
+    continue_existing_experiment: bool,
+    test_models: bool,
+):
+    home_dir = Path(__file__).parent.parent.parent
     config_file: Path = home_dir / config
     assert config_file.exists()
     arguments = ["terratorch", "--config", str(config_file.resolve())]
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
deleted file mode 100644
index ecc2850..0000000
--- a/tests/test_benchmark.py
+++ /dev/null
@@ -1,358 +0,0 @@
-import itertools
-import shutil
-from benchmark.benchmark_types import Defaults, Task, TaskTypeEnum
-import pytest
-from benchmark.backbone_benchmark import benchmark_backbone
-from terratorch.datamodules import MChesapeakeLandcoverNonGeoDataModule
-from albumentations import HorizontalFlip, VerticalFlip, Resize
-from albumentations.pytorch.transforms import ToTensorV2
-import os
-from pathlib import Path
-import uuid
-from jsonargparse import ArgumentParser
-
-
-BACKBONE_PRETRAINED_FILE = os.getenv(
-    "BACKBONE_PRETRAINED_FILE",
-    "/dccstor/geofm-finetuning/pretrain_ckpts/v9_no_sea/vit_b/epoch-395-loss-0.0339_clean.pt",
-)
-
-SEGMENTATION_V1 = os.getenv(
-    "SEGMENTATION_V1", "/dccstor/geofm-finetuning/datasets/geobench/segmentation_v1.0"
-)
-
-
-RAY_STORAGE = os.getenv(
-    "RAY_STORAGE", "/dccstor/geofm-finetuning/terratorch-iterate-test/ray_storage"
-)
-
-
-@pytest.fixture(scope="module")
-def defaults() -> Defaults:
-    file = BACKBONE_PRETRAINED_FILE
-    assert Path(file).exists(), f"Error! {file=} does not exist"
-    trainer_args = {
-        "precision": "bf16-mixed",
-        "max_epochs": 10,
-    }
-    terratorch_task = {
-        "model_args": {
-            "pretrained": True,
-            "backbone": "prithvi_vit_100",
-            "backbone_out_indices": [2, 5, 8, 11],
-            "backbone_pretrained_cfg_overlay": {"file": file},
-        },
-        "model_factory": "PrithviModelFactory",
-        "optimizer": "AdamW",
-    }
-    return Defaults(trainer_args=trainer_args, terratorch_task=terratorch_task)
-
-
-@pytest.fixture(scope="module")
-def mchesapeakelandcovernongeodatamodule() -> MChesapeakeLandcoverNonGeoDataModule:
-    data_root = SEGMENTATION_V1
-    assert Path(data_root).exists(), f"Error! Directory {data_root} does not exist"
-    train_transform = [Resize(height=224, width=224), ToTensorV2()]
-    test_transform = [
-        HorizontalFlip(p=0.5),
-        VerticalFlip(p=0.5),
-        Resize(height=224, width=224),
-        ToTensorV2(),
-    ]
-    return MChesapeakeLandcoverNonGeoDataModule(
-        num_workers=6,
-        batch_size=16,
-        partition="0.10x_train",
-        train_transform=train_transform,
-        test_transform=test_transform,
-        data_root=data_root,
-        bands=["RED", "GREEN", "BLUE", "NIR"],
-    )
-
-
-@pytest.fixture(scope="module")
-def tasks(mchesapeakelandcovernongeodatamodule):
-
-    t = Task(
-        name="chesapeake",
-        type=TaskTypeEnum.segmentation,
-        direction="max",
-        metric="val/Multiclass_Jaccard_Index",
-        early_stop_patience=10,
-        terratorch_task={
-            "loss": "ce",
-            "model_args": {
-                "decoder": "UperNetDecoder",
-                "decoder_channels": 128,
-                "decoder_scale_modules": True,
-                "bands": ["RED", "GREEN", "BLUE", "NIR"],
-                "num_classes": 7,
-            },
-        },
-        datamodule=mchesapeakelandcovernongeodatamodule,
-    )
-    return [t]
-
-
-def find_file(directory: str, filename: str):
-    for root, _, files in os.walk(directory):
-        if filename in files:
-            return os.path.join(root, filename)
-    return None
-
-
-CONFIG_FILES = [
-    # "configs/tests/benchmark_v2_simple.yaml",
-    "configs/tests/dofa_large_patch16_224_upernetdecoder_true_modified.yaml",
-    "configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml",
-    "configs/tests/terratorch-iterate-configs/test_case_03/terratorch__encoder_decoder_timm_resnet101_model_factory.yaml",
-]
-CONTINUE_EXISTING_EXPERIMENT = [True, False]
-TEST_MODELS = [True, False]
-INPUT_TEST_RUN_BENCHMARK = list(
-    itertools.product(CONFIG_FILES, CONTINUE_EXISTING_EXPERIMENT, TEST_MODELS)
-)
-
-
-def get_test_ids() -> list[str]:
-    test_case_ids = list()
-    for config, cee, tm in INPUT_TEST_RUN_BENCHMARK:
-        filename = config.split("/")[-1].replace(".yaml", "")
-        tid = f"{filename}_{cee}_{tm}"
-        test_case_ids.append(tid)
-    return test_case_ids
-
-
-@pytest.mark.parametrize(
-    "config, continue_existing_experiment, test_models",
-    INPUT_TEST_RUN_BENCHMARK,
-    ids=get_test_ids(),
-)
-def test_run_benchmark(
-    config: str, continue_existing_experiment: bool, test_models: bool
-):
-    path = os.path.join(os.getcwd(), config)
-    config_path = Path(path)
-    # instantiate objects from yaml
-    parser = ArgumentParser()
-    parser.add_argument('--defaults', type=Defaults)  # to ignore model
-    parser.add_argument('--optimization_space', type=dict)  # to ignore model
-    parser.add_argument('--experiment_name', type=str)  # to ignore model
-    parser.add_argument('--run_name', type=str)  # to ignore model
-    parser.add_argument('--save_models', type=bool)  # to ignore model
-    parser.add_argument('--storage_uri', type=str)  # to ignore model
-    parser.add_argument('--ray_storage_path', type=str)  # to ignore model
-    parser.add_argument('--n_trials', type=int)  # to ignore model
-    parser.add_argument('--run_repetitions', type=int)  # to ignore model
-    parser.add_argument('--tasks', type=list[Task])
-    config = parser.parse_path(str(config_path))
-    config_init = parser.instantiate_classes(config)
-    # validate the objects
-    experiment_name = config_init.experiment_name
-    experiment_name = f"{experiment_name}_continue_{continue_existing_experiment}_test_models_{test_models}"
-    assert isinstance(experiment_name, str), f"Error! {experiment_name=} is not a str"
-    run_name = config_init.run_name
-    if run_name is not None:
-        assert isinstance(run_name, str), f"Error! {run_name=} is not a str"
-    tasks = config_init.tasks
-    assert isinstance(tasks, list), f"Error! {tasks=} is not a list"
-    for t in tasks:
-        assert isinstance(t, Task), f"Error! {t=} is not a Task"
-    defaults = config_init.defaults
-    assert isinstance(defaults, Defaults), f"Error! {defaults=} is not a Defaults"
-    # defaults.trainer_args["max_epochs"] = 5
-    storage_uri = config_init.storage_uri
-    assert isinstance(storage_uri, str), f"Error! {storage_uri=} is not a str"
-    storage_uri_path = Path(storage_uri) / uuid.uuid4().hex / "hpo"
-    if not storage_uri_path.exists():
-        try:
-            storage_uri_path.mkdir(parents=True, exist_ok=True)
-            print(f"Directory created at: {path}")
-        except FileNotFoundError as e:
-            print(f"Error creating directory: {e}")
-
-    optimization_space = config_init.optimization_space
-    assert isinstance(
-        optimization_space, dict
-    ), f"Error! {optimization_space=} is not a dict"
-    ray_storage = RAY_STORAGE
-    assert isinstance(ray_storage, str), f"Error! {ray_storage=} is not a str"
-    ray_storage_path = Path(ray_storage) / uuid.uuid4().hex
-    if not ray_storage_path.exists():
-        try:
-            ray_storage_path.mkdir(parents=True, exist_ok=True)
-            print(f"Directory created at: {path}")
-        except FileNotFoundError as e:
-            print(f"Error creating directory: {e}")
-    n_trials = config_init.n_trials
-    assert isinstance(n_trials, int) and n_trials > 0, f"Error! {n_trials=} is invalid"
-    # run_repetions is an optional parameter
-    run_repetitions = config_init.run_repetitions
-    if run_repetitions is not None:
-        assert (
-            isinstance(run_repetitions, int) and run_repetitions >= 0
-        ), f"Error! {run_repetitions=} is invalid"
-    else:
-        run_repetitions = 0
-    mlflow_info = benchmark_backbone(
-        experiment_name=experiment_name,
-        run_name=run_name,
-        run_id=None,
-        defaults=defaults,
-        tasks=tasks,
-        n_trials=n_trials,
-        save_models=False,
-        storage_uri=str(storage_uri_path),
-        ray_storage_path=str(ray_storage_path),
-        optimization_space=optimization_space,
-        continue_existing_experiment=continue_existing_experiment,
-        test_models=test_models,
-        run_repetitions=run_repetitions,
-        logger=None,
-    )
-    assert isinstance(mlflow_info, dict), f"Error! {mlflow_info=} is not a dict"
-    validate_results(
-        experiment_name=experiment_name,
-        storage_uri=str(storage_uri_path),
-        finished_run_id=mlflow_info["experiment_id"],
-    )
-
-
-@pytest.mark.parametrize(
-    "config, continue_existing_experiment, test_models",
-    [
-        ("configs/tests/benchmark_marida_l2a_terramind_base.yaml", False, False),
-    ],
-)
-def test_run_benchmark_no_specific_terratorch_task(
-    config: str, continue_existing_experiment: bool, test_models: bool
-):
-    delete_new_dirs = True
-    path = os.path.join(os.getcwd(), config)
-    config_path = Path(path)
-    assert (
-        config_path.exists()
-    ), f"Error! config does not exist: {config_path.resolve()}"
-    # instantiate objects from yaml
-    parser = ArgumentParser()
-    parser.add_argument('--defaults', type=Defaults)  # to ignore model
-    parser.add_argument('--optimization_space', type=dict)  # to ignore model
-    parser.add_argument('--experiment_name', type=str)  # to ignore model
-    parser.add_argument('--run_name', type=str)  # to ignore model
-    parser.add_argument('--save_models', type=bool)  # to ignore model
-    parser.add_argument('--storage_uri', type=str)  # to ignore model
-    parser.add_argument('--ray_storage_path', type=str)  # to ignore model
-    parser.add_argument('--n_trials', type=int)  # to ignore model
-    parser.add_argument('--run_repetitions', type=int)  # to ignore model
-    parser.add_argument('--tasks', type=list[Task])
-    config = parser.parse_path(str(config_path))
-    config_init = parser.instantiate_classes(config)
-    # validate the objects
-    experiment_name = config_init.experiment_name
-    experiment_name = f"{experiment_name}_continue_{continue_existing_experiment}_test_models_{test_models}"
-    assert isinstance(experiment_name, str), f"Error! {experiment_name=} is not a str"
-    run_name = config_init.run_name
-    if run_name is not None:
-        assert isinstance(run_name, str), f"Error! {run_name=} is not a str"
-    tasks = config_init.tasks
-    assert isinstance(tasks, list), f"Error! {tasks=} is not a list"
-    for t in tasks:
-        assert isinstance(t, Task), f"Error! {t=} is not a Task"
-        if t.terratorch_task is not None:
-            t.terratorch_task = None
-
-    defaults = config_init.defaults
-    assert isinstance(defaults, Defaults), f"Error! {defaults=} is not a Defaults"
-    # defaults.trainer_args["max_epochs"] = 5
-    storage_uri = config_init.storage_uri
-    assert isinstance(storage_uri, str), f"Error! {storage_uri=} is not a str"
-    storage_uri_path = Path(storage_uri) / uuid.uuid4().hex / "hpo"
-    if not storage_uri_path.exists():
-        try:
-            storage_uri_path.mkdir(parents=True, exist_ok=True)
-            print(f"Directory created at: {path}")
-        except FileNotFoundError as e:
-            print(f"Error creating directory: {e}")
-    optimization_space = config_init.optimization_space
-    assert isinstance(
-        optimization_space, dict
-    ), f"Error! {optimization_space=} is not a dict"
-    ray_storage = RAY_STORAGE
-    assert isinstance(ray_storage, str), f"Error! {ray_storage=} is not a str"
-    ray_storage_path = Path(ray_storage) / uuid.uuid4().hex
-    if not ray_storage_path.exists():
-        try:
-            ray_storage_path.mkdir(parents=True, exist_ok=True)
-            print(f"Directory created at: {path}")
-        except FileNotFoundError as e:
-            print(f"Error creating directory: {e}")
-    n_trials = config_init.n_trials
-    assert isinstance(n_trials, int) and n_trials > 0, f"Error! {n_trials=} is invalid"
-    # run_repetions is an optional parameter
-    run_repetitions = config_init.run_repetitions
-    if run_repetitions is not None:
-        assert (
-            isinstance(run_repetitions, int) and run_repetitions >= 0
-        ), f"Error! {run_repetitions=} is invalid"
-    else:
-        run_repetitions = 0
-    finished_run_id = benchmark_backbone(
-        experiment_name=experiment_name,
-        run_name=run_name,
-        run_id=None,
-        defaults=defaults,
-        tasks=tasks,
-        n_trials=n_trials,
-        save_models=False,
-        storage_uri=str(storage_uri_path),
-        ray_storage_path=str(ray_storage_path),
-        optimization_space=optimization_space,
-        continue_existing_experiment=continue_existing_experiment,
-        test_models=test_models,
-        run_repetitions=run_repetitions,
-    )
-    validate_results(
-        experiment_name=experiment_name,
-        storage_uri=str(storage_uri_path),
-        finished_run_id=finished_run_id,
-    )
-
-    if storage_uri_path.is_dir() and delete_new_dirs:
-        try:
-            shutil.rmtree(storage_uri_path)
-            print(
-                f"Directory '{storage_uri_path}' and its contents removed successfully."
-            )
-        except OSError as e:
-            print(f"Error: {storage_uri_path} : {e.strerror}")
-    else:
-        print(f"Directory '{storage_uri_path}' does not exist.")
-
-
-def validate_results(experiment_name: str, storage_uri: str, finished_run_id: str):
-    # get the most recent modified directory
-    dir_path = Path(storage_uri) / finished_run_id
-    assert dir_path.exists(), f"Error! Directory does not exist: {dir_path}"
-    # find mlflow.runName files within the result dir
-    meta_yaml = "meta.yaml"
-
-    meta_yaml_path = dir_path / meta_yaml
-    assert (
-        meta_yaml_path.exists()
-    ), f"Error! meta.yaml file {meta_yaml_path} does not exist"
-    # open file and check that the experiment name is the same
-    with open(meta_yaml_path, mode="r") as f:
-        # read all the lines
-        lines = f.readlines()
-        # try to find experiment id and name in these lines
-        experiment_name_found: bool = False
-        experiment_id_found: bool = False
-        for line in lines:
-            if experiment_name in line:
-                experiment_name_found = True
-            if finished_run_id in line:
-                experiment_id_found = True
-        assert (
-            experiment_name_found and experiment_id_found
-        ), f"Error! Both experiment name ({experiment_name=}) and finished run id ({finished_run_id=}) must be in the {meta_yaml_path=}: {experiment_id_found=} {experiment_name_found=}"
-    # TODO delete the directories that were created by this test case

From c2792bfac98a66c74d07a60880c52a44acc0a514 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Thu, 25 Sep 2025 14:52:10 -0300
Subject: [PATCH 21/40] fix run test

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 tests/integration/test_main.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/integration/test_main.py b/tests/integration/test_main.py
index 9789470..bcd6ff8 100644
--- a/tests/integration/test_main.py
+++ b/tests/integration/test_main.py
@@ -20,15 +20,15 @@
 
 def get_test_ids() -> list[str]:
     test_case_ids = list()
-    for config, cee, tm in INPUT_TEST_MAIN:
+    for hpo, config in INPUT_TEST_MAIN:
         filename = config.split("/")[-1].replace(".yaml", "")
-        tid = f"{filename}_{cee}_{tm}"
+        tid = f"{filename}_hpo_{hpo}"
         test_case_ids.append(tid)
     return test_case_ids
 
 
 @pytest.mark.parametrize(
-    "config, continue_existing_experiment, test_models",
+    "hpo, config",
     INPUT_TEST_MAIN,
     ids=get_test_ids(),
 )

From ed6fa9fef2f7bcbd141b05ec252f52568b3e528a Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Thu, 25 Sep 2025 14:55:52 -0300
Subject: [PATCH 22/40] fix another bug

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 tests/integration/test_main.py | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/tests/integration/test_main.py b/tests/integration/test_main.py
index bcd6ff8..f65d80e 100644
--- a/tests/integration/test_main.py
+++ b/tests/integration/test_main.py
@@ -10,8 +10,6 @@
     "configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml",
     "configs/tests/terratorch-iterate-configs/test_case_03/terratorch__encoder_decoder_timm_resnet101_model_factory.yaml",
 ]
-# CONTINUE_EXISTING_EXPERIMENT = [True, False]
-# TEST_MODELS = [True, False]
 HPO = [True]
 INPUT_TEST_MAIN = list(
     itertools.product(HPO, CONFIG_FILES)
@@ -21,8 +19,11 @@
 def get_test_ids() -> list[str]:
     test_case_ids = list()
     for hpo, config in INPUT_TEST_MAIN:
+        # get the filename
         filename = config.split("/")[-1].replace(".yaml", "")
+        # set test id
         tid = f"{filename}_hpo_{hpo}"
+        # append to list of test ids
         test_case_ids.append(tid)
     return test_case_ids
 
@@ -32,22 +33,9 @@ def get_test_ids() -> list[str]:
     INPUT_TEST_MAIN,
     ids=get_test_ids(),
 )
-
-# terratorch iterate --hpo --config configs/tests/benchmark_v2_simple.yaml
-@pytest.mark.parametrize(
-    "hpo, config",
-    [
-        (
-            True,
-            "configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml",
-        )
-    ],
-)
 def test_main(
     hpo: bool,
     config: str,
-    continue_existing_experiment: bool,
-    test_models: bool,
 ):
     home_dir = Path(__file__).parent.parent.parent
     config_file: Path = home_dir / config

From 1dd0372879fe7966e33dae313e484c924c251819 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Thu, 25 Sep 2025 15:42:02 -0300
Subject: [PATCH 23/40] set default continue_experiments to False

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 benchmark/backbone_benchmark.py | 199 ++++++++++++++++++++------------
 tests/integration/test_main.py  |   2 +-
 2 files changed, 123 insertions(+), 78 deletions(-)

diff --git a/benchmark/backbone_benchmark.py b/benchmark/backbone_benchmark.py
index cf161a8..7b9c2c0 100644
--- a/benchmark/backbone_benchmark.py
+++ b/benchmark/backbone_benchmark.py
@@ -179,6 +179,99 @@ def parse_optimization_space(space: dict | None) -> optimization_space_type | No
     return parsed_space
 
 
+def _run_hpo(
+    run_name: str,
+    run_id: str,
+    description: str,
+    tasks: list,
+    completed_task_run_names: list,
+    task_run_to_id_match: dict,
+    defaults,
+    storage_uri: str,
+    experiment_name: str,
+    optimization_space,
+    n_trials,
+    save_models,
+    sampler,
+    test_models,
+    table_entries,
+    table_columns,
+    backbone,
+    task_names,
+    PATH_TO_JOB_TRACKING,
+    logger,
+) -> tuple[str, str]:
+    logger.info("Running hyperparameter optimization")
+    with mlflow.start_run(
+        run_name=run_name, run_id=run_id, description=description
+    ) as run:
+        for task in tasks:
+            # only run task if it was not completed before
+            task_run_name = task.name
+            if task_run_name in completed_task_run_names:
+                logger.info(f"{task_run_name} already completed")
+                continue
+            else:
+                logger.info(f"{task_run_name} not completed. starting now")
+
+            task_run_id = (
+                task_run_to_id_match[task_run_name]
+                if task_run_name in task_run_to_id_match
+                else None
+            )
+            best_value, metric_name, hparams = benchmark_backbone_on_task(
+                logger,
+                defaults,
+                task,
+                storage_uri,
+                experiment_name,
+                experiment_run_id=run.info.run_id,
+                task_run_id=task_run_id,
+                optimization_space=optimization_space,
+                n_trials=n_trials,
+                save_models=save_models,
+                sampler=sampler,
+                test_models=test_models,
+            )
+            table_entries.append([task.name, metric_name, best_value, hparams])
+            table_entries_filename = str(
+                PATH_TO_JOB_TRACKING
+                / f"{experiment_name}-{run.info.run_id}_table_entries.pkl"
+            )
+            with open(table_entries_filename, 'wb') as handle:
+                pickle.dump(table_entries, handle, protocol=pickle.HIGHEST_PROTOCOL)
+
+        table = tabulate(table_entries, headers=table_columns)
+        logger.info(table)
+        df = pd.DataFrame(data=table_entries, columns=table_columns)
+        df.set_index("Task")
+        logger.info("Starting to save results")
+        mlflow.log_table(
+            df,
+            "results_table.json",
+            run.info.run_id,
+        )
+        experiment_id = run.info.experiment_id
+
+        # check completion of HPO for all tasks before proceeding to next stage
+        existing_experiments = check_existing_experiments(
+            logger=logger,
+            storage_uri=storage_uri,
+            experiment_name=experiment_name,
+            exp_parent_run_name=run_name,
+            task_names=task_names,
+            n_trials=n_trials,
+            backbone=backbone,
+        )
+        if existing_experiments["finished_run"] is not None:
+            finished_run_id = existing_experiments["finished_run"]
+        else:
+            logger.info("HPO is not complete. Please re-run this experiment")
+            raise RuntimeError
+
+        return experiment_id, finished_run_id
+
+
 def benchmark_backbone(
     defaults: Defaults,
     tasks: list[Task],
@@ -194,7 +287,7 @@ def benchmark_backbone(
     run_id: str | None = None,
     description: str = "No description provided",
     bayesian_search: bool = True,
-    continue_existing_experiment: bool = True,
+    continue_existing_experiment: bool = False,
     test_models: bool = False,
     run_repetitions: int = REPEATED_SEEDS_DEFAULT,
     report_on_best_val: bool = True,
@@ -240,21 +333,14 @@ def benchmark_backbone(
     mlflow.set_tracking_uri(storage_uri)
     mlflow.set_experiment(experiment_name)
 
-    if bayesian_search:
-        sampler: BaseSampler | None = None  # take the default
-    else:
-        sampler = RandomSampler()
-
     optimization_space = parse_optimization_space(optimization_space)
-    table_columns = ["Task", "Metric", "Best Score", "Hyperparameters"]
-    table_entries = []
 
     backbone: str = defaults.terratorch_task["model_args"]["backbone"]
     task_names = [task.name for task in tasks]
     run_name = f"top_run_{experiment_name}" if run_name is None else run_name
 
     completed_task_run_names = []
-    run_hpo = True
+    optimize_hyperparams = True
     task_run_to_id_match = {}
     if continue_existing_experiment:
         # find status of existing runs, and delete incomplete runs except one with the most complete tasks
@@ -276,10 +362,10 @@ def benchmark_backbone(
                 logger.info("Continuing previous experiment parent run")
                 run_id = existing_experiments["incomplete_run_to_finish"]
                 experiment_id = existing_experiments["experiment_id"]
-                run_hpo = True
+                optimize_hyperparams = True
 
             if existing_experiments["finished_run"] is not None:
-                run_hpo = False
+                optimize_hyperparams = False
                 finished_run_id = existing_experiments["finished_run"]
                 run_id = existing_experiments["finished_run"]
 
@@ -300,79 +386,38 @@ def benchmark_backbone(
         logger.info("Starting new experiment from scratch")
 
     # only run hyperparameter optimization (HPO) if there are no experiments with finished HPO
-    if run_hpo:
-        logger.info("Running hyperparameter optimization")
-        with mlflow.start_run(
-            run_name=run_name, run_id=run_id, description=description
-        ) as run:
-            for task in tasks:
-                # only run task if it was not completed before
-                task_run_name = task.name
-                if task_run_name in completed_task_run_names:
-                    logger.info(f"{task_run_name} already completed")
-                    continue
-                else:
-                    logger.info(f"{task_run_name} not completed. starting now")
-
-                task_run_id = (
-                    task_run_to_id_match[task_run_name]
-                    if task_run_name in task_run_to_id_match
-                    else None
-                )
-                best_value, metric_name, hparams = benchmark_backbone_on_task(
-                    logger,
-                    defaults,
-                    task,
-                    storage_uri,
-                    experiment_name,
-                    experiment_run_id=run.info.run_id,
-                    task_run_id=task_run_id,
-                    optimization_space=optimization_space,
-                    n_trials=n_trials,
-                    save_models=save_models,
-                    sampler=sampler,
-                    test_models=test_models,
-                )
-                table_entries.append([task.name, metric_name, best_value, hparams])
-                table_entries_filename = str(
-                    PATH_TO_JOB_TRACKING
-                    / f"{experiment_name}-{run.info.run_id}_table_entries.pkl"
-                )
-                with open(table_entries_filename, 'wb') as handle:
-                    pickle.dump(table_entries, handle, protocol=pickle.HIGHEST_PROTOCOL)
-
-            table = tabulate(table_entries, headers=table_columns)
-            logger.info(table)
-            df = pd.DataFrame(data=table_entries, columns=table_columns)
-            df.set_index("Task")
-            logger.info("Starting to save results")
-            mlflow.log_table(
-                df,
-                "results_table.json",
-                run.info.run_id,
-            )
-            experiment_id = run.info.experiment_id
-
-        # check completion of HPO for all tasks before proceeding to next stage
-        existing_experiments = check_existing_experiments(
-            logger=logger,
+    if optimize_hyperparams:
+        if bayesian_search:
+            sampler: BaseSampler | None = None  # take the default
+        else:
+            sampler = RandomSampler()
+        table_columns = ["Task", "Metric", "Best Score", "Hyperparameters"]
+        table_entries = []
+        experiment_id, finished_run_id = _run_hpo(
+            run_name=run_name,
+            run_id=run_id,
+            description=description,
+            tasks=tasks,
+            task_names=task_names,
+            completed_task_run_names=completed_task_run_names,
+            task_run_to_id_match=task_run_to_id_match,
+            defaults=defaults,
             storage_uri=storage_uri,
             experiment_name=experiment_name,
-            exp_parent_run_name=run_name,
-            task_names=task_names,
             n_trials=n_trials,
+            save_models=save_models,
+            sampler=sampler,
+            test_models=test_models,
+            table_entries=table_entries,
+            table_columns=table_columns,
             backbone=backbone,
+            PATH_TO_JOB_TRACKING=PATH_TO_JOB_TRACKING,
+            logger=logger,
         )
-        if existing_experiments["finished_run"] is not None:
-            finished_run_id = existing_experiments["finished_run"]
-        else:
-            logger.info("HPO is not complete. Please re-run this experiment")
-            raise RuntimeError
-    logger.info("HPO complete")
-
-    logger.info(f"run_repetitions: {run_repetitions}")
+        logger.info("HPO complete")
 
     if run_repetitions >= 1:
+        logger.info(f"run_repetitions: {run_repetitions}")
         # run repeated experiments
         logger.info(
             f"Now running {run_repetitions} repeats per experiment \n\
diff --git a/tests/integration/test_main.py b/tests/integration/test_main.py
index f65d80e..10b7dfd 100644
--- a/tests/integration/test_main.py
+++ b/tests/integration/test_main.py
@@ -8,7 +8,7 @@
     # "configs/tests/benchmark_v2_simple.yaml",
     "configs/tests/dofa_large_patch16_224_upernetdecoder_true_modified.yaml",
     "configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml",
-    "configs/tests/terratorch-iterate-configs/test_case_03/terratorch__encoder_decoder_timm_resnet101_model_factory.yaml",
+    "configs/tests/terratorch-iterate-configs/test_case_03/test_config_util__encoder_decoder_timm_resnet101_model_factory.yaml",
 ]
 HPO = [True]
 INPUT_TEST_MAIN = list(

From 66fe8e73f054f9e888531865de77f3af0da9ef66 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Thu, 25 Sep 2025 16:14:52 -0300
Subject: [PATCH 24/40] add missing param

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 benchmark/backbone_benchmark.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmark/backbone_benchmark.py b/benchmark/backbone_benchmark.py
index 7b9c2c0..0eb1f94 100644
--- a/benchmark/backbone_benchmark.py
+++ b/benchmark/backbone_benchmark.py
@@ -412,6 +412,7 @@ def benchmark_backbone(
             table_columns=table_columns,
             backbone=backbone,
             PATH_TO_JOB_TRACKING=PATH_TO_JOB_TRACKING,
+            optimization_space=optimization_space,
             logger=logger,
         )
         logger.info("HPO complete")

From 604b44d20b1a55add0098e4db97de9b5ce6b2d31 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Thu, 25 Sep 2025 17:00:13 -0300
Subject: [PATCH 25/40] validate test results

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 benchmark/main.py              |  3 +-
 tests/integration/test_main.py | 53 +++++++++++++++++++++++++++++++---
 2 files changed, 51 insertions(+), 5 deletions(-)

diff --git a/benchmark/main.py b/benchmark/main.py
index 6ddb626..076651b 100644
--- a/benchmark/main.py
+++ b/benchmark/main.py
@@ -307,7 +307,7 @@ def main():
                 run_repetitions = 0
 
             # run_repetitions is an optional parameter
-            benchmark_backbone(
+            experiment_info: dict = benchmark_backbone(
                 defaults=defaults,
                 tasks=tasks,
                 experiment_name=experiment_name,
@@ -323,6 +323,7 @@ def main():
                 bayesian_search=bayesian_search,
                 logger=logger,
             )
+            return experiment_info
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/test_main.py b/tests/integration/test_main.py
index 10b7dfd..d3caaf1 100644
--- a/tests/integration/test_main.py
+++ b/tests/integration/test_main.py
@@ -1,5 +1,7 @@
 import itertools
 from pathlib import Path
+
+import yaml
 from benchmark.main import main
 import pytest
 import sys
@@ -11,9 +13,7 @@
     "configs/tests/terratorch-iterate-configs/test_case_03/test_config_util__encoder_decoder_timm_resnet101_model_factory.yaml",
 ]
 HPO = [True]
-INPUT_TEST_MAIN = list(
-    itertools.product(HPO, CONFIG_FILES)
-)
+INPUT_TEST_MAIN = list(itertools.product(HPO, CONFIG_FILES))
 
 
 def get_test_ids() -> list[str]:
@@ -28,6 +28,35 @@ def get_test_ids() -> list[str]:
     return test_case_ids
 
 
+def validate_results(experiment_name: str, storage_uri: str, finished_run_id: str):
+    # get the most recent modified directory
+    dir_path = Path(storage_uri) / finished_run_id
+    assert dir_path.exists(), f"Error! Directory does not exist: {dir_path}"
+    # find mlflow.runName files within the result dir
+    meta_yaml = "meta.yaml"
+
+    meta_yaml_path = dir_path / meta_yaml
+    assert (
+        meta_yaml_path.exists()
+    ), f"Error! meta.yaml file {meta_yaml_path} does not exist"
+    # open file and check that the experiment name is the same
+    with open(meta_yaml_path, mode="r") as f:
+        # read all the lines
+        lines = f.readlines()
+        # try to find experiment id and name in these lines
+        experiment_name_found: bool = False
+        experiment_id_found: bool = False
+        for line in lines:
+            if experiment_name in line:
+                experiment_name_found = True
+            if finished_run_id in line:
+                experiment_id_found = True
+        assert (
+            experiment_name_found and experiment_id_found
+        ), f"Error! Both experiment name ({experiment_name=}) and finished run id ({finished_run_id=}) must be in the {meta_yaml_path=}: {experiment_id_found=} {experiment_name_found=}"
+    # TODO delete the directories that were created by this test case
+
+
 @pytest.mark.parametrize(
     "hpo, config",
     INPUT_TEST_MAIN,
@@ -40,8 +69,24 @@ def test_main(
     home_dir = Path(__file__).parent.parent.parent
     config_file: Path = home_dir / config
     assert config_file.exists()
+    with open(config_file, 'r') as file:
+        config_data = yaml.safe_load(file)
+    storage_uri: str = config_data["storage_uri"]
+    # handling relative paths
+    if storage_uri.startswith(".") or storage_uri.startswith(".."):
+        repo_home_dir = Path(__file__).parent.parent 
+        abs_path = repo_home_dir / storage_uri
+        storage_uri = str(abs_path.resolve())
+    experiment_name = config_data["experiment_name"]
     arguments = ["terratorch", "--config", str(config_file.resolve())]
     if hpo:
         arguments.insert(1, "--hpo")
     sys.argv = arguments
-    main()
+    # main only returns a dict when hpo is True
+    mlflow_info = main()
+    assert isinstance(mlflow_info, dict), f"Error! {mlflow_info=} is not a dict"
+    validate_results(
+        experiment_name=experiment_name,
+        storage_uri=storage_uri,
+        finished_run_id=mlflow_info["experiment_id"],
+    )

From 3368166bb418c1033a68e6873065d44e8a8a42d7 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Thu, 25 Sep 2025 18:20:22 -0300
Subject: [PATCH 26/40] fix path to storage_uri

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 tests/integration/test_main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/test_main.py b/tests/integration/test_main.py
index d3caaf1..613b5f8 100644
--- a/tests/integration/test_main.py
+++ b/tests/integration/test_main.py
@@ -74,7 +74,7 @@ def test_main(
     storage_uri: str = config_data["storage_uri"]
     # handling relative paths
     if storage_uri.startswith(".") or storage_uri.startswith(".."):
-        repo_home_dir = Path(__file__).parent.parent 
+        repo_home_dir = Path(__file__).parent.parent.parent 
         abs_path = repo_home_dir / storage_uri
         storage_uri = str(abs_path.resolve())
     experiment_name = config_data["experiment_name"]

From 4c21ef555472b0f0fd97c58dd8ef608552a98e01 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Thu, 25 Sep 2025 18:38:02 -0300
Subject: [PATCH 27/40] allow users to set continue_existing_experiments

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 benchmark/main.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/benchmark/main.py b/benchmark/main.py
index 076651b..9069718 100644
--- a/benchmark/main.py
+++ b/benchmark/main.py
@@ -155,6 +155,11 @@ def main():
     parser.add_argument('--bayesian_search', type=bool, default=True)
     parser.add_argument("--hpo", help="optimize hyperparameters", action="store_true")
     parser.add_argument("--repeat", help="repeat best experiments", action="store_true")
+    parser.add_argument(
+        "--continue_existing_experiments",
+        help="continue existing experiments",
+        action="store_true",
+    )
     parser.add_argument(
         "--summarize",
         help="summarize results from repeated experiments",
@@ -191,11 +196,16 @@ def main():
         hpo = args.hpo
         assert isinstance(hpo, bool), f"Error! {hpo=} is not a bool"
 
+        continue_existing_experiments: bool = args.continue_existing_experiments
+        assert isinstance(
+            continue_existing_experiments, bool
+        ), f"Error! {continue_existing_experiments=} is not a bool"
+
         storage_uri = config_init.storage_uri
         assert isinstance(storage_uri, str), f"Error! {storage_uri=} is not a str"
         # handling relative paths
         if storage_uri.startswith(".") or storage_uri.startswith(".."):
-            repo_home_dir = Path(__file__).parent.parent 
+            repo_home_dir = Path(__file__).parent.parent
             abs_path = repo_home_dir / storage_uri
             storage_uri = str(abs_path.resolve())
 
@@ -321,6 +331,7 @@ def main():
                 report_on_best_val=report_on_best_val,
                 test_models=test_models,
                 bayesian_search=bayesian_search,
+                continue_existing_experiment=continue_existing_experiments,
                 logger=logger,
             )
             return experiment_info

From 00cac8702c6c00a31c94eaecf77359c7ce0b9f66 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Fri, 26 Sep 2025 11:59:10 -0300
Subject: [PATCH 28/40] allow users to convert configs using terratorch iterate
 command

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 .../config_util/build_geobench_configs.py     | 258 -----------
 benchmark/main.py                             | 375 +++++++++-------
 .../geobench_v1_prithvi_big_earth_net.yaml    | 111 -----
 configs/tests/geobench_v1_prithvi_cashew.yaml | 108 -----
 .../tests/geobench_v1_prithvi_chesapeake.yaml | 104 -----
 configs/tests/geobench_v1_resnet_cashew.yaml  |  89 ----
 .../tests/geobench_v1_resnet_chesapeake.yaml  |  90 ----
 ...et50_sentinel2_all_moco_smp_unet_true.yaml | 402 ------------------
 ...ncoderdecoder_eo_v2_300_model_factory.yaml |   2 +-
 ...ncoderdecoder_eo_v2_300_model_factory.yaml |   2 +-
 tests/unit/test_build_geobench_configs.py     |  42 +-
 11 files changed, 254 insertions(+), 1329 deletions(-)
 delete mode 100644 benchmark/config_util/build_geobench_configs.py
 delete mode 100644 configs/tests/geobench_v1_prithvi_big_earth_net.yaml
 delete mode 100644 configs/tests/geobench_v1_prithvi_cashew.yaml
 delete mode 100644 configs/tests/geobench_v1_prithvi_chesapeake.yaml
 delete mode 100644 configs/tests/geobench_v1_resnet_cashew.yaml
 delete mode 100644 configs/tests/geobench_v1_resnet_chesapeake.yaml
 delete mode 100644 configs/tests/geobench_v1_ssl4eos12_resnet50_sentinel2_all_moco_smp_unet_true.yaml

diff --git a/benchmark/config_util/build_geobench_configs.py b/benchmark/config_util/build_geobench_configs.py
deleted file mode 100644
index 2342727..0000000
--- a/benchmark/config_util/build_geobench_configs.py
+++ /dev/null
@@ -1,258 +0,0 @@
-from pathlib import Path
-from typing import Any
-import yaml
-import pandas as pd
-import click
-from benchmark.benchmark_types import (
-    TaskTypeEnum,
-)
-from copy import deepcopy
-
-PRITHVI_600M = 'prithvi_600M'
-
-
-def _build_dataframe(config_files) -> pd.DataFrame:
-    """
-    build a pandas dataframe using the parameters of the specified config files
-    """
-    files = list()
-    dataset = list()
-    models = list()
-    for config_file in config_files:
-        try:
-            # extract dataset name from filename
-            ds = str(config_file).split('/')[-1].split('_')[0]
-            dataset.append(ds)
-            # append file path
-            files.append(str(config_file))
-        except KeyError as e:
-            msg = f"Error in file: {config_file}\n{e}"
-            print(msg)
-            raise KeyError(msg)
-
-    df = pd.DataFrame(data={"file": files, "dataset": dataset})
-    models = [
-        x.split('/')[-1].replace(y + '_', '').replace('.yaml', '')
-        for x, y in zip(df['file'].values, df['dataset'].values)
-    ]
-    df["model"] = models
-    return df
-
-
-# def _create_basemodule(data: dict[str, Any], model_filter: str) -> dict:
-#     """create a dict based on the "data" field of the terratorch config
-
-#     Args:
-#         data (dict[str, Any]): _description_
-#         model_filter (str): model name is used to specify batch_size and eval_batch_size
-
-#     Returns:
-#         dict: returns a dict that represents the datamodule field of iterate config file
-#     """
-#     base_module = dict()
-#     base_module["class_path"] = data["class_path"]
-#     if "dict_kwargs" in data.keys():
-#         dict_kwargs = data["dict_kwargs"]
-#         batch_size = 8 if model_filter != PRITHVI_600M else 4
-#         dict_kwargs["batch_size"] = batch_size
-#         dict_kwargs['eval_batch_size'] = 8 if model_filter != PRITHVI_600M else 4
-
-#         base_module["dict_kwargs"] = dict_kwargs
-#     base_module["init_args"] = data["init_args"]
-#     return base_module
-
-
-def _create_task(
-    name: str,
-    datamodule: dict,
-    metric: str,
-    terratorch_task: dict,
-    task_type: TaskTypeEnum,
-    direction: str,
-    max_run_duration: str | None = None,
-    early_stop_patience: int | None = None,
-    early_prune: bool | None = None,
-) -> dict:
-    """instantiate Task dataclass and convert it to dict
-
-    Args:
-        name (str): name of the task - comes from terratorch config - data.init_args.cls
-        datamodule (dict): _description_
-        metric (str): _description_
-        terratorch_task (dict): _description_
-        task_type (TaskTypeEnum): type of task, e.g., regression, classification
-        direction (str): direction to optimize
-        max_run_duration (str | None, optional): _description_. Defaults to None.
-        early_stop_patience (int | None, optional): _description_. Defaults to None.
-        early_prune (bool, optional): _description_. Defaults to False.
-
-    Returns:
-        dict: _description_
-    """
-
-    task_dict = {
-        "name": name,
-        "datamodule": datamodule,
-        "type": task_type.value,
-        "direction": direction,
-        "metric": metric,
-        "terratorch_task": terratorch_task,
-    }
-    # set optional fields if they are not None
-    for k, v in [
-        ("max_run_duration", max_run_duration),
-        ("early_stop_patience", early_stop_patience),
-        ("early_prune", early_prune),
-    ]:
-        if v is not None:
-            task_dict[k] = v
-
-    return task_dict
-
-
-def _get_task_type(template: dict) -> TaskTypeEnum:
-    tasks = template["tasks"]
-    task = tasks[0]
-    task_type = task["type"]
-    assert isinstance(task_type, str)
-
-    return TaskTypeEnum(value=task_type)
-
-
-def _get_task_direction(template: dict) -> str:
-    """extract task direction from template
-
-    Args:
-        template (dict): template created by user
-
-    Returns:
-        str: direction of the optimization (max or min)
-    """
-    tasks = template["tasks"]
-    task = tasks[0]
-    direction = task["direction"]
-    assert isinstance(direction, str)
-    assert direction in ["min", "max"]
-    return direction
-
-
-def generate_iterate_config(
-    input_dir: Path, template: Path, output_dir: Path, prefix: str = "test_"
-):
-    """generate the tt-iterate based on yaml files located within the specified directory, based
-    on previously defined template and save the result using specified output filename
-
-    Args:
-        input_dir (Path): contains all terratorch yaml files
-        output_dir (Path): filename of the result
-        template (Path): template file that contains pre-defined values
-        prefix (str): prefix for creating new config files
-    """
-
-    config_files = input_dir.glob('**/*.yaml')
-    files_df = _build_dataframe(config_files=config_files)
-
-    models = files_df['model'].unique()
-
-    with open(template, 'r') as file:
-        template_dict: dict = yaml.safe_load(file)
-
-    # generate one config per model
-    for model in models:
-        model_specific_template = deepcopy(template_dict)
-        # create unique name for experiment
-        model_specific_template["experiment_name"] = f"{prefix}_{model}"
-        tasks = list()  
-        
-        # filter dataframe by model
-        single_model_df = files_df[files_df['model'].values == model]
-
-        for i in range(single_model_df.shape[0]):
-            # open terratorch config file
-            with open(single_model_df['file'].values[i], 'r') as file:
-                data = yaml.safe_load(file)
-
-            name = single_model_df['dataset'].values[i]
-
-            model_args: dict = data['model']['init_args']['model_args']
-            # framework is an optional field of terratorch config
-            if (
-                model_args.get("framework") is not None
-                and model_args.get("framework") == "faster-rcnn"
-            ):
-                metric = 'val_map'
-            else:
-                metric = 'val/loss'
-
-            # terratorchtask is extracted from the data.model.init_args of terratorch config file
-            terratorch_task = data['model']['init_args']
-            # create datamodule based on data field
-            datamodule = data['data']
-            task_type = _get_task_type(template=template_dict)
-            task_direction = _get_task_direction(template=template_dict)
-            task = _create_task(
-                name=name,
-                datamodule=datamodule,
-                metric=metric,
-                terratorch_task=terratorch_task,
-                task_type=task_type,
-                direction=task_direction,
-            )
-            tasks.append(task)
-
-        model_specific_template['tasks'] = tasks
-        path = output_dir / f"{prefix}_{model}.yaml"
-        if path.exists():
-            path.unlink()
-        with open(path, 'w') as file:
-            yaml.dump(model_specific_template, file)
-            print(f"{path} file has been created")
-
-
-@click.command()
-@click.option(
-    '--input_dir',
-    prompt='Full path to the directory that contains all terratorch config yaml files',
-    help='Full path to the directory that contains all terratorch config yaml files',
-)
-@click.option(
-    '--output_dir',
-    prompt='Full path to the directory in which the new config files will be stored',
-    help='Full path to the directory in which the new config files will be stored',
-)
-@click.option(
-    '--template',
-    prompt='Full path to the template file',
-    help='Full path to the template file',
-)
-@click.option(
-    '--prefix',
-    prompt='Prefix of the config filename, e.g., my-config-',
-    help='Prefix of the config filename',
-)
-def generate_tt_iterate_config(
-    input_dir: str, output_dir: str, template: str, prefix: str
-):
-    directory_path = Path(input_dir)
-    assert directory_path.exists()
-    assert directory_path.is_dir
-
-    template_path = Path(template)
-    assert template_path.exists()
-    assert template_path.is_file
-
-    output_path = Path(output_dir)
-    assert output_path.exists()
-    assert output_path.is_dir
-
-    assert isinstance(prefix, str), f"Error! {type(prefix)} is not a str"
-    generate_iterate_config(
-        input_dir=directory_path,
-        output_dir=output_path,
-        template=template_path,
-        prefix=prefix,
-    )
-
-
-if __name__ == '__main__':
-    generate_tt_iterate_config()
diff --git a/benchmark/main.py b/benchmark/main.py
index 9069718..39e2847 100644
--- a/benchmark/main.py
+++ b/benchmark/main.py
@@ -11,6 +11,7 @@
     import_custom_modules,
     get_results_and_parameters,
 )
+from benchmark.config_util import build_iterate_config
 
 
 def _summarize(
@@ -131,6 +132,42 @@ def _repeat_experiment(
     )
 
 
+def _convert_config(args: Namespace):
+    """
+    This function processes command-line arguments to convert configuration files.
+
+    Parameters:
+    args (argparse.Namespace): Namespace object containing command-line arguments.
+
+    Raises:
+    AssertionError: If input or output paths are invalid or missing.
+
+    This function performs the following steps:
+    1. Asserts that the 'input' argument is a non-empty string and checks if the file exists.
+    2. Asserts that the 'output' argument is a non-empty string.
+    3. Calls the `generate_iterate_config` function from the `build_iterate_config` module, passing the input path, output path, prefix (if provided), and template (if provided).
+    """
+    input: str = args.input
+    assert input is not None and isinstance(
+        input, str
+    ), f"Error! Invalid value: {input=}"
+    input_path = Path(input)
+    assert input_path.exists()
+
+    output: str = args.output
+    assert output is not None and isinstance(
+        output, str
+    ), f"Error! Invalid value: {output=}"
+    output_path = Path(output)
+    template: str | None = args.template
+
+    prefix: str | None = args.prefix
+
+    template: str | None = args.template
+    build_iterate_config.generate_iterate_config(
+        input=input_path, output=output_path, prefix=prefix, template=template
+    )
+
 def main():
 
     parser = ArgumentParser()
@@ -173,168 +210,206 @@ def main():
         type=str,
         help="name of summarized results file",
     )
+    # arguments to convert terratorch's config into iterate's config
+    parser.add_argument(
+        "--build_iterate_config",
+        help="convert terratorch's config into terratorch-iterate's config",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--input",
+        help="input file or directory",
+        type=str,
+    )
+    parser.add_argument(
+        "--output",
+        help="output file or directory",
+        type=str,
+    )
+    parser.add_argument(
+        "--template",
+        help="template for creating config files",
+        type=str,
+    )
+    parser.add_argument(
+        "--prefix",
+        help="prefix of new config files",
+        type=str,
+    )
 
     args = parser.parse_args()
-    config_path: str | None = args.config
-    if config_path is None:
-        msg = """
-        Error: config argument has not been passed
-        usage: terratorch [-h] [--hpo] [--repeat] [--summarize] [--config CONFIG] 
-        """
-        print(msg)
+    if (
+            args.build_iterate_config is not None
+            and args.build_iterate_config is True
+        ):
+        _convert_config(args)
     else:
-        assert isinstance(
-            config_path, str
-        ), f"Error! Unexpected config type: {config_path}"
-        config = parser.parse_path(config_path)
-        config_init: Namespace = parser.instantiate_classes(config)
-
-        summarize: bool = args.summarize
-        assert isinstance(summarize, bool), f"Error! {summarize=} is not a bool"
-        repeat = args.repeat
-        assert isinstance(repeat, bool), f"Error! {repeat=} is not a bool"
-        hpo = args.hpo
-        assert isinstance(hpo, bool), f"Error! {hpo=} is not a bool"
-
-        continue_existing_experiments: bool = args.continue_existing_experiments
-        assert isinstance(
-            continue_existing_experiments, bool
-        ), f"Error! {continue_existing_experiments=} is not a bool"
-
-        storage_uri = config_init.storage_uri
-        assert isinstance(storage_uri, str), f"Error! {storage_uri=} is not a str"
-        # handling relative paths
-        if storage_uri.startswith(".") or storage_uri.startswith(".."):
-            repo_home_dir = Path(__file__).parent.parent
-            abs_path = repo_home_dir / storage_uri
-            storage_uri = str(abs_path.resolve())
-
-        logger_path = config_init.logger
-        if logger_path is None:
-            storage_uri_path = Path(storage_uri)
-            logger = get_logger(
-                log_folder=f"{str(storage_uri_path.parents[0])}/job_logs"
-            )
+        config_path: str | None = args.config
+        if config_path is None:
+            msg = """
+            Error: config argument has not been passed
+            usage: terratorch iterate [-h] [--hpo] [--repeat] [--summarize] [--config CONFIG] 
+            """
+            print(msg)
         else:
-            logging.config.fileConfig(fname=logger_path, disable_existing_loggers=False)
-            logger = logging.getLogger("terratorch-iterate")
-
-        # only summarize results from multiple experiments
-        if summarize:
-            return _summarize(
-                config_init=config_init,
-            )
-
-        # optimize hyperparameters and/or do repeated runs for single experiments
-        assert (
-            hpo is True or repeat is True
-        ), f"Error! either {repeat=} or {hpo=} must be True"
-        parent_run_id = args.parent_run_id
-        if parent_run_id is not None:
             assert isinstance(
-                parent_run_id, str
-            ), f"Error! {parent_run_id=} is not a str"
-
-        # validate the objects
-        experiment_name = config_init.experiment_name
-        assert isinstance(
-            experiment_name, str
-        ), f"Error! {experiment_name=} is not a str"
-        run_name = config_init.run_name
-        if run_name is not None:
-            assert isinstance(run_name, str), f"Error! {run_name=} is not a str"
-        # validate defaults
-        defaults = config_init.defaults
-        assert isinstance(defaults, Defaults), f"Error! {defaults=} is not a Defaults"
-
-        tasks = config_init.tasks
-        assert isinstance(tasks, list), f"Error! {tasks=} is not a list"
-        for t in tasks:
-            assert isinstance(t, Task), f"Error! {t=} is not a Task"
-            # if there is not specific terratorch_task specified, then use default terratorch_task
-            if t.terratorch_task is None:
-                t.terratorch_task = defaults.terratorch_task
-        # defaults.trainer_args["max_epochs"] = 5
-
-        optimization_space = config_init.optimization_space
-        assert isinstance(
-            optimization_space, dict
-        ), f"Error! {optimization_space=} is not a dict"
-
-        # ray_storage_path is optional
-        ray_storage_path = config_init.ray_storage_path
-        if ray_storage_path is not None:
-            assert isinstance(
-                ray_storage_path, str
-            ), f"Error! {ray_storage_path=} is not a str"
+                config_path, str
+            ), f"Error! Unexpected config type: {config_path}"
+            config = parser.parse_path(config_path)
+        
 
-        n_trials = config_init.n_trials
-        assert (
-            isinstance(n_trials, int) and n_trials > 0
-        ), f"Error! {n_trials=} is invalid"
-        run_repetitions = config_init.run_repetitions
+            config_init: Namespace = parser.instantiate_classes(config)
 
-        report_on_best_val = config_init.report_on_best_val
-        assert isinstance(
-            report_on_best_val, bool
-        ), f"Error! {ray_storage_path=} is not a bool"
+            summarize: bool = args.summarize
+            assert isinstance(summarize, bool), f"Error! {summarize=} is not a bool"
+            repeat = args.repeat
+            assert isinstance(repeat, bool), f"Error! {repeat=} is not a bool"
+            hpo = args.hpo
+            assert isinstance(hpo, bool), f"Error! {hpo=} is not a bool"
 
-        save_models = config_init.save_models
-        assert isinstance(save_models, bool), f"Error! {save_models=} is not a bool"
+            continue_existing_experiments: bool = args.continue_existing_experiments
+            assert isinstance(
+                continue_existing_experiments, bool
+            ), f"Error! {continue_existing_experiments=} is not a bool"
+
+            storage_uri = config_init.storage_uri
+            assert isinstance(storage_uri, str), f"Error! {storage_uri=} is not a str"
+            # handling relative paths
+            if storage_uri.startswith(".") or storage_uri.startswith(".."):
+                repo_home_dir = Path(__file__).parent.parent
+                abs_path = repo_home_dir / storage_uri
+                storage_uri = str(abs_path.resolve())
+
+            logger_path = config_init.logger
+            if logger_path is None:
+                storage_uri_path = Path(storage_uri)
+                logger = get_logger(
+                    log_folder=f"{str(storage_uri_path.parents[0])}/job_logs"
+                )
+            else:
+                logging.config.fileConfig(
+                    fname=logger_path, disable_existing_loggers=False
+                )
+                logger = logging.getLogger("terratorch-iterate")
+
+            # only summarize results from multiple experiments
+            if summarize:
+                return _summarize(
+                    config_init=config_init,
+                )
+
+            # optimize hyperparameters and/or do repeated runs for single experiments
+            assert (
+                hpo is True or repeat is True
+            ), f"Error! either {repeat=} or {hpo=} must be True"
+            parent_run_id = args.parent_run_id
+            if parent_run_id is not None:
+                assert isinstance(
+                    parent_run_id, str
+                ), f"Error! {parent_run_id=} is not a str"
+
+            # validate the objects
+            experiment_name = config_init.experiment_name
+            assert isinstance(
+                experiment_name, str
+            ), f"Error! {experiment_name=} is not a str"
+            run_name = config_init.run_name
+            if run_name is not None:
+                assert isinstance(run_name, str), f"Error! {run_name=} is not a str"
+            # validate defaults
+            defaults = config_init.defaults
+            assert isinstance(
+                defaults, Defaults
+            ), f"Error! {defaults=} is not a Defaults"
+
+            tasks = config_init.tasks
+            assert isinstance(tasks, list), f"Error! {tasks=} is not a list"
+            for t in tasks:
+                assert isinstance(t, Task), f"Error! {t=} is not a Task"
+                # if there is not specific terratorch_task specified, then use default terratorch_task
+                if t.terratorch_task is None:
+                    t.terratorch_task = defaults.terratorch_task
+            # defaults.trainer_args["max_epochs"] = 5
+
+            optimization_space = config_init.optimization_space
+            assert isinstance(
+                optimization_space, dict
+            ), f"Error! {optimization_space=} is not a dict"
+
+            # ray_storage_path is optional
+            ray_storage_path = config_init.ray_storage_path
+            if ray_storage_path is not None:
+                assert isinstance(
+                    ray_storage_path, str
+                ), f"Error! {ray_storage_path=} is not a str"
+
+            n_trials = config_init.n_trials
+            assert (
+                isinstance(n_trials, int) and n_trials > 0
+            ), f"Error! {n_trials=} is invalid"
+            run_repetitions = config_init.run_repetitions
+
+            report_on_best_val = config_init.report_on_best_val
+            assert isinstance(
+                report_on_best_val, bool
+            ), f"Error! {ray_storage_path=} is not a bool"
 
-        test_models = config_init.test_models
-        assert isinstance(test_models, bool), f"Error! {test_models=} is not a bool"
+            save_models = config_init.save_models
+            assert isinstance(save_models, bool), f"Error! {save_models=} is not a bool"
 
-        bayesian_search = config_init.bayesian_search
-        assert isinstance(
-            bayesian_search, bool
-        ), f"Error! {bayesian_search=} is not a bool"
+            test_models = config_init.test_models
+            assert isinstance(test_models, bool), f"Error! {test_models=} is not a bool"
 
-        # custom_modules_path is optional
-        custom_modules_path = config_init.custom_modules_path
-        if custom_modules_path is not None:
+            bayesian_search = config_init.bayesian_search
             assert isinstance(
-                custom_modules_path, str
-            ), f"Error! {custom_modules_path=} is not a str"
-            import_custom_modules(
-                logger=logger, custom_modules_path=custom_modules_path
-            )
-
-        if repeat and not hpo:
-            _repeat_experiment(
-                config_init=config_init,
-                storage_uri=storage_uri,
-                experiment_name=experiment_name,
-                defaults=defaults,
-                tasks=tasks,
-                optimization_space=optimization_space,
-                run_repetitions=run_repetitions,
-                save_models=save_models,
-                logger=logger,
-            )
-        else:
-            if not repeat and hpo:
-                run_repetitions = 0
-
-            # run_repetitions is an optional parameter
-            experiment_info: dict = benchmark_backbone(
-                defaults=defaults,
-                tasks=tasks,
-                experiment_name=experiment_name,
-                storage_uri=storage_uri,
-                ray_storage_path=ray_storage_path,
-                run_name=run_name,
-                optimization_space=optimization_space,
-                n_trials=n_trials,
-                run_repetitions=run_repetitions,
-                save_models=save_models,
-                report_on_best_val=report_on_best_val,
-                test_models=test_models,
-                bayesian_search=bayesian_search,
-                continue_existing_experiment=continue_existing_experiments,
-                logger=logger,
-            )
-            return experiment_info
+                bayesian_search, bool
+            ), f"Error! {bayesian_search=} is not a bool"
+
+            # custom_modules_path is optional
+            custom_modules_path = config_init.custom_modules_path
+            if custom_modules_path is not None:
+                assert isinstance(
+                    custom_modules_path, str
+                ), f"Error! {custom_modules_path=} is not a str"
+                import_custom_modules(
+                    logger=logger, custom_modules_path=custom_modules_path
+                )
+
+            if repeat and not hpo:
+                _repeat_experiment(
+                    config_init=config_init,
+                    storage_uri=storage_uri,
+                    experiment_name=experiment_name,
+                    defaults=defaults,
+                    tasks=tasks,
+                    optimization_space=optimization_space,
+                    run_repetitions=run_repetitions,
+                    save_models=save_models,
+                    logger=logger,
+                )
+            else:
+                if not repeat and hpo:
+                    run_repetitions = 0
+
+                # run_repetitions is an optional parameter
+                experiment_info: dict = benchmark_backbone(
+                    defaults=defaults,
+                    tasks=tasks,
+                    experiment_name=experiment_name,
+                    storage_uri=storage_uri,
+                    ray_storage_path=ray_storage_path,
+                    run_name=run_name,
+                    optimization_space=optimization_space,
+                    n_trials=n_trials,
+                    run_repetitions=run_repetitions,
+                    save_models=save_models,
+                    report_on_best_val=report_on_best_val,
+                    test_models=test_models,
+                    bayesian_search=bayesian_search,
+                    continue_existing_experiment=continue_existing_experiments,
+                    logger=logger,
+                )
+                return experiment_info
 
 
 if __name__ == "__main__":
diff --git a/configs/tests/geobench_v1_prithvi_big_earth_net.yaml b/configs/tests/geobench_v1_prithvi_big_earth_net.yaml
deleted file mode 100644
index 6c19c15..0000000
--- a/configs/tests/geobench_v1_prithvi_big_earth_net.yaml
+++ /dev/null
@@ -1,111 +0,0 @@
-experiment_name: geobench_v2_test
-run_name: test_models_saved_multiple_epochs_no_ray
-defaults:
-  trainer_args:
-    precision: bf16-mixed # for these new models pretrained with bf16-mixed we should probably finetune with bf16-mixed
-    max_epochs: 5
-  terratorch_task:
-    model_args:
-      pretrained: True
-      backbone: prithvi_eo_v1_100
-      backbone_out_indices:
-        - 2
-        - 5
-        - 8
-        - 11
-      backbone_pretrained_cfg_overlay:
-        file: /dccstor/geofm-finetuning/pretrain_ckpts/v9_no_sea/vit_b/epoch-395-loss-0.0339_clean.pt
-    model_factory: PrithviModelFactory
-    optimizer: AdamW
-    
-tasks:
-   # class
-  - name: big_earth_net
-    type: multilabel_classification
-    direction: max
-    terratorch_task:
-      loss: balanced_bce
-      model_args:
-        bands:
-          - RED
-          - GREEN
-          - BLUE
-          - NIR_NARROW
-          - SWIR_1
-          - SWIR_2
-        num_classes: 43
-        decoder: IdentityDecoder
-        head_linear_after_pool: True
-    datamodule:
-      class_path: terratorch.datamodules.MBigEarthNonGeoDataModule
-      init_args:
-        partition: 0.10x_train
-        train_transform:
-          - class_path: albumentations.HorizontalFlip
-            init_args:
-              p: 0.5
-          # - class_path: albumentations.RandomRotate90
-          #   init_args:
-          #     p: 0.5
-          - class_path: albumentations.VerticalFlip
-            init_args:
-              p: 0.5
-          # - class_path: albumentations.RandomBrightnessContrast
-          #   init_args:
-          #     p: 0.8
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        val_transform:
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        test_transform:
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        batch_size: 16
-        num_workers: 6
-        data_root: "/dccstor/geofm-finetuning/datasets/geobench/classification_v1.0"
-        bands:
-          - "RED"
-          - "GREEN"
-          - "BLUE"
-          - "NIR_NARROW"
-          - "SWIR_1"
-          - "SWIR_2"
-    optimization_except:
-      - decoder_channels
-      - head_dropout
-    metric: val/Multilabel_F1_Score
-    early_stop_patience: 5
-n_trials: 2
-save_models: False
-storage_uri: /dccstor/geofm-finetuning/terratorch-iterate-test/benchmark
-ray_storage_path: /dccstor/geofm-finetuning/terratorch-iterate-test/ray_storage
-optimization_space:
-  batch_size:
-      - 8
-      - 32
-      - 64
-  lr:
-    max: 1e-3
-    min: 1e-6
-    type: real
-    log: true
-  optimizer_hparams:
-    weight_decay:
-      min: 0
-      max: 0.4
-      type: real
-  model_args:
-    decoder_channels:
-      - 64
-      - 128
-      - 256
\ No newline at end of file
diff --git a/configs/tests/geobench_v1_prithvi_cashew.yaml b/configs/tests/geobench_v1_prithvi_cashew.yaml
deleted file mode 100644
index 2bb7a1c..0000000
--- a/configs/tests/geobench_v1_prithvi_cashew.yaml
+++ /dev/null
@@ -1,108 +0,0 @@
-experiment_name: geobench_v2_test
-run_name: test_models_saved_multiple_epochs_no_ray
-defaults:
-  trainer_args:
-    precision: bf16-mixed # for these new models pretrained with bf16-mixed we should probably finetune with bf16-mixed
-    max_epochs: 300
-  terratorch_task:
-    model_args:
-      pretrained: True
-      backbone: prithvi_eo_v1_100
-      backbone_out_indices:
-        - 2
-        - 5
-        - 8
-        - 11
-      backbone_pretrained_cfg_overlay:
-        file: /dccstor/geofm-finetuning/pretrain_ckpts/v9_no_sea/vit_b/epoch-395-loss-0.0339_clean.pt
-    model_factory: PrithviModelFactory
-    optimizer: AdamW
-    
-tasks:
-  - name: cashew
-    type: segmentation
-    direction: max
-    metric: val/Multiclass_Jaccard_Index
-    early_stop_patience: 50
-    terratorch_task:
-      loss: ce
-      model_args:
-        num_classes: 7
-        bands:
-          - RED
-          - GREEN
-          - BLUE
-          - NIR_NARROW
-          - SWIR_1
-          - SWIR_2
-        decoder: UperNetDecoder
-        decoder_channels: 128
-        decoder_scale_modules: true
-    datamodule:
-      class_path: terratorch.datamodules.MBeninSmallHolderCashewsNonGeoDataModule
-      init_args:
-        partition: 0.10x_train
-        train_transform:
-          - class_path: albumentations.HorizontalFlip
-            init_args:
-              p: 0.5
-          # - class_path: albumentations.RandomRotate90
-          #   init_args:
-          #     p: 0.5
-          - class_path: albumentations.VerticalFlip
-            init_args:
-              p: 0.5
-          # - class_path: albumentations.RandomBrightnessContrast
-          #   init_args:
-          #     p: 0.8
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        val_transform:
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        test_transform:
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        batch_size: 16
-        num_workers: 6
-        data_root: "/dccstor/geofm-finetuning/datasets/geobench/segmentation_v1.0"
-        bands:
-          - "RED"
-          - "GREEN"
-          - "BLUE"
-          - "NIR_NARROW"
-          - "SWIR_1"
-          - "SWIR_2"
-n_trials: 16
-save_models: False
-storage_uri: /dccstor/geofm-finetuning/carlosgomes/benchmark
-ray_storage_path: /dccstor/geofm-finetuning/carlosgomes/ray_storage
-optimization_space:
-  batch_size:
-      - 8
-      - 32
-      - 64
-  lr:
-    max: 1e-3
-    min: 1e-6
-    type: real
-    log: true
-  optimizer_hparams:
-    weight_decay:
-      min: 0
-      max: 0.4
-      type: real
-  model_args:
-    decoder_channels:
-      - 64
-      - 128
-      - 256
\ No newline at end of file
diff --git a/configs/tests/geobench_v1_prithvi_chesapeake.yaml b/configs/tests/geobench_v1_prithvi_chesapeake.yaml
deleted file mode 100644
index 52be24a..0000000
--- a/configs/tests/geobench_v1_prithvi_chesapeake.yaml
+++ /dev/null
@@ -1,104 +0,0 @@
-experiment_name: geobench_v2_test
-run_name: test_models_saved_multiple_epochs_no_ray
-defaults:
-  trainer_args:
-    precision: bf16-mixed # for these new models pretrained with bf16-mixed we should probably finetune with bf16-mixed
-    max_epochs: 300
-  terratorch_task:
-    model_args:
-      pretrained: True
-      backbone: prithvi_eo_v1_100
-      backbone_out_indices:
-        - 2
-        - 5
-        - 8
-        - 11
-      backbone_pretrained_cfg_overlay:
-        file: /dccstor/geofm-finetuning/pretrain_ckpts/v9_no_sea/vit_b/epoch-395-loss-0.0339_clean.pt
-    model_factory: PrithviModelFactory
-    optimizer: AdamW
-    
-tasks:
-  - name: chesapeake
-    type: segmentation
-    direction: max
-    metric: val/Multiclass_Jaccard_Index
-    early_stop_patience: 50
-    terratorch_task:
-      loss: ce
-      model_args:
-        decoder: UperNetDecoder
-        decoder_channels: 128
-        decoder_scale_modules: true
-        bands:
-        - RED
-        - GREEN
-        - BLUE
-        - NIR_NARROW
-        num_classes: 7
-    datamodule:
-      class_path: terratorch.datamodules.MChesapeakeLandcoverNonGeoDataModule
-      init_args:
-        partition: 0.10x_train
-        train_transform:
-          - class_path: albumentations.HorizontalFlip
-            init_args:
-              p: 0.5
-          # - class_path: albumentations.RandomRotate90
-          #   init_args:
-          #     p: 0.5
-          - class_path: albumentations.VerticalFlip
-            init_args:
-              p: 0.5
-          # - class_path: albumentations.RandomBrightnessContrast
-          #   init_args:
-          #     p: 0.8
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        val_transform:
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        test_transform:
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        batch_size: 16
-        num_workers: 6
-        data_root: "/dccstor/geofm-finetuning/datasets/geobench/segmentation_v1.0"
-        bands:
-          - "RED"
-          - "GREEN"
-          - "BLUE"
-          - "NIR"
-n_trials: 16
-save_models: False
-storage_uri: /dccstor/geofm-finetuning/carlosgomes/benchmark
-ray_storage_path: /dccstor/geofm-finetuning/carlosgomes/ray_storage
-optimization_space:
-  batch_size:
-      - 8
-      - 32
-      - 64
-  lr:
-    max: 1e-3
-    min: 1e-6
-    type: real
-    log: true
-  optimizer_hparams:
-    weight_decay:
-      min: 0
-      max: 0.4
-      type: real
-  model_args:
-    decoder_channels:
-      - 64
-      - 128
-      - 256
\ No newline at end of file
diff --git a/configs/tests/geobench_v1_resnet_cashew.yaml b/configs/tests/geobench_v1_resnet_cashew.yaml
deleted file mode 100644
index f07937e..0000000
--- a/configs/tests/geobench_v1_resnet_cashew.yaml
+++ /dev/null
@@ -1,89 +0,0 @@
-experiment_name: geobench_resnet
-run_name: resnet_50_rgb_only_16_trials
-bayesian_search: False
-defaults:
-  trainer_args:
-    precision: bf16-mixed # for these new models pretrained with bf16-mixed we should probably finetune with bf16-mixed
-    max_epochs: 2
-  terratorch_task:
-    model_args:
-      pretrained: True
-      backbone: resnet50
-    optimizer: AdamW
-    
-tasks:
-  - name: cashew
-    type: segmentation
-    direction: max
-    metric: val/Multiclass_Jaccard_Index
-    early_stop_patience: 50
-    terratorch_task:
-      loss: ce
-      model_factory: SMPModelFactory
-      model_args:
-        num_classes: 7
-        bands:
-          - RED
-          - GREEN
-          - BLUE
-        model: Unet
-    datamodule:
-      class_path: terratorch.datamodules.MBeninSmallHolderCashewsNonGeoDataModule
-      init_args:
-        partition: 0.10x_train
-        train_transform:
-          - class_path: albumentations.HorizontalFlip
-            init_args:
-              p: 0.5
-          # - class_path: albumentations.RandomRotate90
-          #   init_args:
-          #     p: 0.5
-          - class_path: albumentations.VerticalFlip
-            init_args:
-              p: 0.5
-          # - class_path: albumentations.RandomBrightnessContrast
-          #   init_args:
-          #     p: 0.8
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        val_transform:
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        test_transform:
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        batch_size: 16
-        num_workers: 6
-        data_root: "/dccstor/geofm-finetuning/datasets/geobench/segmentation_v1.0"
-        bands:
-          - "RED"
-          - "GREEN"
-          - "BLUE"
-n_trials: 16
-save_models: False
-storage_uri: /dccstor/geofm-finetuning/carlosgomes/benchmark
-ray_storage_path: /dccstor/geofm-finetuning/carlosgomes/ray_storage
-optimization_space:
-  batch_size:
-      - 8
-      - 32
-      - 64
-  lr:
-    max: 1e-3
-    min: 1e-6
-    type: real
-    log: true
-  optimizer_hparams:
-    weight_decay:
-      min: 0
-      max: 0.4
-      type: real
\ No newline at end of file
diff --git a/configs/tests/geobench_v1_resnet_chesapeake.yaml b/configs/tests/geobench_v1_resnet_chesapeake.yaml
deleted file mode 100644
index bd842ea..0000000
--- a/configs/tests/geobench_v1_resnet_chesapeake.yaml
+++ /dev/null
@@ -1,90 +0,0 @@
-experiment_name: geobench_resnet
-run_name: resnet_50_rgb_only_16_trials
-bayesian_search: False
-defaults:
-  trainer_args:
-    precision: bf16-mixed # for these new models pretrained with bf16-mixed we should probably finetune with bf16-mixed
-    max_epochs: 2
-  terratorch_task:
-    model_args:
-      pretrained: True
-      backbone: resnet50
-    optimizer: AdamW
-    
-tasks:
-  - name: chesapeake
-    type: segmentation
-    direction: max
-    metric: val/Multiclass_Jaccard_Index
-    early_stop_patience: 50
-    terratorch_task:
-      loss: ce
-      model_factory: SMPModelFactory
-      model_args:
-        model: Unet
-        bands:
-        - RED
-        - GREEN
-        - BLUE
-        num_classes: 7
-    datamodule:
-      class_path: terratorch.datamodules.MChesapeakeLandcoverNonGeoDataModule
-      init_args:
-        partition: 0.10x_train
-        train_transform:
-          - class_path: albumentations.HorizontalFlip
-            init_args:
-              p: 0.5
-          # - class_path: albumentations.RandomRotate90
-          #   init_args:
-          #     p: 0.5
-          - class_path: albumentations.VerticalFlip
-            init_args:
-              p: 0.5
-          # - class_path: albumentations.RandomBrightnessContrast
-          #   init_args:
-          #     p: 0.8
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        val_transform:
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        test_transform:
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        batch_size: 16
-        num_workers: 6
-        data_root: "/dccstor/geofm-finetuning/datasets/geobench/segmentation_v1.0"
-        bands:
-          - "RED"
-          - "GREEN"
-          - "BLUE"
-  
-n_trials: 16
-save_models: False
-storage_uri: /dccstor/geofm-finetuning/carlosgomes/benchmark
-ray_storage_path: /dccstor/geofm-finetuning/carlosgomes/ray_storage
-optimization_space:
-  batch_size:
-      - 8
-      - 32
-      - 64
-  lr:
-    max: 1e-3
-    min: 1e-6
-    type: real
-    log: true
-  optimizer_hparams:
-    weight_decay:
-      min: 0
-      max: 0.4
-      type: real
\ No newline at end of file
diff --git a/configs/tests/geobench_v1_ssl4eos12_resnet50_sentinel2_all_moco_smp_unet_true.yaml b/configs/tests/geobench_v1_ssl4eos12_resnet50_sentinel2_all_moco_smp_unet_true.yaml
deleted file mode 100644
index 1076979..0000000
--- a/configs/tests/geobench_v1_ssl4eos12_resnet50_sentinel2_all_moco_smp_unet_true.yaml
+++ /dev/null
@@ -1,402 +0,0 @@
-experiment_name: ssl4eos12_resnet50_sentinel2_all_moco_smp_unet
-defaults:
-  trainer_args:
-    max_epochs: 1
-    log_every_n_steps: 1
-  terratorch_task:
-    model_args:
-      backbone_pretrained: True
-      backbone: ssl4eos12_resnet50_sentinel2_all_moco
-      backbone_out_indices:
-        - 0
-        - 1
-        - 2
-        - 3
-        - 4
-    model_factory: EncoderDecoderFactory
-    optimizer: AdamW
-    
-tasks:
-  - name: chesapeake
-    type: segmentation
-    direction: max
-    metric: val/Multiclass_Jaccard_Index
-    early_stop_patience: 5
-    terratorch_task:
-      loss: ce
-      model_args:
-        decoder: smp_Unet
-        decoder_decoder_channels:
-          - 512
-          - 256
-          - 128
-          - 64
-        backbone_model_bands:
-        - RED
-        - GREEN
-        - BLUE
-        - NIR_NARROW
-        num_classes: 7
-    datamodule:
-      class_path: terratorch.datamodules.MChesapeakeLandcoverNonGeoDataModule
-      init_args:
-        partition: "0.01x_train"
-        train_transform:
-          - class_path: albumentations.HorizontalFlip
-            init_args:
-              p: 0.5
-          - class_path: albumentations.VerticalFlip
-            init_args:
-              p: 0.5
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        val_transform:
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        test_transform:
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        batch_size: 16
-        num_workers: 4
-        data_root: "/dccstor/geofm-finetuning/datasets/geobench/segmentation_v1.0"
-        bands:
-          - "RED"
-          - "GREEN"
-          - "BLUE"
-          - "NIR"
-  - name: cashew
-    type: segmentation
-    direction: max
-    metric: val/Multiclass_Jaccard_Index
-    early_stop_patience: 50
-    terratorch_task:
-      loss: ce
-      model_args:
-        num_classes: 7
-        backbone_model_bands:
-        - "COASTAL_AEROSOL"
-        - "BLUE"
-        - "GREEN"
-        - "RED"
-        - "RED_EDGE_1"
-        - "RED_EDGE_2"
-        - "RED_EDGE_3"
-        - "NIR_BROAD"
-        - "NIR_NARROW"
-        - "WATER_VAPOR"
-        - "SWIR_1"
-        - "SWIR_2"
-        decoder: smp_Unet
-        decoder_decoder_channels:
-          - 512
-          - 256
-          - 128
-          - 64
-    datamodule:
-      class_path: terratorch.datamodules.MBeninSmallHolderCashewsNonGeoDataModule
-      init_args:
-        partition: "0.01x_train"
-        train_transform:
-          - class_path: albumentations.HorizontalFlip
-            init_args:
-              p: 0.5
-          - class_path: albumentations.VerticalFlip
-            init_args:
-              p: 0.5
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        val_transform:
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        test_transform:
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        batch_size: 16
-        num_workers: 4
-        data_root: "/dccstor/geofm-finetuning/datasets/geobench/segmentation_v1.0"
-        bands:
-          - "COASTAL_AEROSOL"
-          - "BLUE"
-          - "GREEN"
-          - "RED"
-          - "RED_EDGE_1"
-          - "RED_EDGE_2"
-          - "RED_EDGE_3"
-          - "NIR_BROAD"
-          - "NIR_NARROW"
-          - "WATER_VAPOR"
-          - "SWIR_1"
-          - "SWIR_2"
-  - name: neontree
-    type: segmentation
-    direction: max
-    metric: val/Multiclass_Jaccard_Index
-    early_stop_patience: 5
-    terratorch_task:
-      loss: ce
-      model_args:
-        num_classes: 2
-        backbone_model_bands:
-          - RED
-          - GREEN
-          - BLUE
-        decoder: smp_Unet
-        decoder_decoder_channels:
-          - 512
-          - 256
-          - 128
-          - 64
-    datamodule:
-      class_path: terratorch.datamodules.MNeonTreeNonGeoDataModule
-      init_args:
-        partition: "0.01x_train"
-        train_transform:
-          - class_path: albumentations.HorizontalFlip
-            init_args:
-              p: 0.5
-          - class_path: albumentations.VerticalFlip
-            init_args:
-              p: 0.5
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        val_transform:
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        test_transform:
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        batch_size: 8
-        num_workers: 4
-        data_root: "/dccstor/geofm-finetuning/datasets/geobench/segmentation_v1.0"
-        bands:
-          - "RED"
-          - "GREEN"
-          - "BLUE"
-  - name: nz_cattle
-    type: segmentation
-    direction: max
-    metric: val/Multiclass_Jaccard_Index
-    early_stop_patience: 5
-    terratorch_task:
-      loss: ce
-      model_args:
-        backbone_model_bands:
-          - RED
-          - GREEN
-          - BLUE
-        num_classes: 2
-        decoder: smp_Unet
-        decoder_decoder_channels:
-          - 512
-          - 256
-          - 128
-          - 64
-    datamodule:
-      class_path: terratorch.datamodules.MNzCattleNonGeoDataModule
-      init_args:
-        partition: "0.01x_train"
-        train_transform:
-          - class_path: albumentations.HorizontalFlip
-            init_args:
-              p: 0.5
-          - class_path: albumentations.VerticalFlip
-            init_args:
-              p: 0.5
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        val_transform:
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        test_transform:
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        batch_size: 16
-        num_workers: 4
-        data_root: "/dccstor/geofm-finetuning/datasets/geobench/segmentation_v1.0"
-        bands:
-          - "RED"
-          - "GREEN"
-          - "BLUE"
-  - name: pv4ger_seg
-    type: segmentation
-    direction: max
-    metric: val/Multiclass_Jaccard_Index
-    early_stop_patience: 5
-    terratorch_task:
-      loss: ce
-      model_args:
-        decoder: smp_Unet
-        decoder_decoder_channels:
-          - 512
-          - 256
-          - 128
-          - 64
-
-        backbone_model_bands:
-          - RED
-          - GREEN
-          - BLUE
-        num_classes: 2
-    datamodule:
-      class_path: terratorch.datamodules.MPv4gerSegNonGeoDataModule
-      init_args:
-        partition: "0.01x_train"
-        train_transform:
-          - class_path: albumentations.HorizontalFlip
-            init_args:
-              p: 0.5
-          - class_path: albumentations.VerticalFlip
-            init_args:
-              p: 0.5
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        val_transform:
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        test_transform:
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        batch_size: 16
-        num_workers: 4
-        data_root: "/dccstor/geofm-finetuning/datasets/geobench/segmentation_v1.0"
-        bands:
-          - "RED"
-          - "GREEN"
-          - "BLUE"
-  - name: sa_crop_type
-    type: segmentation
-    direction: max
-    metric: val/Multiclass_Jaccard_Index
-    early_stop_patience: 5
-    terratorch_task:
-      loss: ce
-      model_args:
-        decoder: smp_Unet
-        decoder_decoder_channels:
-          - 512
-          - 256
-          - 128
-          - 64
-        backbone_model_bands:
-          - "COASTAL_AEROSOL"
-          - "BLUE"
-          - "GREEN"
-          - "RED"
-          - "RED_EDGE_1"
-          - "RED_EDGE_2"
-          - "RED_EDGE_3"
-          - "NIR_BROAD"
-          - "NIR_NARROW"
-          - "WATER_VAPOR"
-          - "SWIR_1"
-          - "SWIR_2"
-        num_classes: 10
-    datamodule:
-      class_path: terratorch.datamodules.m_SA_crop_type.MSACropTypeNonGeoDataModule
-      init_args:
-        partition: "0.01x_train"
-        train_transform:
-          - class_path: albumentations.HorizontalFlip
-            init_args:
-              p: 0.5
-          - class_path: albumentations.VerticalFlip
-            init_args:
-              p: 0.5
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        val_transform:
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        test_transform:
-          - class_path: albumentations.Resize
-            init_args:
-              height: 224
-              width: 224
-          - class_path: ToTensorV2
-        batch_size: 16
-        num_workers: 4
-        data_root: "/dccstor/geofm-finetuning/datasets/geobench/segmentation_v1.0"
-        bands:
-          - "COASTAL_AEROSOL"
-          - "BLUE"
-          - "GREEN"
-          - "RED"
-          - "RED_EDGE_1"
-          - "RED_EDGE_2"
-          - "RED_EDGE_3"
-          - "NIR_BROAD"
-          - "NIR_NARROW"
-          - "WATER_VAPOR"
-          - "SWIR_1"
-          - "SWIR_2"
-
-n_trials: 16
-save_models: False
-storage_uri: /dccstor/geofm-finetuning/terratorch-iterate-test/benchmark
-ray_storage_path: /dccstor/geofm-finetuning/terratorch-iterate-test/benchmark/ray_storage_results
-optimization_space:
-  batch_size:
-    - 8
-    - 16
-    - 32
-  lr:
-    min: 6e-5
-    max: 1e-3
-    type: real
-    log: true
-  optimizer_hparams:
-    weight_decay:
-      min: 0
-      max: 0.4
-      type: real
diff --git a/configs/tests/terratorch-iterate-configs/test_case_02/oracle/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml b/configs/tests/terratorch-iterate-configs/test_case_02/oracle/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
index 0f57244..c5d97b8 100644
--- a/configs/tests/terratorch-iterate-configs/test_case_02/oracle/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
+++ b/configs/tests/terratorch-iterate-configs/test_case_02/oracle/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
@@ -101,7 +101,7 @@ tasks:
       - '4'
       - '5'
       backbone_drop_path: 0.1
-      # backbone_pretrained: true
+      backbone_pretrained: true
       decoder: UNetDecoder
       decoder_channels:
       - 512
diff --git a/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml b/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
index dac536c..9fa2765 100644
--- a/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
+++ b/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
@@ -18,7 +18,7 @@ defaults:
   trainer_args:
     log_every_n_steps: 1
     max_epochs: 5
-experiment_name: test_config_util__encoderdecoder_eo_v2_300_model_factory
+experiment_name: tt-iterate-_encoderdecoder_eo_v2_300_model_factory
 n_trials: 1
 optimization_space:
   lr:
diff --git a/tests/unit/test_build_geobench_configs.py b/tests/unit/test_build_geobench_configs.py
index 313c3ef..e5744d5 100644
--- a/tests/unit/test_build_geobench_configs.py
+++ b/tests/unit/test_build_geobench_configs.py
@@ -1,7 +1,7 @@
 from pathlib import Path
 import pytest
 import yaml
-from benchmark.config_util.build_geobench_configs import generate_iterate_config
+from benchmark.config_util.build_iterate_config import generate_iterate_config
 from deepdiff import DeepDiff
 import logging
 
@@ -9,7 +9,7 @@
 
 
 @pytest.mark.parametrize(
-    "input_dir, output_dir, template, prefix, oracle_config_file",
+    "input, output, template, prefix, oracle_config_file",
     [
         (
             "./configs/tests/terratorch_configs/test_case_01",
@@ -25,6 +25,13 @@
             "test_config_util_",
             "./configs/tests/terratorch-iterate-configs/test_case_02/oracle/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml",
         ),
+        (
+            "./configs/tests/terratorch_configs/test_case_02/test_encoderdecoder_eo_v2_300_model_factory.yaml",
+            "./configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml",
+            "./configs/templates/template.yaml",
+            "test_config_util_",
+            "./configs/tests/terratorch-iterate-configs/test_case_02/oracle/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml",
+        ),
         (
             "./configs/tests/terratorch_configs/test_case_03",
             "./configs/tests/terratorch-iterate-configs/test_case_03",
@@ -35,32 +42,37 @@
     ],
 )
 def test__generate_iterate_config(
-    input_dir, output_dir, template, prefix, oracle_config_file
+    input, output, template, prefix, oracle_config_file
 ):
     # Get the absolute path of the current script file
     script_path = Path(__file__).resolve()
 
     # Get the home directory
     repo_home_dir = script_path.parent.parent.parent
-    input_dir_path: Path = repo_home_dir / input_dir
-    assert input_dir_path.exists()
-    assert input_dir_path.is_dir()
-    output_path: Path = repo_home_dir / output_dir
+    input_path: Path = repo_home_dir / input
+    assert input_path.exists()
+    output_path: Path = repo_home_dir / output
     assert output_path.exists()
-    assert output_path.is_dir()
     # warning! delete all files of the output dir
-    for item in output_path.iterdir():
-        if item.is_file():
-            logging.debug(f"Cleaning up directory: {item} deleted")
-            item.unlink()
+    if output_path.is_dir():
+        for item in output_path.iterdir():
+            if item.is_file():
+                logging.debug(f"Cleaning up directory: {item} deleted")
+                item.unlink()
+    else:
+        output_path.unlink()
 
     generate_iterate_config(
-        input_dir=input_dir_path,
-        output_dir=output_path,
+        input=input_path,
+        output=output_path,
         template=repo_home_dir / template,
         prefix=prefix,
     )
-    generated_config_files = list(output_path.glob(f'**/{prefix}*.yaml'))
+    if output_path.is_dir():
+        generated_config_files = list(output_path.glob(f'**/{prefix}*.yaml'))
+    else:
+        generated_config_files = [output_path]
+        
     assert len(generated_config_files) > 0
 
     if oracle_config_file is not None:

From e45814bf4c54d4c4dc946fa22002ced59994a9f8 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Fri, 26 Sep 2025 12:02:59 -0300
Subject: [PATCH 29/40] update pre-commit to apply ruff format

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 .pre-commit-config.yaml                                    | 7 +++++++
 ...onfig_util__encoderdecoder_eo_v2_300_model_factory.yaml | 2 +-
 pyproject.toml                                             | 2 +-
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ec30804..08e218f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -19,3 +19,10 @@ repos:
         # when "--baseline" with "--use-all-plugins", pre-commit scan with all available plugins
         # add "--fail-on-unaudited" to fail pre-commit for unaudited potential secrets
         args: [--baseline, .secrets.baseline, --use-all-plugins]
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.12.3
+    hooks:
+      - id: ruff-format
+        types_or:
+          - python
+          - jupyter
diff --git a/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml b/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
index 9fa2765..dac536c 100644
--- a/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
+++ b/configs/tests/terratorch-iterate-configs/test_case_02/test_config_util__encoderdecoder_eo_v2_300_model_factory.yaml
@@ -18,7 +18,7 @@ defaults:
   trainer_args:
     log_every_n_steps: 1
     max_epochs: 5
-experiment_name: tt-iterate-_encoderdecoder_eo_v2_300_model_factory
+experiment_name: test_config_util__encoderdecoder_eo_v2_300_model_factory
 n_trials: 1
 optimization_space:
   lr:
diff --git a/pyproject.toml b/pyproject.toml
index 999542c..0ce4837 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -83,7 +83,7 @@ Issues = "https://github.com/IBM/terratorch-iterate/issues"
 
 [project.optional-dependencies]
 dev = [
-  "black",
+  "ruff",
   "flake8", 
   "mkdocs-material",
   "mkdocstrings[python]",

From da2fa53fb528eabc2dafea1dcc251709407b45c8 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Wed, 1 Oct 2025 17:05:17 -0300
Subject: [PATCH 30/40] add logging to run_tests script; improve documentation

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 .secrets.baseline                         | 146 +++++++++++++++
 benchmark/backbone_benchmark.py           |  16 +-
 benchmark/benchmark_ray.py                |   1 -
 benchmark/benchmark_types.py              |   3 +-
 benchmark/main.py                         | 152 ++++++++-------
 benchmark/model_fitting.py                |  23 +--
 benchmark/repeat_best_experiment.py       |  23 +--
 benchmark/utils.py                        |  41 ++--
 plotting/plot_results_mlflow.ipynb        | 218 +++++++++++++++++-----
 plotting/plot_results_repeated_runs.ipynb | 162 +++++++++++++---
 run_tests.py                              |  41 ++--
 tests/integration/test_main.py            |  16 +-
 tests/unit/test_build_geobench_configs.py |  11 +-
 tests/unit/test_cli.py                    |   2 +-
 14 files changed, 607 insertions(+), 248 deletions(-)
 create mode 100644 .secrets.baseline

diff --git a/.secrets.baseline b/.secrets.baseline
new file mode 100644
index 0000000..e3ac1a7
--- /dev/null
+++ b/.secrets.baseline
@@ -0,0 +1,146 @@
+{
+  "exclude": {
+    "files": "^.secrets.baseline$",
+    "lines": null
+  },
+  "generated_at": "2025-10-01T20:02:29Z",
+  "plugins_used": [
+    {
+      "name": "AWSKeyDetector"
+    },
+    {
+      "name": "ArtifactoryDetector"
+    },
+    {
+      "name": "AzureStorageKeyDetector"
+    },
+    {
+      "base64_limit": 4.5,
+      "name": "Base64HighEntropyString"
+    },
+    {
+      "name": "BasicAuthDetector"
+    },
+    {
+      "name": "BoxDetector"
+    },
+    {
+      "name": "CloudantDetector"
+    },
+    {
+      "ghe_instance": "github.ibm.com",
+      "name": "GheDetector"
+    },
+    {
+      "name": "GitHubTokenDetector"
+    },
+    {
+      "hex_limit": 3,
+      "name": "HexHighEntropyString"
+    },
+    {
+      "name": "IbmCloudIamDetector"
+    },
+    {
+      "name": "IbmCosHmacDetector"
+    },
+    {
+      "name": "JwtTokenDetector"
+    },
+    {
+      "keyword_exclude": null,
+      "name": "KeywordDetector"
+    },
+    {
+      "name": "MailchimpDetector"
+    },
+    {
+      "name": "NpmDetector"
+    },
+    {
+      "name": "PrivateKeyDetector"
+    },
+    {
+      "name": "SlackDetector"
+    },
+    {
+      "name": "SoftlayerDetector"
+    },
+    {
+      "name": "SquareOAuthDetector"
+    },
+    {
+      "name": "StripeDetector"
+    },
+    {
+      "name": "TwilioKeyDetector"
+    }
+  ],
+  "results": {
+    "plotting/plot_results_mlflow.ipynb": [
+      {
+        "hashed_secret": "5810b71c07271f259208c5790992170ac1e13b37",
+        "is_verified": false,
+        "line_number": 437,
+        "type": "Base64 High Entropy String",
+        "verified_result": null
+      },
+      {
+        "hashed_secret": "1c1dc227208cec78bbdb8d9247164879f908a9ad",
+        "is_verified": false,
+        "line_number": 482,
+        "type": "Base64 High Entropy String",
+        "verified_result": null
+      },
+      {
+        "hashed_secret": "e57967bc8f018a30bb192717673876f0ebdbe5d9",
+        "is_verified": false,
+        "line_number": 558,
+        "type": "Base64 High Entropy String",
+        "verified_result": null
+      }
+    ],
+    "plotting/plot_results_repeated_runs.ipynb": [
+      {
+        "hashed_secret": "e52b18568a4fa073b958134ea5ec0f9407b6ebc3",
+        "is_verified": false,
+        "line_number": 352,
+        "type": "Base64 High Entropy String",
+        "verified_result": null
+      },
+      {
+        "hashed_secret": "43cf2641021e5833120affd5a2bcdf35089eaf75",
+        "is_verified": false,
+        "line_number": 417,
+        "type": "Base64 High Entropy String",
+        "verified_result": null
+      },
+      {
+        "hashed_secret": "78f9a422a3afb6ff5aff30094699c2b299dfd614",
+        "is_verified": false,
+        "line_number": 949,
+        "type": "Base64 High Entropy String",
+        "verified_result": null
+      },
+      {
+        "hashed_secret": "2525429c7a93512ed0c4b799b867a83a6b19f7ff",
+        "is_verified": false,
+        "line_number": 1014,
+        "type": "Base64 High Entropy String",
+        "verified_result": null
+      },
+      {
+        "hashed_secret": "8915fab07d3bf85d3755089a7fc82e911405d40a",
+        "is_verified": false,
+        "line_number": 1080,
+        "type": "Base64 High Entropy String",
+        "verified_result": null
+      }
+    ]
+  },
+  "version": "0.13.1+ibm.61.dss",
+  "word_list": {
+    "file": null,
+    "hash": null
+  }
+}
diff --git a/benchmark/backbone_benchmark.py b/benchmark/backbone_benchmark.py
index 0eb1f94..b6cd52c 100644
--- a/benchmark/backbone_benchmark.py
+++ b/benchmark/backbone_benchmark.py
@@ -52,7 +52,6 @@ def benchmark_backbone_on_task(
     sampler: BaseSampler | None = None,
     test_models: bool = False,
 ) -> tuple[float, str | list[str] | None, dict[str, Any]]:
-
     optuna_db_path = Path(storage_uri).parents[0] / "optuna_db"
     if not os.path.exists(optuna_db_path):
         os.makedirs(optuna_db_path)
@@ -136,15 +135,15 @@ def benchmark_backbone_on_task(
             "early_stop_patience": str(training_spec.task.early_stop_patience),
             "partition_name": (
                 str(training_spec.task.datamodule.partition)
-                if hasattr(training_spec.task.datamodule, 'partition')
-                else 'default'
+                if hasattr(training_spec.task.datamodule, "partition")
+                else "default"
             ),
             "decoder": (
                 str(training_spec.task.terratorch_task["model_args"]["decoder"])
                 if "decoder" in training_spec.task.terratorch_task["model_args"]
-                else training_spec.task.terratorch_task['model_args']['framework']
+                else training_spec.task.terratorch_task["model_args"]["framework"]
             ),
-            "task": str(training_spec.task.type).split('.')[-1],
+            "task": str(training_spec.task.type).split(".")[-1],
             "backbone": str(
                 training_spec.task.terratorch_task["model_args"]["backbone"]
             ),
@@ -238,7 +237,7 @@ def _run_hpo(
                 PATH_TO_JOB_TRACKING
                 / f"{experiment_name}-{run.info.run_id}_table_entries.pkl"
             )
-            with open(table_entries_filename, 'wb') as handle:
+            with open(table_entries_filename, "wb") as handle:
                 pickle.dump(table_entries, handle, protocol=pickle.HIGHEST_PROTOCOL)
 
         table = tabulate(table_entries, headers=table_columns)
@@ -329,8 +328,9 @@ def benchmark_backbone(
 
     if backbone_import:
         importlib.import_module(backbone_import)
-
+    logger.info(f"Setting tracking URI: {storage_uri}")
     mlflow.set_tracking_uri(storage_uri)
+    logger.info(f"Setting experiment name: {experiment_name}")
     mlflow.set_experiment(experiment_name)
 
     optimization_space = parse_optimization_space(optimization_space)
@@ -380,7 +380,7 @@ def benchmark_backbone(
                 PATH_TO_JOB_TRACKING / f"{experiment_name}-{run_id}_table_entries.pkl"
             )
             if os.path.exists(table_entries_filename):
-                with open(table_entries_filename, 'rb') as handle:
+                with open(table_entries_filename, "rb") as handle:
                     table_entries = pickle.load(handle)
     else:
         logger.info("Starting new experiment from scratch")
diff --git a/benchmark/benchmark_ray.py b/benchmark/benchmark_ray.py
index 81eed60..4d3767e 100644
--- a/benchmark/benchmark_ray.py
+++ b/benchmark/benchmark_ray.py
@@ -166,7 +166,6 @@ def benchmark_backbone(
     with mlflow.start_run(
         run_name=run_name, run_id=run_id, description=description
     ) as run:
-
         if optimization_space is None:
             # no hparams, parallelize over tasks
             ray_tasks = []
diff --git a/benchmark/benchmark_types.py b/benchmark/benchmark_types.py
index f8f8d31..b9e6082 100644
--- a/benchmark/benchmark_types.py
+++ b/benchmark/benchmark_types.py
@@ -86,7 +86,7 @@ def __post_init__(self):
 
 
 optimization_space_type = dict[
-    str, Union[list, ParameterBounds, 'optimization_space_type']
+    str, Union[list, ParameterBounds, "optimization_space_type"]
 ]
 
 
@@ -146,7 +146,6 @@ class TrainingSpec:
 
 
 def recursive_merge(first_dict: dict[str, Any], second_dict: dict[str, Any]):
-
     # consider using deepmerge instead of this
     for key, val in second_dict.items():
         if key not in first_dict:
diff --git a/benchmark/main.py b/benchmark/main.py
index 39e2847..8bf7fd1 100644
--- a/benchmark/main.py
+++ b/benchmark/main.py
@@ -33,14 +33,14 @@ def _summarize(
     Returns:
         _type_: _description_
     """
-    assert (
-        hpo is False and repeat is False
-    ), f"Error! both {repeat=} and {hpo=} must be False when summarizing results from multiple experiments."
+    assert hpo is False and repeat is False, (
+        f"Error! both {repeat=} and {hpo=} must be False when summarizing results from multiple experiments."
+    )
 
     list_of_experiment_names = config_init.list_of_experiment_names
-    assert isinstance(
-        list_of_experiment_names, list
-    ), f"Error! {list_of_experiment_names=} is not a list"
+    assert isinstance(list_of_experiment_names, list), (
+        f"Error! {list_of_experiment_names=} is not a list"
+    )
     for exp in list_of_experiment_names:
         assert isinstance(exp, str), f"Error! {exp=} is not a str"
 
@@ -58,9 +58,9 @@ def _summarize(
     assert isinstance(benchmark_name, str), f"Error! {benchmark_name=} is not a str"
 
     run_repetitions = config_init.run_repetitions
-    assert (
-        isinstance(run_repetitions, int) and run_repetitions > 0
-    ), f"Error! {run_repetitions=} is invalid"
+    assert isinstance(run_repetitions, int) and run_repetitions > 0, (
+        f"Error! {run_repetitions=} is invalid"
+    )
     # get results and parameters from mlflow logs
     results_and_parameters = get_results_and_parameters(
         benchmark_name=benchmark_name,
@@ -108,9 +108,9 @@ def _repeat_experiment(
     output: str | None = config_init.output_path
     if output is None:
         storage_uri_path = Path(storage_uri)
-        assert (
-            storage_uri_path.exists() and storage_uri_path.is_dir()
-        ), f"Error! Unable to create new output_path based on storage_uri_path because the latter does not exist: {storage_uri_path}"
+        assert storage_uri_path.exists() and storage_uri_path.is_dir(), (
+            f"Error! Unable to create new output_path based on storage_uri_path because the latter does not exist: {storage_uri_path}"
+        )
         output_path = storage_uri_path.parents[0] / "repeated_exp_output_csv"
         output_path.mkdir(parents=True, exist_ok=True)
         output_path = output_path / f"{experiment_name}_repeated_exp_mlflow.csv"
@@ -148,16 +148,16 @@ def _convert_config(args: Namespace):
     3. Calls the `generate_iterate_config` function from the `build_iterate_config` module, passing the input path, output path, prefix (if provided), and template (if provided).
     """
     input: str = args.input
-    assert input is not None and isinstance(
-        input, str
-    ), f"Error! Invalid value: {input=}"
+    assert input is not None and isinstance(input, str), (
+        f"Error! Invalid value: {input=}"
+    )
     input_path = Path(input)
     assert input_path.exists()
 
     output: str = args.output
-    assert output is not None and isinstance(
-        output, str
-    ), f"Error! Invalid value: {output=}"
+    assert output is not None and isinstance(output, str), (
+        f"Error! Invalid value: {output=}"
+    )
     output_path = Path(output)
     template: str | None = args.template
 
@@ -168,28 +168,28 @@ def _convert_config(args: Namespace):
         input=input_path, output=output_path, prefix=prefix, template=template
     )
 
-def main():
 
+def main():
     parser = ArgumentParser()
 
-    parser.add_argument('--defaults', type=Defaults)  # to ignore model
-    parser.add_argument('--optimization_space', type=dict)  # to ignore model
-    parser.add_argument('--experiment_name', type=str)  # to ignore model
-    parser.add_argument('--run_name', type=str)  # to ignore model
-    parser.add_argument('--save_models', type=bool)  # to ignore model
-    parser.add_argument('--storage_uri', type=str)  # to ignore model
-    parser.add_argument('--ray_storage_path', type=str)  # to ignore model
-    parser.add_argument('--n_trials', type=int)  # to ignore model
-    parser.add_argument('--run_repetitions', type=int)  # to ignore model
-    parser.add_argument('--tasks', type=list[Task])
+    parser.add_argument("--defaults", type=Defaults)  # to ignore model
+    parser.add_argument("--optimization_space", type=dict)  # to ignore model
+    parser.add_argument("--experiment_name", type=str)  # to ignore model
+    parser.add_argument("--run_name", type=str)  # to ignore model
+    parser.add_argument("--save_models", type=bool)  # to ignore model
+    parser.add_argument("--storage_uri", type=str)  # to ignore model
+    parser.add_argument("--ray_storage_path", type=str)  # to ignore model
+    parser.add_argument("--n_trials", type=int)  # to ignore model
+    parser.add_argument("--run_repetitions", type=int)  # to ignore model
+    parser.add_argument("--tasks", type=list[Task])
     parser.add_argument("--parent_run_id", type=str)
     parser.add_argument("--output_path", type=str)
     parser.add_argument("--logger", type=str)
     parser.add_argument("--config", type=str)
-    parser.add_argument('--custom_modules_path', type=str)
-    parser.add_argument('--report_on_best_val', type=bool, default=True)
-    parser.add_argument('--test_models', type=bool, default=False)
-    parser.add_argument('--bayesian_search', type=bool, default=True)
+    parser.add_argument("--custom_modules_path", type=str)
+    parser.add_argument("--report_on_best_val", type=bool, default=True)
+    parser.add_argument("--test_models", type=bool, default=False)
+    parser.add_argument("--bayesian_search", type=bool, default=True)
     parser.add_argument("--hpo", help="optimize hyperparameters", action="store_true")
     parser.add_argument("--repeat", help="repeat best experiments", action="store_true")
     parser.add_argument(
@@ -202,11 +202,11 @@ def main():
         help="summarize results from repeated experiments",
         action="store_true",
     )
-    parser.add_argument('--list_of_experiment_names', type=list[str])
-    parser.add_argument('--task_names', type=list[str])
-    parser.add_argument('--task_metrics', type=list[str])
+    parser.add_argument("--list_of_experiment_names", type=list[str])
+    parser.add_argument("--task_names", type=list[str])
+    parser.add_argument("--task_metrics", type=list[str])
     parser.add_argument(
-        '--benchmark_name',
+        "--benchmark_name",
         type=str,
         help="name of summarized results file",
     )
@@ -238,10 +238,7 @@ def main():
     )
 
     args = parser.parse_args()
-    if (
-            args.build_iterate_config is not None
-            and args.build_iterate_config is True
-        ):
+    if args.build_iterate_config is not None and args.build_iterate_config is True:
         _convert_config(args)
     else:
         config_path: str | None = args.config
@@ -252,11 +249,10 @@ def main():
             """
             print(msg)
         else:
-            assert isinstance(
-                config_path, str
-            ), f"Error! Unexpected config type: {config_path}"
+            assert isinstance(config_path, str), (
+                f"Error! Unexpected config type: {config_path}"
+            )
             config = parser.parse_path(config_path)
-        
 
             config_init: Namespace = parser.instantiate_classes(config)
 
@@ -268,9 +264,9 @@ def main():
             assert isinstance(hpo, bool), f"Error! {hpo=} is not a bool"
 
             continue_existing_experiments: bool = args.continue_existing_experiments
-            assert isinstance(
-                continue_existing_experiments, bool
-            ), f"Error! {continue_existing_experiments=} is not a bool"
+            assert isinstance(continue_existing_experiments, bool), (
+                f"Error! {continue_existing_experiments=} is not a bool"
+            )
 
             storage_uri = config_init.storage_uri
             assert isinstance(storage_uri, str), f"Error! {storage_uri=} is not a str"
@@ -299,28 +295,28 @@ def main():
                 )
 
             # optimize hyperparameters and/or do repeated runs for single experiments
-            assert (
-                hpo is True or repeat is True
-            ), f"Error! either {repeat=} or {hpo=} must be True"
+            assert hpo is True or repeat is True, (
+                f"Error! either {repeat=} or {hpo=} must be True"
+            )
             parent_run_id = args.parent_run_id
             if parent_run_id is not None:
-                assert isinstance(
-                    parent_run_id, str
-                ), f"Error! {parent_run_id=} is not a str"
+                assert isinstance(parent_run_id, str), (
+                    f"Error! {parent_run_id=} is not a str"
+                )
 
             # validate the objects
             experiment_name = config_init.experiment_name
-            assert isinstance(
-                experiment_name, str
-            ), f"Error! {experiment_name=} is not a str"
+            assert isinstance(experiment_name, str), (
+                f"Error! {experiment_name=} is not a str"
+            )
             run_name = config_init.run_name
             if run_name is not None:
                 assert isinstance(run_name, str), f"Error! {run_name=} is not a str"
             # validate defaults
             defaults = config_init.defaults
-            assert isinstance(
-                defaults, Defaults
-            ), f"Error! {defaults=} is not a Defaults"
+            assert isinstance(defaults, Defaults), (
+                f"Error! {defaults=} is not a Defaults"
+            )
 
             tasks = config_init.tasks
             assert isinstance(tasks, list), f"Error! {tasks=} is not a list"
@@ -332,27 +328,27 @@ def main():
             # defaults.trainer_args["max_epochs"] = 5
 
             optimization_space = config_init.optimization_space
-            assert isinstance(
-                optimization_space, dict
-            ), f"Error! {optimization_space=} is not a dict"
+            assert isinstance(optimization_space, dict), (
+                f"Error! {optimization_space=} is not a dict"
+            )
 
             # ray_storage_path is optional
             ray_storage_path = config_init.ray_storage_path
             if ray_storage_path is not None:
-                assert isinstance(
-                    ray_storage_path, str
-                ), f"Error! {ray_storage_path=} is not a str"
+                assert isinstance(ray_storage_path, str), (
+                    f"Error! {ray_storage_path=} is not a str"
+                )
 
             n_trials = config_init.n_trials
-            assert (
-                isinstance(n_trials, int) and n_trials > 0
-            ), f"Error! {n_trials=} is invalid"
+            assert isinstance(n_trials, int) and n_trials > 0, (
+                f"Error! {n_trials=} is invalid"
+            )
             run_repetitions = config_init.run_repetitions
 
             report_on_best_val = config_init.report_on_best_val
-            assert isinstance(
-                report_on_best_val, bool
-            ), f"Error! {ray_storage_path=} is not a bool"
+            assert isinstance(report_on_best_val, bool), (
+                f"Error! {ray_storage_path=} is not a bool"
+            )
 
             save_models = config_init.save_models
             assert isinstance(save_models, bool), f"Error! {save_models=} is not a bool"
@@ -361,16 +357,16 @@ def main():
             assert isinstance(test_models, bool), f"Error! {test_models=} is not a bool"
 
             bayesian_search = config_init.bayesian_search
-            assert isinstance(
-                bayesian_search, bool
-            ), f"Error! {bayesian_search=} is not a bool"
+            assert isinstance(bayesian_search, bool), (
+                f"Error! {bayesian_search=} is not a bool"
+            )
 
             # custom_modules_path is optional
             custom_modules_path = config_init.custom_modules_path
             if custom_modules_path is not None:
-                assert isinstance(
-                    custom_modules_path, str
-                ), f"Error! {custom_modules_path=} is not a str"
+                assert isinstance(custom_modules_path, str), (
+                    f"Error! {custom_modules_path=} is not a str"
+                )
                 import_custom_modules(
                     logger=logger, custom_modules_path=custom_modules_path
                 )
diff --git a/benchmark/model_fitting.py b/benchmark/model_fitting.py
index f2b7544..d8502cb 100644
--- a/benchmark/model_fitting.py
+++ b/benchmark/model_fitting.py
@@ -119,9 +119,9 @@ def __init__(self, *args, **kwargs):
 def inject_hparams(training_spec: TrainingSpec, config: dict):
     # treat batch size specially
     config_without_batch_size = copy.deepcopy(config)
-    assert isinstance(
-        config_without_batch_size, dict
-    ), f"Error! Unexpected config type: {config_without_batch_size}"
+    assert isinstance(config_without_batch_size, dict), (
+        f"Error! Unexpected config type: {config_without_batch_size}"
+    )
     batch_size: int | None = config_without_batch_size.pop("batch_size", None)  # type: ignore
     datamodule_with_generated_hparams = copy.deepcopy(training_spec.task.datamodule)
     if batch_size:
@@ -310,9 +310,9 @@ def launch_training(
             ["metric_name", "step"], verify_integrity=True
         )
         series_val_metrics = df_val_metrics["value"]
-        assert (
-            metric in series_val_metrics
-        ), f"Error! {metric} is not in {series_val_metrics}"
+        assert metric in series_val_metrics, (
+            f"Error! {metric} is not in {series_val_metrics}"
+        )
         if direction == "max":
             best_step = series_val_metrics[metric].idxmax()
         elif direction == "min":
@@ -351,9 +351,9 @@ def fit_model(
         PixelwiseRegressionTask,
     ]:
         task.terratorch_task["plot_on_val"] = False
-    assert isinstance(
-        task.terratorch_task, dict
-    ), f"Error! Invalid type: {task.terratorch_task}"
+    assert isinstance(task.terratorch_task, dict), (
+        f"Error! Invalid type: {task.terratorch_task}"
+    )
 
     lightning_task = lightning_task_class(**task.terratorch_task)
 
@@ -454,9 +454,7 @@ def fit_model_with_hparams(
         trial,
         save_models=save_models,
         test_models=test_models,
-    )[
-        0
-    ]  # return only the metric value for optuna
+    )[0]  # return only the metric value for optuna
 
 
 """
@@ -476,7 +474,6 @@ def ray_tune_model(
     backbone_import: str | None = None,
     searcher: Searcher | SearchAlgorithm | None = None,
 ) -> tune.ResultGrid:
-
     if not searcher:
         raise ValueError("searcher must be specified")
     trainable = tune.with_parameters(
diff --git a/benchmark/repeat_best_experiment.py b/benchmark/repeat_best_experiment.py
index 273132f..6fbcf3f 100644
--- a/benchmark/repeat_best_experiment.py
+++ b/benchmark/repeat_best_experiment.py
@@ -53,7 +53,6 @@ def remote_fit(
         run_name=f"{lightning_task_class.name}_{seed}",
         nested=True,
     ):
-
         training_spec_copy = copy.deepcopy(training_spec)
         training_spec_with_generated_hparams = inject_hparams(
             training_spec_copy, best_params
@@ -78,9 +77,7 @@ def remote_fit(
         # get callbacks (set to empty list if none defined) and extend with default ones
         training_spec_with_generated_hparams.trainer_args.setdefault(
             "callbacks", []
-        ).extend(
-            default_callbacks
-        )  # type: ignore
+        ).extend(default_callbacks)  # type: ignore
         if "enable_checkpointing" in training_spec_with_generated_hparams.trainer_args:
             warnings.warn(
                 "enable_checkpointing found. Will be overwritten to False as ray will be responsible for saving models."
@@ -105,8 +102,8 @@ def remote_fit(
 
         test_metric = (
             "test/" + task.metric.split("/")[1]
-            if '/' in task.metric
-            else 'test_' + task.metric.replace(task.metric.split('_')[0] + "_", '')
+            if "/" in task.metric
+            else "test_" + task.metric.replace(task.metric.split("_")[0] + "_", "")
         )
         mlflow.log_metric(f"test_{test_metric}", metrics[test_metric])
         return metrics[test_metric]
@@ -179,9 +176,7 @@ def non_remote_fit(
         # get callbacks (set to empty list if none defined) and extend with default ones
         training_spec_with_generated_hparams.trainer_args.setdefault(
             "callbacks", []
-        ).extend(
-            default_callbacks
-        )  # type: ignore
+        ).extend(default_callbacks)  # type: ignore
 
         trainer = Trainer(**training_spec_with_generated_hparams.trainer_args)
         trainer.logger = MLFlowLogger(
@@ -217,8 +212,8 @@ def non_remote_fit(
         #        return None
         test_metric = (
             "test/" + task.metric.split("/")[1]
-            if '/' in task.metric
-            else 'test_' + task.metric.replace(task.metric.split('_')[0] + "_", '')
+            if "/" in task.metric
+            else "test_" + task.metric.replace(task.metric.split("_")[0] + "_", "")
         )
         mlflow.log_metric(f"test_{test_metric}", metrics[test_metric])
         return metrics[test_metric]
@@ -303,7 +298,9 @@ def rerun_best_from_backbone(
     with mlflow.start_run(run_name=experiment_name, run_id=None) as run:
         for task in tasks:
             logger.info(f"\n\ntask: {task.name}")
-            matching_runs = [run for run in runs if run.info.run_name.endswith(task.name)]  # type: ignore
+            matching_runs = [
+                run for run in runs if run.info.run_name.endswith(task.name)
+            ]  # type: ignore
             if len(matching_runs) == 0:
                 msg = f"No runs found for task {task.name}. Skipping."
                 warnings.warn(msg)
@@ -442,7 +439,7 @@ def rerun_best_from_backbone(
                             )
                             existing_output.reset_index(inplace=True)
                             existing_output = existing_output.drop(
-                                columns=['index', 'level_0']
+                                columns=["index", "level_0"]
                             )
                             existing_output.to_csv(output_path, index=False)
                         else:
diff --git a/benchmark/utils.py b/benchmark/utils.py
index 8c8a7f8..0d010bb 100644
--- a/benchmark/utils.py
+++ b/benchmark/utils.py
@@ -71,9 +71,9 @@ def sync_mlflow_optuna(
         all_mlflow_runs_for_task.append(task_run_id)
         logger.info(f"task_run_id : {task_run_id}")
         experiment_info = client.get_experiment_by_name(experiment_name)
-        assert isinstance(
-            experiment_info, Experiment
-        ), f"Error! Unexpected type of {experiment_info=}"
+        assert isinstance(experiment_info, Experiment), (
+            f"Error! Unexpected type of {experiment_info=}"
+        )
         individual_run_data = client.search_runs(
             experiment_ids=[experiment_info.experiment_id],
             filter_string=f'tags."mlflow.parentRunId" LIKE "{task_run_id}"',
@@ -124,9 +124,9 @@ def sync_mlflow_optuna(
                 for item in all_mlflow_runs_for_task:
                     logger.info(f"deleting {item}")
                     client.delete_run(item)
-                    assert isinstance(
-                        experiment_info, Experiment
-                    ), f"Error! Unexpected type of {experiment_info=}"
+                    assert isinstance(experiment_info, Experiment), (
+                        f"Error! Unexpected type of {experiment_info=}"
+                    )
                     os.system(f"rm -r {experiment_info.artifact_location}/{item}")
                     task_run_id = None
     else:
@@ -135,9 +135,9 @@ def sync_mlflow_optuna(
             for item in all_mlflow_runs_for_task:
                 logger.info(f"deleting {item}")
                 client.delete_run(item)
-                assert isinstance(
-                    experiment_info, Experiment
-                ), f"Error! Unexpected type of {experiment_info=}"
+                assert isinstance(experiment_info, Experiment), (
+                    f"Error! Unexpected type of {experiment_info=}"
+                )
                 os.system(f"rm -r {experiment_info.artifact_location}/{item}")
             task_run_id = None
     return task_run_id
@@ -211,7 +211,7 @@ def extract_repeated_experiment_results(
                 seed = int(run.info.run_name.split("_")[-1])
                 if task in task_info:
                     metric_name = task_info[task]
-                    metric_name = 'test_test/' + metric_name.split("/")[-1]
+                    metric_name = "test_test/" + metric_name.split("/")[-1]
                 else:
                     continue
 
@@ -350,19 +350,19 @@ def extract_parameters(
             best_params["data_percentages"] = DATA_PARTITIONS[
                 best_params["partition_name"]
             ]
-            if 'optimizer_hparams' in best_params:
+            if "optimizer_hparams" in best_params:
                 logger.info(
                     f"optimizer_hparams: {best_params['optimizer_hparams'].items()}"
                 )
                 optimizer_hparams = {
-                    k: v for k, v in best_params['optimizer_hparams'].items()
+                    k: v for k, v in best_params["optimizer_hparams"].items()
                 }
                 best_params.update(optimizer_hparams)
-                del best_params['optimizer_hparams']
-            if 'model_args' in best_params:
-                model_args = {k: v for k, v in best_params['model_args'].items()}
+                del best_params["optimizer_hparams"]
+            if "model_args" in best_params:
+                model_args = {k: v for k, v in best_params["model_args"].items()}
                 best_params.update(model_args)
-                del best_params['model_args']
+                del best_params["model_args"]
 
             best_params = pd.DataFrame(best_params, index=[0])
             all_params.append(best_params)
@@ -421,11 +421,11 @@ def get_results_and_parameters(
         task_metrics=task_metrics,
     )
 
-    with open(f"{results_dir}/incomplete_experiments.txt", 'w') as f:
+    with open(f"{results_dir}/incomplete_experiments.txt", "w") as f:
         for line in incomplete_experiments:
             f.write(f"{line}\n")
     results_and_parameters = results.merge(
-        parameters, on=['experiment_name', 'dataset']
+        parameters, on=["experiment_name", "dataset"]
     )
     results_and_parameters.to_csv(
         f"{str(results_dir)}/results_and_parameters.csv", index=False
@@ -790,7 +790,7 @@ def get_logger(log_level="INFO", log_folder="./experiment_logs") -> logging.Root
     handler = logging.FileHandler(log_file)
     handler.setLevel(log_level)
     formatter = logging.Formatter(
-        '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+        "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
     )
     handler.setFormatter(formatter)
     logger.addHandler(handler)
@@ -802,13 +802,10 @@ def import_custom_modules(
     logger: logging.RootLogger,
     custom_modules_path: str | Path | None = None,
 ) -> None:
-
     if custom_modules_path:
-
         custom_modules_path = Path(custom_modules_path)
 
         if custom_modules_path.is_dir():
-
             # Add 'custom_modules' folder to sys.path
             workdir = custom_modules_path.parents[0]
             module_dir = custom_modules_path.name
diff --git a/plotting/plot_results_mlflow.ipynb b/plotting/plot_results_mlflow.ipynb
index 407e568..5d3a752 100644
--- a/plotting/plot_results_mlflow.ipynb
+++ b/plotting/plot_results_mlflow.ipynb
@@ -19,16 +19,24 @@
    "outputs": [],
    "source": [
     "def add_means_to_df(df, classification_datasets, segmentation_datasets):\n",
-    "    class_means = df[df[\"Task\"].isin(classification_datasets)].groupby('Model', as_index=False).agg({'Best Score': 'mean'})\n",
-    "    seg_means = df[df[\"Task\"].isin(segmentation_datasets)].groupby('Model', as_index=False).agg({'Best Score': 'mean'})\n",
+    "    class_means = (\n",
+    "        df[df[\"Task\"].isin(classification_datasets)]\n",
+    "        .groupby(\"Model\", as_index=False)\n",
+    "        .agg({\"Best Score\": \"mean\"})\n",
+    "    )\n",
+    "    seg_means = (\n",
+    "        df[df[\"Task\"].isin(segmentation_datasets)]\n",
+    "        .groupby(\"Model\", as_index=False)\n",
+    "        .agg({\"Best Score\": \"mean\"})\n",
+    "    )\n",
     "\n",
-    "    class_means['Task'] = 'Classification Mean'\n",
-    "    class_means['Metric'] = 'Mean'  # You can adjust this as needed\n",
-    "    class_means['Hyperparameters'] = None  # Or fill with appropriate value\n",
+    "    class_means[\"Task\"] = \"Classification Mean\"\n",
+    "    class_means[\"Metric\"] = \"Mean\"  # You can adjust this as needed\n",
+    "    class_means[\"Hyperparameters\"] = None  # Or fill with appropriate value\n",
     "\n",
-    "    seg_means['Task'] = 'Segmentation Mean'\n",
-    "    seg_means['Metric'] = 'Mean'  # You can adjust this as needed\n",
-    "    seg_means['Hyperparameters'] = None  # Or fill with appropriate value\n",
+    "    seg_means[\"Task\"] = \"Segmentation Mean\"\n",
+    "    seg_means[\"Metric\"] = \"Mean\"  # You can adjust this as needed\n",
+    "    seg_means[\"Hyperparameters\"] = None  # Or fill with appropriate value\n",
     "\n",
     "    df = pd.concat([df, class_means, seg_means], ignore_index=True)\n",
     "    return df"
@@ -40,24 +48,60 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "results_prithvi_subset_10 = pd.read_json(\"results_table_prithvi_subset_10.json\", orient=\"split\")\n",
-    "results_prithvi_swin_b_new_subset_10 = pd.read_json(\"results_table_prithvi_b_subset_new.json\", orient=\"split\")\n",
-    "results_scratch_subset_10 = pd.read_json(\"results_table_scratch_subset_10.json\", orient=\"split\")\n",
-    "results_imagenet_subset_10 = pd.read_json(\"results_table_imagenet_subset_10.json\", orient=\"split\")\n",
-    "results_imagenet_resnet_subset_10 = pd.read_json(\"results_table_resnet_rgb.json\", orient=\"split\")\n",
-    "results_prithvi_l_subset_10_old = pd.read_json(\"results_table_prithvi_l_subset_fixed.json\", orient=\"split\")\n",
-    "results_prithvi_l_subset_10 = pd.read_json(\"results_table_prithvi_l_subset_fixed_new.json\", orient=\"split\")\n",
-    "results_prithvi_h_subset_10 = pd.read_json(\"results_table_prithvi_h_subset.json\", orient=\"split\")\n",
-    "results_prithvi_l_subset_10_mask = pd.read_json(\"results_table_prithvi_l_subset_mask.json\", orient=\"split\")\n",
-    "results_prithvi_l_subset_10_fp32 = pd.read_json(\"results_table_prithvi_l_subset_fp32.json\", orient=\"split\")\n",
-    "prithvi_l_subset_coords_10 = pd.read_json(\"results_table_prithvi_l_subset_coords_pre_no_ft.json\", orient=\"split\")\n",
-    "prithvi_swin_l_subset_10 = pd.read_json(\"results_table_prithvi_swin_l_subset.json\", orient=\"split\")\n",
-    "results_prithvi_l_full_pretrain_subset_10 = pd.read_json(\"results_table_prithvi_l_full_pretrain_subset.json\", orient=\"split\")\n",
-    "results_prithvi_b_subset_10 = pd.read_json(\"results_table_prithvi_vit_b_subset.json\", orient=\"split\")\n",
-    "results_prithvi_b_subset_10_new = pd.read_json(\"results_table_prithvi_vit_b_subset_new.json\", orient=\"split\")\n",
-    "results_prithvi_b_os_subset_10 = pd.read_json(\"results_table_vit_b_os_subset.json\", orient=\"split\")\n",
-    "results_prithvi_3d_subset_10 = pd.read_json(\"results_table_swin_3d_subset_10.json\", orient=\"split\")\n",
-    "results_scalemae_subset_10 = pd.read_json(\"results_table_scalemae_subset.json\", orient=\"split\")\n",
+    "results_prithvi_subset_10 = pd.read_json(\n",
+    "    \"results_table_prithvi_subset_10.json\", orient=\"split\"\n",
+    ")\n",
+    "results_prithvi_swin_b_new_subset_10 = pd.read_json(\n",
+    "    \"results_table_prithvi_b_subset_new.json\", orient=\"split\"\n",
+    ")\n",
+    "results_scratch_subset_10 = pd.read_json(\n",
+    "    \"results_table_scratch_subset_10.json\", orient=\"split\"\n",
+    ")\n",
+    "results_imagenet_subset_10 = pd.read_json(\n",
+    "    \"results_table_imagenet_subset_10.json\", orient=\"split\"\n",
+    ")\n",
+    "results_imagenet_resnet_subset_10 = pd.read_json(\n",
+    "    \"results_table_resnet_rgb.json\", orient=\"split\"\n",
+    ")\n",
+    "results_prithvi_l_subset_10_old = pd.read_json(\n",
+    "    \"results_table_prithvi_l_subset_fixed.json\", orient=\"split\"\n",
+    ")\n",
+    "results_prithvi_l_subset_10 = pd.read_json(\n",
+    "    \"results_table_prithvi_l_subset_fixed_new.json\", orient=\"split\"\n",
+    ")\n",
+    "results_prithvi_h_subset_10 = pd.read_json(\n",
+    "    \"results_table_prithvi_h_subset.json\", orient=\"split\"\n",
+    ")\n",
+    "results_prithvi_l_subset_10_mask = pd.read_json(\n",
+    "    \"results_table_prithvi_l_subset_mask.json\", orient=\"split\"\n",
+    ")\n",
+    "results_prithvi_l_subset_10_fp32 = pd.read_json(\n",
+    "    \"results_table_prithvi_l_subset_fp32.json\", orient=\"split\"\n",
+    ")\n",
+    "prithvi_l_subset_coords_10 = pd.read_json(\n",
+    "    \"results_table_prithvi_l_subset_coords_pre_no_ft.json\", orient=\"split\"\n",
+    ")\n",
+    "prithvi_swin_l_subset_10 = pd.read_json(\n",
+    "    \"results_table_prithvi_swin_l_subset.json\", orient=\"split\"\n",
+    ")\n",
+    "results_prithvi_l_full_pretrain_subset_10 = pd.read_json(\n",
+    "    \"results_table_prithvi_l_full_pretrain_subset.json\", orient=\"split\"\n",
+    ")\n",
+    "results_prithvi_b_subset_10 = pd.read_json(\n",
+    "    \"results_table_prithvi_vit_b_subset.json\", orient=\"split\"\n",
+    ")\n",
+    "results_prithvi_b_subset_10_new = pd.read_json(\n",
+    "    \"results_table_prithvi_vit_b_subset_new.json\", orient=\"split\"\n",
+    ")\n",
+    "results_prithvi_b_os_subset_10 = pd.read_json(\n",
+    "    \"results_table_vit_b_os_subset.json\", orient=\"split\"\n",
+    ")\n",
+    "results_prithvi_3d_subset_10 = pd.read_json(\n",
+    "    \"results_table_swin_3d_subset_10.json\", orient=\"split\"\n",
+    ")\n",
+    "results_scalemae_subset_10 = pd.read_json(\n",
+    "    \"results_table_scalemae_subset.json\", orient=\"split\"\n",
+    ")\n",
     "results_satlas_subset_10 = pd.read_json(\"results_satlas_subset_10.json\", orient=\"split\")"
    ]
   },
@@ -67,7 +111,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "results_prithvi_subset_10[\"Model\"] = \"Prithvi Swin B (Old Training Dataset - New Dataset model training currently)\""
+    "results_prithvi_subset_10[\"Model\"] = (\n",
+    "    \"Prithvi Swin B (Old Training Dataset - New Dataset model training currently)\"\n",
+    ")"
    ]
   },
   {
@@ -103,7 +149,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "results_prithvi_l_full_pretrain_subset_10[\"Model\"] = \"Prithvi ViT L (New Training Dataset)\""
+    "results_prithvi_l_full_pretrain_subset_10[\"Model\"] = (\n",
+    "    \"Prithvi ViT L (New Training Dataset)\"\n",
+    ")"
    ]
   },
   {
@@ -112,7 +160,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "results_prithvi_l_subset_10[\"Model\"] = \"Prithvi ViT L (1/3 pretraining, Mask ratio 0.75)\""
+    "results_prithvi_l_subset_10[\"Model\"] = (\n",
+    "    \"Prithvi ViT L (1/3 pretraining, Mask ratio 0.75)\"\n",
+    ")"
    ]
   },
   {
@@ -121,7 +171,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "results_prithvi_h_subset_10[\"Model\"] = \"Prithvi ViT H (1/3 pretraining, Mask ratio 0.75)\""
+    "results_prithvi_h_subset_10[\"Model\"] = (\n",
+    "    \"Prithvi ViT H (1/3 pretraining, Mask ratio 0.75)\"\n",
+    ")"
    ]
   },
   {
@@ -130,7 +182,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "results_prithvi_l_subset_10_mask[\"Model\"] = \"Prithvi ViT L (1/3 pretraining, Mask Ratio 0.9)\""
+    "results_prithvi_l_subset_10_mask[\"Model\"] = (\n",
+    "    \"Prithvi ViT L (1/3 pretraining, Mask Ratio 0.9)\"\n",
+    ")"
    ]
   },
   {
@@ -238,8 +292,25 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = pd.concat([results_scratch_subset_10, results_imagenet_subset_10, results_imagenet_resnet_subset_10, results_scalemae_subset_10, results_prithvi_subset_10, results_prithvi_swin_b_new_subset_10, prithvi_swin_l_subset_10, results_prithvi_3d_subset_10, results_prithvi_h_subset_10, results_prithvi_l_full_pretrain_subset_10, results_prithvi_l_subset_10_fp32, prithvi_l_subset_coords_10, results_prithvi_b_subset_10_new, results_prithvi_b_os_subset_10, results_satlas_subset_10\n",
-    "])"
+    "df = pd.concat(\n",
+    "    [\n",
+    "        results_scratch_subset_10,\n",
+    "        results_imagenet_subset_10,\n",
+    "        results_imagenet_resnet_subset_10,\n",
+    "        results_scalemae_subset_10,\n",
+    "        results_prithvi_subset_10,\n",
+    "        results_prithvi_swin_b_new_subset_10,\n",
+    "        prithvi_swin_l_subset_10,\n",
+    "        results_prithvi_3d_subset_10,\n",
+    "        results_prithvi_h_subset_10,\n",
+    "        results_prithvi_l_full_pretrain_subset_10,\n",
+    "        results_prithvi_l_subset_10_fp32,\n",
+    "        prithvi_l_subset_coords_10,\n",
+    "        results_prithvi_b_subset_10_new,\n",
+    "        results_prithvi_b_os_subset_10,\n",
+    "        results_satlas_subset_10,\n",
+    "    ]\n",
+    ")"
    ]
   },
   {
@@ -248,13 +319,35 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "name_mapping = {\"big_earth_net\": \"m-bigearthnet\", \"brick_kiln\": \"m-brick-kiln\", \"eurosat\": \"m-eurosat\", \"forestnet\": \"m-forestnet\", \"pv4ger\": \"m-pv4ger\", \"so2sat\": \"m-so2sat\", \"neontree\": \"m-NeonTree\", \"sa_crop_type\": \"m-SA-crop-type\", \"cashew\": \"m-cashew-plant\", \"chesapeake\": \"m-chesapeake\", \"nz_cattle\": \"m-nz-cattle\", \"pv4ger_seg\": \"m-pv4ger-seg\"}\n",
+    "name_mapping = {\n",
+    "    \"big_earth_net\": \"m-bigearthnet\",\n",
+    "    \"brick_kiln\": \"m-brick-kiln\",\n",
+    "    \"eurosat\": \"m-eurosat\",\n",
+    "    \"forestnet\": \"m-forestnet\",\n",
+    "    \"pv4ger\": \"m-pv4ger\",\n",
+    "    \"so2sat\": \"m-so2sat\",\n",
+    "    \"neontree\": \"m-NeonTree\",\n",
+    "    \"sa_crop_type\": \"m-SA-crop-type\",\n",
+    "    \"cashew\": \"m-cashew-plant\",\n",
+    "    \"chesapeake\": \"m-chesapeake\",\n",
+    "    \"nz_cattle\": \"m-nz-cattle\",\n",
+    "    \"pv4ger_seg\": \"m-pv4ger-seg\",\n",
+    "}\n",
     "df[\"Task\"] = df[\"Task\"].map(name_mapping)\n",
     "\n",
-    "classification_datasets = [\"m-bigearthnet\", \"m-brick-kiln\", \"m-eurosat\", \"m-forestnet\", \"m-pv4ger\", \"m-so2sat\"]\n",
+    "classification_datasets = [\n",
+    "    \"m-bigearthnet\",\n",
+    "    \"m-brick-kiln\",\n",
+    "    \"m-eurosat\",\n",
+    "    \"m-forestnet\",\n",
+    "    \"m-pv4ger\",\n",
+    "    \"m-so2sat\",\n",
+    "]\n",
     "# exclude bigearthnet for now\n",
     "# classification_datasets = [\"m-brick-kiln\", \"m-eurosat\", \"m-forestnet\", \"m-pv4ger\", \"m-so2sat\"]\n",
-    "segmentation_datasets = list(set(df[\"Task\"].unique().tolist()) - set(classification_datasets))\n"
+    "segmentation_datasets = list(\n",
+    "    set(df[\"Task\"].unique().tolist()) - set(classification_datasets)\n",
+    ")"
    ]
   },
   {
@@ -272,7 +365,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
     "dataset_res = {\n",
     "    \"m-bigearthnet\": \"10m\",\n",
     "    \"m-so2sat\": \"10m\",\n",
@@ -285,7 +377,7 @@
     "    \"m-cashew-plant\": \"10m\",\n",
     "    \"m-SA-crop-type\": \"10m\",\n",
     "    \"m-nz-cattle\": \"0.1m\",\n",
-    "    \"m-NeonTree\": \"0.1m\"\n",
+    "    \"m-NeonTree\": \"0.1m\",\n",
     "}\n",
     "\n",
     "dataset_instrument = {\n",
@@ -300,7 +392,7 @@
     "    \"m-cashew-plant\": \"S2\",\n",
     "    \"m-SA-crop-type\": \"S2\",\n",
     "    \"m-nz-cattle\": \"RGB\",\n",
-    "    \"m-NeonTree\": \"RGB + Hyper\"\n",
+    "    \"m-NeonTree\": \"RGB + Hyper\",\n",
     "}\n",
     "\n",
     "img_size = {\n",
@@ -315,14 +407,16 @@
     "    \"m-cashew-plant\": 256,\n",
     "    \"m-SA-crop-type\": 256,\n",
     "    \"m-nz-cattle\": 500,\n",
-    "    \"m-NeonTree\": 400\n",
+    "    \"m-NeonTree\": 400,\n",
     "}\n",
     "\n",
-    "dataset_name_map = {name: f\"{name}\\n {dataset_instrument[name]} @ {dataset_res[name]}\" for name in img_size.keys()}\n",
+    "dataset_name_map = {\n",
+    "    name: f\"{name}\\n {dataset_instrument[name]} @ {dataset_res[name]}\"\n",
+    "    for name in img_size.keys()\n",
+    "}\n",
     "dataset_name_map[\"Segmentation Mean\"] = \"Segmentation Mean\"\n",
     "dataset_name_map[\"Classification Mean\"] = \"Classification Mean\"\n",
-    "df[\"Task\"] = df[\"Task\"].map(dataset_name_map)\n",
-    "\n"
+    "df[\"Task\"] = df[\"Task\"].map(dataset_name_map)"
    ]
   },
   {
@@ -396,9 +490,15 @@
    ],
    "source": [
     "g = sns.catplot(\n",
-    "    data=df, kind=\"bar\",\n",
-    "    x=\"Task\", y=\"Best Score\", hue=\"Model\",\n",
-    "    errorbar=\"sd\", palette=\"dark\", alpha=.6, height=10\n",
+    "    data=df,\n",
+    "    kind=\"bar\",\n",
+    "    x=\"Task\",\n",
+    "    y=\"Best Score\",\n",
+    "    hue=\"Model\",\n",
+    "    errorbar=\"sd\",\n",
+    "    palette=\"dark\",\n",
+    "    alpha=0.6,\n",
+    "    height=10,\n",
     ")\n",
     "g.despine(left=True)\n",
     "g.set_axis_labels(\"Dataset\", \"Metric\")\n",
@@ -413,8 +513,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df_subset = pd.concat([results_imagenet_subset_10, results_imagenet_resnet_subset_10, results_scalemae_subset_10, results_prithvi_subset_10, results_prithvi_swin_b_new_subset_10, results_prithvi_l_full_pretrain_subset_10,results_prithvi_b_subset_10_new, results_satlas_subset_10\n",
-    "])\n",
+    "df_subset = pd.concat(\n",
+    "    [\n",
+    "        results_imagenet_subset_10,\n",
+    "        results_imagenet_resnet_subset_10,\n",
+    "        results_scalemae_subset_10,\n",
+    "        results_prithvi_subset_10,\n",
+    "        results_prithvi_swin_b_new_subset_10,\n",
+    "        results_prithvi_l_full_pretrain_subset_10,\n",
+    "        results_prithvi_b_subset_10_new,\n",
+    "        results_satlas_subset_10,\n",
+    "    ]\n",
+    ")\n",
     "df_subset[\"Task\"] = df_subset[\"Task\"].map(name_mapping)\n",
     "df_subset = add_means_to_df(df_subset, classification_datasets, segmentation_datasets)\n",
     "df_subset[\"Task\"] = df_subset[\"Task\"].map(dataset_name_map)"
@@ -456,9 +566,15 @@
    ],
    "source": [
     "g = sns.catplot(\n",
-    "    data=df_subset, kind=\"bar\",\n",
-    "    x=\"Task\", y=\"Best Score\", hue=\"Model\",\n",
-    "    errorbar=\"sd\", palette=\"dark\", alpha=.6, height=10\n",
+    "    data=df_subset,\n",
+    "    kind=\"bar\",\n",
+    "    x=\"Task\",\n",
+    "    y=\"Best Score\",\n",
+    "    hue=\"Model\",\n",
+    "    errorbar=\"sd\",\n",
+    "    palette=\"dark\",\n",
+    "    alpha=0.6,\n",
+    "    height=10,\n",
     ")\n",
     "g.despine(left=True)\n",
     "g.set_axis_labels(\"Dataset\", \"Metric\")\n",
diff --git a/plotting/plot_results_repeated_runs.ipynb b/plotting/plot_results_repeated_runs.ipynb
index 5e5080d..bcb8f85 100644
--- a/plotting/plot_results_repeated_runs.ipynb
+++ b/plotting/plot_results_repeated_runs.ipynb
@@ -38,8 +38,7 @@
     "from matplotlib.ticker import FormatStrFormatter\n",
     "import json\n",
     "from scipy.stats import trim_mean\n",
-    "import plot_tools\n",
-    "\n"
+    "import plot_tools"
    ]
   },
   {
@@ -48,7 +47,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "name_mapping = {\"big_earth_net\": \"m-bigearthnet\", \"brick_kiln\": \"m-brick-kiln\", \"eurosat\": \"m-eurosat\", \"forestnet\": \"m-forestnet\", \"pv4ger\": \"m-pv4ger\", \"so2sat\": \"m-so2sat\", \"neontree\": \"m-NeonTree\", \"sa_crop_type\": \"m-SA-crop-type\", \"cashew\": \"m-cashew-plant\", \"chesapeake\": \"m-chesapeake\", \"nz_cattle\": \"m-nz-cattle\", \"pv4ger_seg\": \"m-pv4ger-seg\"}"
+    "name_mapping = {\n",
+    "    \"big_earth_net\": \"m-bigearthnet\",\n",
+    "    \"brick_kiln\": \"m-brick-kiln\",\n",
+    "    \"eurosat\": \"m-eurosat\",\n",
+    "    \"forestnet\": \"m-forestnet\",\n",
+    "    \"pv4ger\": \"m-pv4ger\",\n",
+    "    \"so2sat\": \"m-so2sat\",\n",
+    "    \"neontree\": \"m-NeonTree\",\n",
+    "    \"sa_crop_type\": \"m-SA-crop-type\",\n",
+    "    \"cashew\": \"m-cashew-plant\",\n",
+    "    \"chesapeake\": \"m-chesapeake\",\n",
+    "    \"nz_cattle\": \"m-nz-cattle\",\n",
+    "    \"pv4ger_seg\": \"m-pv4ger-seg\",\n",
+    "}"
    ]
   },
   {
@@ -73,7 +85,9 @@
     "prithvi_os_results = pd.read_csv(\"prithvi_vit_os.csv\", index_col=\"Unnamed: 0\")\n",
     "prithvi_os_results[\"Backbone\"] = \"prithvi-eo-hls-100m-vit-os\"\n",
     "\n",
-    "prithvi_results = pd.concat([prithvi_results, prithvi_os_results, prithvi_global_results], ignore_index=True)"
+    "prithvi_results = pd.concat(\n",
+    "    [prithvi_results, prithvi_os_results, prithvi_global_results], ignore_index=True\n",
+    ")"
    ]
   },
   {
@@ -82,7 +96,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "prithvi_results = prithvi_results.rename(columns={\"Task\": \"dataset\", \"Backbone\": \"model\", \"Score\": \"test metric\"})\n",
+    "prithvi_results = prithvi_results.rename(\n",
+    "    columns={\"Task\": \"dataset\", \"Backbone\": \"model\", \"Score\": \"test metric\"}\n",
+    ")\n",
     "prithvi_results[\"partition name\"] = \"1.00x train\"\n",
     "prithvi_results[\"dataset\"] = prithvi_results[\"dataset\"].map(name_mapping)"
    ]
@@ -93,7 +109,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "prithvi_results_2 = prithvi_results_2.rename(columns={\"Task\": \"dataset\", \"Backbone\": \"model\", \"Score\": \"test metric\"})\n",
+    "prithvi_results_2 = prithvi_results_2.rename(\n",
+    "    columns={\"Task\": \"dataset\", \"Backbone\": \"model\", \"Score\": \"test metric\"}\n",
+    ")\n",
     "prithvi_results_2[\"partition name\"] = \"1.00x train\"\n",
     "prithvi_results_2[\"dataset\"] = prithvi_results_2[\"dataset\"].map(name_mapping)"
    ]
@@ -106,8 +124,12 @@
    "source": [
     "geobench_results_class = pd.read_csv(\"baseline_classification_results.csv\")\n",
     "df_1x = plot_tools.extract_1x_data(geobench_results_class)\n",
-    "model_order = \"prithvi-eo-hls-90m-swin-B,prithvi-eo-hls-100m-vit,prithvi-eo-hls-100m-vit-os,ResNet18-Rnd,ResNet18-timm,ResNet18-MoCo-S2,ResNet50-SECO-S2,ResNet50-MoCo-S2,ResNet50-timm,ConvNeXt-B-timm,ViT-T-timm,ViT-S-timm,SwinV2-T-timm\".split(\",\")\n",
-    "model_colors = dict( zip(model_order, sns.color_palette(\"tab20\", n_colors=len(model_order))))"
+    "model_order = \"prithvi-eo-hls-90m-swin-B,prithvi-eo-hls-100m-vit,prithvi-eo-hls-100m-vit-os,ResNet18-Rnd,ResNet18-timm,ResNet18-MoCo-S2,ResNet50-SECO-S2,ResNet50-MoCo-S2,ResNet50-timm,ConvNeXt-B-timm,ViT-T-timm,ViT-S-timm,SwinV2-T-timm\".split(\n",
+    "    \",\"\n",
+    ")\n",
+    "model_colors = dict(\n",
+    "    zip(model_order, sns.color_palette(\"tab20\", n_colors=len(model_order)))\n",
+    ")"
    ]
   },
   {
@@ -160,10 +182,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "classification_datasets = [\"m-bigearthnet\", \"m-brick-kiln\", \"m-eurosat\", \"m-forestnet\", \"m-pv4ger\", \"m-so2sat\"]\n",
+    "classification_datasets = [\n",
+    "    \"m-bigearthnet\",\n",
+    "    \"m-brick-kiln\",\n",
+    "    \"m-eurosat\",\n",
+    "    \"m-forestnet\",\n",
+    "    \"m-pv4ger\",\n",
+    "    \"m-so2sat\",\n",
+    "]\n",
     "# exclude bigearthnet for now\n",
     "# classification_datasets = [\"m-brick-kiln\", \"m-eurosat\", \"m-forestnet\", \"m-pv4ger\", \"m-so2sat\"]\n",
-    "segmentation_datasets = list(set(prithvi_results[\"dataset\"].unique().tolist()) - set(classification_datasets))\n",
+    "segmentation_datasets = list(\n",
+    "    set(prithvi_results[\"dataset\"].unique().tolist()) - set(classification_datasets)\n",
+    ")\n",
     "# segmentation_datasets.remove(\"m-bigearthnet\")\n",
     "# segmentation_datasets.remove(\"m-cashew-plant\")"
    ]
@@ -174,11 +205,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "prithvi_class = prithvi_results[prithvi_results[\"dataset\"].isin(classification_datasets)]\n",
+    "prithvi_class = prithvi_results[\n",
+    "    prithvi_results[\"dataset\"].isin(classification_datasets)\n",
+    "]\n",
     "prithvi_seg = prithvi_results[prithvi_results[\"dataset\"].isin(segmentation_datasets)]\n",
     "\n",
-    "prithvi_class_2 = prithvi_results_2[prithvi_results_2[\"dataset\"].isin(classification_datasets)]\n",
-    "prithvi_seg_2 = prithvi_results_2[prithvi_results_2[\"dataset\"].isin(segmentation_datasets)]"
+    "prithvi_class_2 = prithvi_results_2[\n",
+    "    prithvi_results_2[\"dataset\"].isin(classification_datasets)\n",
+    "]\n",
+    "prithvi_seg_2 = prithvi_results_2[\n",
+    "    prithvi_results_2[\"dataset\"].isin(segmentation_datasets)\n",
+    "]"
    ]
   },
   {
@@ -199,7 +236,7 @@
     "    \"m-cashew-plant\": \"10m\",\n",
     "    \"m-SA-crop-type\": \"10m\",\n",
     "    \"m-nz-cattle\": \"0.1m\",\n",
-    "    \"m-NeonTree\": \"0.1m\"\n",
+    "    \"m-NeonTree\": \"0.1m\",\n",
     "}\n",
     "\n",
     "dataset_instrument = {\n",
@@ -214,7 +251,7 @@
     "    \"m-cashew-plant\": \"S2\",\n",
     "    \"m-SA-crop-type\": \"S2\",\n",
     "    \"m-nz-cattle\": \"RGB\",\n",
-    "    \"m-NeonTree\": \"RGB + Hyper\"\n",
+    "    \"m-NeonTree\": \"RGB + Hyper\",\n",
     "}\n",
     "\n",
     "img_size = {\n",
@@ -229,10 +266,13 @@
     "    \"m-cashew-plant\": 256,\n",
     "    \"m-SA-crop-type\": 256,\n",
     "    \"m-nz-cattle\": 500,\n",
-    "    \"m-NeonTree\": 400\n",
+    "    \"m-NeonTree\": 400,\n",
     "}\n",
     "\n",
-    "dataset_name_map = {name: f\"{name}\\n {dataset_instrument[name]} @ {dataset_res[name]}\" for name in img_size.keys()}"
+    "dataset_name_map = {\n",
+    "    name: f\"{name}\\n {dataset_instrument[name]} @ {dataset_res[name]}\"\n",
+    "    for name in img_size.keys()\n",
+    "}"
    ]
   },
   {
@@ -241,7 +281,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "class_df = pd.concat([df_1x[[\"model\", \"dataset\", \"test metric\", \"partition name\"]], prithvi_class.drop(columns=[\"Metric\"])], ignore_index=True)\n",
+    "class_df = pd.concat(\n",
+    "    [\n",
+    "        df_1x[[\"model\", \"dataset\", \"test metric\", \"partition name\"]],\n",
+    "        prithvi_class.drop(columns=[\"Metric\"]),\n",
+    "    ],\n",
+    "    ignore_index=True,\n",
+    ")\n",
     "# class_df = pd.concat([class_df[[\"model\", \"dataset\", \"test metric\", \"partition name\"]], prithvi_class_2.drop(columns=[\"Metric\"])], ignore_index=True)\n",
     "# class_df[\"dataset\"] = class_df[\"dataset\"].map(lambda x: f'{x} ({dataset_instrument[x]} [{dataset_res[x]}])\\n{img_size[x]} x {img_size[x]}').astype(str)"
    ]
@@ -261,7 +307,9 @@
     }
    ],
    "source": [
-    "class_df.groupby([\"model\", \"dataset\"]).agg([\"mean\", \"std\"]).to_csv(\"table_classification.csv\")"
+    "class_df.groupby([\"model\", \"dataset\"]).agg([\"mean\", \"std\"]).to_csv(\n",
+    "    \"table_classification.csv\"\n",
+    ")"
    ]
   },
   {
@@ -312,7 +360,16 @@
    ],
    "source": [
     "class_df[\"dataset\"] = class_df[\"dataset\"].map(dataset_name_map)\n",
-    "plot_tools.plot_per_dataset(class_df, model_order, model_colors=model_colors, metric=\"test metric\", sharey=False, inner=\"points\", fig_size=(14, 3), n_legend_rows=2)\n",
+    "plot_tools.plot_per_dataset(\n",
+    "    class_df,\n",
+    "    model_order,\n",
+    "    model_colors=model_colors,\n",
+    "    metric=\"test metric\",\n",
+    "    sharey=False,\n",
+    "    inner=\"points\",\n",
+    "    fig_size=(14, 3),\n",
+    "    n_legend_rows=2,\n",
+    ")\n",
     "plt.savefig(\"classification_raw.png\", bbox_inches=\"tight\")"
    ]
   },
@@ -367,7 +424,15 @@
     }
    ],
    "source": [
-    "agg_class = plot_tools.normalize_bootstrap_and_plot(class_df, metric=\"test metric\",benchmark_name=\"classification_v1.0\", model_order=model_order, model_colors=model_colors, fig_size=(12,2.3), dataset_name_map=dataset_name_map)\n",
+    "agg_class = plot_tools.normalize_bootstrap_and_plot(\n",
+    "    class_df,\n",
+    "    metric=\"test metric\",\n",
+    "    benchmark_name=\"classification_v1.0\",\n",
+    "    model_order=model_order,\n",
+    "    model_colors=model_colors,\n",
+    "    fig_size=(12, 2.3),\n",
+    "    dataset_name_map=dataset_name_map,\n",
+    ")\n",
     "plt.savefig(\"classification_normalized.png\", bbox_inches=\"tight\")"
    ]
   },
@@ -386,7 +451,9 @@
     }
    ],
    "source": [
-    "agg_class.groupby([\"model\", \"dataset\"]).agg([\"mean\", \"std\"]).to_csv(\"class_with_aggregated.csv\")"
+    "agg_class.groupby([\"model\", \"dataset\"]).agg([\"mean\", \"std\"]).to_csv(\n",
+    "    \"class_with_aggregated.csv\"\n",
+    ")"
    ]
   },
   {
@@ -402,8 +469,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model_order = 'prithvi-eo-hls-90m-swin-B,prithvi-eo-hls-100m-vit,prithvi-eo-hls-100m-vit-os,ResNet18-U-Net-timm,ResNet50-U-Net-timm,ResNet101-U-Net-timm,ResNet18 DeepLabV3-timm,ResNet50 DeepLabV3-timm,ResNet101 DeepLabV3-timm'.split(',')\n",
-    "model_colors = dict( zip(model_order, sns.color_palette(\"tab20\", n_colors=len(model_order))))"
+    "model_order = \"prithvi-eo-hls-90m-swin-B,prithvi-eo-hls-100m-vit,prithvi-eo-hls-100m-vit-os,ResNet18-U-Net-timm,ResNet50-U-Net-timm,ResNet101-U-Net-timm,ResNet18 DeepLabV3-timm,ResNet50 DeepLabV3-timm,ResNet101 DeepLabV3-timm\".split(\n",
+    "    \",\"\n",
+    ")\n",
+    "model_colors = dict(\n",
+    "    zip(model_order, sns.color_palette(\"tab20\", n_colors=len(model_order)))\n",
+    ")"
    ]
   },
   {
@@ -422,7 +493,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "seg_df = pd.concat([df_1x[[\"model\", \"dataset\", \"test metric\", \"partition name\"]], prithvi_seg.drop(columns=[\"Metric\"])], ignore_index=True)"
+    "seg_df = pd.concat(\n",
+    "    [\n",
+    "        df_1x[[\"model\", \"dataset\", \"test metric\", \"partition name\"]],\n",
+    "        prithvi_seg.drop(columns=[\"Metric\"]),\n",
+    "    ],\n",
+    "    ignore_index=True,\n",
+    ")"
    ]
   },
   {
@@ -880,7 +957,16 @@
    ],
    "source": [
     "seg_df[\"dataset\"] = seg_df[\"dataset\"].map(dataset_name_map)\n",
-    "plot_tools.plot_per_dataset(seg_df, model_order, model_colors=model_colors, metric=\"test metric\", sharey=False, inner=\"points\", fig_size=(14, 3), n_legend_rows=2)\n",
+    "plot_tools.plot_per_dataset(\n",
+    "    seg_df,\n",
+    "    model_order,\n",
+    "    model_colors=model_colors,\n",
+    "    metric=\"test metric\",\n",
+    "    sharey=False,\n",
+    "    inner=\"points\",\n",
+    "    fig_size=(14, 3),\n",
+    "    n_legend_rows=2,\n",
+    ")\n",
     "plt.savefig(\"segmentation_raw.png\", bbox_inches=\"tight\")"
    ]
   },
@@ -935,7 +1021,15 @@
     }
    ],
    "source": [
-    "agg_seg = plot_tools.normalize_bootstrap_and_plot(seg_df, metric=\"test metric\",benchmark_name=\"segmentation_v1.0\", model_order=model_order, model_colors=model_colors, fig_size=(12,2.3), dataset_name_map=dataset_name_map)\n",
+    "agg_seg = plot_tools.normalize_bootstrap_and_plot(\n",
+    "    seg_df,\n",
+    "    metric=\"test metric\",\n",
+    "    benchmark_name=\"segmentation_v1.0\",\n",
+    "    model_order=model_order,\n",
+    "    model_colors=model_colors,\n",
+    "    fig_size=(12, 2.3),\n",
+    "    dataset_name_map=dataset_name_map,\n",
+    ")\n",
     "\n",
     "plt.savefig(\"segmentation_normalized.png\", bbox_inches=\"tight\")"
    ]
@@ -993,7 +1087,15 @@
     }
    ],
    "source": [
-    "agg_seg = plot_tools.normalize_bootstrap_and_plot(seg_df[seg_df[\"dataset\"] != dataset_name_map[\"m-cashew-plant\"]], metric=\"test metric\",benchmark_name=\"segmentation_v1.0\", model_order=model_order, model_colors=model_colors, fig_size=(12,2.3), dataset_name_map=dataset_name_map)\n",
+    "agg_seg = plot_tools.normalize_bootstrap_and_plot(\n",
+    "    seg_df[seg_df[\"dataset\"] != dataset_name_map[\"m-cashew-plant\"]],\n",
+    "    metric=\"test metric\",\n",
+    "    benchmark_name=\"segmentation_v1.0\",\n",
+    "    model_order=model_order,\n",
+    "    model_colors=model_colors,\n",
+    "    fig_size=(12, 2.3),\n",
+    "    dataset_name_map=dataset_name_map,\n",
+    ")\n",
     "plt.savefig(\"segmentation_normalized_no_cashew.png\", bbox_inches=\"tight\")"
    ]
   },
@@ -1012,7 +1114,9 @@
     }
    ],
    "source": [
-    "agg_seg.groupby([\"model\", \"dataset\"]).agg([\"mean\", \"std\"]).to_csv(\"seg_with_aggregated.csv\")"
+    "agg_seg.groupby([\"model\", \"dataset\"]).agg([\"mean\", \"std\"]).to_csv(\n",
+    "    \"seg_with_aggregated.csv\"\n",
+    ")"
    ]
   },
   {
diff --git a/run_tests.py b/run_tests.py
index d98e08c..8851994 100644
--- a/run_tests.py
+++ b/run_tests.py
@@ -3,6 +3,13 @@
 from typing import Optional
 from tests.integration.test_main import get_test_ids
 import click
+import logging
+
+logging.basicConfig(
+    level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s"
+)
+
+logger = logging.getLogger(__name__)
 
 # rm geobench_v1_prithvi* && bsub -e ~/geobench_v1_prithvi.err -o ~/geobench_v1_prithvi.out -M 40G -gpu "num=1/task:mode=exclusive_process:gmodel=NVIDIAA100_SXM4_80GB" terratorch iterate --hpo --config configs/geobench_v1_prithvi.yaml
 
@@ -15,7 +22,8 @@
 # Delete all files in logs dir
 for item in LOGS_DIR.iterdir():
     if item.is_file():
-        item.unlink()  
+        item.unlink()
+
 
 @click.group()
 def cli():
@@ -31,50 +39,52 @@ def submit_job(
     err_file = LOGS_DIR / stderr_file
     # delete file if it exists
     if err_file.exists():
-        print(f"Delete file {err_file}")
+        logger.info(f"Delete file {err_file}")
         err_file.unlink(missing_ok=True)
         assert not err_file.exists()
 
     out_file = LOGS_DIR / stdout_file
     # delete file if it exists
     if out_file.exists():
-        print(f"Delete file {out_file}")
+        logger.info(f"Delete file {out_file}")
         out_file.unlink(missing_ok=True)
         assert not out_file.exists()
     if tc_id is not None:
-        jbsub = f"bsub -e {err_file} -o {out_file} -M 40G -gpu \"num=1/task:mode=exclusive_process:gmodel=NVIDIAA100_SXM4_80GB\" pytest -vv tests/integration/test_main.py::test_main[{tc_id}]"
+        jbsub = f'bsub -e {err_file} -o {out_file} -M 40G -gpu "num=1/task:mode=exclusive_process:gmodel=NVIDIAA100_SXM4_80GB" pytest -vv tests/integration/test_main.py::test_main[{tc_id}]'
     elif config is not None:
-        jbsub = f"bsub -e {err_file} -o {out_file} -M 40G -gpu \"num=1/task:mode=exclusive_process:gmodel=NVIDIAA100_SXM4_80GB\" terratorch iterate --hpo --config {config}"
+        jbsub = f'bsub -e {err_file} -o {out_file} -M 40G -gpu "num=1/task:mode=exclusive_process:gmodel=NVIDIAA100_SXM4_80GB" terratorch iterate --hpo --config {config}'
     else:
         raise ValueError("Error! Either tc_id or config must be not None")
     cmd = jbsub.split()
     result = subprocess.run(cmd, capture_output=True)
     if result.returncode == 0:
-        print(f"Command executed successfully: {jbsub}")
+        logger.info(f"Command executed successfully: {jbsub}")
 
     else:
-        print(f"Command failed: {jbsub}")
-        print("Command failed with error code:", result.returncode)
-        print("stderr:", result.stderr)
+        logger.info(f"Command failed: {jbsub}")
+        logger.info("Command failed with error code:", result.returncode)
+        logger.info("stderr:", result.stderr)
 
 
-@click.command()
-@click.option('--test_id', default=None, help='test ID')
+@click.command("run tests")
+@click.option("--test_id", default=None, help="test ID")
 def run_tests(test_id: Optional[str] = None):
     if test_id is None:
         test_ids = get_test_ids()
     else:
         test_ids = [test_id]
     for tc_id in test_ids:
-        print(f"Running test case: tests/test_benchmark.py::test_run_benchmark {tc_id}")
+        logger.info(
+            f"Running test case: tests/test_benchmark.py::test_run_benchmark {tc_id}"
+        )
         stderr_file = f"{tc_id}.err"
         stdout_file = f"{tc_id}.out"
 
         submit_job(stderr_file=stderr_file, stdout_file=stdout_file, tc_id=tc_id)
 
 
-@click.command()
-@click.option('--config', default=None, help='path to config file')
+@click.command("run single job")
+@click.option("--config", default=None, help="path to config file")
 def run_job(config: str):
     home_dir = Path(__file__).parent
     config_path = home_dir / config
@@ -82,11 +92,12 @@ def run_job(config: str):
     stem = config_path.stem
     err_file = f"{stem}.err"
     out_file = f"{stem}.out"
+    logger.info(f"Running job with config: {config}")
     submit_job(stdout_file=out_file, stderr_file=err_file, config=config)
 
 
 cli.add_command(run_job)
 cli.add_command(run_tests)
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     cli()
diff --git a/tests/integration/test_main.py b/tests/integration/test_main.py
index 613b5f8..7219d4b 100644
--- a/tests/integration/test_main.py
+++ b/tests/integration/test_main.py
@@ -36,9 +36,9 @@ def validate_results(experiment_name: str, storage_uri: str, finished_run_id: st
     meta_yaml = "meta.yaml"
 
     meta_yaml_path = dir_path / meta_yaml
-    assert (
-        meta_yaml_path.exists()
-    ), f"Error! meta.yaml file {meta_yaml_path} does not exist"
+    assert meta_yaml_path.exists(), (
+        f"Error! meta.yaml file {meta_yaml_path} does not exist"
+    )
     # open file and check that the experiment name is the same
     with open(meta_yaml_path, mode="r") as f:
         # read all the lines
@@ -51,9 +51,9 @@ def validate_results(experiment_name: str, storage_uri: str, finished_run_id: st
                 experiment_name_found = True
             if finished_run_id in line:
                 experiment_id_found = True
-        assert (
-            experiment_name_found and experiment_id_found
-        ), f"Error! Both experiment name ({experiment_name=}) and finished run id ({finished_run_id=}) must be in the {meta_yaml_path=}: {experiment_id_found=} {experiment_name_found=}"
+        assert experiment_name_found and experiment_id_found, (
+            f"Error! Both experiment name ({experiment_name=}) and finished run id ({finished_run_id=}) must be in the {meta_yaml_path=}: {experiment_id_found=} {experiment_name_found=}"
+        )
     # TODO delete the directories that were created by this test case
 
 
@@ -69,12 +69,12 @@ def test_main(
     home_dir = Path(__file__).parent.parent.parent
     config_file: Path = home_dir / config
     assert config_file.exists()
-    with open(config_file, 'r') as file:
+    with open(config_file, "r") as file:
         config_data = yaml.safe_load(file)
     storage_uri: str = config_data["storage_uri"]
     # handling relative paths
     if storage_uri.startswith(".") or storage_uri.startswith(".."):
-        repo_home_dir = Path(__file__).parent.parent.parent 
+        repo_home_dir = Path(__file__).parent.parent.parent
         abs_path = repo_home_dir / storage_uri
         storage_uri = str(abs_path.resolve())
     experiment_name = config_data["experiment_name"]
diff --git a/tests/unit/test_build_geobench_configs.py b/tests/unit/test_build_geobench_configs.py
index e5744d5..bdb0337 100644
--- a/tests/unit/test_build_geobench_configs.py
+++ b/tests/unit/test_build_geobench_configs.py
@@ -5,7 +5,7 @@
 from deepdiff import DeepDiff
 import logging
 
-logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
+logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
 
 
 @pytest.mark.parametrize(
@@ -41,9 +41,7 @@
         ),
     ],
 )
-def test__generate_iterate_config(
-    input, output, template, prefix, oracle_config_file
-):
+def test__generate_iterate_config(input, output, template, prefix, oracle_config_file):
     # Get the absolute path of the current script file
     script_path = Path(__file__).resolve()
 
@@ -69,10 +67,10 @@ def test__generate_iterate_config(
         prefix=prefix,
     )
     if output_path.is_dir():
-        generated_config_files = list(output_path.glob(f'**/{prefix}*.yaml'))
+        generated_config_files = list(output_path.glob(f"**/{prefix}*.yaml"))
     else:
         generated_config_files = [output_path]
-        
+
     assert len(generated_config_files) > 0
 
     if oracle_config_file is not None:
@@ -81,7 +79,6 @@ def test__generate_iterate_config(
             oracle_config = yaml.safe_load(gt_file)
 
         for gen_config_file in generated_config_files:
-
             with open(gen_config_file, "r") as gen_file:
                 new_config = yaml.safe_load(gen_file)
 
diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py
index 71ed890..953ac53 100644
--- a/tests/unit/test_cli.py
+++ b/tests/unit/test_cli.py
@@ -2,5 +2,5 @@
 
 
 def test_cli():
-    exit_status = os.system('terratorch iterate --help')
+    exit_status = os.system("terratorch iterate --help")
     assert exit_status == 0

From b4320fe24f5d594714f8d0ab5d93a423f13ab91d Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Wed, 1 Oct 2025 17:10:17 -0300
Subject: [PATCH 31/40] add script that converts terratorch config to iterate
 config

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 benchmark/config_util/__init__.py             |   0
 benchmark/config_util/build_iterate_config.py | 253 ++++++++++++++++++
 2 files changed, 253 insertions(+)
 create mode 100644 benchmark/config_util/__init__.py
 create mode 100644 benchmark/config_util/build_iterate_config.py

diff --git a/benchmark/config_util/__init__.py b/benchmark/config_util/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/benchmark/config_util/build_iterate_config.py b/benchmark/config_util/build_iterate_config.py
new file mode 100644
index 0000000..700ee89
--- /dev/null
+++ b/benchmark/config_util/build_iterate_config.py
@@ -0,0 +1,253 @@
+from pathlib import Path
+import yaml
+import pandas as pd
+import click
+from benchmark.benchmark_types import (
+    TaskTypeEnum,
+)
+from copy import deepcopy
+
+DEFAULT_TEMPLATE = (
+    Path(__file__).parent.parent.parent / "configs/templates/template.yaml"
+)
+
+
+def _build_dataframe(config_files) -> pd.DataFrame:
+    """
+    build a pandas dataframe using the parameters of the specified config files
+    """
+    files = list()
+    dataset = list()
+    models = list()
+    for config_file in config_files:
+        try:
+            # extract dataset name from filename
+            ds = str(config_file).split("/")[-1].split("_")[0]
+            dataset.append(ds)
+            # append file path
+            files.append(str(config_file))
+        except KeyError as e:
+            msg = f"Error in file: {config_file}\n{e}"
+            print(msg)
+            raise KeyError(msg)
+
+    df = pd.DataFrame(data={"file": files, "dataset": dataset})
+    models = [
+        x.split("/")[-1].replace(y + "_", "").replace(".yaml", "")
+        for x, y in zip(df["file"].values, df["dataset"].values)
+    ]
+    df["model"] = models
+    return df
+
+
+def _create_task(
+    name: str,
+    datamodule: dict,
+    metric: str,
+    terratorch_task: dict,
+    task_type: TaskTypeEnum,
+    direction: str,
+    max_run_duration: str | None = None,
+    early_stop_patience: int | None = None,
+    early_prune: bool | None = None,
+) -> dict:
+    """instantiate Task dataclass and convert it to dict
+
+    Args:
+        name (str): name of the task - comes from terratorch config - data.init_args.cls
+        datamodule (dict): _description_
+        metric (str): _description_
+        terratorch_task (dict): _description_
+        task_type (TaskTypeEnum): type of task, e.g., regression, classification
+        direction (str): direction to optimize
+        max_run_duration (str | None, optional): _description_. Defaults to None.
+        early_stop_patience (int | None, optional): _description_. Defaults to None.
+        early_prune (bool, optional): _description_. Defaults to False.
+
+    Returns:
+        dict: _description_
+    """
+
+    task_dict = {
+        "name": name,
+        "datamodule": datamodule,
+        "type": task_type.value,
+        "direction": direction,
+        "metric": metric,
+        "terratorch_task": terratorch_task,
+    }
+    # set optional fields if they are not None
+    for k, v in [
+        ("max_run_duration", max_run_duration),
+        ("early_stop_patience", early_stop_patience),
+        ("early_prune", early_prune),
+    ]:
+        if v is not None:
+            task_dict[k] = v
+
+    return task_dict
+
+
+def _get_task_type(template: dict) -> TaskTypeEnum:
+    tasks = template["tasks"]
+    task = tasks[0]
+    task_type = task["type"]
+    assert isinstance(task_type, str)
+
+    return TaskTypeEnum(value=task_type)
+
+
+def _get_task_direction(template: dict) -> str:
+    """extract task direction from template
+
+    Args:
+        template (dict): template created by user
+
+    Returns:
+        str: direction of the optimization (max or min)
+    """
+    tasks = template["tasks"]
+    task = tasks[0]
+    direction = task["direction"]
+    assert isinstance(direction, str)
+    assert direction in ["min", "max"]
+    return direction
+
+
+def generate_iterate_config(
+    input: Path,
+    output: Path,
+    template: Path = DEFAULT_TEMPLATE,
+    prefix: str = "tt-iterate-",
+):
+    """generate the tt-iterate based on yaml files located within the specified directory, based
+    on previously defined template and save the result using specified output filename
+
+    Args:
+        input_dir (Path): contains all terratorch yaml files
+        output_dir (Path): filename of the result
+        template (Path): template file that contains pre-defined values
+        prefix (str): prefix for creating new config files
+    """
+    assert input.exists()
+    if input.is_dir():
+        config_files = input.glob("**/*.yaml")
+    elif input.is_file():
+        config_files = [input]
+    else:
+        ValueError(f"Error! {input=} is neither a file nor a directory")
+    files_df = _build_dataframe(config_files=config_files)
+
+    # set default values if necessary
+    if template is None:
+        template = DEFAULT_TEMPLATE
+    if prefix is None:
+        prefix = "tt-iterate-"
+
+    models = files_df["model"].unique()
+
+    with open(template, "r") as file:
+        template_dict: dict = yaml.safe_load(file)
+
+    # generate one config per model
+    for model in models:
+        model_specific_template = deepcopy(template_dict)
+        # create unique name for experiment
+        model_specific_template["experiment_name"] = f"{prefix}_{model}"
+        tasks = list()
+
+        # filter dataframe by model
+        single_model_df = files_df[files_df["model"].values == model]
+
+        for i in range(single_model_df.shape[0]):
+            # open terratorch config file
+            with open(single_model_df["file"].values[i], "r") as file:
+                data = yaml.safe_load(file)
+
+            name = single_model_df["dataset"].values[i]
+
+            model_args: dict = data["model"]["init_args"]["model_args"]
+            # framework is an optional field of terratorch config
+            if (
+                model_args.get("framework") is not None
+                and model_args.get("framework") == "faster-rcnn"
+            ):
+                metric = "val_map"
+            else:
+                metric = "val/loss"
+
+            # terratorchtask is extracted from the data.model.init_args of terratorch config file
+            terratorch_task = data["model"]["init_args"]
+            # create datamodule based on data field
+            datamodule = data["data"]
+            task_type = _get_task_type(template=template_dict)
+            task_direction = _get_task_direction(template=template_dict)
+            task = _create_task(
+                name=name,
+                datamodule=datamodule,
+                metric=metric,
+                terratorch_task=terratorch_task,
+                task_type=task_type,
+                direction=task_direction,
+            )
+            tasks.append(task)
+
+        model_specific_template["tasks"] = tasks
+        if output.is_dir():
+            path = output / f"{prefix}_{model}.yaml"
+        else:
+            path = output
+        if path.exists():
+            path.unlink()
+        with open(path, "w") as file:
+            yaml.dump(model_specific_template, file)
+            print(f"{path} file has been created")
+
+
+@click.command()
+@click.option(
+    "--input_dir",
+    prompt="Full path to the directory that contains all terratorch config yaml files",
+    help="Full path to the directory that contains all terratorch config yaml files",
+)
+@click.option(
+    "--output_dir",
+    prompt="Full path to the directory in which the new config files will be stored",
+    help="Full path to the directory in which the new config files will be stored",
+)
+@click.option(
+    "--template",
+    prompt="Full path to the template file",
+    help="Full path to the template file",
+)
+@click.option(
+    "--prefix",
+    prompt="Prefix of the config filename, e.g., my-config-",
+    help="Prefix of the config filename",
+)
+def generate_tt_iterate_config(
+    input_dir: str, output_dir: str, template: str, prefix: str
+):
+    directory_path = Path(input_dir)
+    assert directory_path.exists()
+    assert directory_path.is_dir
+
+    template_path = Path(template)
+    assert template_path.exists()
+    assert template_path.is_file
+
+    output_path = Path(output_dir)
+    assert output_path.exists()
+    assert output_path.is_dir
+
+    assert isinstance(prefix, str), f"Error! {type(prefix)} is not a str"
+    generate_iterate_config(
+        input=directory_path,
+        output=output_path,
+        template=template_path,
+        prefix=prefix,
+    )
+
+
+if __name__ == "__main__":
+    generate_tt_iterate_config()

From 012672dc279fde9ef2cda5836613c7ac86501f4b Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Wed, 1 Oct 2025 17:15:20 -0300
Subject: [PATCH 32/40] remove command name

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 run_tests.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/run_tests.py b/run_tests.py
index 8851994..9a31b8e 100644
--- a/run_tests.py
+++ b/run_tests.py
@@ -66,7 +66,7 @@ def submit_job(
         logger.info("stderr:", result.stderr)
 
 
-@click.command("run tests")
+@click.command()
 @click.option("--test_id", default=None, help="test ID")
 def run_tests(test_id: Optional[str] = None):
     if test_id is None:
@@ -83,7 +83,7 @@ def run_tests(test_id: Optional[str] = None):
         submit_job(stderr_file=stderr_file, stdout_file=stdout_file, tc_id=tc_id)
 
 
-@click.command("run single job")
+@click.command()
 @click.option("--config", default=None, help="path to config file")
 def run_job(config: str):
     home_dir = Path(__file__).parent

From 6bb9146e9bf552976c8191cedc14f419a67295e7 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Thu, 2 Oct 2025 17:02:37 -0300
Subject: [PATCH 33/40] fix invalid optuna path

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 benchmark/backbone_benchmark.py | 35 +++++++++++++++++++++++++--------
 benchmark/main.py               |  3 +++
 benchmark/model_fitting.py      | 15 +++++++-------
 benchmark/utils.py              |  6 +++++-
 4 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/benchmark/backbone_benchmark.py b/benchmark/backbone_benchmark.py
index b6cd52c..4c6a503 100644
--- a/benchmark/backbone_benchmark.py
+++ b/benchmark/backbone_benchmark.py
@@ -39,7 +39,7 @@
 
 
 def benchmark_backbone_on_task(
-    logger,
+    logger: logging.RootLogger,
     defaults: Defaults,
     task: Task,
     storage_uri: str,
@@ -52,7 +52,14 @@ def benchmark_backbone_on_task(
     sampler: BaseSampler | None = None,
     test_models: bool = False,
 ) -> tuple[float, str | list[str] | None, dict[str, Any]]:
-    optuna_db_path = Path(storage_uri).parents[0] / "optuna_db"
+    logger.info(
+        f"starting backbone benchmark on task {task.name} {task_run_id=} {experiment_name=}"
+    )
+    if storage_uri.startswith("http"):
+        optuna_db_path = Path(".") / "optuna_db"
+    else:
+        optuna_db_path = Path(storage_uri).parents[0] / "optuna_db"
+
     if not os.path.exists(optuna_db_path):
         os.makedirs(optuna_db_path)
     optuna_db_path = optuna_db_path / f"{experiment_name}_{experiment_run_id}"
@@ -67,8 +74,13 @@ def benchmark_backbone_on_task(
         n_trials=n_trials,
         logger=logger,
     )
-
-    with mlflow.start_run(run_name=task.name, nested=True, run_id=task_run_id) as run:
+    if task_run_id is not None:
+        # run_name is used only when run_id is unspecified.
+        run_name = None
+    else:
+        run_name = task.name
+    logger.info(f"start run: {run_name=} {task_run_id=}")
+    with mlflow.start_run(run_name=run_name, nested=True, run_id=task_run_id) as run:
         logger.info(f"starting task run with id: {run.info.run_id}")
         training_spec = combine_with_defaults(task, defaults)
         if "max_epochs" not in training_spec.trainer_args:
@@ -179,8 +191,8 @@ def parse_optimization_space(space: dict | None) -> optimization_space_type | No
 
 
 def _run_hpo(
-    run_name: str,
-    run_id: str,
+    run_name: str | None,
+    run_id: str | None,
     description: str,
     tasks: list,
     completed_task_run_names: list,
@@ -200,7 +212,12 @@ def _run_hpo(
     PATH_TO_JOB_TRACKING,
     logger,
 ) -> tuple[str, str]:
-    logger.info("Running hyperparameter optimization")
+    logger.info(
+        f"Running hyperparameter optimization: {run_name=} {run_id=} {description=}"
+    )
+    if run_id is not None:
+        run_name = None
+
     with mlflow.start_run(
         run_name=run_name, run_id=run_id, description=description
     ) as run:
@@ -286,7 +303,7 @@ def benchmark_backbone(
     run_id: str | None = None,
     description: str = "No description provided",
     bayesian_search: bool = True,
-    continue_existing_experiment: bool = False,
+    continue_existing_experiment: bool = True,
     test_models: bool = False,
     run_repetitions: int = REPEATED_SEEDS_DEFAULT,
     report_on_best_val: bool = True,
@@ -361,12 +378,14 @@ def benchmark_backbone(
             ):
                 logger.info("Continuing previous experiment parent run")
                 run_id = existing_experiments["incomplete_run_to_finish"]
+                logger.debug(f"incomplete_run_to_finish: {run_id=}")
                 experiment_id = existing_experiments["experiment_id"]
                 optimize_hyperparams = True
 
             if existing_experiments["finished_run"] is not None:
                 optimize_hyperparams = False
                 finished_run_id = existing_experiments["finished_run"]
+                logger.debug(f"finished_run: {run_id=}")
                 run_id = existing_experiments["finished_run"]
 
             # get previously completed tasks
diff --git a/benchmark/main.py b/benchmark/main.py
index 8bf7fd1..51ed939 100644
--- a/benchmark/main.py
+++ b/benchmark/main.py
@@ -1,3 +1,4 @@
+import os
 from jsonargparse import Namespace
 import logging
 from pathlib import Path
@@ -270,6 +271,7 @@ def main():
 
             storage_uri = config_init.storage_uri
             assert isinstance(storage_uri, str), f"Error! {storage_uri=} is not a str"
+            os.environ["MLFLOW_TRACKING_URI"] = storage_uri
             # handling relative paths
             if storage_uri.startswith(".") or storage_uri.startswith(".."):
                 repo_home_dir = Path(__file__).parent.parent
@@ -395,6 +397,7 @@ def main():
                     storage_uri=storage_uri,
                     ray_storage_path=ray_storage_path,
                     run_name=run_name,
+                    run_id=None,
                     optimization_space=optimization_space,
                     n_trials=n_trials,
                     run_repetitions=run_repetitions,
diff --git a/benchmark/model_fitting.py b/benchmark/model_fitting.py
index d8502cb..10402cf 100644
--- a/benchmark/model_fitting.py
+++ b/benchmark/model_fitting.py
@@ -6,6 +6,7 @@
 import copy
 import dataclasses
 import importlib
+import logging
 import os
 import shutil
 import types
@@ -445,13 +446,13 @@ def fit_model_with_hparams(
     )
     run_name = f"{run_name}_{trial.number}"
     return fit_model(
-        training_spec_with_generated_hparams,
-        lightning_task_class,
-        run_name,
-        experiment_name,
-        storage_uri,
-        parent_run_id,
-        trial,
+        training_spec=training_spec_with_generated_hparams,
+        lightning_task_class=lightning_task_class,
+        run_name=run_name,
+        experiment_name=experiment_name,
+        storage_uri=storage_uri,
+        parent_run_id=parent_run_id,
+        trial=trial,
         save_models=save_models,
         test_models=test_models,
     )[0]  # return only the metric value for optuna
diff --git a/benchmark/utils.py b/benchmark/utils.py
index 0d010bb..d2dd533 100644
--- a/benchmark/utils.py
+++ b/benchmark/utils.py
@@ -63,13 +63,16 @@ def sync_mlflow_optuna(
     Returns:
         task_run_id: run id of the task to be continued (if one exists) or None
     """
+    logger.info(
+        f"sync_mlflow_optuna - {optuna_db_path=} {storage_uri=} {task_run_id=} {experiment_name=} {task_run_id=}"
+    )
     # check number of successful mlflow runs in task
     client = mlflow.tracking.MlflowClient(tracking_uri=storage_uri)
     completed_in_mlflow_for_task = []
     all_mlflow_runs_for_task = []
     if task_run_id is not None:
         all_mlflow_runs_for_task.append(task_run_id)
-        logger.info(f"task_run_id : {task_run_id}")
+        logger.info(f"sync_mlflow_optuna - {task_run_id=}")
         experiment_info = client.get_experiment_by_name(experiment_name)
         assert isinstance(experiment_info, Experiment), (
             f"Error! Unexpected type of {experiment_info=}"
@@ -140,6 +143,7 @@ def sync_mlflow_optuna(
                 )
                 os.system(f"rm -r {experiment_info.artifact_location}/{item}")
             task_run_id = None
+    logging.info(f"sync_mlflow_optuna returns {task_run_id=}")
     return task_run_id
 
 

From 427bbcf12a8939c7869de1b5b0fa08ebf602eb7d Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Thu, 2 Oct 2025 19:01:38 -0300
Subject: [PATCH 34/40] minor modifications

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 run_tests.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/run_tests.py b/run_tests.py
index 9a31b8e..64489b0 100644
--- a/run_tests.py
+++ b/run_tests.py
@@ -4,9 +4,12 @@
 from tests.integration.test_main import get_test_ids
 import click
 import logging
+import sys
 
 logging.basicConfig(
-    level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s"
+    stream=sys.stdout,
+    level=logging.DEBUG,
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
 )
 
 logger = logging.getLogger(__name__)

From 1586f0e0fc88b64cb9709266ebc50767483d0b2a Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Fri, 3 Oct 2025 11:50:40 -0300
Subject: [PATCH 35/40] change benchmark to terratorch_iterate

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 benchmark/py.typed                            |  0
 .../resources/dataset_specifications/agb.yaml | 64 ------------------
 .../dataset_specifications/eurosat.yaml       | 28 --------
 .../dataset_specifications/fire_scars.yaml    | 56 ----------------
 .../multi_temporal_crop.yaml                  | 57 ----------------
 .../dataset_specifications/sen1floods11.yaml  | 59 ----------------
 .../sen1floods11_transforms.yaml              | 67 -------------------
 benchmark/tests/__init__.py                   |  0
 pyproject.toml                                |  1 -
 {benchmark => terratorch_iterate}/__init__.py |  0
 .../backbone_benchmark.py                     |  8 +--
 .../benchmark_ray.py                          |  6 +-
 .../config_util/__init__.py                   |  0
 .../config_util/build_iterate_config.py       |  2 +-
 .../iterate_types.py                          |  0
 {benchmark => terratorch_iterate}/main.py     | 10 +--
 .../model_fitting.py                          |  4 +-
 {benchmark => terratorch_iterate}/module.py   |  0
 .../plot_tools.py                             |  0
 .../repeat_best_experiment.py                 |  4 +-
 {benchmark => terratorch_iterate}/utils.py    |  5 +-
 tests/integration/test_main.py                |  2 +-
 tests/unit/test_build_geobench_configs.py     |  2 +-
 tests/unit/test_tasktypeenum.py               |  2 +-
 24 files changed, 22 insertions(+), 355 deletions(-)
 delete mode 100644 benchmark/py.typed
 delete mode 100644 benchmark/resources/dataset_specifications/agb.yaml
 delete mode 100644 benchmark/resources/dataset_specifications/eurosat.yaml
 delete mode 100644 benchmark/resources/dataset_specifications/fire_scars.yaml
 delete mode 100644 benchmark/resources/dataset_specifications/multi_temporal_crop.yaml
 delete mode 100644 benchmark/resources/dataset_specifications/sen1floods11.yaml
 delete mode 100644 benchmark/resources/dataset_specifications/sen1floods11_transforms.yaml
 delete mode 100644 benchmark/tests/__init__.py
 rename {benchmark => terratorch_iterate}/__init__.py (100%)
 rename {benchmark => terratorch_iterate}/backbone_benchmark.py (98%)
 rename {benchmark => terratorch_iterate}/benchmark_ray.py (97%)
 rename {benchmark => terratorch_iterate}/config_util/__init__.py (100%)
 rename {benchmark => terratorch_iterate}/config_util/build_iterate_config.py (99%)
 rename benchmark/benchmark_types.py => terratorch_iterate/iterate_types.py (100%)
 rename {benchmark => terratorch_iterate}/main.py (98%)
 rename {benchmark => terratorch_iterate}/model_fitting.py (99%)
 rename {benchmark => terratorch_iterate}/module.py (100%)
 rename {benchmark => terratorch_iterate}/plot_tools.py (100%)
 rename {benchmark => terratorch_iterate}/repeat_best_experiment.py (99%)
 rename {benchmark => terratorch_iterate}/utils.py (99%)

diff --git a/benchmark/py.typed b/benchmark/py.typed
deleted file mode 100644
index e69de29..0000000
diff --git a/benchmark/resources/dataset_specifications/agb.yaml b/benchmark/resources/dataset_specifications/agb.yaml
deleted file mode 100644
index 33e9c95..0000000
--- a/benchmark/resources/dataset_specifications/agb.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-class_path: terratorch.datamodules.GenericNonGeoPixelwiseRegressionDataModule
-init_args:
-  batch_size: 16
-  num_workers: 4
-  train_transform:
-    - class_path: albumentations.HorizontalFlip
-      init_args:
-        p: 0.5
-    - class_path: albumentations.augmentations.geometric.rotate.Rotate
-      init_args:
-        limit: 30
-        border_mode: 0 # cv2.BORDER_CONSTANT
-        # value: 0
-        # mask_value: 1
-        p: 0.5
-      dict_kwargs:
-        value: 0
-        mask_value: 1
-    - class_path: ToTensorV2
-  dataset_bands:
-    - 0
-    - BLUE
-    - GREEN
-    - RED
-    - NIR_NARROW
-    - SWIR_1
-    - SWIR_2
-    - 1
-    - 2
-    - 3
-    - 4
-  output_bands:
-    - BLUE
-    - GREEN
-    - RED
-    - NIR_NARROW
-    - SWIR_1
-    - SWIR_2
-  rgb_indices:
-    - 2
-    - 1
-    - 0
-  train_data_root: /dccstor/hhr-weather/latest_filters_all_agb_patches_tts_clipped_0_500/train_images
-  train_label_data_root: /dccstor/hhr-weather/latest_filters_all_agb_patches_tts_clipped_0_500/train_labels
-  val_data_root: /dccstor/hhr-weather/latest_filters_all_agb_patches_tts_clipped_0_500/val_images
-  val_label_data_root: /dccstor/hhr-weather/latest_filters_all_agb_patches_tts_clipped_0_500/val_labels
-  test_data_root: /dccstor/hhr-weather/latest_filters_all_agb_patches_tts_clipped_0_500/test_images
-  test_label_data_root: /dccstor/hhr-weather/latest_filters_all_agb_patches_tts_clipped_0_500/test_labels
-  # img_grep: "*.tif"
-  # label_grep: "*.tif"
-  means:
-    - 385.88501817
-    - 714.60615207
-    - 658.96267376
-    - 3314.57774238
-    - 2238.71812558
-    - 1250.00982518
-  stds:
-    - 264.62872
-    - 355.62848
-    - 504.54855
-    - 898.4953
-    - 947.22894
-    - 828.1297
diff --git a/benchmark/resources/dataset_specifications/eurosat.yaml b/benchmark/resources/dataset_specifications/eurosat.yaml
deleted file mode 100644
index 029ee51..0000000
--- a/benchmark/resources/dataset_specifications/eurosat.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-class_path: terratorch.datamodules.TorchNonGeoDataModule
-init_args:
-  transforms:
-    # a possible way to select bands:
-    # - class_path: SelectBands
-    #   init_args:
-    #     band_indices:
-    #       - 2
-    #       - 1
-    #       - 0
-    - class_path: albumentations.augmentations.geometric.resize.Resize
-      dict_kwargs:
-        height: 224
-        width: 224
-    - class_path: ToTensorV2
-  cls: torchgeo.datamodules.EuroSATDataModule
-  batch_size: 16
-  num_workers: 4
-dict_kwargs:
-  root: /dccstor/geofm-pre/EuroSat
-  download: True
-  bands:
-    - B02
-    - B03
-    - B04
-    - B08A
-    - B11
-    - B12
diff --git a/benchmark/resources/dataset_specifications/fire_scars.yaml b/benchmark/resources/dataset_specifications/fire_scars.yaml
deleted file mode 100644
index a2f50a1..0000000
--- a/benchmark/resources/dataset_specifications/fire_scars.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-class_path: terratorch.datamodules.GenericNonGeoSegmentationDataModule
-init_args:
-  batch_size: 4
-  num_workers: 8
-  dataset_bands:
-    - BLUE
-    - GREEN
-    - RED
-    - NIR_NARROW
-    - SWIR_1
-    - SWIR_2
-  output_bands:
-    - BLUE
-    - GREEN
-    - RED
-    - NIR_NARROW
-    - SWIR_1
-    - SWIR_2
-  rgb_indices:
-    - 2
-    - 1
-    - 0
-  train_transform:
-    - class_path: albumentations.RandomCrop
-      init_args:
-        height: 224
-        width: 224
-    - class_path: albumentations.HorizontalFlip
-      init_args:
-        p: 0.5
-    - class_path: ToTensorV2
-  no_data_replace: 0
-  no_label_replace: -1
-  train_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/training
-  train_label_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/training
-  val_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
-  val_label_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
-  test_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
-  test_label_data_root: /dccstor/geofm-finetuning/fire-scars/finetune-data/6_bands_no_replant_extended/validation
-  img_grep: "*_merged.tif"
-  label_grep: "*.mask.tif"
-  means:
-    - 0.033349706741586264
-    - 0.05701185520536176
-    - 0.05889748132001316
-    - 0.2323245113436119
-    - 0.1972854853760658
-    - 0.11944914225186566
-  stds:
-    - 0.02269135568823774
-    - 0.026807560223070237
-    - 0.04004109844362779
-    - 0.07791732423672691
-    - 0.08708738838140137
-    - 0.07241979477437814
-  num_classes: 2
diff --git a/benchmark/resources/dataset_specifications/multi_temporal_crop.yaml b/benchmark/resources/dataset_specifications/multi_temporal_crop.yaml
deleted file mode 100644
index bc30877..0000000
--- a/benchmark/resources/dataset_specifications/multi_temporal_crop.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-class_path: terratorch.datamodules.GenericNonGeoSegmentationDataModule
-init_args:
-  batch_size: 8
-  num_workers: 12
-  train_transform:
-    - class_path: FlattenTemporalIntoChannels
-    - class_path: albumentations.Flip
-    - class_path: ToTensorV2
-    - class_path: UnflattenTemporalFromChannels
-      init_args:
-        n_timesteps: 3
-  dataset_bands:
-      - BLUE
-      - GREEN
-      - RED
-      - NIR_NARROW
-      - SWIR_1
-      - SWIR_2
-  output_bands:
-    - BLUE
-    - GREEN
-    - RED
-    - NIR_NARROW
-    - SWIR_1
-    - SWIR_2
-  rgb_indices:
-    - 2
-    - 1
-    - 0
-  reduce_zero_label: True
-  expand_temporal_dimension: True
-  train_data_root: /dccstor/geofm-finetuning/hls_cdl_reclassed/training_chips
-  train_label_data_root: /dccstor/geofm-finetuning/hls_cdl_reclassed/training_chips
-  val_data_root: /dccstor/geofm-finetuning/hls_cdl_reclassed/validation_chips
-  val_label_data_root: /dccstor/geofm-finetuning/hls_cdl_reclassed/validation_chips
-  test_data_root: /dccstor/geofm-finetuning/hls_cdl_reclassed/validation_chips
-  test_label_data_root: /dccstor/geofm-finetuning/hls_cdl_reclassed/validation_chips
-  train_split: /dccstor/geofm-finetuning/hls_cdl_reclassed/training_chips/training_data.txt
-  test_split: /dccstor/geofm-finetuning/hls_cdl_reclassed/validation_chips/validation_data.txt
-  val_split: /dccstor/geofm-finetuning/hls_cdl_reclassed/validation_chips/validation_data.txt
-  img_grep: "*_merged.tif"
-  label_grep: "*.mask.tif"
-  means:
-    - 494.905781
-    - 815.239594
-    - 924.335066
-    - 2968.881459
-    - 2634.621962
-    - 1739.579917
-  stds:
-    - 284.925432
-    - 357.84876
-    - 575.566823
-    - 896.601013
-    - 951.900334
-    - 921.407808
-  num_classes: 13
diff --git a/benchmark/resources/dataset_specifications/sen1floods11.yaml b/benchmark/resources/dataset_specifications/sen1floods11.yaml
deleted file mode 100644
index d3201e1..0000000
--- a/benchmark/resources/dataset_specifications/sen1floods11.yaml
+++ /dev/null
@@ -1,59 +0,0 @@
-class_path: terratorch.datamodules.GenericNonGeoSegmentationDataModule
-init_args:
-  batch_size: 8
-  num_workers: 4
-  constant_scale: 0.0001
-  dataset_bands:
-      - COASTAL_AEROSOL
-      - BLUE
-      - GREEN
-      - RED
-      - RED_EDGE_1
-      - RED_EDGE_2
-      - RED_EDGE_3
-      - NIR_BROAD
-      - NIR_NARROW
-      - WATER_VAPOR
-      - CIRRUS
-      - SWIR_1
-      - SWIR_2
-  output_bands:
-    - BLUE
-    - GREEN
-    - RED
-    - NIR_NARROW
-    - SWIR_1
-    - SWIR_2
-  rgb_indices:
-    - 2
-    - 1
-    - 0
-  train_data_root: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/data/flood_events/HandLabeled/S2Hand/
-  train_label_data_root: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/data/flood_events/HandLabeled/LabelHand
-  val_data_root: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/data/flood_events/HandLabeled/S2Hand/
-  val_label_data_root: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/data/flood_events/HandLabeled/LabelHand
-  test_data_root: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/data/flood_events/HandLabeled/S2Hand/
-  test_label_data_root: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/data/flood_events/HandLabeled/LabelHand
-  # these must be obtained by running terratorch/examples/scripts/convert_sen1floods11_splits.py on the original split csv files
-  train_split: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/splits/flood_handlabeled/flood_train_data.txt
-  test_split: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/splits/flood_handlabeled/flood_test_data.txt
-  val_split: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/splits/flood_handlabeled/flood_valid_data.txt
-  img_grep: "*_S2Hand.tif"
-  label_grep: "*_LabelHand.tif"
-  no_label_replace: -1
-  no_data_replace: 0
-means:
-  - 0.1412956
-  - 0.13795798
-  - 0.12353792
-  - 0.30902815
-  - 0.2044958
-  - 0.11912015
-stds:
-  - 0.07406382
-  - 0.07370365
-  - 0.08692279
-  - 0.11798815
-  - 0.09772074
-  - 0.07659938
-num_classes: 2
diff --git a/benchmark/resources/dataset_specifications/sen1floods11_transforms.yaml b/benchmark/resources/dataset_specifications/sen1floods11_transforms.yaml
deleted file mode 100644
index ffea683..0000000
--- a/benchmark/resources/dataset_specifications/sen1floods11_transforms.yaml
+++ /dev/null
@@ -1,67 +0,0 @@
-class_path: terratorch.datamodules.GenericNonGeoSegmentationDataModule
-init_args:
-  batch_size: 8
-  num_workers: 4
-  constant_scale: 0.0001
-  dataset_bands:
-      - COASTAL_AEROSOL
-      - BLUE
-      - GREEN
-      - RED
-      - RED_EDGE_1
-      - RED_EDGE_2
-      - RED_EDGE_3
-      - NIR_BROAD
-      - NIR_NARROW
-      - WATER_VAPOR
-      - CIRRUS
-      - SWIR_1
-      - SWIR_2
-  output_bands:
-    - BLUE
-    - GREEN
-    - RED
-    - NIR_NARROW
-    - SWIR_1
-    - SWIR_2
-  rgb_indices:
-    - 2
-    - 1
-    - 0
-  train_data_root: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/data/flood_events/HandLabeled/S2Hand/
-  train_label_data_root: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/data/flood_events/HandLabeled/LabelHand
-  val_data_root: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/data/flood_events/HandLabeled/S2Hand/
-  val_label_data_root: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/data/flood_events/HandLabeled/LabelHand
-  test_data_root: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/data/flood_events/HandLabeled/S2Hand/
-  test_label_data_root: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/data/flood_events/HandLabeled/LabelHand
-  # these must be obtained by running terratorch/examples/scripts/convert_sen1floods11_splits.py on the original split csv files
-  train_split: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/splits/flood_handlabeled/flood_train_data.txt
-  test_split: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/splits/flood_handlabeled/flood_test_data.txt
-  val_split: /dccstor/geofm-finetuning/datasets/sen1floods11/v1.1/splits/flood_handlabeled/flood_valid_data.txt
-  img_grep: "*_S2Hand.tif"
-  label_grep: "*_LabelHand.tif"
-  no_label_replace: -1
-  no_data_replace: 0
-  train_transform:
-    - class_path: albumentations.HorizontalFlip
-      init_args:
-        p: 0.5
-    - class_path: albumentations.VerticalFlip
-      init_args:
-        p: 0.5
-    - class_path: ToTensorV2
-  means:
-    - 0.1412956
-    - 0.13795798
-    - 0.12353792
-    - 0.30902815
-    - 0.2044958
-    - 0.11912015
-  stds:
-    - 0.07406382
-    - 0.07370365
-    - 0.08692279
-    - 0.11798815
-    - 0.09772074
-    - 0.07659938
-  num_classes: 2
diff --git a/benchmark/tests/__init__.py b/benchmark/tests/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/pyproject.toml b/pyproject.toml
index 0ce4837..55d2302 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,6 @@ classifiers = [
 readme = "README.md"
 
 dependencies = [
-# ObjectDetection is not supported on terratorch==1.0.2, so iterate relies on main branch
 "terratorch>=1.1.0", 
 # "terratorch",
 # requests>=2.32.0 because of this vulnerability https://github.com/psf/requests/security/advisories/GHSA-9wx4-h78v-vm56
diff --git a/benchmark/__init__.py b/terratorch_iterate/__init__.py
similarity index 100%
rename from benchmark/__init__.py
rename to terratorch_iterate/__init__.py
diff --git a/benchmark/backbone_benchmark.py b/terratorch_iterate/backbone_benchmark.py
similarity index 98%
rename from benchmark/backbone_benchmark.py
rename to terratorch_iterate/backbone_benchmark.py
index 4c6a503..412d245 100644
--- a/benchmark/backbone_benchmark.py
+++ b/terratorch_iterate/backbone_benchmark.py
@@ -17,16 +17,16 @@
 from optuna.samplers import BaseSampler, RandomSampler
 from tabulate import tabulate
 import pickle
-from benchmark.benchmark_types import (
+from terratorch_iterate.iterate_types import (
     Defaults,
     ParameterBounds,
     Task,
     combine_with_defaults,
     optimization_space_type,
 )
-from benchmark.model_fitting import fit_model, fit_model_with_hparams
-from benchmark.repeat_best_experiment import rerun_best_from_backbone
-from benchmark.utils import (
+from terratorch_iterate.model_fitting import fit_model, fit_model_with_hparams
+from terratorch_iterate.repeat_best_experiment import rerun_best_from_backbone
+from terratorch_iterate.utils import (
     check_existing_task_parent_runs,
     check_existing_experiments,
     unflatten,
diff --git a/benchmark/benchmark_ray.py b/terratorch_iterate/benchmark_ray.py
similarity index 97%
rename from benchmark/benchmark_ray.py
rename to terratorch_iterate/benchmark_ray.py
index 4d3767e..0c7ea5c 100644
--- a/benchmark/benchmark_ray.py
+++ b/terratorch_iterate/benchmark_ray.py
@@ -14,15 +14,15 @@
 from ray.tune.search.optuna import OptunaSearch
 from tabulate import tabulate
 
-from benchmark.backbone_benchmark import parse_optimization_space
-from benchmark.benchmark_types import (
+from terratorch_iterate.backbone_benchmark import parse_optimization_space
+from terratorch_iterate.iterate_types import (
     Defaults,
     Task,
     TrainingSpec,
     combine_with_defaults,
     optimization_space_type,
 )
-from benchmark.model_fitting import fit_model, ray_tune_model, valid_task_types
+from terratorch_iterate.model_fitting import fit_model, ray_tune_model, valid_task_types
 
 
 def benchmark_backbone_on_task(
diff --git a/benchmark/config_util/__init__.py b/terratorch_iterate/config_util/__init__.py
similarity index 100%
rename from benchmark/config_util/__init__.py
rename to terratorch_iterate/config_util/__init__.py
diff --git a/benchmark/config_util/build_iterate_config.py b/terratorch_iterate/config_util/build_iterate_config.py
similarity index 99%
rename from benchmark/config_util/build_iterate_config.py
rename to terratorch_iterate/config_util/build_iterate_config.py
index 700ee89..e45d8c7 100644
--- a/benchmark/config_util/build_iterate_config.py
+++ b/terratorch_iterate/config_util/build_iterate_config.py
@@ -2,7 +2,7 @@
 import yaml
 import pandas as pd
 import click
-from benchmark.benchmark_types import (
+from terratorch_iterate.iterate_types import (
     TaskTypeEnum,
 )
 from copy import deepcopy
diff --git a/benchmark/benchmark_types.py b/terratorch_iterate/iterate_types.py
similarity index 100%
rename from benchmark/benchmark_types.py
rename to terratorch_iterate/iterate_types.py
diff --git a/benchmark/main.py b/terratorch_iterate/main.py
similarity index 98%
rename from benchmark/main.py
rename to terratorch_iterate/main.py
index 51ed939..8b23c4b 100644
--- a/benchmark/main.py
+++ b/terratorch_iterate/main.py
@@ -4,15 +4,15 @@
 from pathlib import Path
 from jsonargparse import ArgumentParser
 import pandas as pd
-from benchmark.backbone_benchmark import benchmark_backbone
-from benchmark.benchmark_types import Defaults, Task
-from benchmark.repeat_best_experiment import rerun_best_from_backbone
-from benchmark.utils import (
+from terratorch_iterate.backbone_benchmark import benchmark_backbone
+from terratorch_iterate.iterate_types import Defaults, Task
+from terratorch_iterate.repeat_best_experiment import rerun_best_from_backbone
+from terratorch_iterate.utils import (
     get_logger,
     import_custom_modules,
     get_results_and_parameters,
 )
-from benchmark.config_util import build_iterate_config
+from terratorch_iterate.config_util import build_iterate_config
 
 
 def _summarize(
diff --git a/benchmark/model_fitting.py b/terratorch_iterate/model_fitting.py
similarity index 99%
rename from benchmark/model_fitting.py
rename to terratorch_iterate/model_fitting.py
index 10402cf..5735c58 100644
--- a/benchmark/model_fitting.py
+++ b/terratorch_iterate/model_fitting.py
@@ -45,7 +45,7 @@
 from torchgeo.datamodules import BaseDataModule
 from torchgeo.trainers import BaseTask
 
-from benchmark.benchmark_types import (
+from terratorch_iterate.iterate_types import (
     ParameterBounds,
     ParameterTypeEnum,
     TrainingSpec,
@@ -55,7 +55,7 @@
 )
 
 
-from benchmark.utils import get_logger
+from terratorch_iterate.utils import get_logger
 
 LOGGER = get_logger()
 
diff --git a/benchmark/module.py b/terratorch_iterate/module.py
similarity index 100%
rename from benchmark/module.py
rename to terratorch_iterate/module.py
diff --git a/benchmark/plot_tools.py b/terratorch_iterate/plot_tools.py
similarity index 100%
rename from benchmark/plot_tools.py
rename to terratorch_iterate/plot_tools.py
diff --git a/benchmark/repeat_best_experiment.py b/terratorch_iterate/repeat_best_experiment.py
similarity index 99%
rename from benchmark/repeat_best_experiment.py
rename to terratorch_iterate/repeat_best_experiment.py
index 6fbcf3f..3a168dd 100644
--- a/benchmark/repeat_best_experiment.py
+++ b/terratorch_iterate/repeat_best_experiment.py
@@ -24,13 +24,13 @@
 
 from lightning.pytorch.loggers.mlflow import MLFlowLogger
 import time
-from benchmark.benchmark_types import (
+from terratorch_iterate.iterate_types import (
     Defaults,
     Task,
     TrainingSpec,
     combine_with_defaults,
 )
-from benchmark.model_fitting import (
+from terratorch_iterate.model_fitting import (
     get_default_callbacks,
     inject_hparams,
     valid_task_types,
diff --git a/benchmark/utils.py b/terratorch_iterate/utils.py
similarity index 99%
rename from benchmark/utils.py
rename to terratorch_iterate/utils.py
index d2dd533..1cf0e38 100644
--- a/benchmark/utils.py
+++ b/terratorch_iterate/utils.py
@@ -10,12 +10,11 @@
 from matplotlib import pyplot as plt
 from ast import literal_eval
 import optuna
-from benchmark.benchmark_types import Task
-from benchmark import plot_tools
+from terratorch_iterate.iterate_types import Task
+from terratorch_iterate import plot_tools
 import sys
 from mlflow.entities.experiment import Experiment
 import importlib
-import logging
 
 N_TRIALS_DEFAULT = 16
 REPEATED_SEEDS_DEFAULT = 10
diff --git a/tests/integration/test_main.py b/tests/integration/test_main.py
index 7219d4b..acc342d 100644
--- a/tests/integration/test_main.py
+++ b/tests/integration/test_main.py
@@ -2,7 +2,7 @@
 from pathlib import Path
 
 import yaml
-from benchmark.main import main
+from terratorch_iterate.main import main
 import pytest
 import sys
 
diff --git a/tests/unit/test_build_geobench_configs.py b/tests/unit/test_build_geobench_configs.py
index bdb0337..a75ae55 100644
--- a/tests/unit/test_build_geobench_configs.py
+++ b/tests/unit/test_build_geobench_configs.py
@@ -1,7 +1,7 @@
 from pathlib import Path
 import pytest
 import yaml
-from benchmark.config_util.build_iterate_config import generate_iterate_config
+from terratorch_iterate.config_util.build_iterate_config import generate_iterate_config
 from deepdiff import DeepDiff
 import logging
 
diff --git a/tests/unit/test_tasktypeenum.py b/tests/unit/test_tasktypeenum.py
index 24ce069..1dfb669 100644
--- a/tests/unit/test_tasktypeenum.py
+++ b/tests/unit/test_tasktypeenum.py
@@ -1,4 +1,4 @@
-from benchmark.benchmark_types import TaskTypeEnum
+from terratorch_iterate.iterate_types import TaskTypeEnum
 import pytest
 from terratorch.tasks.base_task import TerraTorchTask
 from terratorch.tasks.classification_tasks import ClassificationTask

From 0a741f9bcabba600e920fe452f57e630a68b3e25 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Fri, 3 Oct 2025 13:53:38 -0300
Subject: [PATCH 36/40] change version

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 pyproject.toml |  3 +--
 run_tests.py   | 16 ++++++++++------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 55d2302..c56db55 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,7 +13,7 @@ include = ["benchmark*"]
 [project]
 
 name = "terratorch-iterate"
-version = "0.1.5"
+version = "0.2.0"
 requires-python = ">= 3.9"
 description = "A terratorch's plugin for benchmarking and hyperparameter optimization"
 authors = [
@@ -38,7 +38,6 @@ readme = "README.md"
 
 dependencies = [
 "terratorch>=1.1.0", 
-# "terratorch",
 # requests>=2.32.0 because of this vulnerability https://github.com/psf/requests/security/advisories/GHSA-9wx4-h78v-vm56
 "requests>=2.32.0",
 # Jinja2 vulnerability issue https://github.com/pallets/jinja/security/advisories/GHSA-h75v-3vvj-5mfj
diff --git a/run_tests.py b/run_tests.py
index 64489b0..60fa0b1 100644
--- a/run_tests.py
+++ b/run_tests.py
@@ -6,13 +6,17 @@
 import logging
 import sys
 
-logging.basicConfig(
-    stream=sys.stdout,
-    level=logging.DEBUG,
-    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-)
-
 logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)  # Set appropriate level
+
+# Create a StreamHandler that writes to stdout
+ch = logging.StreamHandler(sys.stdout)
+ch.setLevel(logging.DEBUG)  # Set appropriate level for the handler
+
+formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+ch.setFormatter(formatter)
+
+logger.addHandler(ch)
 
 # rm geobench_v1_prithvi* && bsub -e ~/geobench_v1_prithvi.err -o ~/geobench_v1_prithvi.out -M 40G -gpu "num=1/task:mode=exclusive_process:gmodel=NVIDIAA100_SXM4_80GB" terratorch iterate --hpo --config configs/geobench_v1_prithvi.yaml
 

From f3a4821893a01718f528acf913b4a851457c4381 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Fri, 3 Oct 2025 14:55:03 -0300
Subject: [PATCH 37/40] add tox file

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 pyproject.toml | 1 -
 tox.ini        | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index c56db55..3e9e4b3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -71,7 +71,6 @@ dependencies = [
 "configspace",
 "optuna-integration",
 "seaborn",
-"torchgeo",
 "psutil",
 ]
 
diff --git a/tox.ini b/tox.ini
index 68ae8fd..67af1c7 100644
--- a/tox.ini
+++ b/tox.ini
@@ -9,8 +9,8 @@ skip_missing_interpreters = false
 description = run code style
 skip_install = true
 deps =
-    black
-commands = black {posargs:.}
+    ruff
+commands = ruff format {posargs:.}
  
 [testenv:lint]
 description = run linters

From 67d092599e3c81ecd42e9c81344480cc4a45a0e5 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Fri, 3 Oct 2025 15:29:02 -0300
Subject: [PATCH 38/40] minor modifications

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 pyproject.toml                      | 8 ++++----
 terratorch_iterate/model_fitting.py | 1 -
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3e9e4b3..e36edb0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,7 +14,7 @@ include = ["benchmark*"]
 
 name = "terratorch-iterate"
 version = "0.2.0"
-requires-python = ">= 3.9"
+requires-python = ">= 3.10"
 description = "A terratorch's plugin for benchmarking and hyperparameter optimization"
 authors = [
       { name = "Carlos Gomes"},
@@ -62,7 +62,6 @@ dependencies = [
 "importlib-metadata",
 "numpy",
 "optuna",
-"tabulate",
 "types-tabulate",
 "ray",
 "gputil",
@@ -72,6 +71,7 @@ dependencies = [
 "optuna-integration",
 "seaborn",
 "psutil",
+"tabulate>=0.9.0",
 ]
 
 [project.urls]
@@ -105,12 +105,12 @@ nvidia = ["pynvml"]
 amd = ["pyrsmi"]
 
 [tool.black]
-target-version = ["py310"]
+target-version = ["py312"]
 line-length = 88
 skip-string-normalization = true
 
 [project.scripts]
-iterate = "benchmark.main:main"
+iterate = "terratorch_iterate.main:main"
 # ray_benchmark = "benchmark.benchmark_ray:main"
 # repeat_experiments = "benchmark.main:main"
 
diff --git a/terratorch_iterate/model_fitting.py b/terratorch_iterate/model_fitting.py
index 5735c58..8ef1819 100644
--- a/terratorch_iterate/model_fitting.py
+++ b/terratorch_iterate/model_fitting.py
@@ -6,7 +6,6 @@
 import copy
 import dataclasses
 import importlib
-import logging
 import os
 import shutil
 import types

From e6a84ad1bfe401a53aa4ab97cb191666aba6d092 Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Fri, 3 Oct 2025 15:39:37 -0300
Subject: [PATCH 39/40] torchgeo supports >=3.11

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 31ca6a7..3da48e6 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.10", "3.11", "3.12"]
+        python-version: ["3.11", "3.12", "3.13"]
 
     steps:
     - uses: actions/checkout@v5

From dd1c11fa8b5b22cb43e09be917a195adacc5561a Mon Sep 17 00:00:00 2001
From: "Leonardo P. Tizzei" <ltizzei@br.ibm.com>
Date: Fri, 3 Oct 2025 15:45:01 -0300
Subject: [PATCH 40/40] update tox

Signed-off-by: Leonardo P. Tizzei <ltizzei@br.ibm.com>
---
 tox.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tox.ini b/tox.ini
index 67af1c7..21e49b6 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,7 +1,7 @@
 [tox]
 requires =
     tox>=4.23.0
-env_list = 3.1{2,1,0}, lint, style
+env_list = 3.1{3,2,1}, lint, style
 isolated_build = true
 skip_missing_interpreters = false