From 0b6e08b937b37feb91790489854af94c2a7ebe8f Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Wed, 8 Apr 2026 15:02:56 +0200
Subject: [PATCH 01/16] add complex hpo sets

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 configs/gridfm_graphkit_hpo.yaml |  47 +++++++
 docs/iterate2.md                 | 218 ++++++++++++++++++++++++++++---
 terratorch_iterate/iterate2.py   |  27 +++-
 3 files changed, 270 insertions(+), 22 deletions(-)
 create mode 100644 configs/gridfm_graphkit_hpo.yaml

diff --git a/configs/gridfm_graphkit_hpo.yaml b/configs/gridfm_graphkit_hpo.yaml
new file mode 100644
index 0000000..65073ed
--- /dev/null
+++ b/configs/gridfm_graphkit_hpo.yaml
@@ -0,0 +1,47 @@
+# HPO configuration for gridfm-graphkit HGNS PF case2000
+#
+# Hyperparameters:
+#   gpu_num   – number of GPUs to request from the WLM (launcher-level)
+#   bfloat16  – boolean flag (presence = --bfloat16, absence = flag omitted)
+#   tf32      – boolean flag (presence = --tf32, absence = flag omitted)
+#   compile   – torch.compile mode; null disables the flag entirely
+#   dataset   – group: selects config + data_path + exp_name together
+#
+# Static args:
+#   all other fixed CLI args
+
+hpo:
+  gpu_num:
+    type: categorical
+    choices: [1, 2, 4]
+
+  bfloat16:
+    type: flag               # store_true: true → --bfloat16, false → flag omitted
+
+  tf32:
+    type: flag               # store_true: true → --tf32, false → flag omitted
+
+  compile:
+    type: categorical
+    choices: ["max-autotune", "default", "reduce-overhead", null]
+    # null → --compile flag is omitted entirely
+
+  dataset:
+    type: group             # one choice selects all bundled args together
+    choices:
+      case2000:
+        config: ./examples/config/HGNS_PF_datakit_case2000.yaml
+        data_path: /dccstor/gridfm/powermodels_data/v4/finetuning/pf/
+        exp_name: case2000
+      case1000:
+        config: ./examples/config/HGNS_PF_datakit_case1000.yaml
+        data_path: /dccstor/gridfm/powermodels_data/v4/finetuning/pf/
+        exp_name: case1000
+
+static:
+  run_name: run1
+  log_dir: logs
+  dataset_wrapper: SharedMemoryCacheDataset
+  plugins: gridfm_graphkit_ee
+  num_workers: 32
+  dataset_wrapper_cache_dir: /dccstor/terratorch/case2000_ieeecache
diff --git a/docs/iterate2.md b/docs/iterate2.md
index 3429984..8922d52 100644
--- a/docs/iterate2.md
+++ b/docs/iterate2.md
@@ -1,6 +1,14 @@
 # iterate2 – HPO Launcher
 
-`iterate2` is a generic Optuna-based hyperparameter optimisation (HPO) launcher with a pluggable workload-manager backend. It submits one trial per Optuna suggestion, waits for the job to finish, extracts a scalar metric from the job's log file, and returns it to Optuna.
+`iterate2` is a generic Optuna-based hyperparameter optimisation (HPO) launcher with a pluggable workload-manager backend. It submits one trial per Optuna suggestion, waits for the job to finish, extracts one or more metrics from the job's log file, and returns them to Optuna.
+
+Key capabilities:
+
+- **Multi-objective optimisation** — extract and optimise several metrics simultaneously (Pareto front)
+- **Five HPO parameter types** — `float`, `int`, `categorical`, `flag` (store-true), `group` (bundled arg sets)
+- **Dynamic GPU count per trial** — `gpu_num` in the HPO space controls the WLM resource request per trial
+- **Null-omission** — `null` in a `categorical` choice causes the flag to be completely absent from the command line
+- **Workload manager backends** — LSF, Slurm, or direct local execution
 
 ## Quick start
 
@@ -73,7 +81,107 @@ static:
   dataset_path: /data/my_dataset
 ```
 
-Supported parameter types: `float`, `int`, `categorical`.
+Supported parameter types: `float`, `int`, `categorical`, `flag`, `group`.
+
+#### Parameter types
+
+##### `float`
+
+Suggests a floating-point value between `low` and `high`. Set `log: true` for log-uniform sampling.
+
+```yaml
+learning_rate:
+  type: float
+  low: 1e-5
+  high: 1e-2
+  log: true
+```
+
+Generates: `--learning-rate 0.0003`
+
+##### `int`
+
+Suggests an integer between `low` and `high` (inclusive).
+
+```yaml
+encoder_depth:
+  type: int
+  low: 2
+  high: 8
+```
+
+Generates: `--encoder-depth 4`
+
+##### `categorical`
+
+Suggests one value from a list of choices. Choices can be strings, numbers, or `null`.
+
+```yaml
+batch_size:
+  type: categorical
+  choices: [16, 32, 64]
+```
+
+Generates: `--batch-size 32`
+
+**`null` omits the flag entirely.** Useful for optional flags like `--compile`:
+
+```yaml
+compile:
+  type: categorical
+  choices: ["max-autotune", "default", null]
+  # null → --compile is completely absent from the command
+```
+
+##### `flag`
+
+Models a `store_true`-style flag that takes no value — its presence or absence is the parameter. `true` adds the flag; `false` omits it.
+
+```yaml
+bfloat16:
+  type: flag   # true → --bfloat16   false → (omitted)
+
+tf32:
+  type: flag   # true → --tf32       false → (omitted)
+```
+
+!!! note
+    Use unquoted YAML `true`/`false` for `flag` and for boolean values in `categorical.choices`.
+    Use **quoted** `"true"`/`"false"` when the wrapped script expects the literal string as a value (e.g. `--amp true`).
+
+##### `group`
+
+Bundles several CLI arguments together under a single Optuna categorical parameter. Optuna picks one group name; `iterate2` then injects all key/value pairs from that group into the trial's argument list. This is useful when multiple arguments are co-dependent (e.g. config file + dataset path + experiment name).
+
+```yaml
+dataset:
+  type: group
+  choices:
+    case2000:
+      config: ./examples/config/model_case2000.yaml
+      data_path: /data/pf/
+      exp_name: case2000
+    case1000:
+      config: ./examples/config/model_case1000.yaml
+      data_path: /data/pf/
+      exp_name: case1000
+```
+
+Optuna tracks the choice as a single categorical (`dataset = "case2000"`), but the wrapped script receives:
+
+```
+--config ./examples/config/model_case2000.yaml --data-path /data/pf/ --exp-name case2000
+```
+
+##### `gpu_num` — dynamic GPU count
+
+The special key `gpu_num` (as `categorical` or `int`) overrides `--gpu-count` for the **WLM resource request** of each individual trial. It is consumed by `iterate2` and never forwarded to the wrapped script.
+
+```yaml
+gpu_num:
+  type: categorical
+  choices: [1, 2, 4]
+```
 
 ### Static arguments
 
@@ -86,13 +194,41 @@ Arguments passed unchanged to every trial. Can be supplied inline or via file:
 
 If neither is provided, `iterate2` falls back to the `static` section of `--hpo-yaml`.
 
+Static boolean values follow the same rule as HPO values: unquoted `true` produces a bare flag (`--flag`), unquoted `false` omits it.
+
+```yaml
+static:
+  max_epochs: 50
+  tf32: true      # → --tf32  (store_true flag, always present)
+  debug: false    # → (omitted)
+```
+
 ### Metric extraction
 
 | Option | Default | Description |
 |---|---|
-| `--metric` | `val/F1_Score` | Metric name to extract from the trial's stdout log |
+| `--metrics` | `score_combined` | Comma-separated list of metric names to extract from the trial's stdout log |
+
+The **last** occurrence of the pattern `<metric_name>: <value>` or `<metric_name>= <value>` is used for each metric. If a metric is not found, it defaults to `0.0` with a warning.
+
+**Single metric (single-objective):**
 
-The last occurrence of the pattern `<metric_name>: <value>` or `<metric_name>= <value>` is used.
+```sh
+--metrics val_loss
+```
+
+**Multiple metrics (multi-objective, Pareto front):**
+
+```sh
+--metrics score_linear_acc,score_modality_leak,score_combined
+```
+
+All objectives are maximised. `iterate2` prints the Pareto-front trials at the end:
+
+```
+Trial 12: Values=[0.873, 0.041, 0.791]
+Trial 17: Values=[0.901, 0.038, 0.812]
+```
 
 ---
 
@@ -116,7 +252,7 @@ iterate2 --param-setter set ...
 | `--param-setter set` | `--set learning_rate 0.001 --set batch_size 32` |
 
 !!! note
-    In setter style, boolean parameters are passed explicitly as `--set flag true` / `--set flag false` rather than as bare flags (`--flag`), since there is no named flag to toggle.
+    In setter style, `flag` parameters are passed as `--set flag` (key only, no value) when `true`, and omitted when `false`.
 
 ---
 
@@ -183,37 +319,81 @@ Not yet implemented.
 
 ## Example HPO YAML
 
+Full example combining all parameter types:
+
 ```yaml
 # hpo_space.yaml
 hpo:
+  # float – log-uniform over [1e-5, 1e-2]
   learning_rate:
     type: float
     low: 1e-5
     high: 1e-2
     log: true
-  weight_decay:
-    type: float
-    low: 1e-6
-    high: 1e-3
-    log: true
+
+  # int – encoder depth
+  encoder_depth:
+    type: int
+    low: 2
+    high: 8
+
+  # categorical – batch size
+  batch_size:
+    type: categorical
+    choices: [16, 32, 64]
+
+  # categorical with null – compile mode (null omits --compile entirely)
+  compile:
+    type: categorical
+    choices: ["max-autotune", "default", null]
+
+  # flag – store_true style (--bfloat16 present or absent)
+  bfloat16:
+    type: flag
+
+  # flag – store_true style (--tf32 present or absent)
+  tf32:
+    type: flag
+
+  # gpu_num – controls WLM resource request per trial (not forwarded to script)
+  gpu_num:
+    type: categorical
+    choices: [1, 2, 4]
+
+  # group – bundles co-dependent args; Optuna picks one group by name
+  dataset:
+    type: group
+    choices:
+      case2000:
+        config: ./examples/config/model_case2000.yaml
+        data_path: /data/pf/
+        exp_name: case2000
+      case1000:
+        config: ./examples/config/model_case1000.yaml
+        data_path: /data/pf/
+        exp_name: case1000
 
 static:
-  max_epochs: 30
-  config: configs/my_model.yaml
+  max_epochs: 50
+  log_dir: logs
+  num_workers: 16
 ```
 
 Launch with:
 
 ```sh
 iterate2 \
-  --script terratorch_iterate/main.py \
+  --script gridfm_graphkit \
+  --interpreter "" \
+  --root-dir /path/to/project \
+  --venv /path/to/venv \
   --wlm lsf \
   --lsf-gpu-config-string "num=1:mode=exclusive_process:mps=yes:gmodel=NVIDIAA100_SXM4_80GB" \
-  --cpu-count 20 \
-  --mem-gb 512 \
-  --optuna-study-name geobench_hpo \
-  --optuna-db-path sqlite:///geobench_hpo.db \
-  --optuna-n-trials 40 \
+  --cpu-count 16 \
+  --mem-gb 64 \
+  --optuna-study-name my_study \
+  --optuna-db-path sqlite:///my_study.db \
+  --optuna-n-trials 50 \
   --hpo-yaml hpo_space.yaml \
-  --metric "val/F1_Score"
+  --metrics val_loss,val_f1
 ```
diff --git a/terratorch_iterate/iterate2.py b/terratorch_iterate/iterate2.py
index eef5069..516bd2f 100644
--- a/terratorch_iterate/iterate2.py
+++ b/terratorch_iterate/iterate2.py
@@ -85,11 +85,18 @@ def build_shell_command(interpreter, root_dir, script_path, venv, script_args, p
     arg_list = [f"{interpreter} {script_path}"]
     for key, value in script_args.items():
         arg_name = key.replace("_", "-")
+        if value is None:
+            continue  # None means "omit this flag" (e.g. compile: null disables --compile)
         if param_setter:
-            arg_list.append(f"--{param_setter} {key} {value}")
+            if isinstance(value, bool):
+                if value: arg_list.append(f"--{param_setter} {key}")
+                # False → omit entirely
+            else:
+                arg_list.append(f"--{param_setter} {key} {value}")
         else:
             if isinstance(value, bool):
                 if value: arg_list.append(f"--{arg_name}")
+                # False → flag is simply absent
             else:
                 arg_list.append(f"--{arg_name} {value}")
     parts.append(" ".join(arg_list))
@@ -139,6 +146,12 @@ def suggest_from_spec(trial, name, spec):
     if t == "float": return trial.suggest_float(name, float(spec["low"]), float(spec["high"]), log=spec.get("log", False))
     if t == "int": return trial.suggest_int(name, int(spec["low"]), int(spec["high"]), log=spec.get("log", False))
     if t == "categorical": return trial.suggest_categorical(name, spec["choices"])
+    if t == "flag":
+        # store_true style: True → --name present, False → flag omitted entirely
+        return trial.suggest_categorical(name, [True, False])
+    if t == "group":
+        # Suggests one of the named group keys; caller expands the key → dict of args
+        return trial.suggest_categorical(name, list(spec["choices"].keys()))
     raise ValueError(f"Unknown param type: {t}")
 
 def main():
@@ -152,13 +165,21 @@ def main():
     def objective(trial):
         script_args = static_args.copy()
         for name, spec in hpo_space.items():
-            script_args[name] = suggest_from_spec(trial, name, spec)
+            val = suggest_from_spec(trial, name, spec)
+            if spec["type"] == "group":
+                # Expand the chosen group's key→value pairs directly into script_args
+                script_args.update(spec["choices"][val])
+            else:
+                script_args[name] = val
+
+        # gpu_num in hpo/static overrides the CLI --gpu-count for this trial's launcher
+        gpu_count = int(script_args.pop("gpu_num", args.gpu_count))
 
         out_file = f"trial_{trial.number}.out"
         err_file = f"trial_{trial.number}.err"
 
         shell_cmd = build_shell_command(args.interpreter, root_dir, script_path, args.venv, script_args, args.param_setter)
-        launcher_cmd = build_launcher_command(args.wlm, shell_cmd, trial.number, out_file, err_file, args.gpu_count, args.cpu_count, args.mem_gb, args.lsf_gpu_config_string)
+        launcher_cmd = build_launcher_command(args.wlm, shell_cmd, trial.number, out_file, err_file, gpu_count, args.cpu_count, args.mem_gb, args.lsf_gpu_config_string)
 
         print(f"Trial {trial.number}: Running...")
         subprocess.run(launcher_cmd, shell=True, check=True)

From b4fc64a42250b84db13ae629822003120a1d6152 Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Wed, 8 Apr 2026 17:05:41 +0200
Subject: [PATCH 02/16] add debug statements

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 examples/iterate_study.db      | Bin 0 -> 172032 bytes
 terratorch_iterate/iterate2.py | 158 ++++++++++++++++++++++++---------
 2 files changed, 117 insertions(+), 41 deletions(-)
 create mode 100644 examples/iterate_study.db

diff --git a/examples/iterate_study.db b/examples/iterate_study.db
new file mode 100644
index 0000000000000000000000000000000000000000..5d1050d429bc9c4cb8bbf47bf5b76bd9d6c499c9
GIT binary patch
literal 172032
zcmeHw2V51$`t~kx`Z>EcK*Rzl3JQ37LFFuRP*Frd>;+M<q9~y5uBJdtj3vF_bTuZX
zs!2>wOf=o(ruTMJG|4r^Xqp=Hy)(0`5f10xf4*Pl`<rjLx%0j=yFSm(GrQAvcFwr+
z()mrbX>;lu7F9Q;dAJT7&vQ~*8pp+zavawm{;&Uu;3Vh~u>2qVc3i;7Ti3~eP8N@%
zbGj1K&gm?fDZmt93NQtj0!#s>08@Y|z!YE#Fa?+bOo6{Zfn=KlP<e4b2mi4@rT|lb
zDZmt93NQtj0!#s>08@Y|z!YE#Fa=I;1(NxA&ev4i&`{k}-!OY#)x0J3Rg0?|tK3$|
z7cYLv!GG+JDZmt93NQtj0!#s>08@Y|z!YE#Fa?+bOo7u}f#i6fCEEC(y#GJUiHA?~
z46@5-3NQtj0!#s>08@Y|z!YE#Fa?+bOaZ0<Q{WT@tb78;C)>Sw!8|7!{~zSUgQuLr
zB1{3M08@Y|z!YE#Fa?+bOaZ0<Q-CSJ6krOR&I&kq1J}c19$h%8WOT`tVzU1KD985Y
zJDm%aT{}~NDZmt93NQtj0!#s>08@Y|z!YE#Fa_FGAR)nGwxIX_u<1YhV+t?@m;y`z
zrT|lbDZmt93NQtj0!#s>08`*^tN?od&&L0M;{s+`V+t?@m;y`zrT|lbDZmt93NQtj
z0!#s>04TuT|FheGDZmt93NQtj0!#s>08@Y|z!YE#Fa?+be}4to`~Sax8M7QR1(*U%
z0j2;`fGNNfU<xn=m;y`zrT|mG020KBKOrIdi${sWRACA*1(*U%0j2;`fGNNfU<xn=
zm;y`zrT|mmZ=yh-rI3-{A@u&g!SO679ut2Se-OVBzZ5?cKN8;+-w^*Pz92p=?h+pn
zcZ%D^JH=bYjpFs<)#7F1h2mOqwRo1Oip#_$;zDtbSS?NyCyEu~XmNyCBo>Gv(JKxV
z`-_=knwTmki5*3|XcFUu--KU;L&CShSHkDQ$HIHUo5Cx?i^4O)lft9I1HumBF5xy|
zlW>D@jc~bek+4oUS2$Z(DJ&P33U$I<VU{pmm?Vr7%7hXjB7}u}!6!I{0YYCPUFa?(
z3!MaqU>4#X#~r^q4m-YceC_zc@rmPo$6Jn99eW{@?2jqH6krN41(*U%0j2;`fGNNf
zU<#c63M9l^_y!a#MZppj)T3ZA3KpTD4h0KQumA<~Q7{h$b5Sq{1+^%sLBVVk%tApm
z3TC393I#JzFdYTcP%sq*Q&2D&1(Q%P5d{-aFdhY!C>V!=3KW#1U@Qv8pr8x|qft<b
zf>9_KiGmUoj6lI~6bwT_F$y9mC_=$d6cnOBLO~b>1t=JTg25;lgo1n&gisJfK>!7Q
z6!=i!MS%weZWOps;6%Yd6y%{G7X>*e7=VKQD9A=Z77F^Ipf3tCQP2kky-|>Xf^-z5
zp`aHEdZM5Q3c8~p6$RZ;kb;7)C`d*@5(*Mg&;<pZQP2qm9Z}E$1tJOr6gW^|M}Z9m
zRuouJU`Bxn1x6GYP>_Ivcof9N$6MI`|EK@6&vL*NU<xn=m;y`zrT|lbDZmt93NQtj
z0!)Gbr~v!^|36B^PGAZ!1(*U%0j2;`fGNNfU<xn=m;y`zrod^h0Nelnv@b%I0j2;`
zfGNNfU<xn=m;y`zrT|lbDZmt93j9X}*!%zgC=ok>DZmt93NQtj0!#s>08@Y|z!YE#
zFa?+br@aDb|9`89x5mpx`~Qmv#ea)mh#$iafUk=$kpriFCRqlU0!#s>08@Y|z!YE#
zFa?+bOaZ0<Q-CS(w^1OgWi;faUrP>zeOqi0W;QQ?Fr&Ey!t~}W2-BKP5cZk_LC**T
zJu=|^KWx@`1}7d74~YlF&&3a5=l{Lp9`RA}KG^ervv{3&xwrv#`(GtCiFINP?DJnK
zmWmN^FzoQ3BlZ!yi(O!E{{-Qfa9B71yZV11ydmrr_P~Dr_X&3jn}zFOC;tt?YGIYo
z1bg_`2-Ad0p%ixSA1ru<9H9^F+uud73ki;6uw(xL$LEd@9B;s0{d*jbI_`7a>DcVJ
z&T+Y8gJZR0m7~c~=csW^b5uG?9TCT1hu4wg=;P?_=;E+D670w9hwTUKpW8pMzhU2N
z-(!E&exLnL`)2!f_RH-X?5pjo>`nGMdyRdXz0zK4kJty>z4jb?AA5Iu7rWgaZ#!Z;
zX#3LkvF$C}OSV0>M{GN6TWvSluCiTdJJ+_#)@WN`tF}$Djj;{04Y7G`18f<#6k7+I
z$$H%SqxFFGGwXZSSFO)kpRn$<-euivz1DiEb*=Sm>k8{)Ypr#fb)0piwa^;04z%{Q
z_ON!b+N^PwUoGET_FF!(ylL5M*=>2)vfXl<<wnbumJ2MaEh{Yzmid;MmWh@!OR;6J
z#bfDjNw;*hh!&&yH}hfhzs;YT-!;Ete%AcB`F``A=1t~*m@hG(Z$8Vs+`PzKW1ecR
zFqfDmbHJQu&NO#7cQ#wiyy+Lyccy)&4^3~FUNk*rddRfRbgStG(-o!-rgKcHX{l+R
zsme6LG};s~4Kle+*`_p8vPm!*jK_?Jj9(l7Wqil@PvbMj$Bg$G?=WsOUSqu2xW;&<
zahb8sINLbISZ*9)3>*E%Tw@<&s<D&NV&n`z8@@GsVfeuCy5R-GlZFQk_ZYSqt~Xq6
zSZ`=CD2645xrP~r@rF`Eks;sUGGrNg8IlYRLqfvQgdY;VO86w<?Sz*To=$i);ogMX
z6K+npI^m*(^WbJ>e}4}J%n7_0*P!wfh@<f1i6irs*jcLb<A|g16~vMGa_lTo`LV=N
z_%Xzh`7-R(tNdu<D10e#WPTKO7OVV7;wXFxab$i3b{47paN;QZFyhF3F?Q-yK0+LY
zFCvc255>+xl`kZY!b`-F`7m}CsC)r&6n+SCWPUJq=BxZ5;wXGRab!M(op~xBB#y!d
zh$HiU?95epA8{1kOB|W^U}uiXyNRRlF5<|%6FapkKae;IpGO>-&&5uS%I6SA;Rg^$
z=KEu3w#sJ{N8z)GBlG>RGfU<B5=Y@Pi6irUuv4w_y@{jn8N`wKbnMJj`847vd@th2
zd{69DseBLOD13L~$b2exW~h8O;wXFyab&(LcBZR*GI10>i8wN!h@ELF--S2|-<dcv
z-w8WYRlXx}6utv-WM0J16qOf<qwo&m$h;jplU3eE9EG<MN9HZqnWXY&;wZd{I5Ka<
z&P0_r5J%w?h$Hjy*qNa6al}!0*rgYqf(bIuVQ0L`{Z1T(J5C&#`wcslDtC-H3U`z^
zGIs<!<5ccf;waoN#F4q5u~VUPKM_aaek6{}9mY<%${iw(!u>!TnLCJ`u`2gHaTM-5
z;>g^$*cqd8-w;RP4iHD?{*9e7mHV1F3ilOpWbRAsj8?h*#8J3?#F4o#uv4mXpA$#n
zJ|m9IeTtn?D)%qqDBLH+k-3ktGg9R~B96j+NF15_06Qfr_danH?mgnj+`HHrp>ppK
zN8#Qkj?BG<o#86?CUF$*4dTe$>)08la<36b;a(+<%)Nr0VwL+RaTM-l;>g@f*ommz
zUg9X+i^P$+7qC;La?cY-;hrOo%sq>pp(^(baTM-p;>g?{>=dfpZsI81Q^b+EC$S@`
z+%DoM+!Mr+xyP{+R=LNBqi~NBN9G>EPJzljOdN%Kh&VF$Aa;hR+ylf>xShn2x%;s*
zSmo{`j>6qb9GTmJok1$Moj3}&jW{xQ4|eiZ?r!2J++D<xxjV5FQn@>bqj0wqN9ML-
zC#Z6_5l7)}C63H(!A?NsZXu4sZ6=P)ZNiRU<u(#W;ch05%-w_?pUT}x9EH1qI5KxV
zcDyQg9dQ)yTH?suKd|Fbxoe1{a90yY=B~nyTjj1Kj>26*9GSZuJ1&*Gj5rE+DRE@(
z66`ot?qcF7+(pEZxeKu~P~|Qlj>2sqj?AscPM*rGBaXtYC63IUkDXkVTSFX$JC8Ur
zcP@5vRBknK6z&}2$Xp9{2B=&!aTM-s6I4X}0QoFz`>SUXtDHfsyb9ZFbtSQiO02A4
zo2AOcDl3SUmt)&cT}G_ZM6BG1ZC|y4SY;`(@)B$_)p}x;#l*^su<fJP5vwdDR$hQ@
zZ*@Mg$~<D_x!7i?bBI-HiIr=xO;=|VtIQ%+uEsV^ok^@xMXWpn+g|E)VwGvc%2ToJ
zsZJqQnM|xa3ELj(L}HZ*#LDBb?XFf5tBfO7uD~``EhknPORPKw+iq$ZvC3#-<x*@@
z)KSDLBZ-wuu<fdjAXXVptUL_cWVM)BB|@xRgl&>Klvt&ZSXsh0Q4JHT6c8&9!M2M!
zm{?^Hv2s4Poz)PrN|0DNfNdw$Ppsl2R`z1sQS}h3xQUfr*mh8z#3}=cmGiI_)m&ng
z9Af1G*a~WYVwG%S<t%I+YCmF?zQoFz*xJ=T#45dsl{2ulsp-TjX~fFCu(hf^iB)<K
zD|g4%qNWn7bR$+y!PcyHC00o$R!+j!q$U!pbRkymjIB}aM6A-0Sh)kX22~_h5r~x?
z*e0lUVig;)vK8BS)k3UdCRR3K8>bqHRSd++3C0AA$gA<#C~*deS!DG6e~tNa4&M8J
z3UB=Pi1&+I#H+;fMMYd7PKA;F5HU~eC3X;b;RoR}7~4NB>=bSlt`^n`s<047^P`0V
zVW5yEbQIzohhY5vrsEmM1CHAq*ErTWR>FvVyknRn;K*{MIIQ-gFg}0DzSDlA{e1gU
z`xJW-jKC9Z$8Gy<uh<^4ZM3boHQA=a2z#KdtBtpQZGGMPsPz`>g)oYqX)Uq3t*O=o
z%Qu#{EKk5l`4Wq2nQbYx_$)mwCiC|&8h*-r2V{i(F$I_cOaZ0<Q-CROS}KrOW8`?V
zDWxI2{NCR;C%VrbR#H|{S)4eVBrgrOj4-}G@WWkN@+^|PB<$;2^YMAxI4!xFB-e*4
zF8!(IsfYFCnIw5}xY0M@+l^=I*i|HXQCO)y>-y$3dh!gCTo*ob?C)Lk*6PXAN%F$*
z`QJ4@)At%lqnt*P7laplw)MzE&v9DvRFXVDJi#^cE9GK6c?wCM7w#R}!*4lp-IGc3
z+^{r!=cQ+D*3X_qlIMh-Q|zy8GV93`Npfwt@ru6n*-P}~2_(5DJnyH~_uQ1OCyyt|
zv%{-x$y*1+>&cZQc~*FO*fh>jpeK(b$<^W2AN<<%`bIsuf+WujFS+!*wR7G-TgzoR
zNv<mR<`>uZeL4N?u_Sp$c#`LftKJOi$zw?J^zgc=n}=P~Ne3w-$<xA@hPFNVddHa>
z_GpqkHEcdKd4K2xb}31o5?)a_Y2)&bbdXUbd2+a+{I#6dj_b)IN%Ev{%S-Vy-cH%2
zVV98PiQ)4v`Tl}U3nVRh1WBF{KKJH=$B#XsgA6Ch<HIcvy!=#Po1Q$3Bv*!4tapt}
zI*-#pib?Xg@a!MDK3Q^xo*W^`72!oc_y6Hovz}Z;lFP$OAKdcALtFIZp(J^1c*?D-
zmf8pF$%Q0&Ot}2Bi$C$t(32&STo&$JJEgGCE<HI+l1GPASDts@Ym%N^K$1(t?r%qJ
z*nQ&cAtZTJxc9E-`o4LSe)eFJJTlzt_j$XwrRm9oNODOyZsFF3*_@u7Pm)IzeED1O
zm0c&2LnL{4L5JxtNl-s&QUyu!umbP?g4<3e2S{>pLC5ubKRKD~C&`f^hi@MasXB;{
zBo`H=@7nQ)WG_h`I^=^FH=ay(cu=xArI1{K!%d=+7Il$mSc^JIv_OjvB+(&SG>=3F
zlW1Ztikeae75td{&x?ELm*>bKXXR`EXaI?Zv}k`44QkPB5)Ek4EE4r=(S9WA)1rMz
z)T>1^Nz|i7`;e$xi}of_mln++QKuG7C((ggG>t^_v}i99&DEkkNi;``_8`##TC_Wf
z_Sd4RB$};7yOC&?7EK}1ep<9EiT2f^$t0SoMUzOhj}}cN(cW6L3yEfE(at2Au0=bM
zXqp!7NTR*8Xa^GQsYOK+?V&{l678-<9VD8nMeQWoO^ez{G)0SANwljLh1a9#?n~C9
zW)e-(q9zhe)S^Zb?V?2uB-&YvCXi?+EgDav9kpm2iFVMUJc)`VioXA^aTM$C|GyAF
z65keI5uX>I5+8vT0C&M`z>VV7;w9ob@f=tKuuNPm&J|~hlf?>g6s!UmBKpOFVwRXL
zb`v|pIsk)kT=-e|UieD*RCpg|3tkeQ5uOkp6t)Yu3!7jqz!k!U!W!XhK@l2Y_Mk?X
zE=&-{2qT0-Az$#odVox!r;sdk5G*j8a1>SqeB;>X_}KA|<5kBCj@_^(;C{#5j$0i!
z!R*4Nj`grA;0(udN4;a7quMdWF%H%R6r;zR{V@fY0!#s>08@Y|z!YE#Fa?+bOo6{l
z0b>Gh<{N6TSvnh=C9|-puf}HaOl%faVN*8)n}yS{SuhQo`BSl(HwBxyld+jI37gu9
z*wjqGX7+e&W>sQSJr0|h71&gjV>4qcHq*yoGp!7psiU!(Qi{#wQP@lxiOs|kY$lAr
zX8dq$Du-b+t{9t&2sY(K*o+;D&6q-L$|P(?hp{Owz-H7CY(@^oreqK{Bl58s9>Qi=
z5S!uvHW5EIMLukXda)_=U?aJ)3A?Z<aAGrLAU1>Zuo;w#O@0nGp#j(g`(qQx#>St8
zjjtaz-oDs)GO=;@!N%1a8)pVK1Jkj|OT#9&7dAOPu^G?<oBrLg$xg*4s~a}`Qn2aU
z6`RatZ2BZ&(>oEHj4s%ucg7~I6E?j%V$-t&Ha$dax(nE(I<V<x$0o&wO;;;6$rfyq
z%-AHFu<2sNrn3Q?PVoL8ZqttN{vS*Sc>m9vEh5|h|1YZ*7%)?SDZmt93NQtj0!#s>
z08@Y|z!YE#Fa`eCD8Tmr|6fxS>_V6VOaZ0<Q-CSJ6krN41(*U%0j2;`fGO~oDc}%#
zayHXJPFxCW_xsu}c6?}yvlm<DSnoBDG95J5n6@N5Z%mA@PPmx=BtDD3fWP1`yA%e@
z6krN41(*U%0j9uTrNAVcG1-;EFPp!(rdFOWS2Z=vudZvXYHV6oqg2hWAyIo#MR8$e
zaau`Pq<B(V+q2Tfl%<g~)3Wq`ncdRWYD!K?;aA(5s%O>JlH(+_X?T@|LraTK{7b6^
z9|cw*sw^H}T#;5brZTN;d}(P;n$@y+*`iss4f-LJqEnrkR^3#aHleVhXhdN}RxtY%
zh#EN7G=EVoWUsoRDJ=qdsw^2@jId4|nq9wW30!UMDM!avl#DK{n4C7Mcyd~newEqi
z-whj6QCu>-3>`U{^|XrOVZ{~2Wkto~($GcDuWkH;FxKprk!Dl!ph5iV0azN6k4B}j
zsdiCSbrayvTGmwC*iN$L6!F?QDJ@Hvr!{PQ@)Dqv+M~>6EUZ;BPKfDqxp9`4SJy48
zty<7nzc{V3cv2-g@f3ZcUlQ=*e?7jeq<nnwNjY-TFw378bF{^nTs?^A@EzQ^v~GS=
zZB_NMrg|LLZsgV?G*-EyUuC1UMKYR_0|9<@uRoN{vc}qmHYL+4(G;azTAgyTR8BoW
zt4!Lk{xv01VlpOA55z8!Q!nL2d9+G4w~S0MCFkbytB3!gG-~EI)Xr|2U%yy;L{7#H
zQOe-t(UbC@NNZ2|6DORw`&(5dtsj?D>Y4gk3(y5Fuhna&Q;tS?SWeKj|CF4-{PRyr
z*kDW^mm9lWPF%@}A~~62ght4W8lvWD@uuX=Oul6$E|ulA4UJG5Rr43mssE#&a7t<X
z@dzr3KgQcDhQ`_RY8O>kX{YLHq8g`e{;Y=T2Ia&dy~cO@{8*q<L~ccePbeBuTr?^z
ztJR?)u54@ef;gxFQu$_4Zxm5sLmjSFCX1^V)oLm6S`M{J?L_<7_8*`oQcKnAK7^Cq
zZ0AkMsX$hL;sysM?aa7S1U&K2^{V}3Wqr#1h0ar|5c-Lm7ggP-oO6mi<R3u&Y|VC<
zj&g$GZccbyXms?mzXQA8FR^A>zB5lS#~ZIPJ#4t!IOVVEiZEQJ08@Y|z!dmz6==@x
zYDyk3fN!zj#-qBfcG0Z)v-LKyRW$XK#-!E3KQ_oP&^pm@cv`g^MPtgwRaO+j_$;jz
zLe-Lmv(rv#R`jM7@StTGl449QAJAUsqcxbTRYLcco=LbPcn)bN$rx;2Eezvk)mAq(
zwvMNqlG)ZrQEpqMwbuh7r?u14p5Q+*SZu$MZOeB`=UeA#)4Mxy;E#h!oCsse69=u?
z;pAAIJ96TQ>xIr;>y&PA)8Wotd7?47Hr!r8X^e}TY8w{S*35?ibrl{Do<N-6_E6F}
zNnK3I;V^&hAbg*;i_z|Z#3;oXh3I6twmtZ#3K&M@OY~mnDMi~BKW$j)m_j@lZhNe%
zNm+8TGZu)^6>Cp{*5d;0yqX;7RMsx}(>G0^voU#8X6&*)wRlOaOG|MlJOWrf`VS>a
zj*&|ve&;CVMvj~;Djfe)rE_9P6t##j?#WUiH2=eb(624(qneX;;sXAB0d)M&1+b*L
zp?cAY=ZnNTwd8al1(5oO0wBjo0T91qlmZ|}Pyt}?zm*=1y2w9hGqgB)sFmn7^HI$V
zvsq|(*#3->ejV+|e3JD~>pWVFHAzGii!w>n&VddsB?6hi%l?D<ZT@{b|5}VvCT;%h
z$>*c>q5nyF{kaLe=-1tT6%>5}kD$P)@;@|OC5~9LMFgeQw2<C<wMrIR!gexu*W(Yx
z((0tBfQ~3d((2!xERa(V&?=5LtnEB3|96GKzW@Jk``+w4rT|lbDZmt93NQtj0!#s>
z08@Y|z!YE#oW2UM@BdHVvSU}y6krN41(*U%0j2;`fGNNfU<xn=m;y|J|5gF?{+|~Q
zbMPPgV+t?@m;y`zrT|lbDZmt93NQtj0!#s>08`*^q=1!=>#^LE=gxEHRipR+Z2bQ>
zDqNNorT|lbDZmt93NQtj0!#s>08@Y|z!YE#XbRZkdsy`GKYIV4!A<1E^<t86voOGM
zzawma-agLup>3}9pmn9yV7bVWV%}uVH9cS|G`?sYZ}`|SKjBcq840HNOX5@GZiySn
zKg1VtF9FEOzorZmOe>h0$Ju3RMAI*gWzSr^G6QWRl98X5F|4k>x+(I9okB8TPuP>&
z;KC^@Gte$K;4EHNR|f~{>Q_L-mFEP1-u$`qP{irP(YpG%5Nm2!R(r-7P3an|O${Hj
z#rOMaQ5PDlX*#PS`QPqx@EgZ88mzr^R$1z_>Z0?nYI=|cYfp{Us!9V_UG&m-cYH&G
zwTI5CNW)hym3oJspuyT*XO*Q%GtQ~Z33Q{unyRr{R4F4+_2?yU6wqMprn4%Nt^St&
zZ{D9mgEd8Gm8JAqa@8jtdeUI+s<E0?ssFW$w)I-_91YfFomG*NtYecezG){7)+C)(
zmK<NFOuRBzroo!1v6@uL=BZlxXjd~0)-F1$B31VMU{%T5k7=-W)>&n#qUpwaGcu;o
zVC|%_8da&ob!Y52n)@RS){Z)>BH4X!Cco72Od6~mbXHk%nx{R^jiGWki5jaxm3n=n
z9{hl#l2<`zRiwn1&wTuo5jWCM*rBt^Qe^1Q&Ei2SwZC0sO@Q9oo4xHFN*ih9)uyv5
zk~ia#?fuSbron2}S!Kz_|6_ZPxJPKPS~S*pRq{0q9`f7Bf6-ty>#T~D`qS<Ee)LYJ
z!D`Z3Woht-Pu~2G0X7<}MvXO24IfT8<GG#Jt);<g&{-8J_vN=fxc=g2Xs{;etg>YO
zp}%?inO{?2ZHzZ@7BrGnr9ig%i4}2MX^_Tgq>41iIQ!eQL$A>w<uy`Sns{k9?#>h%
zj}00BS90Pyv5RnnkmcCn7;JyWKGyc0t;YJTRkp@iHdvC(H<|mJ?lTn_pEFh%J}}Hl
z_&z~RNQl2MzH8jZxEy{bFL5t$mHP0%DWcsd$%-`K(bq3I=<Q5FzNR9bRhBAWTHpKm
z&Kqd34%Jwb)bKCQUN^P->|7eGg*vMe{^{tpb3B=s(O{KyR#_T;MX!Pz-lNiBhc(tj
zRmz<3+{huxR0fF!I;#@?<=vEi&3zxEq3{r$RhIJJzd^mgnM#9ou*TX&4Il44+1>kf
zDw8^cbXG;OF8gffyW6_b$ZNjNDofpe+P?IPK2)C9kjC0sl??OFOSV0`l15&GIx9>$
z9JBs&LkW!*r757Z%94G|ZBv$+cGJkKUt{eA3SV;bx8_g*4OXAd3WEu^JvcFIIt^B@
z&MHd@eUcm#vtOdY>d{y`!jyw!VAW%_%V@B=byh_(zquy!&a*zF!RpdkWvO)PiYta)
zFpCDOQ)BI*N~P`@SE`Gsv^fKHRz(`Qz&`VHr5}yF=IN|3_3KScUl#PxV9nK7MO7M<
zGdA2@@eU2v9Gz8>itC?UoAK-%8mt3!R$0nv$t-{Rlj~`)_SaYiRZ1v(C98qkOM^9A
zXN7zCqlaF(=Yywdux9D3&>ueEIrhP{w`j2T(^ws<lsfgCUWE@7(qQeYvno=~>zfac
zcy0j=)=ZsMmb?c$=7meByn^eavD#Is<F{Y^v~SE18hP!lvnrBjSkdv<exsrAUo!rm
zqva$+mHe-zF7V#<8x8$xbyh{{H*9^?k;3gXSZj1vS&~LST(Rzj;WSuhYpm(2G{XD+
zt9|a6N`rNl&Z<bEPp-YM&%p^aSgUnbS?ZsCq~iK(sf?XwYOHChl=XVW-l{YzwQQBn
z3hmerD{?2P@6k|rhR!O7kEbRdTXX(I8m!Yb)?TXAbKOy{%S+GGV4bG3DpHTDZXNmU
zgH%SRQ*~BZ$`7tt_ThoKH1aw{W9_L*MMs21*|oROV4bY9D$=B9OV1zu(oq_$lXO;D
z>TsmfnyWiHX|PVzSbM0_xQeA$H942kV4a||DpFCWMSDAZzK{m%c%2nSY`1=uuzZ=H
z25Y6p+Fg~r??n#ro(dYQ<8)R~cz5{)o2sbHIalbc@Cx(uO*8N5_&SZemTRo3s#Fl^
za=*=c4Gq?@I;$cL9W;H$%(YZroQ~01Whv>)_vABgzMDo~%QV(*sx<y1)2oqR95h%*
z>#T}2w&IP1)$%JeSW9(Q7_m*6IlS|^`7~HZX{;$wOAJw3dhDaJq;I6osz?*d_7AVf
zNT-q45}j2JA99%vEeY^6SVw5AT~(=M$L^a_OPAAN9j>z~l6(JU`?kG!9}U)FI;$*Y
zb=uXv^a?8P!-_T5WK}Y?^w{&<sDl*pipKxx+*5z%{eOJ?hWNy|8{+!$+xQSb{6G3@
zYSvf&<f@WX+4t%eI;)(T^`$?#iqt9Plf9#NQF$Vo^|e2_vea#V+}c;~q0)pk>x+MK
zRB7V#xw|$U?oZ+FY-)zpKlnS697XaamN)06a#Uzx{STpqH*DQMe0ocHA1bu40tnL%
zfUhg+mB@o{&?#F^r-kKYw=8}$bKIjea(kA}Dod83pQeZ3y?_SmnHp<<RT3Xtl)2@w
zNQ3nZomG+2&$#xM%<)vl#H(~xS#oc9r|$ODcWLBxrN){K)$xJ}cVF_&=QLPVomG+I
zR~<a;dFl)rtcuPmOXZ0@XHM90E)7;$W6e^fuF|ndmmL^MgLQ?@s)P?7d(eJ-;`uaK
zm+P!B>u_M|w|D%wp9bqPjkTXD<$Ut_r*{`q`Q)HUXH}&9y~}@1I<$jEUK@2*S@K+e
z&AUy59;3n9pt1H<!#`A9o9+uzS;D+jXN46}yXPynp0l1tUYF>svQ)fv?o#`i-D$Aa
zYpj{7G`2Xh%R2EA8mx<TR``(ctzm;k)lym4u}EiyH}RjAmQC48r3X`|vG#!#QNMij
z@`Yc9Xehi;XN46}Pgm?s-;hXyb%D+btE(?9-@GS|io)|X*50brE&C^>?<4SO_+S4S
z`8=Ifk;XLqTm8B4BO0u8byitQeBsK?MGhJ*9U1>`<;1mOC*fM5kK-<f-~NPsgzZ(^
z6zgZ!I?IoiGc6|b#pV>#M$-V}ea0b%XAEN!-btvA|2n=g?pWNpaRPr8pT=$di--SB
z&H8GaAk2>}EPlDNPY#VzY1Wt91QjXSaqBbJ-}^lkT7A7uP?k*VQVX`fNoTyRFSrS)
z;h%T@dNgC-MO1RDueb>)QbEV{ExEI9qC%@Lxe3VOpU18#*ev#=LYtvI>3&t>HeQi8
z^%5F0eND~jIxQ?#o^<e?Yi^OK<W}Dv!!Juihur<-6UHM{X!ZRue9-+#8oS}~#3U-T
z`UV+3MH(@1=6mly(18lAzC(sjmh$@Ve{Mw$mFBs*yLL-?RVn`6_4hd6Iz}b8`W_ix
zSn6d<?{(#l@2Jq~n`C%psY}w^U;lD|$~xX=eU}W6Dy5fu*Qf5bQpv5pO@>F23Z1+A
zZ$5|0+pT7OpA3&I^}V6AW5PLospMAQD8sEvU7lGqd2Xdbg;w7w!>xpW-}bTe%D}s*
z(CS-dxMgW{(w3u3Z|qKmR^KbbrG^iMj*qRar1CniS>G(frAV&bPdC;bO{S7teYXsk
z96s8Vk-4nTW-7G$b{S6CR)O2pvWL$19L@TE8BX|ST6y=Jp*yH7)ovEF7RV_}d}w63
zy!HSUk@X!j2C9;E%dofSJhXxet-fW(Kt&q2SSmgK{sUBK^*u8N!Y6%i&N!ob-nCR{
z^-VMKREhuXz}nC*Ds%eH`mPyy(1o+bZOp%k&WO*f-N|`!_=gwY9DL^uG+Ln3X#B6Q
zdnr=Gzy7*4?wuf=5wX7TrAU#C#>u4{C(ya~^_4G0u%6YhE#<o;D&IXe>q}pTs^Q;W
z%IGqpxrWM<uCIL=3UiukB4;1*Q+cD+9MK-Rp|T|W-jwj-HI-CyTcptzs^RZfnr@S=
zS5TqVx2h<FO(UvC?-|ZtLxon~tD;brGAi$!<Jd}NMPReOS%suZ_8<J+cTbo>CAa!+
z6%u^w`P>apta~_<3a!3fg#??&%(>K6b{UnI?#=ps6=7BC+jHY1xnI#45f9cLps*s9
z`xZ7X@X;9~>N{41<?yeCzi(~X+lflq>RVP6s8X>0sBB82Guxr>Sy2EhDkeU(>h&kT
zqLN#E(~1IFiu-)~zUe#0QlZs%tr((8kpoArYwCL$6<U4UiXpJ5;_k<~HU3N|xB9*n
zL*(%HwTlfk1{0Os>Kj)KhRu?;zp`r3?738E^_?pQE8$;{)TUSM+eL*|-@0P3ECqLm
zGk?ySK!sM{yJ8T$!nwBNrn9~YQlZs1uNb6AaX#_JZ|kQ~q1AV<7z8Ud)27UCncss7
zZJyQw<*Smh>DdR<i|EYT>HAmYD^jl~pIWhLH<j;_n)MAV@@1)GUu8zYV^luXY}R+M
z2*G#wS3k6U$>-y#l&!vnMM#l4{Qk?3T??oz)@;t!ZmAILO!d!83txCPfkJN4_&>yL
z=ENt&GsVHe5#d2$x!`sD;MnF^?8vcyZNJSv$DRhe0bXt^vze_=Syx&8mhUaMTWTyl
z&F`2mHJ6%=rd=l0<TZY4yv;bv*xm4!;bOx`Lqfvi337rv{y_Ye_?hwD;@*h6Fm6O#
z9RDc4oOg0x1Lc4H7ePG{OOL|BV9}V-V@r!Gi``DQKiBEWbvx5s{(Sh)?aT}KJubH^
z+L53;5OlgCsAGXp0<lp1LAO8XkA@QPIh{d&1oaaTia!>LFBJ57qh5>O@ACyc5!CD>
z6kkjfcQE7+M7tJW&>IW}Bd8TeDBf5oUSGfyh>qg&yCFxYQAQ}9SSX&5(+{Ud(cKFK
zUVj9&u?WQ-6UFKG`J<wEf-aZK8$nGbLUF}HaeJI_A<?eI9YU!Q)B++DXDk$#*XMB}
zpf-1s+XYt>h@b`yp$v?L;`9eSA<WwPesp=90k0>5+9!mP*Di`HlotvGoc^fS;`I4l
zPIm+~HwYy+CW<={?cPQBL02&7jiA;8q2$Cu3Hm&4f7^Q(p}5>$w>N?s1B5al7D^!K
z4Me%?00s2&c>@ttw<DDPF;QGjSCqRBPyjm!&k?Gs5lVI}6u;N)4*A-YG@$rHAy>#B
zK{YQz$%=*I^LspjsJi&w0jE0{LF<zcO21er-k>K)?$T547q}K5T%q3`LCcL0O5d0$
zPLB&pqxHQDC|<Zkk2``^4<VGySSTKk-ygk<;AuzCG+GRVQ2N9|aeMu4U({>yxV^4m
zD1z4ZAe7#*P+S2hqPDdZ<jC#uxIKXgTB?IkGGd`PLr_OYEoql8;CFf=Xypw;Nso!*
zc0%P4O<iXo7=l^~Eu=vxY3-u~^FltXN|Z8!Yji^Wg4V?#lwPq=f+0BB)Am{bAD%^*
zJA#(EAe5dlQJ{oyrFm*y=L+TpgHEr@A3>{D5K50&C;@NKAFYf4#RY}ril9X(2&H>0
z6o0_!@wKhJ;aUP7U&!N$pfx23B{dcboEdOMx$A;?e!tTh@<q@R5ron$CJH<&(e9UE
zo(~=jpC^J=d?1vRSSVhP&lQYXMm{HWN_-KtfCHg)jfLWYQ=_y;pbHeV6RriV$3Q5_
zu~6JWuQS-TJpvSuH{|gKBWQUALP?5=;&gi9v2Jtk2J_thP!QSxv|0k8B*sF4n<C_l
zdM!{b_+6d|TKs@ey2L_p`n^7=z@uCX?(T&mXe|Rm>D)d_09wG18=jabC{8at8~zAd
znt)I`#YAyAp=Zz*1@NI}fL0ByG(adFW1&FP=yFF<H;@+$LF4O+poIenr9&(fAcR&Y
z+O-5dez)5bLF)hziWn2c?SiKz8cHA#^f>(y^jaUG2(eK7UY9pY-`f?)3&0KF@=E9}
zy(O`*&@rsAq_h>AH{avW^92JzSL=HPAMtzO<_}5eg*!s9M@4Xl@|@7IMs4UR{W)(w
z9C0~)Zogkb@6Hi|EjofL&kyZPoBlQ+z!4Zi_}mhD4UQ13Q4w68yg&$s8m;eDZ$2Ce
zdIC@?5_+SJ5G+v;-1%PU*SJD(d$h_0ITG^0|0MKs8X=gYBY2>f2L0OBN&*5L2?RVY
zkA&VcBLq_v1XsS#pXUzwp;O=L8t@U17iuV%WYQ3fQ4t^)P{9Y_?mV?DTqu8DPXHc$
z34J?_uE7u$!41z7AV9~rH3B*kf+0jmLZ3k+goNk_@Z<I&x8K?NiFf6@-FYtPYWsr{
z`l1;j#79AJ=KBJ9aQl1P^vcN*FI<INLLVn1gt)c{&S1VXm<QFUyY*eI9f1Ly&liU8
zj4cQu3P1oAh0o>l`&vEZPIAN#)s{C5Uksbu0dVI#eR*zZR9oLqPJG1Ug4@B1#{WTX
z8z(+2Hi>z{m%>J2nvmpp#j(y2u^+cTXm7CR*!J0OvQ4pdvA%3Q-#XNC%(Bz6#M0mV
zx%me3By%UzUekFd$#?|54Onc<GJI;d&M?8yA>oCD)d>ahzr^o|uZ!;+_etD8;ws@w
zf#>)ZelYhF5T5&A5C2a&F|rhC4)xKJcIzXz59$J^FVwb%g(J{s@VT6kB^pY7`zT&$
zGvI!2eFwo@1K@k0Ee%ELHI&6MQQ!t>Q!fDu;6oQ9<c%!WP!`2PL3KKeGERLS2#V7i
zf);s^hEf*`CE)i4qqG@tEzqd@AV+l?%EDMEez<j`^qK$#&5`(Ak%bz{f|w}IfYaH!
zro%acIvDEF$N~*zek>H9*9k+oXu5#3(9Mj@*HGrgLh(Xx#NW0D2RZUVuK{}e^E8yX
zu~0lg=mSJ8Bj`+dp`_<(D05<>xS*bI-G*RYJOLO_1tN1al-gJ*P@IA2>Oyl0<M>Fe
zhEfv?#T9@owe4ktE^fI0p*LBhq0Ekn0>d(Y>-Gfa$c3hZf|1!8%B)x@PH0l1J=0#O
z>0x5d6Pcx<RJV@;Z6G`>(Vl5QarvOti&SeUGh?B^WJ<72>jXIhd{59H@I+>6C{;00
zTz+q~GJ@VhzypJ%NR@^%BNj@)2d!<CyADvGr2W3g3=L&^EEIpx8T3Y}>!42q^EL1^
zPS;ST#YBNZiqcmHU7(nQ&?lXyp-hd1;)80@8Rf3?pk5-3s3KD}lqs=Lyxt)6?Az81
zfZ~Fu-5r^tp-hg2;_(Lq(NSR37lOKOvW7A#77A)}{84HtPabsleSu(Pl7=!d7K#go
z+0pJ@4?K-%vMn-ELzxg01u`G4mhwQq2yTL4WP*k=J{F48<AwRnXzDscFc1nw#%m~*
z?W4HS9Elf-qD`yj$%AgDGZc(eYAEAkp`bb{+Wi9fP)MN5I1QyDCW;fTCCYQ;hNlsp
z$55m~Ln)7i5^%!=f0WwW4PyhyX)sc*p^S}%0<DEF%6JO;t^t<^#$}PQ8p@bhC_dnG
zM=2wB9*n8H(2|VNP|9MVcwzpjbq^A~adGGQV2Z%&jg)C9qhq4LZ62*9ap!qqMj_ye
zjMh*}W1+xJ=XbX14wGwvj+zJF@sw&Pqhg^z9R)X6+q=#UPa~RFj*QY!M#e;ePG*$Z
zQ_uyb&qMxDWTb{t5(@?T@Mu=O%@Y8l6u4iU&Pa)dG9nfVyoG?<B`S*B4S9;7MXadT
zJiL7r7mTc+V;7|t1Y;>5+ybr$T9b-UhQ&m2c|veGZE^(oL3mjbjG!f^2&Fg{N)Vcn
zDDzk_77xP95>OYd7)2<NSST>U3BkZU%C*4Uh70C{(E?C}QWOgX-V#8y8w~~Kl%dj$
zp!J*xWoS$kn3{|BrU=b0cu_A1Ezd+Kg|SdzlEVpa5TfMB?}GNh9YL!t5sDNG1)gVk
z9HXJYGYz$A1TCIKDB)NrP!qxP9}UF|bKfxYkJdsWl!90&Ad4F&_@d;<gC>Q15wx@s
zp$v(I;(`e{n1hOf0yCEAjVoHIh)@Q{L~+4er6@I~D-R}kJ<yV%g@*`bP%IR<2;9v)
zWhM%0O!)nJcrOgPpml@@CBJ<XCu+N*y;ngfFo=bDa<nWEp@d?g1OX)!<=zE+m{s=r
zPowq!Xgr912(k=TO}{#3$jfpnh4m3l&1e9LX%&eVjkB{leMp5CjW{ta{Mcrn_4giH
z^G_<YXo!kvM}x?BPPmY7ptJrTjcGBh0$;>8|6WrbPbIf#5R7R-<kTBlZn?CU3N0En
zBid3BxrZfj!gc3Sp+&=NObg$dW*q;1{l|1x=%H~rriGm>u6OKN6YN7Jw`d@bXh(s_
zQyv)8X||gREgJb_S`gWN;oUzDrt-TH&1k9s)52zo!IwP?Uwn~DZqeKVq8+J*e|+kb
zv4P3IQ=vtZ5tvp9AG`R#+vnD&P@zS$7nl}APCPi}zH8|0Qi7&C5N!#F{6mG9e9KNM
zxkd9Mm=@~fZizQO|Aa_|7EP#NS`ayT#B@j3e^a4FGcbsD1c+??Zp7*wI(IUfvca^l
zYWl**etq^{I^PkaIUY<4b@GMWot`i0tPw|(LWp)a{95`!{~fEIqVoWuStLxWNQQgr
z&$#Z#O;m0vH0^|GVb%2IolBfouBJkZ=BW_vFu0TV?fB$hzsXc+(Zm*}1(9<u$#Z%a
zP@zRLVVD-alfD0?m(M&llnO1HIzzO@@SW_vU#<E!mrmt~=GrhVh@2SS?n-k~$t{|k
z!?Ymsqzl$Qm9dcuEt<_kv=I<__x(?9S@8iCS~UHKX+h-GA7(cEbR`v9G+&5mLF9im
z&HPta8lU-{KI8uuwE70=GZ{9I8S^jymtWmXqgYz3I;|4^@s4i?OLq*TLW|b<;M~gL
zA15EPuBiBe3N2b3glH#$$QM-h>N)ELDzs=t5vEnbhcC##@zs)_sL-NiNSGFO=-K*Y
zMpjN86<V}*3DHi3Zcy&8oxiw&%1#z7Xu%VvRl<k*-E*k-@x@efi&jZtT3O;uXXYer
z?n;FgmR4cf39!NRf=lAAfA3-{w6GwH&_Z=lYdkc0E0tNw7EY%Hkr#bbb=D4vMsAzW
z(rCv+buqv5UE{iLr9ykAP75M;{r$%0kI)&couSjh?_@Skzs&ggy;O3$N~5iWJ6Tz`
z{j$ObsnD*}X+h*)mfvx%zkv#^s?*BhpB}s7ykosyp+c)@wByw9!Trld9bQA_H=mki
zomL4SEt|LHg?K7IT-Llor-eKD#^FhK?e0xQ<mDP|1<c91RW6c6XG@7?IxUE7wjAB_
zbT2BoZPIDw@bB-xaZ$N9jtXs~Mq3Udmp|9x8ud*ov<*6~5<W7s{LB64hN;k^^}wii
zQ4S*S=(qB{edkc2MGJ@#?N|`G(-$w?RzYXmLA2r+(<<TL9$0Z(Mes~2xkXEqF)j2i
z&S=^E$g3q(XwjNxL^}pVzWVp!orlnQmx2~SV_GGA{Ptyc?3_)fFNIc3V_Fb7d2!mt
z*J$iyaT<;Pb2NQwVDlLB(NtAlMB^rG8KBcD;qTXdIOFm)FH@oIuhYVAkWbz?t@unT
z+n2RuYqYbWL+^bt<CYuf4A!!AS`c~EKKI!hs7#f&^wViU<ZVgg-+$ynDrMVOqn)LO
zk7o6WJoo*>RA@7GS|$8L#r~OJ5AR5YwvSFLhktoK@tVh%Qh6WI(p#gghC6w!?N#g7
zRQ9ZC$<S#*<Tb&2x7<W!L$8)}ofdw{BKh2R3!b`>O4+7qv@>BQtLLvD+n=9Ig|?SY
z3q#YH_pUqtsUcKod+M|>Cp){08{4Cb3T+RKwhDe^;ll?E|9<*yDzx2oT9}jdJ#c%j
zN@clFOR7#QhmYJB-hS%=Dx0IWbkk^O!0y8L4n1`7d%LKJjJ5_qoxT~+@NIguarycs
zRA|v&0+?0~AMA8bfv~cG3N6}v0MSl|I(d7cV@%mfDzs>~0!#}cr!QVtx0lYl6tuko
zriJQaTIBINx4l6nw`e~EL^};cw$2{au%F7S=@zts0;UC#J6G;~_?z3P<QDC?fN7!O
zGhKc6lef;JLfcWJoeEpiZ`-`(%SC&s(00&ip-xV|RNXOa3l&;Xr<KEp3j^m?jib}4
z5j5H<a3^1P_L`!v?NoB>&}l*B`;+#st-g*5tzD;uiSLe=Jl{Bh#w5#WH2&8Yq13~^
zrsLM_e4+cNREkAkgi;Ti$9gg|M?HE26<U1}N<DOg*4~|c$14v}q16|mEQW5-(H&E7
zxjdT+E&9q0)#8hx@em3=xv7fEmYXf;vp7sEhmZMg{$TA_d#L0VeMg6A7oin;p2ycO
z4pN~-AKqbF7@E#G$HE1RsL-M>^)M~$&^zw|>zXI2Ea+`PpY|czIyL;$f)(~x4%|Z}
zx9D3yOba4!TD)mM4wZMrE$HJxObd7Nl$M)9@77bvE&AFJ(Jq7*`i+!Z&&#CpgI+D@
z^F&MwD_I8ATh&EWeki>Keb0z#LF5O1I9C1PXeuHXXtWF9{r}+$zNq+-PNxrjF^O`!
zKneeR!M-~mJkWtkZqX-}m=?N0jyLO<B#oj%i@wQ3wDVz3_VMua>)x?Yq0QH6(RAUp
z3le)gO@%h3)54srV^4>j2k0zn3~IFV;D>Kt-CgE-m&yyhmVizRy^F>H?|*npH5HNl
zIxW0iSo;2&-~cK=eB0vFXy-ydx%$S<x38e`NlA-Wr-ka`)~V%_-=|Y2dvsd(B@0XT
zSzDCfsg$i-qn!hH@`@MVx@RhtAL46q>9o-B-SXLveHYPrfSful{3c7Mrnky=C{#oq
zsL|F!!}si_FV2l5Q=!e%Y2i-(?MPMK1+P(|&DCk4chNZ^-G9%Q6ll@--(q`}6W5BJ
zglmO9j=LOw`xEvNki`C&0!#s>08@Y|z!YE#Fa?+bOo9LH3N*Fodpk71nyeAGJ<+Wr
zm5;Ss^t~M#;Fl~{xvw0XbvuROOH+%!w?hN0Wck;x7mob+I2BrbZ-=EYG+lap__YJ+
zEV9t|c32A2g=3SNpa1$JD!JA7c33Kh58Z6vIcqtcHPHIr4olSV?^QzG+paNGa;xv{
zummPqCf;rMdU_2NT77SaCGehXLrp=qS7<C3LgRm4d>Hos|KDD*vFl?BFa?+bOaZ0<
yQ-CSJ6krN41(*U%0j9wJj|xQ4h8t-A|E2Bk{|~=+2fzLiZU28b0zUxY^8Ftl!#BYI

literal 0
HcmV?d00001

diff --git a/terratorch_iterate/iterate2.py b/terratorch_iterate/iterate2.py
index 516bd2f..97381fc 100644
--- a/terratorch_iterate/iterate2.py
+++ b/terratorch_iterate/iterate2.py
@@ -2,6 +2,7 @@
 
 import argparse
 import json
+import logging
 import os
 import subprocess
 import re
@@ -11,6 +12,8 @@
 import optuna
 import yaml
 
+logger = logging.getLogger("iterate2")
+
 # ============================================================
 # CLI
 # ============================================================
@@ -58,6 +61,16 @@ def parse_args():
         help="Comma-separated metric names to extract (e.g. score_linear_acc,score_modality_leak,score_combined)",
     )
 
+    # ------------------------
+    # Logging
+    # ------------------------
+    parser.add_argument(
+        "--log-level",
+        default="INFO",
+        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
+        help="Logging verbosity (default: INFO)",
+    )
+
     return parser.parse_args()
 
 
@@ -67,58 +80,80 @@ def parse_args():
 
 def resolve_paths(script: str, root_dir: Optional[str]):
     if root_dir is None: root_dir = '.'
-    return script, Path(root_dir).resolve()
+    resolved = Path(root_dir).resolve()
+    logger.debug("Resolved root_dir '%s' → '%s'", root_dir, resolved)
+    return script, resolved
 
 def build_launcher_command(wlm, cmd, trial_id, out_file, err_file, gpu_count, cpu_count, mem_gb, lsf_gpu_config_string):
+    logger.debug("Building launcher command: wlm=%s gpu_count=%d cpu_count=%d mem_gb=%d", wlm, gpu_count, cpu_count, mem_gb)
     if wlm == "lsf":
         gpu_fragment = f"-gpu \"{lsf_gpu_config_string}\"" if lsf_gpu_config_string else (f"-gpu num={gpu_count}" if gpu_count > 0 else "")
-        return f"bsub {gpu_fragment} -K -o {out_file} -e {err_file} -R \"rusage[ngpus={gpu_count}, cpu={cpu_count}, mem={mem_gb}GB]\" -J hpo_trial_{trial_id} \"{cmd}\""
-    if wlm == "slurm":
-        return f"srun --gres=gpu:{gpu_count} --cpus-per-task={cpu_count} --mem={mem_gb}G --job-name=hpo_trial_{trial_id} --output={out_file} --error={err_file} bash -c \"{cmd}\""
-    if wlm == "none":
-        return f'bash -c "{cmd} > {out_file} 2> {err_file}"'
-    raise ValueError(f"Unknown WLM: {wlm}")
+        launcher = f"bsub {gpu_fragment} -K -o {out_file} -e {err_file} -R \"rusage[ngpus={gpu_count}, cpu={cpu_count}, mem={mem_gb}GB]\" -J hpo_trial_{trial_id} \"{cmd}\""
+    elif wlm == "slurm":
+        launcher = f"srun --gres=gpu:{gpu_count} --cpus-per-task={cpu_count} --mem={mem_gb}G --job-name=hpo_trial_{trial_id} --output={out_file} --error={err_file} bash -c \"{cmd}\""
+    elif wlm == "none":
+        launcher = f'bash -c "{cmd} > {out_file} 2> {err_file}"'
+    else:
+        raise ValueError(f"Unknown WLM: {wlm}")
+    logger.debug("Launcher command: %s", launcher)
+    return launcher
 
 def build_shell_command(interpreter, root_dir, script_path, venv, script_args, param_setter):
     parts = [f"cd {root_dir}"]
-    if venv: parts.append(f"source {venv}/bin/activate")
+    if venv:
+        parts.append(f"source {venv}/bin/activate")
+        logger.debug("Activating venv: %s", venv)
     arg_list = [f"{interpreter} {script_path}"]
     for key, value in script_args.items():
         arg_name = key.replace("_", "-")
         if value is None:
-            continue  # None means "omit this flag" (e.g. compile: null disables --compile)
+            logger.debug("Skipping arg '%s': value is None (flag omitted)", key)
+            continue
         if param_setter:
             if isinstance(value, bool):
-                if value: arg_list.append(f"--{param_setter} {key}")
-                # False → omit entirely
+                if value:
+                    arg_list.append(f"--{param_setter} {key}")
+                    logger.debug("Setter flag: --%s %s (store_true)", param_setter, key)
+                else:
+                    logger.debug("Skipping flag '%s': False → omitted", key)
             else:
                 arg_list.append(f"--{param_setter} {key} {value}")
+                logger.debug("Setter arg: --%s %s %s", param_setter, key, value)
         else:
             if isinstance(value, bool):
-                if value: arg_list.append(f"--{arg_name}")
-                # False → flag is simply absent
+                if value:
+                    arg_list.append(f"--{arg_name}")
+                    logger.debug("Flag present: --%s", arg_name)
+                else:
+                    logger.debug("Skipping flag '--%s': False → omitted", arg_name)
             else:
                 arg_list.append(f"--{arg_name} {value}")
-    parts.append(" ".join(arg_list))
-    return " && ".join(parts)
+                logger.debug("Arg: --%s %s", arg_name, value)
+    cmd = " && ".join(parts + [" ".join(arg_list)])
+    logger.debug("Shell command: %s", cmd)
+    return cmd
 
 # ============================================================
 # MULTI-METRIC EXTRACTION
 # ============================================================
 
 def extract_metrics_from_log(path: str, metric_names: List[str]) -> List[float]:
+    logger.debug("Extracting metrics %s from '%s'", metric_names, path)
     results = []
     with open(path, "r", encoding="utf-8", errors="ignore") as f:
         text = f.read()
-    
+    logger.debug("Log file '%s': %d characters read", path, len(text))
+
     for metric in metric_names:
         pattern = re.compile(rf"{re.escape(metric)}\s*[:=]\s*([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)")
         matches = pattern.findall(text)
         if not matches:
-            print(f"Warning: Metric '{metric}' not found in {path}. Defaulting to 0.0")
+            logger.warning("Metric '%s' not found in '%s' — defaulting to 0.0", metric, path)
             results.append(0.0)
         else:
-            results.append(float(matches[-1]))
+            value = float(matches[-1])
+            logger.debug("Metric '%s': found %d match(es), using last value %s", metric, len(matches), value)
+            results.append(value)
     return results
 
 # ============================================================
@@ -127,39 +162,71 @@ def extract_metrics_from_log(path: str, metric_names: List[str]) -> List[float]:
 
 def load_hpo_space(args):
     data = {}
-    if args.hpo_json: data = json.loads(args.hpo_json)
+    if args.hpo_json:
+        logger.debug("Loading HPO space from JSON string")
+        data = json.loads(args.hpo_json)
     elif args.hpo_yaml:
+        logger.debug("Loading HPO space from YAML file: %s", args.hpo_yaml)
         with open(args.hpo_yaml, "r") as f: data = yaml.safe_load(f)
-    return data.get("hpo", {})
+    space = data.get("hpo", {})
+    logger.info("HPO space loaded: %d parameter(s): %s", len(space), list(space.keys()))
+    return space
 
 def load_static_args(args):
     data = {}
-    if args.static_args_json: data = json.loads(args.static_args_json)
+    if args.static_args_json:
+        logger.debug("Loading static args from JSON string")
+        data = json.loads(args.static_args_json)
     elif args.static_args_yaml:
+        logger.debug("Loading static args from YAML file: %s", args.static_args_yaml)
         with open(args.static_args_yaml, "r") as f: data = yaml.safe_load(f)
     elif args.hpo_yaml:
+        logger.debug("Loading static args from HPO YAML file: %s", args.hpo_yaml)
         with open(args.hpo_yaml, "r") as f: data = yaml.safe_load(f)
-    return data.get("static", data if data else {})
+    static = data.get("static", data if data else {})
+    logger.info("Static args loaded: %d key(s): %s", len(static), list(static.keys()))
+    return static
 
 def suggest_from_spec(trial, name, spec):
     t = spec["type"]
-    if t == "float": return trial.suggest_float(name, float(spec["low"]), float(spec["high"]), log=spec.get("log", False))
-    if t == "int": return trial.suggest_int(name, int(spec["low"]), int(spec["high"]), log=spec.get("log", False))
-    if t == "categorical": return trial.suggest_categorical(name, spec["choices"])
-    if t == "flag":
-        # store_true style: True → --name present, False → flag omitted entirely
-        return trial.suggest_categorical(name, [True, False])
-    if t == "group":
-        # Suggests one of the named group keys; caller expands the key → dict of args
-        return trial.suggest_categorical(name, list(spec["choices"].keys()))
-    raise ValueError(f"Unknown param type: {t}")
+    if t == "float":
+        val = trial.suggest_float(name, float(spec["low"]), float(spec["high"]), log=spec.get("log", False))
+    elif t == "int":
+        val = trial.suggest_int(name, int(spec["low"]), int(spec["high"]), log=spec.get("log", False))
+    elif t == "categorical":
+        val = trial.suggest_categorical(name, spec["choices"])
+    elif t == "flag":
+        val = trial.suggest_categorical(name, [True, False])
+    elif t == "group":
+        val = trial.suggest_categorical(name, list(spec["choices"].keys()))
+    else:
+        raise ValueError(f"Unknown param type: {t}")
+    logger.debug("Suggested '%s' (%s) = %r", name, t, val)
+    return val
 
 def main():
     args = parse_args()
+
+    logging.basicConfig(
+        level=getattr(logging, args.log_level),
+        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    # Suppress noisy optuna INFO logs unless user asked for DEBUG
+    logging.getLogger("optuna").setLevel(
+        logging.WARNING if args.log_level == "INFO" else getattr(logging, args.log_level)
+    )
+
+    logger.info("iterate2 starting")
+    logger.info("Log level: %s", args.log_level)
+    logger.info("WLM: %s | interpreter: %s | script: %s", args.wlm, args.interpreter, args.script)
+    logger.info("Optuna study: '%s' | db: %s | n_trials: %d", args.optuna_study_name, args.optuna_db_path, args.optuna_n_trials)
+
     hpo_space = load_hpo_space(args)
     static_args = load_static_args(args)
     metric_list = [m.strip() for m in args.metrics.split(",")]
-    
+    logger.info("Optimising metrics: %s", metric_list)
+
     script_path, root_dir = resolve_paths(args.script, args.root_dir)
 
     def objective(trial):
@@ -174,38 +241,47 @@ def objective(trial):
 
         # gpu_num in hpo/static overrides the CLI --gpu-count for this trial's launcher
         gpu_count = int(script_args.pop("gpu_num", args.gpu_count))
+        logger.debug("Trial %d: effective gpu_count=%d", trial.number, gpu_count)
+        logger.info("Trial %d: sampled parameters: %s", trial.number, script_args)
 
         out_file = f"trial_{trial.number}.out"
         err_file = f"trial_{trial.number}.err"
+        logger.debug("Trial %d: stdout → %s | stderr → %s", trial.number, out_file, err_file)
 
         shell_cmd = build_shell_command(args.interpreter, root_dir, script_path, args.venv, script_args, args.param_setter)
         launcher_cmd = build_launcher_command(args.wlm, shell_cmd, trial.number, out_file, err_file, gpu_count, args.cpu_count, args.mem_gb, args.lsf_gpu_config_string)
 
-        print(f"Trial {trial.number}: Running...")
+        logger.info("Trial %d: submitting → %s", trial.number, launcher_cmd)
         subprocess.run(launcher_cmd, shell=True, check=True)
+        logger.info("Trial %d: job finished", trial.number)
 
         values = extract_metrics_from_log(out_file, metric_list)
-        print(f"Trial {trial.number} results: {dict(zip(metric_list, values))}")
-        
+        logger.info("Trial %d: results %s", trial.number, dict(zip(metric_list, values)))
+
         return tuple(values)
 
     # Multi-objective direction
     directions = ["maximize"] * len(metric_list)
+    logger.info("Creating Optuna study (directions: %s)", directions)
+
+    storage = f"sqlite:///{args.optuna_db_path}" if "sqlite" not in args.optuna_db_path else args.optuna_db_path
+    logger.debug("Optuna storage: %s", storage)
 
     study = optuna.create_study(
         study_name=args.optuna_study_name,
-        storage=f"sqlite:///{args.optuna_db_path}" if "sqlite" not in args.optuna_db_path else args.optuna_db_path,
+        storage=storage,
         directions=directions,
         load_if_exists=True,
     )
+    logger.info("Study '%s' ready (existing trials: %d)", args.optuna_study_name, len(study.trials))
 
     study.optimize(objective, n_trials=args.optuna_n_trials)
 
-    print("\n" + "="*60)
-    print("OPTIMIZATION COMPLETE")
-    print(f"Pareto Front Trials: {len(study.best_trials)}")
+    logger.info("=" * 60)
+    logger.info("OPTIMIZATION COMPLETE")
+    logger.info("Pareto Front Trials: %d", len(study.best_trials))
     for t in study.best_trials:
-        print(f"Trial {t.number}: Values={t.values}")
+        logger.info("  Trial %d: Values=%s  Params=%s", t.number, t.values, t.params)
 
 if __name__ == "__main__":
     main()
\ No newline at end of file

From f995680eb1f8253cf2e578b40cd5c957712482ca Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Wed, 8 Apr 2026 17:30:54 +0200
Subject: [PATCH 03/16] add support for underscore parameters

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 terratorch_iterate/iterate2.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/terratorch_iterate/iterate2.py b/terratorch_iterate/iterate2.py
index 97381fc..c621fe0 100644
--- a/terratorch_iterate/iterate2.py
+++ b/terratorch_iterate/iterate2.py
@@ -36,6 +36,13 @@ def parse_args():
     parser.add_argument("--cpu-count", type=int, default=4)
     parser.add_argument("--mem-gb", type=int, default=128)
     parser.add_argument("--lsf-gpu-config-string", type=str, default=None)
+    parser.add_argument(
+        "--no-underscore-to-hyphen",
+        dest="underscore_to_hyphen",
+        action="store_false",
+        default=True,
+        help="Do not convert underscores to hyphens in arg names (default: convert)",
+    )
 
     # ------------------------
     # Optuna config
@@ -98,14 +105,14 @@ def build_launcher_command(wlm, cmd, trial_id, out_file, err_file, gpu_count, cp
     logger.debug("Launcher command: %s", launcher)
     return launcher
 
-def build_shell_command(interpreter, root_dir, script_path, venv, script_args, param_setter):
+def build_shell_command(interpreter, root_dir, script_path, venv, script_args, param_setter, underscore_to_hyphen=True):
     parts = [f"cd {root_dir}"]
     if venv:
         parts.append(f"source {venv}/bin/activate")
         logger.debug("Activating venv: %s", venv)
     arg_list = [f"{interpreter} {script_path}"]
     for key, value in script_args.items():
-        arg_name = key.replace("_", "-")
+        arg_name = key.replace("_", "-") if underscore_to_hyphen else key
         if value is None:
             logger.debug("Skipping arg '%s': value is None (flag omitted)", key)
             continue
@@ -248,7 +255,7 @@ def objective(trial):
         err_file = f"trial_{trial.number}.err"
         logger.debug("Trial %d: stdout → %s | stderr → %s", trial.number, out_file, err_file)
 
-        shell_cmd = build_shell_command(args.interpreter, root_dir, script_path, args.venv, script_args, args.param_setter)
+        shell_cmd = build_shell_command(args.interpreter, root_dir, script_path, args.venv, script_args, args.param_setter, args.underscore_to_hyphen)
         launcher_cmd = build_launcher_command(args.wlm, shell_cmd, trial.number, out_file, err_file, gpu_count, args.cpu_count, args.mem_gb, args.lsf_gpu_config_string)
 
         logger.info("Trial %d: submitting → %s", trial.number, launcher_cmd)

From 83468846cb5f6c9fe7f246accace4d5b64085a7c Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Fri, 10 Apr 2026 08:52:10 +0200
Subject: [PATCH 04/16] read metrics from stderr, extend regex

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 configs/gridfm_graphkit_hpo.yaml | 24 ++++++++++----------
 terratorch_iterate/iterate2.py   | 39 ++++++++++++++++++++++++++++----
 2 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/configs/gridfm_graphkit_hpo.yaml b/configs/gridfm_graphkit_hpo.yaml
index 65073ed..334241f 100644
--- a/configs/gridfm_graphkit_hpo.yaml
+++ b/configs/gridfm_graphkit_hpo.yaml
@@ -1,4 +1,4 @@
-# HPO configuration for gridfm-graphkit HGNS PF case2000
+# HPO configuration for gridfm-graphkit HGNS PF case118
 #
 # Hyperparameters:
 #   gpu_num   – number of GPUs to request from the WLM (launcher-level)
@@ -9,6 +9,14 @@
 #
 # Static args:
 #   all other fixed CLI args
+#
+# Metrics:
+#   extracted from [performance] lines in trial output
+
+metrics:
+  - case118_ieee/layer_0_residual
+  - last epoch time
+  - last epoch it/s
 
 hpo:
   gpu_num:
@@ -29,19 +37,11 @@ hpo:
   dataset:
     type: group             # one choice selects all bundled args together
     choices:
-      case2000:
-        config: ./examples/config/HGNS_PF_datakit_case2000.yaml
-        data_path: /dccstor/gridfm/powermodels_data/v4/finetuning/pf/
-        exp_name: case2000
-      case1000:
-        config: ./examples/config/HGNS_PF_datakit_case1000.yaml
-        data_path: /dccstor/gridfm/powermodels_data/v4/finetuning/pf/
-        exp_name: case1000
+      case118:
+        config: ./examples/config/HGNS_PF_datakit_case118.yaml
+        data_path: /u/rkie/
 
 static:
   run_name: run1
   log_dir: logs
-  dataset_wrapper: SharedMemoryCacheDataset
-  plugins: gridfm_graphkit_ee
   num_workers: 32
-  dataset_wrapper_cache_dir: /dccstor/terratorch/case2000_ieeecache
diff --git a/terratorch_iterate/iterate2.py b/terratorch_iterate/iterate2.py
index c621fe0..650298a 100644
--- a/terratorch_iterate/iterate2.py
+++ b/terratorch_iterate/iterate2.py
@@ -144,15 +144,27 @@ def build_shell_command(interpreter, root_dir, script_path, venv, script_args, p
 # MULTI-METRIC EXTRACTION
 # ============================================================
 
-def extract_metrics_from_log(path: str, metric_names: List[str]) -> List[float]:
+def extract_metrics_from_log(path: str, metric_names: List[str], err_path: Optional[str] = None) -> List[float]:
     logger.debug("Extracting metrics %s from '%s'", metric_names, path)
     results = []
     with open(path, "r", encoding="utf-8", errors="ignore") as f:
         text = f.read()
     logger.debug("Log file '%s': %d characters read", path, len(text))
+    # Also read stderr — Lightning/rich writes test result tables there
+    if err_path:
+        try:
+            with open(err_path, "r", encoding="utf-8", errors="ignore") as f:
+                err_text = f.read()
+            logger.debug("Err file '%s': %d characters read", err_path, len(err_text))
+            text = text + "\n" + err_text
+        except FileNotFoundError:
+            logger.debug("Err file '%s' not found, skipping", err_path)
 
     for metric in metric_names:
-        pattern = re.compile(rf"{re.escape(metric)}\s*[:=]\s*([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)")
+        # Matches: key: value | key=value | [performance] key : value | Lightning table │ key │ value │
+        pattern = re.compile(
+            rf"(?:\[\w+\]\s*)?{re.escape(metric)}\s*(?:[:=│])\s*([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)"
+        )
         matches = pattern.findall(text)
         if not matches:
             logger.warning("Metric '%s' not found in '%s' — defaulting to 0.0", metric, path)
@@ -179,6 +191,22 @@ def load_hpo_space(args):
     logger.info("HPO space loaded: %d parameter(s): %s", len(space), list(space.keys()))
     return space
 
+def load_metrics_from_yaml(args):
+    """Return metrics list from YAML 'metrics:' key, or None if not present."""
+    data = {}
+    if args.hpo_json:
+        data = json.loads(args.hpo_json)
+    elif args.hpo_yaml:
+        with open(args.hpo_yaml, "r") as f: data = yaml.safe_load(f)
+    elif args.static_args_yaml:
+        with open(args.static_args_yaml, "r") as f: data = yaml.safe_load(f)
+    metrics = data.get("metrics", None)
+    if metrics is None:
+        return None
+    if isinstance(metrics, list):
+        return [m.strip() for m in metrics]
+    return [m.strip() for m in str(metrics).split(",")]
+
 def load_static_args(args):
     data = {}
     if args.static_args_json:
@@ -231,8 +259,9 @@ def main():
 
     hpo_space = load_hpo_space(args)
     static_args = load_static_args(args)
-    metric_list = [m.strip() for m in args.metrics.split(",")]
-    logger.info("Optimising metrics: %s", metric_list)
+    yaml_metrics = load_metrics_from_yaml(args)
+    metric_list = yaml_metrics if yaml_metrics is not None else [m.strip() for m in args.metrics.split(",")]
+    logger.info("Optimising metrics: %s (source: %s)", metric_list, "yaml" if yaml_metrics else "cli")
 
     script_path, root_dir = resolve_paths(args.script, args.root_dir)
 
@@ -262,7 +291,7 @@ def objective(trial):
         subprocess.run(launcher_cmd, shell=True, check=True)
         logger.info("Trial %d: job finished", trial.number)
 
-        values = extract_metrics_from_log(out_file, metric_list)
+        values = extract_metrics_from_log(out_file, metric_list, err_path=err_file)
         logger.info("Trial %d: results %s", trial.number, dict(zip(metric_list, values)))
 
         return tuple(values)

From 1d359de05f88e6428b9b5fce99e1e531ef34ffc3 Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Fri, 10 Apr 2026 09:02:33 +0200
Subject: [PATCH 05/16] report performance in hpo run

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 configs/gridfm_graphkit_hpo.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/configs/gridfm_graphkit_hpo.yaml b/configs/gridfm_graphkit_hpo.yaml
index 334241f..9939893 100644
--- a/configs/gridfm_graphkit_hpo.yaml
+++ b/configs/gridfm_graphkit_hpo.yaml
@@ -45,3 +45,4 @@ static:
   run_name: run1
   log_dir: logs
   num_workers: 32
+  report_performance: true

From 44fd1f3eee952062d4b6131407347ef73e577383 Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Fri, 10 Apr 2026 10:28:09 +0200
Subject: [PATCH 06/16] add parallel execution within single iterate process

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 configs/gridfm_graphkit_hpo.yaml |  2 +-
 docs/iterate2.md                 | 51 +++++++++++++++++
 terratorch_iterate/iterate2.py   | 98 +++++++++++++++++++++++++++++++-
 3 files changed, 147 insertions(+), 4 deletions(-)

diff --git a/configs/gridfm_graphkit_hpo.yaml b/configs/gridfm_graphkit_hpo.yaml
index 9939893..6ee3a2e 100644
--- a/configs/gridfm_graphkit_hpo.yaml
+++ b/configs/gridfm_graphkit_hpo.yaml
@@ -45,4 +45,4 @@ static:
   run_name: run1
   log_dir: logs
   num_workers: 32
-  report_performance: true
+  report-performance: true
diff --git a/docs/iterate2.md b/docs/iterate2.md
index 8922d52..c70bef2 100644
--- a/docs/iterate2.md
+++ b/docs/iterate2.md
@@ -41,6 +41,7 @@ iterate2 \
 | `--cpu-count` | `4` | Number of CPUs per trial |
 | `--mem-gb` | `128` | Memory (GB) per trial |
 | `--lsf-gpu-config-string` | `None` | Optional verbatim LSF `-gpu` option string (see [GPU configuration](#gpu-configuration-on-lsf)) |
+| `--parallelism` | `1` | Number of trials to run in parallel (see [Parallel execution](#parallel-execution)) |
 
 ### Optuna options
 
@@ -297,6 +298,56 @@ bsub -n 20 -R "span[hosts=1]" \
 
 ---
 
+---
+
+## Parallel execution
+
+By default `iterate2` runs one trial at a time. Pass `--parallelism N` to run up to `N` trials simultaneously, each in its own thread.
+
+```sh
+iterate2 \
+  --parallelism 4 \
+  --wlm lsf \
+  ...
+```
+
+### How it works
+
+Each thread independently:
+
+1. Asks Optuna for a new set of hyperparameters (`study.ask()`)
+2. Builds and submits the launcher command (e.g. `bsub -K …`)
+3. Streams every output line to the main process stdout/stderr, prefixed with `[trial-N]`
+4. Reports the extracted metrics back to Optuna (`study.tell()`)
+
+Output from concurrent trials is prefixed so you can follow individual workers:
+
+```
+[trial-3] Epoch 1/10  ━━━━━━━━━━ 100/100 0:01:12
+[trial-5] Using bfloat16 precision
+[trial-3] [performance] val_loss : 0.0421
+[trial-5] Epoch 1/10  ━━━━━━━━━━ 100/100 0:01:15
+```
+
+### Output files
+
+| WLM | stdout | stderr |
+|---|---|---|
+| `none` | `trial_N.out` (written by iterate2) | `trial_N.err` (written by iterate2) |
+| `lsf` / `slurm` | `trial_N.out` (written by WLM on cluster) | `trial_N.err` (written by WLM on cluster) |
+
+For WLM backends the local WLM tool output (bsub/srun status messages) is written to `trial_N_wlm.out` / `trial_N_wlm.err` so the cluster-managed files are never overwritten.
+
+### SQLite and parallelism
+
+Optuna retries on SQLite locking errors automatically. Values up to `--parallelism 4` work well with SQLite. For higher concurrency use a PostgreSQL storage URL:
+
+```sh
+--optuna-db-path postgresql://user:pass@host/dbname
+```
+
+---
+
 ## Workload managers
 
 ### LSF
diff --git a/terratorch_iterate/iterate2.py b/terratorch_iterate/iterate2.py
index 650298a..19823f4 100644
--- a/terratorch_iterate/iterate2.py
+++ b/terratorch_iterate/iterate2.py
@@ -5,7 +5,9 @@
 import logging
 import os
 import subprocess
+import sys
 import re
+import threading
 from pathlib import Path
 from typing import Dict, Any, Optional, Literal, List
 
@@ -36,6 +38,15 @@ def parse_args():
     parser.add_argument("--cpu-count", type=int, default=4)
     parser.add_argument("--mem-gb", type=int, default=128)
     parser.add_argument("--lsf-gpu-config-string", type=str, default=None)
+    parser.add_argument(
+        "--parallelism",
+        type=int,
+        default=1,
+        help="Number of trials to run in parallel (default: 1 = sequential). "
+             "Each parallel trial runs in its own thread. "
+             "For SQLite storage, values >4 may cause locking contention; "
+             "consider PostgreSQL for high parallelism.",
+    )
     parser.add_argument(
         "--no-underscore-to-hyphen",
         dest="underscore_to_hyphen",
@@ -99,7 +110,9 @@ def build_launcher_command(wlm, cmd, trial_id, out_file, err_file, gpu_count, cp
     elif wlm == "slurm":
         launcher = f"srun --gres=gpu:{gpu_count} --cpus-per-task={cpu_count} --mem={mem_gb}G --job-name=hpo_trial_{trial_id} --output={out_file} --error={err_file} bash -c \"{cmd}\""
     elif wlm == "none":
-        launcher = f'bash -c "{cmd} > {out_file} 2> {err_file}"'
+        # No embedded redirect: run_and_stream() captures stdout/stderr via PIPE
+        # and writes to out_file/err_file itself.
+        launcher = f'bash -c "{cmd}"'
     else:
         raise ValueError(f"Unknown WLM: {wlm}")
     logger.debug("Launcher command: %s", launcher)
@@ -140,6 +153,84 @@ def build_shell_command(interpreter, root_dir, script_path, venv, script_args, p
     logger.debug("Shell command: %s", cmd)
     return cmd
 
+# ============================================================
+# PARALLEL STREAMING RUNNER
+# ============================================================
+
+_print_lock = threading.Lock()
+
+def _stream_pipe(pipe, dest_file, trial_id: int, stream_name: str, dest_stream):
+    """Read lines from *pipe*, write to *dest_file* and print prefixed to *dest_stream*."""
+    prefix = f"[trial-{trial_id}]"
+    with open(dest_file, "w", encoding="utf-8", errors="replace") as fh:
+        for raw in pipe:
+            line = raw.decode("utf-8", errors="replace")
+            fh.write(line)
+            fh.flush()
+            with _print_lock:
+                dest_stream.write(f"{prefix} {line}")
+                dest_stream.flush()
+
+def run_and_stream(launcher_cmd: str, trial_id: int, out_file: str, err_file: str, wlm: str):
+    """
+    Run *launcher_cmd* in a shell.
+
+    For ``wlm='none'``: captures stdout and stderr via PIPE, streams every line
+    to the main process stdout/stderr (prefixed with ``[trial-N]``), and also
+    writes them to *out_file* / *err_file* for later metric extraction.
+
+    For WLM backends (lsf, slurm, …): the WLM tool itself manages the output
+    files on the cluster.  The local subprocess output (WLM status messages,
+    errors) is still streamed with the same prefix so parallel workers are
+    distinguishable.
+    """
+    logger.debug("Trial %d: run_and_stream wlm=%s cmd=%s", trial_id, wlm, launcher_cmd)
+    proc = subprocess.Popen(
+        launcher_cmd,
+        shell=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+    )
+
+    if wlm == "none":
+        # Full capture: write to files AND stream to console
+        t_out = threading.Thread(
+            target=_stream_pipe,
+            args=(proc.stdout, out_file, trial_id, "stdout", sys.stdout),
+            daemon=True,
+        )
+        t_err = threading.Thread(
+            target=_stream_pipe,
+            args=(proc.stderr, err_file, trial_id, "stderr", sys.stderr),
+            daemon=True,
+        )
+    else:
+        # WLM manages the cluster output files (out_file/err_file) itself.
+        # Stream only the local WLM tool output (bsub/srun status messages)
+        # to console; write it to separate local files to avoid clobbering the
+        # cluster-managed trial output files.
+        wlm_out = out_file.replace(".out", "_wlm.out")
+        wlm_err = err_file.replace(".err", "_wlm.err")
+        t_out = threading.Thread(
+            target=_stream_pipe,
+            args=(proc.stdout, wlm_out, trial_id, "wlm-stdout", sys.stdout),
+            daemon=True,
+        )
+        t_err = threading.Thread(
+            target=_stream_pipe,
+            args=(proc.stderr, wlm_err, trial_id, "wlm-stderr", sys.stderr),
+            daemon=True,
+        )
+
+    t_out.start()
+    t_err.start()
+    proc.wait()
+    t_out.join()
+    t_err.join()
+
+    if proc.returncode != 0:
+        raise subprocess.CalledProcessError(proc.returncode, launcher_cmd)
+
 # ============================================================
 # MULTI-METRIC EXTRACTION
 # ============================================================
@@ -288,7 +379,7 @@ def objective(trial):
         launcher_cmd = build_launcher_command(args.wlm, shell_cmd, trial.number, out_file, err_file, gpu_count, args.cpu_count, args.mem_gb, args.lsf_gpu_config_string)
 
         logger.info("Trial %d: submitting → %s", trial.number, launcher_cmd)
-        subprocess.run(launcher_cmd, shell=True, check=True)
+        run_and_stream(launcher_cmd, trial.number, out_file, err_file, args.wlm)
         logger.info("Trial %d: job finished", trial.number)
 
         values = extract_metrics_from_log(out_file, metric_list, err_path=err_file)
@@ -311,7 +402,8 @@ def objective(trial):
     )
     logger.info("Study '%s' ready (existing trials: %d)", args.optuna_study_name, len(study.trials))
 
-    study.optimize(objective, n_trials=args.optuna_n_trials)
+    logger.info("Parallelism: %d worker(s)", args.parallelism)
+    study.optimize(objective, n_trials=args.optuna_n_trials, n_jobs=args.parallelism)
 
     logger.info("=" * 60)
     logger.info("OPTIMIZATION COMPLETE")

From 7c28bcddd7f6f4aaeaba1be5614cb44c2996c47d Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Fri, 10 Apr 2026 10:34:21 +0200
Subject: [PATCH 07/16] support for JournalFile

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 docs/iterate2.md               | 10 ++++++++--
 terratorch_iterate/iterate2.py | 10 +++++++++-
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/docs/iterate2.md b/docs/iterate2.md
index c70bef2..f0da899 100644
--- a/docs/iterate2.md
+++ b/docs/iterate2.md
@@ -48,7 +48,7 @@ iterate2 \
 | Option | Default | Description |
 |---|---|---|
 | `--optuna-study-name` | *(required)* | Name of the Optuna study |
-| `--optuna-db-path` | *(required)* | Storage URL for the Optuna database, e.g. `sqlite:///hpo.db` |
+| `--optuna-db-path` | *(required)* | Storage URL. `sqlite:///hpo.db` for SQLite, `js:///path/journal.log` for JournalStorage, or any Optuna-supported URL |
 | `--optuna-n-trials` | `100` | Number of trials to run |
 
 ### HPO search space
@@ -340,12 +340,18 @@ For WLM backends the local WLM tool output (bsub/srun status messages) is writte
 
 ### SQLite and parallelism
 
-Optuna retries on SQLite locking errors automatically. Values up to `--parallelism 4` work well with SQLite. For higher concurrency use a PostgreSQL storage URL:
+Optuna retries on SQLite locking errors automatically. Values up to `--parallelism 4` work well with SQLite. For higher concurrency use PostgreSQL or **JournalStorage**:
 
 ```sh
+# PostgreSQL
 --optuna-db-path postgresql://user:pass@host/dbname
+
+# JournalStorage (file-based, lock-free, safe for parallel workers on a shared filesystem)
+--optuna-db-path js:///path/to/study_journal.log
 ```
 
+`js:///` is a custom `iterate2` scheme. The path after `js:///` is passed to Optuna's `JournalFileStorage`. JournalStorage serialises trials to an append-only log and is well-suited for NFS/GPFS shared filesystems where SQLite locking is unreliable.
+
 ---
 
 ## Workload managers
diff --git a/terratorch_iterate/iterate2.py b/terratorch_iterate/iterate2.py
index 19823f4..f33d4a7 100644
--- a/terratorch_iterate/iterate2.py
+++ b/terratorch_iterate/iterate2.py
@@ -12,6 +12,7 @@
 from typing import Dict, Any, Optional, Literal, List
 
 import optuna
+from optuna.storages import JournalStorage, JournalFileStorage
 import yaml
 
 logger = logging.getLogger("iterate2")
@@ -391,7 +392,14 @@ def objective(trial):
     directions = ["maximize"] * len(metric_list)
     logger.info("Creating Optuna study (directions: %s)", directions)
 
-    storage = f"sqlite:///{args.optuna_db_path}" if "sqlite" not in args.optuna_db_path else args.optuna_db_path
+    if args.optuna_db_path.startswith("js:///"):
+        journal_path = args.optuna_db_path[len("js://"):]
+        logger.info("Using JournalStorage at '%s'", journal_path)
+        storage = JournalStorage(JournalFileStorage(journal_path))
+    elif "sqlite" in args.optuna_db_path:
+        storage = args.optuna_db_path
+    else:
+        storage = f"sqlite:///{args.optuna_db_path}"
     logger.debug("Optuna storage: %s", storage)
 
     study = optuna.create_study(

From 8bb7310751f7b04ab4c5c168b9e8013b2455ef1f Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Tue, 14 Apr 2026 14:51:58 +0200
Subject: [PATCH 08/16] add vela support

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 docs/iterate2.md                   |  77 ++++++-
 examples/run_vela_example.sh       |  52 +++++
 examples/vela_gridfm_template.yaml |  32 +++
 terratorch_iterate/iterate2.py     | 323 ++++++++++++++++++++++++++++-
 4 files changed, 476 insertions(+), 8 deletions(-)
 create mode 100644 examples/run_vela_example.sh
 create mode 100644 examples/vela_gridfm_template.yaml

diff --git a/docs/iterate2.md b/docs/iterate2.md
index f0da899..c21359e 100644
--- a/docs/iterate2.md
+++ b/docs/iterate2.md
@@ -36,13 +36,26 @@ iterate2 \
 | `--venv` | `.venv` | Virtual-environment directory to activate. Set to empty string to disable |
 | `--interpreter` | `python` | Python interpreter to invoke |
 | `--param-setter` | `None` | Use setter-style argument passing (see [Setter-style arguments](#setter-style-arguments)) |
-| `--wlm` | `none` | Workload manager: `lsf`, `slurm`, `openshift`, or `none` |
+| `--wlm` | `none` | Workload manager: `lsf`, `slurm`, `vela`, or `none` |
 | `--gpu-count` | `1` | Number of GPUs per trial |
 | `--cpu-count` | `4` | Number of CPUs per trial |
 | `--mem-gb` | `128` | Memory (GB) per trial |
 | `--lsf-gpu-config-string` | `None` | Optional verbatim LSF `-gpu` option string (see [GPU configuration](#gpu-configuration-on-lsf)) |
 | `--parallelism` | `1` | Number of trials to run in parallel (see [Parallel execution](#parallel-execution)) |
 
+### Vela (OpenShift) options
+
+Required when `--wlm vela`.
+
+| Option | Default | Description |
+|---|---|---|
+| `--vela-job-template` | *(required)* | Path to the Vela job YAML template. `{{HPO_COMMAND}}` in `setupCommands` is replaced per trial |
+| `--vela-chart-path` | *(required)* | Path to the `pytorchjob-generator` helm chart directory |
+| `--vela-namespace` | *(current context)* | OpenShift/Kubernetes namespace |
+| `--vela-cmd-placeholder` | `{{HPO_COMMAND}}` | String in `setupCommands` that is replaced with the HPO-parametrised CLI call |
+| `--vela-pod-ready-timeout` | `600` | Seconds to wait for the trial pod to reach Running state |
+| `--vela-job-timeout` | `86400` | Seconds to wait (streaming logs) for the job to complete |
+
 ### Optuna options
 
 | Option | Default | Description |
@@ -368,9 +381,67 @@ Uses `srun` with `--gres=gpu:<N>`, `--cpus-per-task`, and `--mem` flags.
 
 Runs the command directly in a local shell, redirecting stdout/stderr to `trial_<N>.out` / `trial_<N>.err`.
 
-### openshift
+### Vela (OpenShift / MLBatch)
+
+`--wlm vela` submits each trial as a [PyTorchJob](https://www.kubeflow.org/docs/components/training/pytorch/) via the [MLBatch `pytorchjob-generator`](https://github.com/project-codeflare/mlbatch) helm chart.
+
+#### Submission flow
+
+1. For each Optuna trial iterate2:
+    * Builds the CLI invocation from sampled + static args.
+    * Patches the **job template YAML**:
+        * appends `-trial-<N>` to `jobName` (unique resource per trial)
+        * sets `numGpusPerPod` from `gpu_num` (HPO or CLI `--gpu-count`)
+        * replaces the `{{HPO_COMMAND}}` placeholder in `setupCommands` with the generated CLI call
+    * Runs `helm template -f <patched.yaml> <chart> | oc create [-n <ns>] -f-`
+2. Polls until `<jobName>-master-0` pod appears, then streams `oc logs -f <pod>` — **this call blocks until the container exits**, so the trial behaves the same as other WLM backends.
+3. Pod exit code is checked; non-zero raises an error.
+4. The `PyTorchJob` resource is deleted.
+
+#### Job template
+
+Create a YAML file modelled on `examples/vela_gridfm_template.yaml`.  The only special requirement is the `{{HPO_COMMAND}}` placeholder somewhere in `setupCommands`:
+
+```yaml
+jobName: "my-project-hpo"       # iterate2 appends -trial-N
+numGpusPerPod: 1                # iterate2 overwrites with gpu_num
+numCpusPerPod: 32
+totalMemoryPerPod: "32Gi"
+
+volumes:
+  - name: "data-vol"
+    claimName: "my-pvc"
+    mountPath: "/mnt/data"
+
+setupCommands:
+  - "wget -q https://example.com/config.yaml"
+  - "{{HPO_COMMAND}}"           # ← iterate2 fills this in
+```
+
+#### Example invocation
 
-Not yet implemented.
+```sh
+iterate2 \
+  --script            "gridfm_graphkit train" \
+  --interpreter       "" \
+  --wlm               vela \
+  --vela-job-template examples/vela_gridfm_template.yaml \
+  --vela-chart-path   ../mlbatch/tools/pytorchjob-generator/chart \
+  --vela-namespace    my-namespace \
+  --gpu-count         1 \
+  --optuna-study-name gridfm_vela_hpo \
+  --optuna-db-path    sqlite:///gridfm_vela_hpo.db \
+  --optuna-n-trials   20 \
+  --hpo-yaml          configs/gridfm_graphkit_hpo.yaml
+```
+
+See `examples/run_vela_example.sh` for a complete ready-to-run script.
+
+!!! note
+    `--script` is the bare CLI entry-point (`gridfm_graphkit train`).  Set `--interpreter ""` to suppress the default `python` prefix.
+
+!!! tip
+    `gpu_num` in the HPO space controls both `numGpusPerPod` in the job YAML **and** the WLM resource request, just like with LSF/Slurm.
 
 ---
 
diff --git a/examples/run_vela_example.sh b/examples/run_vela_example.sh
new file mode 100644
index 0000000..41cdb06
--- /dev/null
+++ b/examples/run_vela_example.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+# =============================================================================
+# Example: iterate2 with --wlm vela (OpenShift / MLBatch PyTorchJob)
+#
+# Prerequisites
+# -------------
+#   * helm CLI installed and on PATH
+#   * oc CLI logged in to the target cluster
+#   * mlbatch/tools/pytorchjob-generator/chart checked out locally
+#   * The gridfm HPO YAML (configs/gridfm_graphkit_hpo.yaml) present
+#
+# How it works
+# ------------
+# 1. For each Optuna trial iterate2:
+#    a. Samples hyperparameters from gridfm_graphkit_hpo.yaml
+#    b. Builds the gridfm_graphkit CLI invocation from static + sampled params
+#    c. Patches vela_gridfm_template.yaml:
+#         - appends  "-trial-<N>"  to jobName  (unique resource per trial)
+#         - sets     numGpusPerPod = gpu_num    (from the HPO space)
+#         - replaces {{HPO_COMMAND}}            (the actual CLI call)
+#    d. Runs:  helm template -f <patched.yaml> <chart> | oc create -f-
+#    e. Polls until <jobName>-master-0 pod is Running
+#    f. Streams:  oc logs -f <jobName>-master-0
+#       (blocks until container exits; output captured for metric extraction)
+#    g. Checks pod exit code; deletes the PyTorchJob resource
+# 2. Metrics are extracted from the captured log and returned to Optuna.
+# =============================================================================
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+# Path to the mlbatch pytorchjob-generator helm chart.
+# Clone mlbatch first:  git clone https://github.com/project-codeflare/mlbatch
+CHART_PATH="${MLBATCH_CHART_PATH:-${REPO_ROOT}/../mlbatch/tools/pytorchjob-generator/chart}"
+
+iterate2 \
+  --script            "gridfm_graphkit train" \
+  --interpreter       ""                                       \
+  --wlm               vela                                    \
+  --vela-job-template "${SCRIPT_DIR}/vela_gridfm_template.yaml" \
+  --vela-chart-path   "${CHART_PATH}"                         \
+  --vela-namespace    "${OC_NAMESPACE:-}"                     \
+  --vela-cmd-placeholder "{{HPO_COMMAND}}"                    \
+  --vela-pod-ready-timeout 600                                 \
+  --vela-job-timeout  86400                                    \
+  --gpu-count         1                                        \
+  --optuna-study-name  gridfm_vela_hpo                        \
+  --optuna-db-path     "sqlite:///gridfm_vela_hpo.db"         \
+  --optuna-n-trials    20                                      \
+  --hpo-yaml          "${REPO_ROOT}/configs/gridfm_graphkit_hpo.yaml"
diff --git a/examples/vela_gridfm_template.yaml b/examples/vela_gridfm_template.yaml
new file mode 100644
index 0000000..c30f5ec
--- /dev/null
+++ b/examples/vela_gridfm_template.yaml
@@ -0,0 +1,32 @@
+####################
+# Job Metadata
+# NOTE: iterate2 appends "-trial-<N>" to jobName for each HPO trial so
+#       every trial creates a unique Kubernetes resource.
+####################
+jobName: "romeokienzler-gridfm-hpo"
+containerImage: "us.icr.io/geodn/gridfm-graphkit:0.1"
+imagePullPolicy: "IfNotPresent"
+imagePullSecrets:
+  - name: "pullsecret-gridfm-geodn"
+
+##################################
+# Resource Requirements
+# NOTE: numGpusPerPod is overridden per-trial by the gpu_num HPO parameter.
+##################################
+numPods: 1
+numCpusPerPod: 32
+numGpusPerPod: 1          # placeholder – iterate2 overwrites this with gpu_num
+totalMemoryPerPod: "32Gi"
+
+volumes:
+  - name: "gridfm-storage"
+    claimName: "gridfm"
+    mountPath: "/mnt/data"
+
+########################
+# Workload Specification
+########################
+setupCommands:
+  - "wget -q https://raw.githubusercontent.com/gridfm/gridfm-graphkit/refs/heads/main/examples/config/HGNS_PF_datakit_case118.yaml"
+  # iterate2 replaces the placeholder below with the HPO-parametrised CLI call.
+  - "{{HPO_COMMAND}}"
diff --git a/terratorch_iterate/iterate2.py b/terratorch_iterate/iterate2.py
index f33d4a7..a057e05 100644
--- a/terratorch_iterate/iterate2.py
+++ b/terratorch_iterate/iterate2.py
@@ -7,7 +7,9 @@
 import subprocess
 import sys
 import re
+import tempfile
 import threading
+import time
 from pathlib import Path
 from typing import Dict, Any, Optional, Literal, List
 
@@ -34,11 +36,51 @@ def parse_args():
     parser.add_argument("--venv", default=".venv", help="Virtualenv dir")
     parser.add_argument("--interpreter", default="python", help="Interpreter to use")
     parser.add_argument("--param-setter", type=str, default=None)
-    parser.add_argument("--wlm", choices=["lsf", "slurm", "openshift", "none"], default="none")
+    parser.add_argument("--wlm", choices=["lsf", "slurm", "openshift", "vela", "none"], default="none")
     parser.add_argument("--gpu-count", type=int, default=1)
     parser.add_argument("--cpu-count", type=int, default=4)
     parser.add_argument("--mem-gb", type=int, default=128)
     parser.add_argument("--lsf-gpu-config-string", type=str, default=None)
+
+    # ------------------------
+    # Vela / OpenShift options
+    # ------------------------
+    parser.add_argument(
+        "--vela-job-template",
+        type=str,
+        default=None,
+        help="Path to the Vela job YAML template (required when --wlm vela)",
+    )
+    parser.add_argument(
+        "--vela-chart-path",
+        type=str,
+        default=None,
+        help="Path to the helm chart directory (required when --wlm vela)",
+    )
+    parser.add_argument(
+        "--vela-namespace",
+        type=str,
+        default=None,
+        help="OpenShift/Kubernetes namespace (uses current context if omitted)",
+    )
+    parser.add_argument(
+        "--vela-cmd-placeholder",
+        type=str,
+        default="{{HPO_COMMAND}}",
+        help="String in the job template's setupCommands that is replaced with the HPO command (default: '{{HPO_COMMAND}}')",
+    )
+    parser.add_argument(
+        "--vela-pod-ready-timeout",
+        type=int,
+        default=600,
+        help="Seconds to wait for the trial pod to reach Running state (default: 600)",
+    )
+    parser.add_argument(
+        "--vela-job-timeout",
+        type=int,
+        default=86400,
+        help="Seconds to wait for the trial job to complete (default: 86400 = 24 h)",
+    )
     parser.add_argument(
         "--parallelism",
         type=int,
@@ -114,6 +156,9 @@ def build_launcher_command(wlm, cmd, trial_id, out_file, err_file, gpu_count, cp
         # No embedded redirect: run_and_stream() captures stdout/stderr via PIPE
         # and writes to out_file/err_file itself.
         launcher = f'bash -c "{cmd}"'
+    elif wlm in ("vela",):
+        # Vela uses a separate submission flow; this function is not called for it.
+        raise ValueError("build_launcher_command must not be called for wlm='vela'; use build_vela_job_yaml + run_vela_trial instead.")
     else:
         raise ValueError(f"Unknown WLM: {wlm}")
     logger.debug("Launcher command: %s", launcher)
@@ -154,6 +199,242 @@ def build_shell_command(interpreter, root_dir, script_path, venv, script_args, p
     logger.debug("Shell command: %s", cmd)
     return cmd
 
+
+def build_container_command(interpreter: str, script_path: str, script_args: dict, param_setter: Optional[str], underscore_to_hyphen: bool = True) -> str:
+    """Build a bare CLI invocation suitable for running inside a container.
+
+    Unlike :func:`build_shell_command` this function does **not** prepend
+    ``cd`` or ``source venv`` – those are not needed (or available) inside an
+    already-running container image.
+    """
+    prefix = f"{interpreter} " if interpreter else ""
+    arg_list = [f"{prefix}{script_path}".strip()]
+    for key, value in script_args.items():
+        arg_name = key.replace("_", "-") if underscore_to_hyphen else key
+        if value is None:
+            logger.debug("Container cmd: skipping '%s' (None)", key)
+            continue
+        if param_setter:
+            if isinstance(value, bool):
+                if value:
+                    arg_list.append(f"--{param_setter} {key}")
+                else:
+                    pass  # omit
+            else:
+                arg_list.append(f"--{param_setter} {key} {value}")
+        else:
+            if isinstance(value, bool):
+                if value:
+                    arg_list.append(f"--{arg_name}")
+                # else omit
+            else:
+                arg_list.append(f"--{arg_name} {value}")
+    cmd = " ".join(arg_list)
+    logger.debug("Container command: %s", cmd)
+    return cmd
+
+
+def build_vela_job_yaml(
+    template_path: str,
+    trial_id: int,
+    gpu_count: int,
+    container_cmd: str,
+    placeholder: str,
+) -> tuple[str, str]:
+    """Load *template_path*, inject HPO parameters, and return ``(yaml_str, job_name)``.
+
+    Changes applied to the template:
+    * The ``jobName`` field gets a ``-trial-{trial_id}`` suffix so each trial
+      creates a unique Kubernetes resource.
+    * ``numGpusPerPod`` is overwritten with *gpu_count*.
+    * Any entry in ``setupCommands`` that equals *placeholder* is replaced with
+      *container_cmd*.  If no entry matches the placeholder, the last entry is
+      replaced and a warning is emitted.
+    """
+    with open(template_path, "r") as fh:
+        data = yaml.safe_load(fh)
+
+    # Unique job name per trial
+    base_name = data.get("jobName", "hpo-job")
+    job_name = f"{base_name}-trial-{trial_id}"
+    data["jobName"] = job_name
+    logger.debug("Vela trial %d: jobName → %s", trial_id, job_name)
+
+    # GPU count
+    data["numGpusPerPod"] = gpu_count
+    logger.debug("Vela trial %d: numGpusPerPod → %d", trial_id, gpu_count)
+
+    # Inject command
+    setup_cmds = data.get("setupCommands", [])
+    replaced = False
+    for i, entry in enumerate(setup_cmds):
+        if placeholder in str(entry):
+            setup_cmds[i] = container_cmd
+            replaced = True
+            logger.debug("Vela trial %d: replaced setupCommands[%d] with HPO command", trial_id, i)
+            break
+    if not replaced:
+        logger.warning(
+            "Vela trial %d: placeholder '%s' not found in setupCommands – replacing last entry",
+            trial_id, placeholder,
+        )
+        if setup_cmds:
+            setup_cmds[-1] = container_cmd
+        else:
+            setup_cmds.append(container_cmd)
+    data["setupCommands"] = setup_cmds
+
+    return yaml.dump(data, default_flow_style=False, allow_unicode=True), job_name
+
+
+def _oc(*args, namespace: Optional[str] = None, check: bool = True, capture: bool = False):
+    """Run an ``oc`` sub-command, optionally capturing output."""
+    cmd = ["oc"] + list(args)
+    if namespace:
+        cmd += ["-n", namespace]
+    logger.debug("oc command: %s", " ".join(cmd))
+    if capture:
+        return subprocess.run(cmd, check=check, capture_output=True, text=True)
+    return subprocess.run(cmd, check=check)
+
+
+def run_vela_trial(
+    trial_id: int,
+    job_yaml: str,
+    chart_path: str,
+    job_name: str,
+    namespace: Optional[str],
+    out_file: str,
+    err_file: str,
+    pod_ready_timeout: int,
+    job_timeout: int,
+) -> None:
+    """Submit a Vela/OpenShift PyTorchJob, stream its logs, and wait for completion.
+
+    Steps
+    -----
+    1. Write *job_yaml* to a temp file.
+    2. ``helm template -f <tmp> <chart> | oc create [-n <ns>] -f-``
+    3. Poll until the master pod (``<job_name>-master-0``) appears.
+    4. ``oc logs -f <pod>`` – streams every line to stdout **and** *out_file*.
+    5. After streaming ends, check the pod's terminated exit-code.
+       Non-zero → raise :class:`subprocess.CalledProcessError`.
+    6. Cleanup: delete the PyTorchJob resource.
+    """
+    ns_args = ["-n", namespace] if namespace else []
+    prefix = f"[trial-{trial_id}]"
+
+    # Write temp YAML
+    with tempfile.NamedTemporaryFile(
+        mode="w",
+        suffix=".yaml",
+        prefix=f"vela_trial_{trial_id}_",
+        delete=False,
+    ) as fh:
+        fh.write(job_yaml)
+        tmp_yaml = fh.name
+    logger.debug("Vela trial %d: temp YAML written to %s", trial_id, tmp_yaml)
+
+    try:
+        # ── 1. Submit ──────────────────────────────────────────────────────────
+        ns_flag = f"-n {namespace}" if namespace else ""
+        create_cmd = (
+            f"helm template -f {tmp_yaml} {chart_path}"
+            f" | oc create {ns_flag} -f-"
+        )
+        logger.info("Trial %d: submitting Vela job → %s", trial_id, create_cmd)
+        result = subprocess.run(create_cmd, shell=True, capture_output=True, text=True)
+        with _print_lock:
+            sys.stdout.write(f"{prefix} {result.stdout}")
+            sys.stdout.flush()
+        if result.returncode != 0:
+            raise RuntimeError(
+                f"Vela trial {trial_id}: oc create failed (rc={result.returncode}):\n"
+                f"{result.stderr}"
+            )
+        logger.info("Trial %d: job '%s' created", trial_id, job_name)
+
+        # ── 2. Wait for master pod to appear ──────────────────────────────────
+        master_pod = f"{job_name}-master-0"
+        deadline = time.monotonic() + pod_ready_timeout
+        logger.info("Trial %d: waiting for pod '%s' to appear (timeout %ds)…", trial_id, master_pod, pod_ready_timeout)
+        while time.monotonic() < deadline:
+            r = subprocess.run(
+                ["oc", "get", "pod", master_pod, "--ignore-not-found"] + ns_args,
+                capture_output=True, text=True,
+            )
+            if master_pod in r.stdout:
+                logger.debug("Trial %d: pod '%s' found", trial_id, master_pod)
+                break
+            time.sleep(5)
+        else:
+            raise TimeoutError(
+                f"Vela trial {trial_id}: pod '{master_pod}' did not appear within {pod_ready_timeout}s"
+            )
+
+        # ── 3. Wait for pod to be Running/Succeeded ───────────────────────────
+        logger.info("Trial %d: waiting for pod '%s' to be Running…", trial_id, master_pod)
+        wait_cmd = (
+            ["oc", "wait", f"pod/{master_pod}",
+             "--for=condition=Ready",
+             f"--timeout={pod_ready_timeout}s"]
+            + ns_args
+        )
+        wr = subprocess.run(wait_cmd, capture_output=True, text=True)
+        # oc wait returns non-zero if the pod is already Completed (no Ready condition);
+        # that's fine – the logs are still accessible.
+        logger.debug("Trial %d: oc wait rc=%d stderr=%s", trial_id, wr.returncode, wr.stderr.strip())
+
+        # ── 4. Stream logs ────────────────────────────────────────────────────
+        log_cmd = ["oc", "logs", "-f", master_pod] + ns_args
+        logger.info("Trial %d: streaming logs from '%s'", trial_id, master_pod)
+        log_proc = subprocess.Popen(
+            log_cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        t_out = threading.Thread(
+            target=_stream_pipe,
+            args=(log_proc.stdout, out_file, trial_id, "stdout", sys.stdout),
+            daemon=True,
+        )
+        t_err = threading.Thread(
+            target=_stream_pipe,
+            args=(log_proc.stderr, err_file, trial_id, "stderr", sys.stderr),
+            daemon=True,
+        )
+        t_out.start()
+        t_err.start()
+        log_proc.wait(timeout=job_timeout)
+        t_out.join()
+        t_err.join()
+        logger.debug("Trial %d: log stream ended (rc=%d)", trial_id, log_proc.returncode)
+
+        # ── 5. Check pod exit code ────────────────────────────────────────────
+        ec_result = subprocess.run(
+            ["oc", "get", "pod", master_pod, "-o",
+             "jsonpath={.status.containerStatuses[0].state.terminated.exitCode}"]
+            + ns_args,
+            capture_output=True, text=True,
+        )
+        exit_code_str = ec_result.stdout.strip()
+        exit_code = int(exit_code_str) if exit_code_str.lstrip("-").isdigit() else 0
+        logger.info("Trial %d: pod exit code = %s", trial_id, exit_code)
+        if exit_code != 0:
+            raise subprocess.CalledProcessError(exit_code, log_cmd)
+
+    finally:
+        # ── 6. Cleanup – delete the job ───────────────────────────────────────
+        logger.debug("Trial %d: deleting PyTorchJob '%s'", trial_id, job_name)
+        subprocess.run(
+            ["oc", "delete", "pytorchjob", job_name, "--ignore-not-found"] + ns_args,
+            capture_output=True,
+        )
+        try:
+            os.unlink(tmp_yaml)
+        except OSError:
+            pass
+
 # ============================================================
 # PARALLEL STREAMING RUNNER
 # ============================================================
@@ -376,11 +657,43 @@ def objective(trial):
         err_file = f"trial_{trial.number}.err"
         logger.debug("Trial %d: stdout → %s | stderr → %s", trial.number, out_file, err_file)
 
-        shell_cmd = build_shell_command(args.interpreter, root_dir, script_path, args.venv, script_args, args.param_setter, args.underscore_to_hyphen)
-        launcher_cmd = build_launcher_command(args.wlm, shell_cmd, trial.number, out_file, err_file, gpu_count, args.cpu_count, args.mem_gb, args.lsf_gpu_config_string)
+        if args.wlm == "vela":
+            # ── Vela / OpenShift path ──────────────────────────────────────
+            if not args.vela_job_template:
+                raise ValueError("--vela-job-template is required when --wlm vela")
+            if not args.vela_chart_path:
+                raise ValueError("--vela-chart-path is required when --wlm vela")
+            container_cmd = build_container_command(
+                args.interpreter, script_path, script_args,
+                args.param_setter, args.underscore_to_hyphen,
+            )
+            logger.info("Trial %d: container command → %s", trial.number, container_cmd)
+            job_yaml, job_name = build_vela_job_yaml(
+                args.vela_job_template,
+                trial.number,
+                gpu_count,
+                container_cmd,
+                args.vela_cmd_placeholder,
+            )
+            logger.debug("Trial %d: job YAML (first 400 chars):\n%s", trial.number, job_yaml[:400])
+            run_vela_trial(
+                trial_id=trial.number,
+                job_yaml=job_yaml,
+                chart_path=args.vela_chart_path,
+                job_name=job_name,
+                namespace=args.vela_namespace,
+                out_file=out_file,
+                err_file=err_file,
+                pod_ready_timeout=args.vela_pod_ready_timeout,
+                job_timeout=args.vela_job_timeout,
+            )
+        else:
+            # ── Standard WLM path (lsf / slurm / none) ────────────────────
+            shell_cmd = build_shell_command(args.interpreter, root_dir, script_path, args.venv, script_args, args.param_setter, args.underscore_to_hyphen)
+            launcher_cmd = build_launcher_command(args.wlm, shell_cmd, trial.number, out_file, err_file, gpu_count, args.cpu_count, args.mem_gb, args.lsf_gpu_config_string)
+            logger.info("Trial %d: submitting → %s", trial.number, launcher_cmd)
+            run_and_stream(launcher_cmd, trial.number, out_file, err_file, args.wlm)
 
-        logger.info("Trial %d: submitting → %s", trial.number, launcher_cmd)
-        run_and_stream(launcher_cmd, trial.number, out_file, err_file, args.wlm)
         logger.info("Trial %d: job finished", trial.number)
 
         values = extract_metrics_from_log(out_file, metric_list, err_path=err_file)

From 5982a3aca4572162a957f85bd7a7db03b5bfbf69 Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Tue, 14 Apr 2026 15:19:37 +0200
Subject: [PATCH 09/16] add gpu perf test on vela

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 configs/gpu_perf_hpo.yaml             | 58 +++++++++++++++++++++++++++
 examples/run_vela_gpu_perf_example.sh | 46 +++++++++++++++++++++
 examples/vela_gpu_perf_template.yaml  | 37 +++++++++++++++++
 terratorch_iterate/iterate2.py        | 12 +++---
 4 files changed, 146 insertions(+), 7 deletions(-)
 create mode 100644 configs/gpu_perf_hpo.yaml
 create mode 100644 examples/run_vela_gpu_perf_example.sh
 create mode 100644 examples/vela_gpu_perf_template.yaml

diff --git a/configs/gpu_perf_hpo.yaml b/configs/gpu_perf_hpo.yaml
new file mode 100644
index 0000000..47d4fb1
--- /dev/null
+++ b/configs/gpu_perf_hpo.yaml
@@ -0,0 +1,58 @@
+# HPO configuration for GPU performance benchmarking with the CLAIMED
+# gpu_performance_test component on Vela (OpenShift / MLBatch).
+#
+# Hyperparameters:
+#   batch_size    – DataLoader / training batch size
+#   num_workers   – DataLoader worker count
+#   hidden_dim    – MLP hidden layer width (training/inference cost)
+#   depth         – MLP depth (training/inference cost)
+#   matrix_size   – Matrix multiplication size (raw GPU compute)
+#   gpu_num       – Number of GPUs to request per pod (launcher-level)
+#
+# Static args:
+#   All remaining fixed CLI flags for gpu_performance_test.py
+#
+# Metrics:
+#   Emitted as [performance] lines by the awk rename wrapper in the template.
+#   Using last-match semantics of extract_metrics_from_log each name is unique.
+
+metrics:
+  - dataloader_samples_per_sec
+  - training_samples_per_sec
+  - inference_samples_per_sec
+  - gflops
+
+hpo:
+  gpu_num:
+    type: categorical
+    choices: [1, 2]
+
+  batch_size:
+    type: categorical
+    choices: [32, 64, 128]
+
+  num_workers:
+    type: categorical
+    choices: [8, 16, 32]
+
+  hidden_dim:
+    type: categorical
+    choices: [500, 1000, 2000]
+
+  depth:
+    type: categorical
+    choices: [500, 1000, 2000]
+
+  matrix_size:
+    type: categorical
+    choices: [5000, 10000, 20000]
+
+static:
+  mode: single_gpu
+  dataset_size: 100000
+  steps: 1
+  input_dim: 1000000
+  num_classes: 100
+  materialize_dir: "."
+  cleanup: true      # flag – generates --cleanup (store_true)
+  iterations: 100
diff --git a/examples/run_vela_gpu_perf_example.sh b/examples/run_vela_gpu_perf_example.sh
new file mode 100644
index 0000000..91448c2
--- /dev/null
+++ b/examples/run_vela_gpu_perf_example.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+# =============================================================================
+# Example: iterate2 GPU performance benchmark HPO on Vela (OpenShift / MLBatch)
+#
+# What iterate2 does per trial
+# ----------------------------
+# 1. Samples hyperparameters from configs/gpu_perf_hpo.yaml
+# 2. Builds the CLI call:
+#      python /app/.local/.../gpu_performance_test.py \
+#        --mode single_gpu --batch-size <N> --num-workers <N> ...
+# 3. Patches examples/vela_gpu_perf_template.yaml:
+#      - appends  -trial-<N>  to jobName
+#      - sets     numGpusPerPod = gpu_num
+#      - replaces {{HPO_COMMAND}} with the CLI call (awk wrapper stays intact)
+# 4. Submits:
+#      helm template -f <patched.yaml> <chart> | oc create [-n <ns>] -f-
+# 5. Streams:  oc logs -f <jobName>-master-0  (blocks until container exits)
+# 6. Extracts metrics from the renamed [performance] lines:
+#      dataloader_samples_per_sec, training_samples_per_sec,
+#      inference_samples_per_sec, gflops
+# 7. Deletes the PyTorchJob resource.
+# =============================================================================
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+# Path to the mlbatch pytorchjob-generator helm chart.
+# Clone first:  git clone https://github.com/project-codeflare/mlbatch
+CHART_PATH="${MLBATCH_CHART_PATH:-${REPO_ROOT}/../mlbatch/tools/pytorchjob-generator/chart}"
+
+iterate2 \
+  --script      "/app/.local/lib/python3.12/site-packages/claimed/components/util/gpu_performance_test.py" \
+  --interpreter "python"                                                   \
+  --wlm         vela                                                       \
+  --vela-job-template  "${SCRIPT_DIR}/vela_gpu_perf_template.yaml"        \
+  --vela-chart-path    "${CHART_PATH}"                                     \
+  --vela-namespace     "${OC_NAMESPACE:-}"                                 \
+  --vela-pod-ready-timeout 300                                             \
+  --vela-job-timeout       7200                                            \
+  --gpu-count              1                                               \
+  --optuna-study-name      gpu_perf_hpo                                    \
+  --optuna-db-path         "sqlite:///gpu_perf_hpo.db"                     \
+  --optuna-n-trials        30                                              \
+  --hpo-yaml               "${REPO_ROOT}/configs/gpu_perf_hpo.yaml"
diff --git a/examples/vela_gpu_perf_template.yaml b/examples/vela_gpu_perf_template.yaml
new file mode 100644
index 0000000..830b98d
--- /dev/null
+++ b/examples/vela_gpu_perf_template.yaml
@@ -0,0 +1,37 @@
+####################
+# Job Metadata
+# NOTE: iterate2 appends "-trial-<N>" to jobName so each trial is a unique resource.
+####################
+jobName: "romeokienzler-gpu-test-hpo"
+containerImage: "us.icr.io/geodn/gridfm-graphkit:0.1"
+imagePullPolicy: "IfNotPresent"
+imagePullSecrets:
+  - name: "pullsecret-gridfm-geodn"
+
+##################################
+# Resource Requirements
+# numGpusPerPod is overwritten per-trial by the gpu_num HPO parameter.
+##################################
+numPods: 1
+numCpusPerPod: 32
+numGpusPerPod: 1
+totalMemoryPerPod: "32Gi"
+
+########################
+# Workload Specification
+########################
+setupCommands:
+  - "pip install -U claimed"
+  # iterate2 replaces {{HPO_COMMAND}} with the HPO-parametrised CLI call.
+  # The awk wrapper renames duplicate "Samples/sec:" lines to unique
+  # [performance] metric names so iterate2 can extract all four metrics.
+  - |
+    {{HPO_COMMAND}} 2>&1 | awk '
+      /^--- DataLoader/  { sec="dataloader" }
+      /^--- Training/    { sec="training"   }
+      /^--- Inference/   { sec="inference"  }
+      /^--- GPU compute/ { sec="gpu"        }
+      /Samples\/sec:/    { print "[performance] " sec "_samples_per_sec: " $NF; next }
+      /GFLOPS:/          { print "[performance] gflops: " $NF; next }
+                         { print }
+    '
diff --git a/terratorch_iterate/iterate2.py b/terratorch_iterate/iterate2.py
index a057e05..11c03d7 100644
--- a/terratorch_iterate/iterate2.py
+++ b/terratorch_iterate/iterate2.py
@@ -269,19 +269,17 @@ def build_vela_job_yaml(
     replaced = False
     for i, entry in enumerate(setup_cmds):
         if placeholder in str(entry):
-            setup_cmds[i] = container_cmd
+            # In-place substitution: keeps any wrapper (e.g. awk pipeline) around the placeholder
+            setup_cmds[i] = str(entry).replace(placeholder, container_cmd)
             replaced = True
-            logger.debug("Vela trial %d: replaced setupCommands[%d] with HPO command", trial_id, i)
+            logger.debug("Vela trial %d: substituted placeholder in setupCommands[%d]", trial_id, i)
             break
     if not replaced:
         logger.warning(
-            "Vela trial %d: placeholder '%s' not found in setupCommands – replacing last entry",
+            "Vela trial %d: placeholder '%s' not found in setupCommands – appending command as new entry",
             trial_id, placeholder,
         )
-        if setup_cmds:
-            setup_cmds[-1] = container_cmd
-        else:
-            setup_cmds.append(container_cmd)
+        setup_cmds.append(container_cmd)
     data["setupCommands"] = setup_cmds
 
     return yaml.dump(data, default_flow_style=False, allow_unicode=True), job_name

From 52a27446bf10ce0bd12dcd0620bba2802c020143 Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Wed, 15 Apr 2026 12:37:56 +0200
Subject: [PATCH 10/16] add vela support

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 configs/gpu_perf_hpo.yaml             |  13 ++--
 examples/run_lsf_gpu_example.sh       |   2 +-
 examples/run_vela_example.sh          |   2 +-
 examples/run_vela_gpu_perf_example.sh |  32 ++++----
 examples/vela_gpu_perf_template.yaml  |  18 ++---
 terratorch_iterate/iterate2.py        | 107 ++++++++++++++++++--------
 6 files changed, 107 insertions(+), 67 deletions(-)
 mode change 100644 => 100755 examples/run_vela_example.sh
 mode change 100644 => 100755 examples/run_vela_gpu_perf_example.sh

diff --git a/configs/gpu_perf_hpo.yaml b/configs/gpu_perf_hpo.yaml
index 47d4fb1..bfb12ca 100644
--- a/configs/gpu_perf_hpo.yaml
+++ b/configs/gpu_perf_hpo.yaml
@@ -13,14 +13,15 @@
 #   All remaining fixed CLI flags for gpu_performance_test.py
 #
 # Metrics:
-#   Emitted as [performance] lines by the awk rename wrapper in the template.
-#   Using last-match semantics of extract_metrics_from_log each name is unique.
+#   The gpu_performance_test script prints "Samples/sec:" three times (DataLoader,
+#   Training, Inference) and "GFLOPS:" once. iterate2 uses the  name#N  syntax
+#   to select the Nth occurrence (0-based) of a repeated metric name.
 
 metrics:
-  - dataloader_samples_per_sec
-  - training_samples_per_sec
-  - inference_samples_per_sec
-  - gflops
+  - "Samples/sec#0"   # DataLoader throughput (1st occurrence)
+  - "Samples/sec#1"   # Training throughput   (2nd occurrence)
+  - "Samples/sec#2"   # Inference throughput  (3rd occurrence)
+  - GFLOPS
 
 hpo:
   gpu_num:
diff --git a/examples/run_lsf_gpu_example.sh b/examples/run_lsf_gpu_example.sh
index 2913513..9808c92 100755
--- a/examples/run_lsf_gpu_example.sh
+++ b/examples/run_lsf_gpu_example.sh
@@ -22,7 +22,7 @@ set -euo pipefail
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
-iterate2 \
+iterate \
   --script        "${SCRIPT_DIR}/bumpy_function.py"  \
   --root-dir      "${SCRIPT_DIR}"                    \
   --venv          ""                                 \
diff --git a/examples/run_vela_example.sh b/examples/run_vela_example.sh
old mode 100644
new mode 100755
index 41cdb06..3327864
--- a/examples/run_vela_example.sh
+++ b/examples/run_vela_example.sh
@@ -35,7 +35,7 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
 # Clone mlbatch first:  git clone https://github.com/project-codeflare/mlbatch
 CHART_PATH="${MLBATCH_CHART_PATH:-${REPO_ROOT}/../mlbatch/tools/pytorchjob-generator/chart}"
 
-iterate2 \
+iterate \
   --script            "gridfm_graphkit train" \
   --interpreter       ""                                       \
   --wlm               vela                                    \
diff --git a/examples/run_vela_gpu_perf_example.sh b/examples/run_vela_gpu_perf_example.sh
old mode 100644
new mode 100755
index 91448c2..edf76b6
--- a/examples/run_vela_gpu_perf_example.sh
+++ b/examples/run_vela_gpu_perf_example.sh
@@ -11,13 +11,13 @@
 # 3. Patches examples/vela_gpu_perf_template.yaml:
 #      - appends  -trial-<N>  to jobName
 #      - sets     numGpusPerPod = gpu_num
-#      - replaces {{HPO_COMMAND}} with the CLI call (awk wrapper stays intact)
+#      - replaces {{HPO_COMMAND}} with the CLI call (plain single-line string)
 # 4. Submits:
 #      helm template -f <patched.yaml> <chart> | oc create [-n <ns>] -f-
 # 5. Streams:  oc logs -f <jobName>-master-0  (blocks until container exits)
-# 6. Extracts metrics from the renamed [performance] lines:
-#      dataloader_samples_per_sec, training_samples_per_sec,
-#      inference_samples_per_sec, gflops
+# 6. Extracts metrics using Nth-occurrence syntax (name#N, 0-based):
+#      Samples/sec#0 (DataLoader), Samples/sec#1 (Training),
+#      Samples/sec#2 (Inference), GFLOPS
 # 7. Deletes the PyTorchJob resource.
 # =============================================================================
 
@@ -28,19 +28,25 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
 
 # Path to the mlbatch pytorchjob-generator helm chart.
 # Clone first:  git clone https://github.com/project-codeflare/mlbatch
-CHART_PATH="${MLBATCH_CHART_PATH:-${REPO_ROOT}/../mlbatch/tools/pytorchjob-generator/chart}"
+CHART_PATH="${MLBATCH_CHART_PATH:-${HOME}/tmp/mlbatch/tools/pytorchjob-generator/chart}"
 
-iterate2 \
-  --script      "/app/.local/lib/python3.12/site-packages/claimed/components/util/gpu_performance_test.py" \
-  --interpreter "python"                                                   \
-  --wlm         vela                                                       \
-  --vela-job-template  "${SCRIPT_DIR}/vela_gpu_perf_template.yaml"        \
-  --vela-chart-path    "${CHART_PATH}"                                     \
-  --vela-namespace     "${OC_NAMESPACE:-}"                                 \
+# Only pass --vela-namespace when OC_NAMESPACE is set; an empty string causes
+# argparse to receive a bare "" token it treats as an unrecognised argument.
+NAMESPACE_ARG=()
+[[ -n "${OC_NAMESPACE:-}" ]] && NAMESPACE_ARG=(--vela-namespace "${OC_NAMESPACE}")
+
+iterate \
+  --script                 "/app/.local/lib/python3.12/site-packages/claimed/components/util/gpu_performance_test.py" \
+  --interpreter            "python"                                        \
+  --no-underscore-to-hyphen                                                \
+  --wlm                    vela                                            \
+  --vela-job-template      "${SCRIPT_DIR}/vela_gpu_perf_template.yaml"    \
+  --vela-chart-path        "${CHART_PATH}"                                 \
+  "${NAMESPACE_ARG[@]}"                                                    \
   --vela-pod-ready-timeout 300                                             \
   --vela-job-timeout       7200                                            \
   --gpu-count              1                                               \
   --optuna-study-name      gpu_perf_hpo                                    \
   --optuna-db-path         "sqlite:///gpu_perf_hpo.db"                     \
-  --optuna-n-trials        30                                              \
+   --optuna-n-trials        250                                             \
   --hpo-yaml               "${REPO_ROOT}/configs/gpu_perf_hpo.yaml"
diff --git a/examples/vela_gpu_perf_template.yaml b/examples/vela_gpu_perf_template.yaml
index 830b98d..476a5c7 100644
--- a/examples/vela_gpu_perf_template.yaml
+++ b/examples/vela_gpu_perf_template.yaml
@@ -15,7 +15,7 @@ imagePullSecrets:
 numPods: 1
 numCpusPerPod: 32
 numGpusPerPod: 1
-totalMemoryPerPod: "32Gi"
+totalMemoryPerPod: "64Gi"
 
 ########################
 # Workload Specification
@@ -23,15 +23,7 @@ totalMemoryPerPod: "32Gi"
 setupCommands:
   - "pip install -U claimed"
   # iterate2 replaces {{HPO_COMMAND}} with the HPO-parametrised CLI call.
-  # The awk wrapper renames duplicate "Samples/sec:" lines to unique
-  # [performance] metric names so iterate2 can extract all four metrics.
-  - |
-    {{HPO_COMMAND}} 2>&1 | awk '
-      /^--- DataLoader/  { sec="dataloader" }
-      /^--- Training/    { sec="training"   }
-      /^--- Inference/   { sec="inference"  }
-      /^--- GPU compute/ { sec="gpu"        }
-      /Samples\/sec:/    { print "[performance] " sec "_samples_per_sec: " $NF; next }
-      /GFLOPS:/          { print "[performance] gflops: " $NF; next }
-                         { print }
-    '
+  # Each setupCommands entry must be a single line (Helm chart constraint).
+  # Metrics are disambiguated by iterate2 using Samples/sec#N (Nth-occurrence)
+  # rather than awk renaming.
+  - "{{HPO_COMMAND}}"
diff --git a/terratorch_iterate/iterate2.py b/terratorch_iterate/iterate2.py
index 11c03d7..c8c9033 100644
--- a/terratorch_iterate/iterate2.py
+++ b/terratorch_iterate/iterate2.py
@@ -241,48 +241,56 @@ def build_vela_job_yaml(
     container_cmd: str,
     placeholder: str,
 ) -> tuple[str, str]:
-    """Load *template_path*, inject HPO parameters, and return ``(yaml_str, job_name)``.
+    """Load *template_path* as raw text, inject HPO parameters, return ``(yaml_str, job_name)``.
 
-    Changes applied to the template:
-    * The ``jobName`` field gets a ``-trial-{trial_id}`` suffix so each trial
-      creates a unique Kubernetes resource.
+    All modifications are done via targeted regex/string substitutions on the raw
+    YAML text so that multi-line block scalars (e.g. awk pipelines), single-quoted
+    strings, and other constructs that PyYAML would mangle on a load→dump round-trip
+    are preserved exactly as written in the template.
+
+    Changes applied:
+    * ``jobName`` gets a ``-trial-{trial_id}`` suffix (unique Kubernetes resource).
     * ``numGpusPerPod`` is overwritten with *gpu_count*.
-    * Any entry in ``setupCommands`` that equals *placeholder* is replaced with
-      *container_cmd*.  If no entry matches the placeholder, the last entry is
-      replaced and a warning is emitted.
+    * The *placeholder* string inside ``setupCommands`` is replaced with
+      *container_cmd* in-place, preserving any surrounding wrapper (e.g. awk pipeline).
     """
     with open(template_path, "r") as fh:
-        data = yaml.safe_load(fh)
-
-    # Unique job name per trial
-    base_name = data.get("jobName", "hpo-job")
-    job_name = f"{base_name}-trial-{trial_id}"
-    data["jobName"] = job_name
+        text = fh.read()
+
+    # ── jobName ──────────────────────────────────────────────────────────────
+    job_name_match = re.search(r'^(jobName\s*:\s*["\']?)([^"\'#\n]+)(["\']?)', text, re.MULTILINE)
+    if not job_name_match:
+        raise ValueError(f"'jobName' key not found in template '{template_path}'")
+    raw_name = job_name_match.group(2).strip()
+    job_name = f"{raw_name}-trial-{trial_id}"
+    text = (
+        text[:job_name_match.start(2)]
+        + job_name_match.group(2).replace(raw_name, job_name)
+        + text[job_name_match.end(2):]
+    )
     logger.debug("Vela trial %d: jobName → %s", trial_id, job_name)
 
-    # GPU count
-    data["numGpusPerPod"] = gpu_count
+    # ── numGpusPerPod ────────────────────────────────────────────────────────
+    text = re.sub(
+        r'^(numGpusPerPod\s*:\s*)\S+',
+        lambda m: f"{m.group(1)}{gpu_count}",
+        text,
+        flags=re.MULTILINE,
+    )
     logger.debug("Vela trial %d: numGpusPerPod → %d", trial_id, gpu_count)
 
-    # Inject command
-    setup_cmds = data.get("setupCommands", [])
-    replaced = False
-    for i, entry in enumerate(setup_cmds):
-        if placeholder in str(entry):
-            # In-place substitution: keeps any wrapper (e.g. awk pipeline) around the placeholder
-            setup_cmds[i] = str(entry).replace(placeholder, container_cmd)
-            replaced = True
-            logger.debug("Vela trial %d: substituted placeholder in setupCommands[%d]", trial_id, i)
-            break
-    if not replaced:
+    # ── placeholder substitution ─────────────────────────────────────────────
+    if placeholder in text:
+        text = text.replace(placeholder, container_cmd)
+        logger.debug("Vela trial %d: substituted placeholder '%s'", trial_id, placeholder)
+    else:
         logger.warning(
-            "Vela trial %d: placeholder '%s' not found in setupCommands – appending command as new entry",
-            trial_id, placeholder,
+            "Vela trial %d: placeholder '%s' not found in template '%s' – appending command",
+            trial_id, placeholder, template_path,
         )
-        setup_cmds.append(container_cmd)
-    data["setupCommands"] = setup_cmds
+        text += f"\n  - {container_cmd}\n"
 
-    return yaml.dump(data, default_flow_style=False, allow_unicode=True), job_name
+    return text, job_name
 
 
 def _oc(*args, namespace: Optional[str] = None, check: bool = True, capture: bool = False):
@@ -419,7 +427,8 @@ def run_vela_trial(
         exit_code = int(exit_code_str) if exit_code_str.lstrip("-").isdigit() else 0
         logger.info("Trial %d: pod exit code = %s", trial_id, exit_code)
         if exit_code != 0:
-            raise subprocess.CalledProcessError(exit_code, log_cmd)
+            logger.warning("Trial %d: pod exited with code %d – marking trial as pruned", trial_id, exit_code)
+            raise optuna.exceptions.TrialPruned(f"pod exited with code {exit_code}")
 
     finally:
         # ── 6. Cleanup – delete the job ───────────────────────────────────────
@@ -516,6 +525,19 @@ def run_and_stream(launcher_cmd: str, trial_id: int, out_file: str, err_file: st
 # ============================================================
 
 def extract_metrics_from_log(path: str, metric_names: List[str], err_path: Optional[str] = None) -> List[float]:
+    """Extract metric values from a log file.
+
+    Each entry in *metric_names* is either a plain name (uses the **last**
+    match) or ``name#N`` to select the **N-th occurrence** (0-based). This
+    lets you disambiguate scripts that print the same metric key multiple
+    times, e.g.::
+
+        metrics:
+          - "Samples/sec#0"   # DataLoader throughput (first occurrence)
+          - "Samples/sec#1"   # Training throughput   (second occurrence)
+          - "Samples/sec#2"   # Inference throughput  (third occurrence)
+          - GFLOPS
+    """
     logger.debug("Extracting metrics %s from '%s'", metric_names, path)
     results = []
     with open(path, "r", encoding="utf-8", errors="ignore") as f:
@@ -532,14 +554,33 @@ def extract_metrics_from_log(path: str, metric_names: List[str], err_path: Optio
             logger.debug("Err file '%s' not found, skipping", err_path)
 
     for metric in metric_names:
+        # Support  name#N  syntax for Nth-occurrence selection (0-based)
+        occurrence: Optional[int] = None
+        bare_metric = metric
+        idx_match = re.fullmatch(r'(.+)#(\d+)', metric)
+        if idx_match:
+            bare_metric = idx_match.group(1)
+            occurrence = int(idx_match.group(2))
+
         # Matches: key: value | key=value | [performance] key : value | Lightning table │ key │ value │
         pattern = re.compile(
-            rf"(?:\[\w+\]\s*)?{re.escape(metric)}\s*(?:[:=│])\s*([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)"
+            rf"(?:\[\w+\]\s*)?{re.escape(bare_metric)}\s*(?:[:=│])\s*([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)"
         )
         matches = pattern.findall(text)
         if not matches:
             logger.warning("Metric '%s' not found in '%s' — defaulting to 0.0", metric, path)
             results.append(0.0)
+        elif occurrence is not None:
+            if occurrence >= len(matches):
+                logger.warning(
+                    "Metric '%s' occurrence #%d requested but only %d match(es) found — defaulting to 0.0",
+                    metric, occurrence, len(matches),
+                )
+                results.append(0.0)
+            else:
+                value = float(matches[occurrence])
+                logger.debug("Metric '%s': using occurrence #%d = %s", metric, occurrence, value)
+                results.append(value)
         else:
             value = float(matches[-1])
             logger.debug("Metric '%s': found %d match(es), using last value %s", metric, len(matches), value)

From d27124716510f6522083020ef0ed0855269940da Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Wed, 15 Apr 2026 19:48:21 +0200
Subject: [PATCH 11/16] improve vela config

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 configs/gridfm_graphkit_hpo.yaml   |  5 ++++-
 examples/vela_gridfm_template.yaml | 14 ++++++++------
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/configs/gridfm_graphkit_hpo.yaml b/configs/gridfm_graphkit_hpo.yaml
index 6ee3a2e..049466f 100644
--- a/configs/gridfm_graphkit_hpo.yaml
+++ b/configs/gridfm_graphkit_hpo.yaml
@@ -34,6 +34,10 @@ hpo:
     choices: ["max-autotune", "default", "reduce-overhead", null]
     # null → --compile flag is omitted entirely
 
+  num_workers:
+    type: categorical
+    choices: [8, 16, 24, 32]
+
   dataset:
     type: group             # one choice selects all bundled args together
     choices:
@@ -44,5 +48,4 @@ hpo:
 static:
   run_name: run1
   log_dir: logs
-  num_workers: 32
   report-performance: true
diff --git a/examples/vela_gridfm_template.yaml b/examples/vela_gridfm_template.yaml
index c30f5ec..9779f06 100644
--- a/examples/vela_gridfm_template.yaml
+++ b/examples/vela_gridfm_template.yaml
@@ -4,7 +4,7 @@
 #       every trial creates a unique Kubernetes resource.
 ####################
 jobName: "romeokienzler-gridfm-hpo"
-containerImage: "us.icr.io/geodn/gridfm-graphkit:0.1"
+containerImage: "us.icr.io/geodn/gridfm-graphkit:0.5"
 imagePullPolicy: "IfNotPresent"
 imagePullSecrets:
   - name: "pullsecret-gridfm-geodn"
@@ -14,9 +14,9 @@ imagePullSecrets:
 # NOTE: numGpusPerPod is overridden per-trial by the gpu_num HPO parameter.
 ##################################
 numPods: 1
-numCpusPerPod: 32
-numGpusPerPod: 1          # placeholder – iterate2 overwrites this with gpu_num
-totalMemoryPerPod: "32Gi"
+numCpusPerPod: 64
+numGpusPerPod: 2          # placeholder – iterate overrides this with gpu_num
+totalMemoryPerPod: "64Gi"
 
 volumes:
   - name: "gridfm-storage"
@@ -27,6 +27,8 @@ volumes:
 # Workload Specification
 ########################
 setupCommands:
-  - "wget -q https://raw.githubusercontent.com/gridfm/gridfm-graphkit/refs/heads/main/examples/config/HGNS_PF_datakit_case118.yaml"
-  # iterate2 replaces the placeholder below with the HPO-parametrised CLI call.
+  - "pip install -U git+https://github.com/gridfm/gridfm-graphkit.git@fix_multi_gpu_training"
+  - "wget https://raw.githubusercontent.com/gridfm/gridfm-graphkit/refs/heads/main/examples/config/HGNS_PF_datakit_case118.yaml"
+  - "sed -i 's/epochs: 200/epochs: 5/g' HGNS_PF_datakit_case118.yaml"
+  # iterate replaces the placeholder below with the HPO-parametrised CLI call.
   - "{{HPO_COMMAND}}"

From f61e8bbd85f7a4373888f16a24af507a8aec3e66 Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Thu, 16 Apr 2026 16:18:07 +0200
Subject: [PATCH 12/16] fix

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 examples/run_vela_example.sh          | 11 ++++++++---
 examples/run_vela_gpu_perf_example.sh |  5 +++--
 examples/vela_gridfm_template.yaml    |  2 +-
 terratorch_iterate/iterate2.py        |  2 +-
 4 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/examples/run_vela_example.sh b/examples/run_vela_example.sh
index 3327864..bb5cafc 100755
--- a/examples/run_vela_example.sh
+++ b/examples/run_vela_example.sh
@@ -33,7 +33,10 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
 
 # Path to the mlbatch pytorchjob-generator helm chart.
 # Clone mlbatch first:  git clone https://github.com/project-codeflare/mlbatch
-CHART_PATH="${MLBATCH_CHART_PATH:-${REPO_ROOT}/../mlbatch/tools/pytorchjob-generator/chart}"
+CHART_PATH="${MLBATCH_CHART_PATH:-${HOME}/tmp/mlbatch/tools/pytorchjob-generator/chart}"
+
+NAMESPACE_ARG=()
+[[ -n "${OC_NAMESPACE:-}" ]] && NAMESPACE_ARG=(--vela-namespace "${OC_NAMESPACE}")
 
 iterate \
   --script            "gridfm_graphkit train" \
@@ -41,12 +44,14 @@ iterate \
   --wlm               vela                                    \
   --vela-job-template "${SCRIPT_DIR}/vela_gridfm_template.yaml" \
   --vela-chart-path   "${CHART_PATH}"                         \
-  --vela-namespace    "${OC_NAMESPACE:-}"                     \
+  "${NAMESPACE_ARG[@]}"						 \
   --vela-cmd-placeholder "{{HPO_COMMAND}}"                    \
   --vela-pod-ready-timeout 600                                 \
   --vela-job-timeout  86400                                    \
+  --no-underscore-to-hyphen                                    \
   --gpu-count         1                                        \
   --optuna-study-name  gridfm_vela_hpo                        \
-  --optuna-db-path     "sqlite:///gridfm_vela_hpo.db"         \
+  --optuna-db-path     "js:///gridfm_vela_hpo.journal"        \
+  --parallelism        16                                      \
   --optuna-n-trials    20                                      \
   --hpo-yaml          "${REPO_ROOT}/configs/gridfm_graphkit_hpo.yaml"
diff --git a/examples/run_vela_gpu_perf_example.sh b/examples/run_vela_gpu_perf_example.sh
index edf76b6..bf845b6 100755
--- a/examples/run_vela_gpu_perf_example.sh
+++ b/examples/run_vela_gpu_perf_example.sh
@@ -47,6 +47,7 @@ iterate \
   --vela-job-timeout       7200                                            \
   --gpu-count              1                                               \
   --optuna-study-name      gpu_perf_hpo                                    \
-  --optuna-db-path         "sqlite:///gpu_perf_hpo.db"                     \
+  --optuna-db-path         "js:///gpu_perf_hpo.journal"                    \
    --optuna-n-trials        250                                             \
-  --hpo-yaml               "${REPO_ROOT}/configs/gpu_perf_hpo.yaml"
+  --hpo-yaml               "${REPO_ROOT}/configs/gpu_perf_hpo.yaml"        \
+  --parallelism          5                                               \
diff --git a/examples/vela_gridfm_template.yaml b/examples/vela_gridfm_template.yaml
index 9779f06..e75b3cd 100644
--- a/examples/vela_gridfm_template.yaml
+++ b/examples/vela_gridfm_template.yaml
@@ -29,6 +29,6 @@ volumes:
 setupCommands:
   - "pip install -U git+https://github.com/gridfm/gridfm-graphkit.git@fix_multi_gpu_training"
   - "wget https://raw.githubusercontent.com/gridfm/gridfm-graphkit/refs/heads/main/examples/config/HGNS_PF_datakit_case118.yaml"
-  - "sed -i 's/epochs: 200/epochs: 5/g' HGNS_PF_datakit_case118.yaml"
+#  - "sed -i 's/epochs: 200/epochs: 5/g' HGNS_PF_datakit_case118.yaml"
   # iterate replaces the placeholder below with the HPO-parametrised CLI call.
   - "{{HPO_COMMAND}}"
diff --git a/terratorch_iterate/iterate2.py b/terratorch_iterate/iterate2.py
index c8c9033..86c8e11 100644
--- a/terratorch_iterate/iterate2.py
+++ b/terratorch_iterate/iterate2.py
@@ -745,7 +745,7 @@ def objective(trial):
     logger.info("Creating Optuna study (directions: %s)", directions)
 
     if args.optuna_db_path.startswith("js:///"):
-        journal_path = args.optuna_db_path[len("js://"):]
+        journal_path = args.optuna_db_path[len("js:///"):]
         logger.info("Using JournalStorage at '%s'", journal_path)
         storage = JournalStorage(JournalFileStorage(journal_path))
     elif "sqlite" in args.optuna_db_path:

From d190a8e23f928db6149f379492841ebbfa5add8a Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Thu, 16 Apr 2026 18:44:02 +0200
Subject: [PATCH 13/16] don't crash on fail

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 terratorch_iterate/iterate2.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/terratorch_iterate/iterate2.py b/terratorch_iterate/iterate2.py
index 86c8e11..658476d 100644
--- a/terratorch_iterate/iterate2.py
+++ b/terratorch_iterate/iterate2.py
@@ -763,7 +763,12 @@ def objective(trial):
     logger.info("Study '%s' ready (existing trials: %d)", args.optuna_study_name, len(study.trials))
 
     logger.info("Parallelism: %d worker(s)", args.parallelism)
-    study.optimize(objective, n_trials=args.optuna_n_trials, n_jobs=args.parallelism)
+    study.optimize(
+        objective,
+        n_trials=args.optuna_n_trials,
+        n_jobs=args.parallelism,
+        catch=(Exception,),   # mark trial as FAILED and continue; never crash the study
+    )
 
     logger.info("=" * 60)
     logger.info("OPTIMIZATION COMPLETE")

From b03fb3eb1e55c9bf4f6741c7b3bd10cfff627339 Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Thu, 16 Apr 2026 19:37:24 +0200
Subject: [PATCH 14/16] fix exit code handling

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 terratorch_iterate/iterate2.py | 60 ++++++++++++++++++++++++++++++----
 1 file changed, 53 insertions(+), 7 deletions(-)

diff --git a/terratorch_iterate/iterate2.py b/terratorch_iterate/iterate2.py
index 658476d..d596791 100644
--- a/terratorch_iterate/iterate2.py
+++ b/terratorch_iterate/iterate2.py
@@ -416,14 +416,60 @@ def run_vela_trial(
         t_err.join()
         logger.debug("Trial %d: log stream ended (rc=%d)", trial_id, log_proc.returncode)
 
+        # ── 4b. If oc logs exited early (e.g. "unexpected EOF"), the pod may
+        #        still be running.  Re-attach the log stream and wait for it to
+        #        finish so we capture the full output and don't delete a live job.
+        if log_proc.returncode != 0:
+            logger.warning(
+                "Trial %d: oc logs exited with rc=%d (possible EOF disconnect) – "
+                "waiting for pod to terminate before reading exit code",
+                trial_id, log_proc.returncode,
+            )
+            # Wait for pod phase Succeeded or Failed (container terminated).
+            oc_wait_phase = subprocess.run(
+                ["oc", "wait", f"pod/{master_pod}",
+                 "--for=jsonpath={.status.phase}=Succeeded",
+                 f"--timeout={job_timeout}s"]
+                + ns_args,
+                capture_output=True, text=True,
+            )
+            if oc_wait_phase.returncode != 0:
+                # Pod may have Failed; try that phase too.
+                subprocess.run(
+                    ["oc", "wait", f"pod/{master_pod}",
+                     "--for=jsonpath={.status.phase}=Failed",
+                     f"--timeout=30s"]
+                    + ns_args,
+                    capture_output=True, text=True,
+                )
+            # Re-stream any log lines written after the disconnect into the same files.
+            catchup = subprocess.run(
+                ["oc", "logs", "--tail=-1", master_pod] + ns_args,
+                capture_output=True, text=True,
+            )
+            if catchup.stdout:
+                with open(out_file, "a", encoding="utf-8", errors="replace") as fh:
+                    fh.write(catchup.stdout)
+            if catchup.stderr:
+                with open(err_file, "a", encoding="utf-8", errors="replace") as fh:
+                    fh.write(catchup.stderr)
+
         # ── 5. Check pod exit code ────────────────────────────────────────────
-        ec_result = subprocess.run(
-            ["oc", "get", "pod", master_pod, "-o",
-             "jsonpath={.status.containerStatuses[0].state.terminated.exitCode}"]
-            + ns_args,
-            capture_output=True, text=True,
-        )
-        exit_code_str = ec_result.stdout.strip()
+        # Poll until the pod has a terminated exit code (handles the race
+        # between oc-logs EOF and pod termination being recorded in the API).
+        exit_code_str = ""
+        for _attempt in range(30):
+            ec_result = subprocess.run(
+                ["oc", "get", "pod", master_pod, "-o",
+                 "jsonpath={.status.containerStatuses[0].state.terminated.exitCode}"]
+                + ns_args,
+                capture_output=True, text=True,
+            )
+            exit_code_str = ec_result.stdout.strip()
+            if exit_code_str.lstrip("-").isdigit():
+                break
+            logger.debug("Trial %d: exit code not yet available, retrying in 5 s…", trial_id)
+            time.sleep(5)
         exit_code = int(exit_code_str) if exit_code_str.lstrip("-").isdigit() else 0
         logger.info("Trial %d: pod exit code = %s", trial_id, exit_code)
         if exit_code != 0:

From 989de122b80d2736123d29b06862a58897320c46 Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Fri, 17 Apr 2026 14:27:45 +0200
Subject: [PATCH 15/16] add lsf config example for gridfm

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 examples/run_lsf_gridfm_example.sh | 48 ++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100755 examples/run_lsf_gridfm_example.sh

diff --git a/examples/run_lsf_gridfm_example.sh b/examples/run_lsf_gridfm_example.sh
new file mode 100755
index 0000000..e140776
--- /dev/null
+++ b/examples/run_lsf_gridfm_example.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+# =============================================================================
+# Example: iterate with --wlm lsf for gridfm-graphkit HPO on an LSF cluster
+#
+# Prerequisites
+# -------------
+#   * LSF bsub/bjobs available on PATH
+#   * gridfm-graphkit installed in the venv (or via module load)
+#   * configs/gridfm_graphkit_hpo.yaml present
+#
+# How it works
+# ------------
+# 1. For each Optuna trial iterate:
+#    a. Samples hyperparameters from gridfm_graphkit_hpo.yaml
+#    b. Builds the gridfm_graphkit CLI invocation from static + sampled params
+#    c. Submits a bsub job (-K blocks until completion)
+#    d. Reads stdout/stderr from trial<N>.out / trial<N>.err
+#    e. Extracts metrics and reports them to Optuna
+#
+# Customise
+# ---------
+#   LSF_GPU_CONFIG   – full -gpu option string for bsub
+#   GPU_COUNT        – must match num= in LSF_GPU_CONFIG
+#   OC_NAMESPACE     – not used for LSF; kept as no-op for parity
+# =============================================================================
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+# Adjust to match your cluster's GPU model and scheduling policy.
+LSF_GPU_CONFIG="${LSF_GPU_CONFIG:-num=2:mode=exclusive_process:mps=no:gmodel=NVIDIAA100_SXM4_80GB}"
+
+iterate \
+  --script            "gridfm_graphkit train"                       \
+  --interpreter       ""                                            \
+  --wlm               lsf                                           \
+  --no-underscore-to-hyphen                                         \
+  --gpu-count         2                                             \
+  --cpu-count         32                                            \
+  --mem-gb            256                                           \
+  --lsf-gpu-config-string "${LSF_GPU_CONFIG}"                       \
+  --optuna-study-name gridfm_lsf_hpo                                \
+  --optuna-db-path    "js:///gridfm_lsf_hpo.journal"                \
+  --parallelism       4                                             \
+  --optuna-n-trials   20                                            \
+  --hpo-yaml          "${REPO_ROOT}/configs/gridfm_graphkit_hpo.yaml"

From f3283adab9ab9343ba08b18723489489e8254e04 Mon Sep 17 00:00:00 2001
From: Romeo Kienzler <romeo.kienzler1@ibm.com>
Date: Fri, 17 Apr 2026 15:44:38 +0200
Subject: [PATCH 16/16] add prerun commands

Signed-off-by: Romeo Kienzler <romeo.kienzler1@ibm.com>
---
 examples/run_lsf_gridfm_example.sh |  7 +++++++
 terratorch_iterate/iterate2.py     | 21 +++++++++++++++++----
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/examples/run_lsf_gridfm_example.sh b/examples/run_lsf_gridfm_example.sh
index e140776..5afb84e 100755
--- a/examples/run_lsf_gridfm_example.sh
+++ b/examples/run_lsf_gridfm_example.sh
@@ -32,10 +32,17 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
 # Adjust to match your cluster's GPU model and scheduling policy.
 LSF_GPU_CONFIG="${LSF_GPU_CONFIG:-num=2:mode=exclusive_process:mps=no:gmodel=NVIDIAA100_SXM4_80GB}"
 
+# Commands to run inside the bsub job before launching the training script.
+# source ~/.bashrc ensures module / mamba initialisation is available;
+# micromamba activate gridfm switches to the correct conda environment.
+PRE_RUN="${PRE_RUN_COMMANDS:-source ~/.bashrc && micromamba activate gridfm}"
+
 iterate \
   --script            "gridfm_graphkit train"                       \
   --interpreter       ""                                            \
+  --root-dir          "${GRIDFM_ROOT:-${HOME}/gitco/gridfm-graphkit}" \
   --wlm               lsf                                           \
+  --pre-run-commands  "${PRE_RUN}"                                  \
   --no-underscore-to-hyphen                                         \
   --gpu-count         2                                             \
   --cpu-count         32                                            \
diff --git a/terratorch_iterate/iterate2.py b/terratorch_iterate/iterate2.py
index d596791..a22ad94 100644
--- a/terratorch_iterate/iterate2.py
+++ b/terratorch_iterate/iterate2.py
@@ -33,7 +33,17 @@ def parse_args():
     # ------------------------
     parser.add_argument("--script", required=True, help="Training script to execute")
     parser.add_argument("--root-dir", default=None, help="Root dir (derived if omitted)")
-    parser.add_argument("--venv", default=".venv", help="Virtualenv dir")
+    parser.add_argument("--venv", default=".venv", help="Virtualenv dir (shortcut for source <venv>/bin/activate)")
+    parser.add_argument(
+        "--pre-run-commands",
+        default=None,
+        help=(
+            "Shell commands to run before the training script, joined with ' && '. "
+            "Useful for sourcing bashrc, activating conda/mamba envs, loading modules, etc. "
+            "Example: 'source ~/.bashrc && micromamba activate gridfm'. "
+            "When set, --venv is ignored."
+        ),
+    )
     parser.add_argument("--interpreter", default="python", help="Interpreter to use")
     parser.add_argument("--param-setter", type=str, default=None)
     parser.add_argument("--wlm", choices=["lsf", "slurm", "openshift", "vela", "none"], default="none")
@@ -164,9 +174,12 @@ def build_launcher_command(wlm, cmd, trial_id, out_file, err_file, gpu_count, cp
     logger.debug("Launcher command: %s", launcher)
     return launcher
 
-def build_shell_command(interpreter, root_dir, script_path, venv, script_args, param_setter, underscore_to_hyphen=True):
+def build_shell_command(interpreter, root_dir, script_path, venv, script_args, param_setter, underscore_to_hyphen=True, pre_run_commands=None):
     parts = [f"cd {root_dir}"]
-    if venv:
+    if pre_run_commands:
+        parts.append(pre_run_commands)
+        logger.debug("Pre-run commands: %s", pre_run_commands)
+    elif venv:
         parts.append(f"source {venv}/bin/activate")
         logger.debug("Activating venv: %s", venv)
     arg_list = [f"{interpreter} {script_path}"]
@@ -774,7 +787,7 @@ def objective(trial):
             )
         else:
             # ── Standard WLM path (lsf / slurm / none) ────────────────────
-            shell_cmd = build_shell_command(args.interpreter, root_dir, script_path, args.venv, script_args, args.param_setter, args.underscore_to_hyphen)
+            shell_cmd = build_shell_command(args.interpreter, root_dir, script_path, args.venv, script_args, args.param_setter, args.underscore_to_hyphen, pre_run_commands=args.pre_run_commands)
             launcher_cmd = build_launcher_command(args.wlm, shell_cmd, trial.number, out_file, err_file, gpu_count, args.cpu_count, args.mem_gb, args.lsf_gpu_config_string)
             logger.info("Trial %d: submitting → %s", trial.number, launcher_cmd)
             run_and_stream(launcher_cmd, trial.number, out_file, err_file, args.wlm)