From 0b6e08b937b37feb91790489854af94c2a7ebe8f Mon Sep 17 00:00:00 2001 From: Romeo Kienzler Date: Wed, 8 Apr 2026 15:02:56 +0200 Subject: [PATCH 01/16] add complex hpo sets Signed-off-by: Romeo Kienzler --- configs/gridfm_graphkit_hpo.yaml | 47 +++++++ docs/iterate2.md | 218 ++++++++++++++++++++++++++++--- terratorch_iterate/iterate2.py | 27 +++- 3 files changed, 270 insertions(+), 22 deletions(-) create mode 100644 configs/gridfm_graphkit_hpo.yaml diff --git a/configs/gridfm_graphkit_hpo.yaml b/configs/gridfm_graphkit_hpo.yaml new file mode 100644 index 0000000..65073ed --- /dev/null +++ b/configs/gridfm_graphkit_hpo.yaml @@ -0,0 +1,47 @@ +# HPO configuration for gridfm-graphkit HGNS PF case2000 +# +# Hyperparameters: +# gpu_num – number of GPUs to request from the WLM (launcher-level) +# bfloat16 – boolean flag (presence = --bfloat16, absence = flag omitted) +# tf32 – boolean flag (presence = --tf32, absence = flag omitted) +# compile – torch.compile mode; null disables the flag entirely +# dataset – group: selects config + data_path + exp_name together +# +# Static args: +# all other fixed CLI args + +hpo: + gpu_num: + type: categorical + choices: [1, 2, 4] + + bfloat16: + type: flag # store_true: true → --bfloat16, false → flag omitted + + tf32: + type: flag # store_true: true → --tf32, false → flag omitted + + compile: + type: categorical + choices: ["max-autotune", "default", "reduce-overhead", null] + # null → --compile flag is omitted entirely + + dataset: + type: group # one choice selects all bundled args together + choices: + case2000: + config: ./examples/config/HGNS_PF_datakit_case2000.yaml + data_path: /dccstor/gridfm/powermodels_data/v4/finetuning/pf/ + exp_name: case2000 + case1000: + config: ./examples/config/HGNS_PF_datakit_case1000.yaml + data_path: /dccstor/gridfm/powermodels_data/v4/finetuning/pf/ + exp_name: case1000 + +static: + run_name: run1 + log_dir: logs + dataset_wrapper: SharedMemoryCacheDataset + plugins: gridfm_graphkit_ee + num_workers: 32 + dataset_wrapper_cache_dir: /dccstor/terratorch/case2000_ieeecache diff --git a/docs/iterate2.md b/docs/iterate2.md index 3429984..8922d52 100644 --- a/docs/iterate2.md +++ b/docs/iterate2.md @@ -1,6 +1,14 @@ # iterate2 – HPO Launcher -`iterate2` is a generic Optuna-based hyperparameter optimisation (HPO) launcher with a pluggable workload-manager backend. It submits one trial per Optuna suggestion, waits for the job to finish, extracts a scalar metric from the job's log file, and returns it to Optuna. +`iterate2` is a generic Optuna-based hyperparameter optimisation (HPO) launcher with a pluggable workload-manager backend. It submits one trial per Optuna suggestion, waits for the job to finish, extracts one or more metrics from the job's log file, and returns them to Optuna. + +Key capabilities: + +- **Multi-objective optimisation** — extract and optimise several metrics simultaneously (Pareto front) +- **Five HPO parameter types** — `float`, `int`, `categorical`, `flag` (store-true), `group` (bundled arg sets) +- **Dynamic GPU count per trial** — `gpu_num` in the HPO space controls the WLM resource request per trial +- **Null-omission** — `null` in a `categorical` choice causes the flag to be completely absent from the command line +- **Workload manager backends** — LSF, Slurm, or direct local execution ## Quick start @@ -73,7 +81,107 @@ static: dataset_path: /data/my_dataset ``` -Supported parameter types: `float`, `int`, `categorical`. +Supported parameter types: `float`, `int`, `categorical`, `flag`, `group`. + +#### Parameter types + +##### `float` + +Suggests a floating-point value between `low` and `high`. Set `log: true` for log-uniform sampling. + +```yaml +learning_rate: + type: float + low: 1e-5 + high: 1e-2 + log: true +``` + +Generates: `--learning-rate 0.0003` + +##### `int` + +Suggests an integer between `low` and `high` (inclusive). + +```yaml +encoder_depth: + type: int + low: 2 + high: 8 +``` + +Generates: `--encoder-depth 4` + +##### `categorical` + +Suggests one value from a list of choices. Choices can be strings, numbers, or `null`. + +```yaml +batch_size: + type: categorical + choices: [16, 32, 64] +``` + +Generates: `--batch-size 32` + +**`null` omits the flag entirely.** Useful for optional flags like `--compile`: + +```yaml +compile: + type: categorical + choices: ["max-autotune", "default", null] + # null → --compile is completely absent from the command +``` + +##### `flag` + +Models a `store_true`-style flag that takes no value — its presence or absence is the parameter. `true` adds the flag; `false` omits it. + +```yaml +bfloat16: + type: flag # true → --bfloat16 false → (omitted) + +tf32: + type: flag # true → --tf32 false → (omitted) +``` + +!!! note + Use unquoted YAML `true`/`false` for `flag` and for boolean values in `categorical.choices`. + Use **quoted** `"true"`/`"false"` when the wrapped script expects the literal string as a value (e.g. `--amp true`). + +##### `group` + +Bundles several CLI arguments together under a single Optuna categorical parameter. Optuna picks one group name; `iterate2` then injects all key/value pairs from that group into the trial's argument list. This is useful when multiple arguments are co-dependent (e.g. config file + dataset path + experiment name). + +```yaml +dataset: + type: group + choices: + case2000: + config: ./examples/config/model_case2000.yaml + data_path: /data/pf/ + exp_name: case2000 + case1000: + config: ./examples/config/model_case1000.yaml + data_path: /data/pf/ + exp_name: case1000 +``` + +Optuna tracks the choice as a single categorical (`dataset = "case2000"`), but the wrapped script receives: + +``` +--config ./examples/config/model_case2000.yaml --data-path /data/pf/ --exp-name case2000 +``` + +##### `gpu_num` — dynamic GPU count + +The special key `gpu_num` (as `categorical` or `int`) overrides `--gpu-count` for the **WLM resource request** of each individual trial. It is consumed by `iterate2` and never forwarded to the wrapped script. + +```yaml +gpu_num: + type: categorical + choices: [1, 2, 4] +``` ### Static arguments @@ -86,13 +194,41 @@ Arguments passed unchanged to every trial. Can be supplied inline or via file: If neither is provided, `iterate2` falls back to the `static` section of `--hpo-yaml`. +Static boolean values follow the same rule as HPO values: unquoted `true` produces a bare flag (`--flag`), unquoted `false` omits it. + +```yaml +static: + max_epochs: 50 + tf32: true # → --tf32 (store_true flag, always present) + debug: false # → (omitted) +``` + ### Metric extraction | Option | Default | Description | |---|---| -| `--metric` | `val/F1_Score` | Metric name to extract from the trial's stdout log | +| `--metrics` | `score_combined` | Comma-separated list of metric names to extract from the trial's stdout log | + +The **last** occurrence of the pattern `: ` or `= ` is used for each metric. If a metric is not found, it defaults to `0.0` with a warning. + +**Single metric (single-objective):** -The last occurrence of the pattern `: ` or `= ` is used. +```sh +--metrics val_loss +``` + +**Multiple metrics (multi-objective, Pareto front):** + +```sh +--metrics score_linear_acc,score_modality_leak,score_combined +``` + +All objectives are maximised. `iterate2` prints the Pareto-front trials at the end: + +``` +Trial 12: Values=[0.873, 0.041, 0.791] +Trial 17: Values=[0.901, 0.038, 0.812] +``` --- @@ -116,7 +252,7 @@ iterate2 --param-setter set ... | `--param-setter set` | `--set learning_rate 0.001 --set batch_size 32` | !!! note - In setter style, boolean parameters are passed explicitly as `--set flag true` / `--set flag false` rather than as bare flags (`--flag`), since there is no named flag to toggle. + In setter style, `flag` parameters are passed as `--set flag` (key only, no value) when `true`, and omitted when `false`. --- @@ -183,37 +319,81 @@ Not yet implemented. ## Example HPO YAML +Full example combining all parameter types: + ```yaml # hpo_space.yaml hpo: + # float – log-uniform over [1e-5, 1e-2] learning_rate: type: float low: 1e-5 high: 1e-2 log: true - weight_decay: - type: float - low: 1e-6 - high: 1e-3 - log: true + + # int – encoder depth + encoder_depth: + type: int + low: 2 + high: 8 + + # categorical – batch size + batch_size: + type: categorical + choices: [16, 32, 64] + + # categorical with null – compile mode (null omits --compile entirely) + compile: + type: categorical + choices: ["max-autotune", "default", null] + + # flag – store_true style (--bfloat16 present or absent) + bfloat16: + type: flag + + # flag – store_true style (--tf32 present or absent) + tf32: + type: flag + + # gpu_num – controls WLM resource request per trial (not forwarded to script) + gpu_num: + type: categorical + choices: [1, 2, 4] + + # group – bundles co-dependent args; Optuna picks one group by name + dataset: + type: group + choices: + case2000: + config: ./examples/config/model_case2000.yaml + data_path: /data/pf/ + exp_name: case2000 + case1000: + config: ./examples/config/model_case1000.yaml + data_path: /data/pf/ + exp_name: case1000 static: - max_epochs: 30 - config: configs/my_model.yaml + max_epochs: 50 + log_dir: logs + num_workers: 16 ``` Launch with: ```sh iterate2 \ - --script terratorch_iterate/main.py \ + --script gridfm_graphkit \ + --interpreter "" \ + --root-dir /path/to/project \ + --venv /path/to/venv \ --wlm lsf \ --lsf-gpu-config-string "num=1:mode=exclusive_process:mps=yes:gmodel=NVIDIAA100_SXM4_80GB" \ - --cpu-count 20 \ - --mem-gb 512 \ - --optuna-study-name geobench_hpo \ - --optuna-db-path sqlite:///geobench_hpo.db \ - --optuna-n-trials 40 \ + --cpu-count 16 \ + --mem-gb 64 \ + --optuna-study-name my_study \ + --optuna-db-path sqlite:///my_study.db \ + --optuna-n-trials 50 \ --hpo-yaml hpo_space.yaml \ - --metric "val/F1_Score" + --metrics val_loss,val_f1 ``` diff --git a/terratorch_iterate/iterate2.py b/terratorch_iterate/iterate2.py index eef5069..516bd2f 100644 --- a/terratorch_iterate/iterate2.py +++ b/terratorch_iterate/iterate2.py @@ -85,11 +85,18 @@ def build_shell_command(interpreter, root_dir, script_path, venv, script_args, p arg_list = [f"{interpreter} {script_path}"] for key, value in script_args.items(): arg_name = key.replace("_", "-") + if value is None: + continue # None means "omit this flag" (e.g. compile: null disables --compile) if param_setter: - arg_list.append(f"--{param_setter} {key} {value}") + if isinstance(value, bool): + if value: arg_list.append(f"--{param_setter} {key}") + # False → omit entirely + else: + arg_list.append(f"--{param_setter} {key} {value}") else: if isinstance(value, bool): if value: arg_list.append(f"--{arg_name}") + # False → flag is simply absent else: arg_list.append(f"--{arg_name} {value}") parts.append(" ".join(arg_list)) @@ -139,6 +146,12 @@ def suggest_from_spec(trial, name, spec): if t == "float": return trial.suggest_float(name, float(spec["low"]), float(spec["high"]), log=spec.get("log", False)) if t == "int": return trial.suggest_int(name, int(spec["low"]), int(spec["high"]), log=spec.get("log", False)) if t == "categorical": return trial.suggest_categorical(name, spec["choices"]) + if t == "flag": + # store_true style: True → --name present, False → flag omitted entirely + return trial.suggest_categorical(name, [True, False]) + if t == "group": + # Suggests one of the named group keys; caller expands the key → dict of args + return trial.suggest_categorical(name, list(spec["choices"].keys())) raise ValueError(f"Unknown param type: {t}") def main(): @@ -152,13 +165,21 @@ def main(): def objective(trial): script_args = static_args.copy() for name, spec in hpo_space.items(): - script_args[name] = suggest_from_spec(trial, name, spec) + val = suggest_from_spec(trial, name, spec) + if spec["type"] == "group": + # Expand the chosen group's key→value pairs directly into script_args + script_args.update(spec["choices"][val]) + else: + script_args[name] = val + + # gpu_num in hpo/static overrides the CLI --gpu-count for this trial's launcher + gpu_count = int(script_args.pop("gpu_num", args.gpu_count)) out_file = f"trial_{trial.number}.out" err_file = f"trial_{trial.number}.err" shell_cmd = build_shell_command(args.interpreter, root_dir, script_path, args.venv, script_args, args.param_setter) - launcher_cmd = build_launcher_command(args.wlm, shell_cmd, trial.number, out_file, err_file, args.gpu_count, args.cpu_count, args.mem_gb, args.lsf_gpu_config_string) + launcher_cmd = build_launcher_command(args.wlm, shell_cmd, trial.number, out_file, err_file, gpu_count, args.cpu_count, args.mem_gb, args.lsf_gpu_config_string) print(f"Trial {trial.number}: Running...") subprocess.run(launcher_cmd, shell=True, check=True) From b4fc64a42250b84db13ae629822003120a1d6152 Mon Sep 17 00:00:00 2001 From: Romeo Kienzler Date: Wed, 8 Apr 2026 17:05:41 +0200 Subject: [PATCH 02/16] add debug statements Signed-off-by: Romeo Kienzler --- examples/iterate_study.db | Bin 0 -> 172032 bytes terratorch_iterate/iterate2.py | 158 ++++++++++++++++++++++++--------- 2 files changed, 117 insertions(+), 41 deletions(-) create mode 100644 examples/iterate_study.db diff --git a/examples/iterate_study.db b/examples/iterate_study.db new file mode 100644 index 0000000000000000000000000000000000000000..5d1050d429bc9c4cb8bbf47bf5b76bd9d6c499c9 GIT binary patch literal 172032 zcmeHw2V51$`t~kx`Z>EcK*Rzl3JQ37LFFuRP*Frd>;+MwOf=o(ruTMJG|4r^Xqp=Hy)(0`5f10xf4*Pl`;lu7F9Q;dAJT7&vQ~*8pp+zavawm{;&Uu;3Vh~u>2qVc3i;7Ti3~eP8N@% zbGj1K&gm?fDZmt93NQtj0!#s>08@Y|z!YE#Fa?+bOo6{Zfn=KlP08@Y|z!YE#Fa=I;1(NxA&ev4i&`{k}-!OY#)x0J3Rg0?|tK3$| z7cYLv!GG+JDZmt93NQtj0!#s>08@Y|z!YE#Fa?+bOo7u}f#i6fCEEC(y#GJUiHA?~ z46@5-3NQtj0!#s>08@Y|z!YE#Fa?+bOaZ0Sw!8|7!{~zSUgQuLr zB1{3M08@Y|z!YE#Fa?+bOaZ008@Y|z!YE#Fa_FGAR)nGwxIX_u<1YhV+t?@m;y`z zrT|lbDZmt93NQtj0!#s>08`*^tN?od&&L0M;{s+`V+t?@m;y`zrT|lbDZmt93NQtj z0!#s>04TuT|FheGDZmt93NQtj0!#s>08@Y|z!YE#Fa?+be}4to`~Sax8M7QR1(*U% z0j2;`fGNNfUGv(JKxV z`-_=knwTmki5*3|XcFUu--KU;L&CShSHkDQ$HIHUo5Cx?i^4O)lft9I1HumBF5xy| zlW>D@jc~bek+4oUS2$Z(DJ&P33U$I4#X#~r^q4m-YceC_zc@rmPo$6Jn99eW{@?2jqH6krN41(*U%0j2;`fGNNf zU<#c63M9l^_y!a#MZppj)T3ZA3KpTD4h0KQumA<~Q7{h$b5Sq{1+^%sLBVVk%tApm z3TC393I#JzFdYTcP%sq*Q&2D&1(Q%P5d{-aFdhY!C>V!=3KW#1U@Qv8pr8x|qft9_KiGmUoj6lI~6bwT_F$y9mC_=$d6cnOBLO~b>1t=JTg25;lgo1n&gisJfK>!7Q z6!=i!MS%weZWOps;6%Yd6y%{G7X>*e7=VKQD9A=Z77F^Ipf3tCQP2kky-|>Xf^-z5 zp`aHEdZM5Q3c8~p6$RZ;kb;7)C`d*@5(*Mg&;_Ivcof9N$6MI`|EK@6&vL*NUDZmt93NQtj0!#s>08@Y|z!YE# zFa?+br@aDb|9`89x5mpx`~Qmv#ea)mh#$iafUk=$kpriFCRqlU0!#s>08@Y|z!YE# zFa?+bOaZ0zeOqi0W;QQ?Fr&Ey!t~}W2-BKP5cZk_LC**T zJu=|^KWx@`1}7d74~YlF&&3a5=l{Lp9`RA}KG^ervv{3&xwrv#`(GtCiFINP?DJnK zmWmN^FzoQ3BlZ!yi(O!E{{-Qfa9B71yZV11ydmrr_P~Dr_X&3jn}zFOC;tt?YGIYo z1bg_`2-Ad0p%ixSA1ru<9H9^F+uud73ki;6uw(xL$LEd@9B;s0{d*jbI_`7a>DcVJ z&T+Y8gJZR0m7~c~=csW^b5uG?9TCT1hu4wg=;P?_=;E+D670w9hwTUKpW8pMzhU2N z-(!E&exLnL`)2!f_RH-X?5pjo>`nGMdyRdXz0zK4kJty>z4jb?AA5Iu7rWgaZ#!Z; zX#3LkvF$C}OSV0>M{GN6TWvSluCiTdJJ+_#)@WN`tF}$Djj;{04Y7G`18f<#6k7+I z$$H%SqxFFGGwXZSSFO)kpRn$<-euivz1DiEb*=Sm>k8{)Ypr#fb)0piwa^;04z%{Q z_ON!b+N^PwUoGET_FF!(ylL5M*=>2)vfXl<*E%Tw@<&se}4}J%n7_0*P!wfh@+xl`kZY!b`-F`7m}CsC)r&6n+SCWPUJq=BxZ5;wXGRab!M(op~xBB#y!d zh$HiU?95epA8{1kOB|W^U}uiXyNRRlF5<|%6FapkKae;IpGO>-&&5uS%I6SA;Rg^$ z=KEu3w#sJ{N8z)GBlG>RGfUi8wN!h@ELF--S2|-!TnLCJ`u`2gHaTM-5 z;>g^$*cqd8-w;RP4iHD?{*9e7mHV1F3ilOpWbRAsj8?h*#8J3?#F4o#uv4mXpA$#n zJ|m9IeTtn?D)%qqDBLH+k-3ktGg9R~B96j+NF15_06Qfr_danH?mgnj+`HHrp>ppK zN8#Qkj?BG)08la<36b;a(+<%)Nr0VwL+RaTM-l;>g@f*ommz zUg9X+i^P$+7qC;La?cY-;hrOo%sq>pp(^(baTM-p;>g?{>=dfpZsI81Q^b+EC$S@` z+%DoM+!Mr+xyP{+R=LNBqi~NBN9G>EPJzljOdN%Kh&VF$Aa;hR+ylf>xShn2x%;s* zSmo{`j>6qb9GTmJok1$Moj3}&jW{xQ4|eiZ?r!2J++Dbqj0wqN9ML- zC#Z6_5l7)}C63H(!A?NsZXu4sZ6=P)ZNiRU26*9GSZuJ1&*Gj5rE+DRE@( z66`ot?qcF7+(pEZxeKu~P~|Qlj>2sqj?AscPM*rGBaXtYC63IUkDXkVTSFX$JC8Ur zcP@5vRBknK6z&}2$Xp9{2B=&!aTM-s6I4X}0QoFz`>SUXtDHfsyb9ZFbtSQiO02A4 zo2AOcDl3SUmt)&cT}G_ZM6BG1ZC|y4SY;`(@)B$_)p}x;#l*^suKlvt&ZSXsh0Q4JHT6c8&9!M2M! zm{?^Hv2s4Poz)PrN|0DNfNdw$Ppsl2R`z1sQS}h3xQUfr*mh8z#3}=cmGiI_)m&ng z9Af1G*a~WYVwG%SbqHRSd++3C0AA$gA<#C~*deS!DG6e~tNa4&M8J z3UB=Pi1&+I#H+;fMMYd7PKA;F5HU~eC3X;b;RoR}7~4NB>=bSlt`^n`s<047^P`0V zVW5yEbQIzohhY5vrsEmM1CHAq*ErTWR>FvVyknRn;K*{MIIQ-gFg}0DzSDlA{e1gU z`xJW-jKC9Z$8Gyg)oYqX)Uq3t*O=o z%Qu#{EKk5l`4Wq2nQbYx_$)mwCiC|&8h*-r2V{i(F$I_cOaZ0-y$3dh!gCTo*ob?C)Lk*6PXAN%F$* z`QJ4@)At%lqnt*P7laplw)MzE&v9DvRFXVDJi#^cE9GK6c?wCM7w#R}!*4lp-IGc3 z+^{r!=cQ+D*3X_qlIMh-Q|zy8GV93`Npfwt@ru6n*-P}~2_(5DJnyH~_uQ1OCyyt| zv%{-x$y*1+>&cZQc~*FO*fh>jpeK(b$<^W2AN<<%`bIsuf+WujFS+!*wR7G-TgzoR zNvuZeL4N?u_Sp$c#`LftKJOi$zw?J^zgc=n}=P~Ne3w-$ z_GpqkHEcdKd4K2xb}31o5?)a_Y2)&bbdXUbd2+a+{I#6dj_b)IN%Ev{%S-Vy-cH%2 zVV98PiQ)4v`Tl}U3nVRh1WBF{KKJH=$B#XsgA6ChpLC5ubKRKD~C&`f^hi@MasXB;{ zBo`H=@7nQ)WG_h`I^=^FH=ay(cu=xArI1{K!%d=+7Il$mSc^JIv_OjvB+(&SG>=3F zlW1Ztikeae75td{&x?ELm*>bKXXR`EXaI?Zv}k`44QkPB5)Ek4EE4r=(S9WA)1rMz z)T>1^Nz|i7`;e$xi}of_mln++QKuG7C((ggG>t^_v}i99&DEkkNi;``_8`##TC_Wf z_Sd4RB$};7yOC&?7EK}1ep<9EiT2f^$t0SoMUzOhj}}cN(cW6L3yEfE(at2Au0=bM zXqp!7NTR*8Xa^GQsYOK+?V&{l678-<9VD8nMeQWoO^ez{G)0SANwljLh1a9#?n~C9 zW)e-(q9zhe)S^Zb?V?2uB-&YvCXi?+EgDav9kpm2iFVMUJc)`VioXA^aTM$C|GyAF z65keI5uX>I5+8vT0C&M`z>VV7;w9ob@f=tKuuNPm&J|~hlf?>g6s!UmBKpOFVwRXL zb`v|pIsk)kT=-e|UieD*RCpg|3tkeQ5uOkp6t)Yu3!7jqz!k!U!W!XhK@l2Y_Mk?X zE=&-{2qT0-Az$#odVox!r;sdk5G*j8a1>SqeB;>X_}KA|<5kBCj@_^(;C{#5j$0i! z!R*4Nj`grA;0(udN4;a7quMdWF%H%R6r;zR{V@fY0!#s>08@Y|z!YE#Fa?+bOo6{l z0b>Gh<{N6TSvnh=C9|-puf}HaOl%faVN*8)n}yS{SuhQo`BSl(HwBxyld+jI37gu9 z*wjqGX7+e&W>sQSJr0|h71&gjV>4qcHq*yoGp!7psiU!(Qi{#wQP@lxiOs|kY$lAr zX8dq$Du-b+t{9t&2sY(K*o+;D&6q-L$|P(?hp{Owz-H7CY(@^oreqK{Bl58s9>Qi= z5S!uvHW5EIMLukXda)_=U?aJ)3A?ZZuo;w#O@0nGp#j(g`(qQx#>St8 zjjtaz-oDs)GO=;@!N%1a8)pVK1Jkj|OT#9&7dAOPu^G?oEHj4s%ucg7~I6E?j%V$-t&Ha$dax(nE(Im9vEh5|h|1YZ*7%)?SDZmt93NQtj0!#s> z08@Y|z!YE#Fa`eCD8Tmr|6fxS>_V6VOaZ0JlH(+_X?T@|LraTK{7b6^ z9|cw*sw^H}T#;5brZTN;d}(P;n$@y+*`iss4f-LJqEnrkR^3#aHleVhXhdN}RxtY% zh#EN7G=EVoWUsoRDJ=qdsw^2@jId4|nq9wW30!UMDM!avl#DK{n4C7Mcyd~newEqi z-whj6QCu>-3>`U{^|XrOVZ{~2Wkto~($GcDuWkH;FxKprk!Dl!ph5iV0azN6k4B}j zsdiCSbrayvTGmwC*iN$L6!F?QDJ@Hvr!{PQ@)Dqv+M~>6EUZ;BPKfDqxp9`4SJy48 zty<7nzc{V3cv2-g@f3ZcUlQ=*e?7jeqY4gk3(y5Fuhna&Q;tS?SWeKj|CF4-{PRyr z*kDW^mm9lWPF%@}A~~62ght4W8lvWD@uuX=Oul6$E|ulA4UJG5Rr43mssE#&a7tkX{YLHq8g`e{;Y=T2Ia&dy~cO@{8*qzB5lS#~ZIPJ#4t!IOVVEiZEQJ08@Y|z!dmz6==@x zYDyk3fN!zj#-qBfcG0Z)v-LKyRW$XK#-!E3KQ_oP&^pm@cv`g^MPtgwRaO+j_$;jz zLe-Lmv(rv#R`jM7@StTGl449QAJAUsqcxbTRYLcco=LbPcn)bN$rx;2Eezvk)mAq( zwvMNqlG)ZrQEpqMwbuh7r?u14p5Q+*SZu$MZOeB`=UeA#)4Mxy;E#h!oCsse69=u? z;pAAIJ96TQ>xIr;>y&PA)8Wotd7?47Hr!r8X^e}TY8w{S*35?ibrl{DoCp{*5d;0yqX;7RMsx}(>G0^voU#8X6&*)wRlOaOG|MlJOWrf`VS>a zj*&|ve&;CVMvj~;Djfe)rE_9P6t##j?#WUiH2=eb(624(qneX;;sXAB0d)M&1+b*L zp?cAY=ZnNTwd8al1(5oO0wBjo0T91qlmZ|}Pyt}?zm*=1y2w9hGqgB)sFmn7^HI$V zvsq|(*#3->ejV+|e3JD~>pWVFHAzGii!w>n&VddsB?6hi%l?D*c>q5nyF{kaLe=-1tT6%>5}kD$P)@;@|OC5~9LMFgeQw2 z08@Y|z!YE#oW2UM@BdHVvSU}y6krN41(*U%0j2;`fGNNfU08`*^q=1!=>#^LE=gxEHRipR+Z2bQ> zDqNNorT|lbDZmt93NQtj0!#s>08@Y|z!YE#XbRZkdsy`GKYIV4!A<1E^3}9pmn9yV7bVWV%}uVH9cS|G`?sYZ}`|SKjBcq840HNOX5@GZiySn zKg1VtF9FEOzorZmOe>h0$Ju3RMAI*gWzSr^G6QWRl98X5F|4k>x+(I9okB8TPuP>& z;KC^@Gte$K;4EHNR|f~{>Q_L-mFEP1-u$`qP{irP(YpG%5Nm2!R(r-7P3an|O${Hj z#rOMaQ5PDlX*#PS`QPqx@EgZ88mzr^R$1z_>Z0?nYI=|cYfp{Us!9V_UG&m-cYH&G zwTI5CNW)hym3oJspuyT*XO*Q%GtQ~Z33Q{unyRr{R4F4+_2?yU6wqMprn4%Nt^St& zZ{D9mgEd8Gm8JAqa@8jtdeUI+s&n#qUpwaGcu;o zVC|%_8da&ob!Y52n)@RS){Z)>BH4X!Cco72Od6~mbXHk%nx{R^jiGWki5jaxm3n=n z9{hl#l2<`zRiwn1&wTuo5jWCM*rBt^Qe^1Q&Ei2SwZC0sO@Q9oo4xHFN*ih9)uyv5 zk~ia#?fuSbron2}S!Kz_|6_ZPxJPKPS~S*pRq{0q9`f7Bf6-ty>#T~D`qSYO zp}%?inO{?2ZHzZ@7BrGnr9ig%i4}2MX^_Tgq>41iIQ!eQL$A>wh% zj}00BS90Pyv5RnnkmcCn7;JyWKGyc0t;YJTRkp@iHdvC(H<|mJ?lTn_pEFh%J}}Hl z_&z~RNQl2MzH8jZxEy{bFL5t$mHP0%DWcsd$%-`K(bq3I=|7eGg*vMe{^{tpb3B=s(O{KyR#_T;MX!Pz-lNiBhc(tj zRmz<3+{huxR0fF!I;#@?<=vEi&3zxEq3{r$RhIJJzd^mgnM#9ou*TX&4Il44+1>kf zDw8^cbXG;OF8gffyW6_b$ZNjNDofpe+P?IPK2)C9kjC0sl??OFOSV0`l15&GIx9>$ z9JBs&LkW!*r757Z%94G|ZBv$+cGJkKUt{eA3SV;bx8_g*4OXAd3WEu^JvcFIIt^B@ z&MHd@eUcm#vtOdY>d{y`!jyw!VAW%_%V@B=byh_(zquy!&a*zF!RpdkWvO)PiYta) zFpCDOQ)BI*N~P`@SE`Gsv^fKHRz(`Qz&`VHr5}yF=IN|3_3KScUl#PxV9nK7MO7M< zGdA2@@eU2v9Gz8>itC?UoAK-%8mt3!R$0nv$t-{Rlj~`)_SaYiRZ1v(C98qkOM^9A zXN7zCqlaF(=Yywdux9D3&>ueEIrhP{w`j2T(^wszfac zcy0j=)=ZsMmb?c$=7meByn^eavD#IsTsmfnyWiHX|PVzSbM0_xQeA$H942kV4a||DpFCWMSDAZzK{m%c%2nSY`1=uuzZ=H z25Y6p+Fg~r??n#ro(dYQ<8)R~cz5{)o2sbHIalbc@Cx(uO*8N5_&SZemTRo3s#Fl^ za=*=c4Gq?@I;$cL9W;H$%(YZroQ~01Whv>)_vABgzMDo~%QV(*sx#T}2w&IP1)$%JeSW9(Q7_m*6IlS|^`7~HZX{;$wOAJw3dhDaJq;I6osz?*d_7AVf zNT-q45}j2JA99%vEeY^6SVw5AT~(=M$L^a_OPAAN9j>z~l6(JU`?kG!9}U)FI;$*Y zb=uXv^a?8P!-_T5WK}Y?^w{&x+MK zRB7V#xw|$U?oZ+FY-)zpKlnS697XaamN)06a#Uzx{STpqH*DQMe0ocHA1bu40tnL% zfUhg+mB@o{&?#F^r-kKYw=8}$bKIjea(kA}Dod83pQeZ3y?_SmnHp<u_M|w|D%wp9bqPjkTXD<$Ut_r*{`q`Q)HUXH}&9y~}@1I<$jEUK@2*S@K+e z&AUy59;3n9pt1HsvQ)fv?o#`i-D$Aa zYpj{7G`2Xh%R2EA8mx?<4SO_+S4S z`8=Ifk;XLqTm8B4BO0u8byitQeBsK?MGhJ*9U1>`<;1mOC*fM5kK-#M$-V}ea0b%XAEN!-btvA|2n=g?pWNpaRPr8pT=$di--SB z&H8GaAk2>}EPlDNPY#VzY1Wt91QjXSaqBbJ-}^lkT7A7uP?k*VQVX`fNoTyRFSrS) z;h%T@dNgC-MO1RDueb>)QbEV{ExEI9qC%@Lxe3VOpU18#*ev#=LYtvI>3&t>HeQi8 z^%5F0eND~jIxQ?#o^F23Z1+A zZ$5|0+pT7OpA3&I^}V6AW5PLospMAQD8sEvU7lGqd2Xdbg;w7w!>xpW-}bTe%D}s* z(CS-dxMgW{(w3u3Z|qKmR^KbbrG^iMj*qRar1CniS>G(frAV&bPdC;bO{S7teYXsk z96s8Vk-4nTW-7G$b{S6CR)O2pvWL$19L@TE8BX|ST6y=Jp*yH7)ovEF7RV_}d}w63 zy!HSUk@X!j2C9;E%dofSJhXxet-fW(Kt&q2SSmgK{sUBK^*u8N!Y6%i&N!ob-nCR{ z^-VMKREhuXz}nC*Ds%eH`mPyy(1o+bZOp%k&WO*f-N|`!_=gwY9DL^uG+Ln3X#B6Q zdnr=Gzy7*4?wuf=5wX7TrAU#C#>u4{C(ya~^_4G0u%6YhE#q}pTs^Q;W z%IGqpxrWMCAa!+ z6%u^w`P>apta~_<3a!3fg#??&%(>K6b{UnI?#=ps6=7BC+jHY1xnI#45f9cLps*s9 z`xZ7X@X;9~>N{41RVP6s8X>0sBB82Guxr>Sy2EhDkeU(>h&kT zqLN#E(~1IFiu-)~zUe#0QlZs%tr((8kpoArYwCL$6Kj)KhRu?;zp`r3?738E^_?pQE8$;{)TUSM+eL*|-@0P3ECqLm zGk?ySK!sM{yJ8T$!nwBNrn9~YQlZs1uNb6AaX#_JZ|kQ~q1AV<7z8Ud)27UCncss7 zZJyQw<*Smh>DdRHAmYD^jl~pIWhLH-y#l&!vnMM#l4{Qk?3T??oz)@;t!ZmAILO!d!83txCPfkJN4_&>yL z=ENt&GsVHe5#d2$x!`sD;MnF^?8vcyZNJSv$DRhe0bXt^vze_=Syx&8mhUaMTWTyl z&F`2mHJ6%=rd=l0LFBJ57qh5>O@ACyc5!CD> z6kkjfcQE7+M7tJW&>IW}Bd8TeDBf5oUSGfyh>qg&yCFxYQAQ}9SSX&5(+{Ud(cKFK zUVj9&u?WQ-6UFKG`Jh@b`yp$v?L;`9eSA5+9!mP*Di`HlotvGoc^fS;`I4l zPIm+~HwYy+CW<={?cPQBL02&7jiA;8q2$Cu3Hm&4f7^Q(p}5>$w>N?s1B5al7D^!K z4Me%?00s2&c>@ttw#B zK{YQz$%=*I^LspjsJi&w0jE0{LFK)?$T547q}K5T%q3`LCcL0O5d0$ zPLB&pqxHQDC|yxV^4m zD1z4ZAe7#*P+S2hqPDdZ{D5K50&C;@NKAFYf4#RY}ril9X(2&H>0 z6o0_!@wKhJ;aUP7U&!N$pfx23B{dcboEdOMx$A;?e!tTh@D)d_09wG18=jabC{8at8~zAd znt)I`#YAyAp=Zz*1@NI}fL0ByG(adFW1&FP=yFF(y^jaUG2(eK7UY9pY-`f?)3&0KF@=E9} zy(O`*&@rsAq_h>AH{avW^92JzSL=HPAMtzO<_}5eg*!s9M@4Xl@|@7IMs4UR{W)(w z9C0~)Zogkb@6Hi|EjofL&kyZPoBlQ+z!4Zi_}mhD4UQ13Q4w68yg&$s8m;eDZ$2Ce zdIC@?5_+SJ5G+v;-1%PU*SJD(d$h_0ITG^0|0MKs8X=gYBY2>f2L0OBN&*5L2?RVY zkA&VcBLq_v1XsS#pXUzwp;O=L8t@U17iuV%WYQ3fQ4t^)P{9Y_?mV?DTqu8DPXHc$ z34J?_uE7u$!41z7AV9~rH3B*kf+0jmLZ3k+goNk_@Zl`&vEZPIAN#)s{C5Uksbu0dVI#eR*zZR9oLqPJG1Ug4@B1#{WTX z8z(+2Hi>z{m%>J2nvmpp#j(y2u^+cTXm7CR*!J0OvQ4pdvA%3Q-#XNC%(Bz6#M0mV zx%me3By%UzUekFd$#?|54OncoCD)d>ahzr^o|uZ!;+_etD8;ws@w zf#>)ZelYhF5T5&A5C2a&F|rhC4)xKJcIzXz59$J^FVwb%g(J{s@VT6kB^pY7`zT&$ zGvI!2eFwo@1K@k0Ee%ELHI&6MQQ!t>Q!fDu;6oQ9H9*9k+oXu5#3(9Mj@*HGrgLh(Xx#NW0D2RZUVuK{}e^E8yX zu~0lg=mSJ8Bj`+dp`_<(D05<>xS*bI-G*RYJOLO_1tN1al-gJ*P@IA2>Oyl0Fe zhEfv?#T9@owe4ktE^fI0p*LBhq0Ekn0>d(Y>-Gfa$c3hZf|1!8%B)x@PH0l1J=0#O z>0x5d6Pcx(Vl5QarvOti&SeUGh?B^WJ<72>jXIhd{59H@I+>6C{;00 zTz+q~GJ@VhzypJ%NR@^%BNj@)2d!!42q^EL1^ zPS;ST#YBNZiqcmHU7(nQ&?lXyp-hd1;)80@8Rf3?pk5-3s3KD}lqs=Lyxt)6?Az81 zfZ~Fu-5r^tp-hg2;_(Lq(NSR37lOKOvW7A#77A)}{84HtPabsleSu(Pl7=!d7K#go z+0pJ@4?K-%vMn-ELzxg01u`G4mhwQq2yTL4WP*k=J{F485+ybr$T9b-UhQ&m2c|veGZE^(oL3mjbjG!f^2&Fg{N)Vcn zDDzk_77xP95>OYd7)2U3BkZU%C*4Uh70C{(E?C}QWOgX-V#8y8w~~Kl%dj$ zp!J*xWoS$kn3{|BrU=b0cu_A1Ezd+Kg|SdzlEVpa5TfMB?}GNh9YL!t5sDNG1)gVk z9HXJYGYz$A1TCIKDB)NrP!qxP9}UF|bKfxYkJdsWl!90&Ad4F&_@d;Q15wx@s zp$v(I;(`e{n1hOf0yCEAjVoHIh)@Q{L~+4er6@I~D-R}kJ8|6WrbPbIf#5R7R-u6OKN6YN7Jw`d@bXh(s_ zQyv)8X||gREgJb_S`gWN;oUzDrt-TH&1k9s)52zo!IwP?Uwn~DZqeKVq8+J*e|+kb zv4P3IQ=vtZ5tvp9AG`R#+vnD&P@zS$7nl}APCPi}zH8|0Qi7&C5N!#F{6mG9e9KNM zxkd9Mm=@~fZizQO|Aa_|7EP#NS`ayT#B@j3e^a4FGcbsD1c+??Zp7*wI(IUfvca^l zYWl**etq^{I^PkaIUY<4b@GMWot`i0tPw|(LWp)a{95`!{~fEIqVoWuStLxWNQQgr z&$#Z#O;m0vH0^|GVb%2IolBfouBJkZ=BW_vFu0TV?fB$hzsXc+(Zm*}1(9N)ELDzs=t5vEnbhcC##@zs)_sL-NiNSGFO=-K*Y zMpjN86RA@7GS|$8L#r~OJ5AR5YwvSFLhktoK@tVh%Qh6WI(p#gghC6w!?N#g7 zRQ9ZC$O4+7qv@>BQtLLvD+n=9Ig|?SY z3q#YH_pUqtsUcKod+M|>Cp){08{4Cb3T+RKwhDe^;ll?E|9<*yDzx2oT9}jdJ#c%j zN@clFOR7#QhmYJB-hS%=Dx0IWbkk^O!0y8L4n1`7d%LKJjJ5_qoxT~+@NIguarycs zRA|v&0+?0~AMA8bfv~cG3N6}v0MSl|I(d7cV@%mfDzs>~0!#}cr!QVtx0lYl6tuko zriJQaTIBINx4l6nw`e~EL^};cw$2{au%F7S=@zts0;UC#J6G;~_?z3P&}l*B`;+#st-g*5tzD;uiSLe=Jl{Bh#w5#WH2&8Yq13~^ zrsLM_e4+cNREkAkgi;Ti$9gg|M?HE26^)M~$&^zw|>zXI2Ea+`PpY|czIyL;$f)(~x4%|Z} zx9D3yOba4!TD)mM4wZMrE$HJxObd7Nl$M)9@77bvE&AFJ(Jq7*`i+!Z&&#CpgI+D@ z^F&MwD_I8ATh&EWeki>Keb0z#LF5O1I9C1PXeuHXXtWF9{r}+$zNq+-PNxrjF^O`! zKneeR!M-~mJkWtkZqX-}m=?N0jyLO)x?Yq0QH6(RAUp z3le)gO@%h3)54srV^4>j2k0zn3~IFV;D>Kt-CgE-m&yyhmVizRy^F>H?|*npH5HNl zIxW0iSo;2&-~cK=eB0vFXy-ydx%$S9o-B-SXLveHYPrfSful{3c7Mrnky=C{#oq zsL|F!!}si_FV2l5Q=!e%Y2i-(?MPMK1+P(|&DCk4chNZ^-G9%Q6ll@--(q`}6W5BJ zglmO9j=LOw`xEvNki`C&0!#s>08@Y|z!YE#Fa?+bOo9LH3N*Fodpk71nyeAGJ<+Wr zm5;Ss^t~M#;Fl~{xvw0XbvuROOH+%!w?hN0Wck;x7mob+I2BrbZ-=EYG+lap__YJ+ zEV9t|c32A2g=3SNpa1$JD!JA7c33Kh58Z6vIcqtcHPHIr4olSV?^QzG+paNGa;xv{ zummPqCf;rMdU_2NT77SaCGehXLrp=qS7Hos|KDD*vFl?BFa?+bOaZ0< yQ-CSJ6krN41(*U%0j9wJj|xQ4h8t-A|E2Bk{|~=+2fzLiZU28b0zUxY^8Ftl!#BYI literal 0 HcmV?d00001 diff --git a/terratorch_iterate/iterate2.py b/terratorch_iterate/iterate2.py index 516bd2f..97381fc 100644 --- a/terratorch_iterate/iterate2.py +++ b/terratorch_iterate/iterate2.py @@ -2,6 +2,7 @@ import argparse import json +import logging import os import subprocess import re @@ -11,6 +12,8 @@ import optuna import yaml +logger = logging.getLogger("iterate2") + # ============================================================ # CLI # ============================================================ @@ -58,6 +61,16 @@ def parse_args(): help="Comma-separated metric names to extract (e.g. score_linear_acc,score_modality_leak,score_combined)", ) + # ------------------------ + # Logging + # ------------------------ + parser.add_argument( + "--log-level", + default="INFO", + choices=["DEBUG", "INFO", "WARNING", "ERROR"], + help="Logging verbosity (default: INFO)", + ) + return parser.parse_args() @@ -67,58 +80,80 @@ def parse_args(): def resolve_paths(script: str, root_dir: Optional[str]): if root_dir is None: root_dir = '.' - return script, Path(root_dir).resolve() + resolved = Path(root_dir).resolve() + logger.debug("Resolved root_dir '%s' → '%s'", root_dir, resolved) + return script, resolved def build_launcher_command(wlm, cmd, trial_id, out_file, err_file, gpu_count, cpu_count, mem_gb, lsf_gpu_config_string): + logger.debug("Building launcher command: wlm=%s gpu_count=%d cpu_count=%d mem_gb=%d", wlm, gpu_count, cpu_count, mem_gb) if wlm == "lsf": gpu_fragment = f"-gpu \"{lsf_gpu_config_string}\"" if lsf_gpu_config_string else (f"-gpu num={gpu_count}" if gpu_count > 0 else "") - return f"bsub {gpu_fragment} -K -o {out_file} -e {err_file} -R \"rusage[ngpus={gpu_count}, cpu={cpu_count}, mem={mem_gb}GB]\" -J hpo_trial_{trial_id} \"{cmd}\"" - if wlm == "slurm": - return f"srun --gres=gpu:{gpu_count} --cpus-per-task={cpu_count} --mem={mem_gb}G --job-name=hpo_trial_{trial_id} --output={out_file} --error={err_file} bash -c \"{cmd}\"" - if wlm == "none": - return f'bash -c "{cmd} > {out_file} 2> {err_file}"' - raise ValueError(f"Unknown WLM: {wlm}") + launcher = f"bsub {gpu_fragment} -K -o {out_file} -e {err_file} -R \"rusage[ngpus={gpu_count}, cpu={cpu_count}, mem={mem_gb}GB]\" -J hpo_trial_{trial_id} \"{cmd}\"" + elif wlm == "slurm": + launcher = f"srun --gres=gpu:{gpu_count} --cpus-per-task={cpu_count} --mem={mem_gb}G --job-name=hpo_trial_{trial_id} --output={out_file} --error={err_file} bash -c \"{cmd}\"" + elif wlm == "none": + launcher = f'bash -c "{cmd} > {out_file} 2> {err_file}"' + else: + raise ValueError(f"Unknown WLM: {wlm}") + logger.debug("Launcher command: %s", launcher) + return launcher def build_shell_command(interpreter, root_dir, script_path, venv, script_args, param_setter): parts = [f"cd {root_dir}"] - if venv: parts.append(f"source {venv}/bin/activate") + if venv: + parts.append(f"source {venv}/bin/activate") + logger.debug("Activating venv: %s", venv) arg_list = [f"{interpreter} {script_path}"] for key, value in script_args.items(): arg_name = key.replace("_", "-") if value is None: - continue # None means "omit this flag" (e.g. compile: null disables --compile) + logger.debug("Skipping arg '%s': value is None (flag omitted)", key) + continue if param_setter: if isinstance(value, bool): - if value: arg_list.append(f"--{param_setter} {key}") - # False → omit entirely + if value: + arg_list.append(f"--{param_setter} {key}") + logger.debug("Setter flag: --%s %s (store_true)", param_setter, key) + else: + logger.debug("Skipping flag '%s': False → omitted", key) else: arg_list.append(f"--{param_setter} {key} {value}") + logger.debug("Setter arg: --%s %s %s", param_setter, key, value) else: if isinstance(value, bool): - if value: arg_list.append(f"--{arg_name}") - # False → flag is simply absent + if value: + arg_list.append(f"--{arg_name}") + logger.debug("Flag present: --%s", arg_name) + else: + logger.debug("Skipping flag '--%s': False → omitted", arg_name) else: arg_list.append(f"--{arg_name} {value}") - parts.append(" ".join(arg_list)) - return " && ".join(parts) + logger.debug("Arg: --%s %s", arg_name, value) + cmd = " && ".join(parts + [" ".join(arg_list)]) + logger.debug("Shell command: %s", cmd) + return cmd # ============================================================ # MULTI-METRIC EXTRACTION # ============================================================ def extract_metrics_from_log(path: str, metric_names: List[str]) -> List[float]: + logger.debug("Extracting metrics %s from '%s'", metric_names, path) results = [] with open(path, "r", encoding="utf-8", errors="ignore") as f: text = f.read() - + logger.debug("Log file '%s': %d characters read", path, len(text)) + for metric in metric_names: pattern = re.compile(rf"{re.escape(metric)}\s*[:=]\s*([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)") matches = pattern.findall(text) if not matches: - print(f"Warning: Metric '{metric}' not found in {path}. Defaulting to 0.0") + logger.warning("Metric '%s' not found in '%s' — defaulting to 0.0", metric, path) results.append(0.0) else: - results.append(float(matches[-1])) + value = float(matches[-1]) + logger.debug("Metric '%s': found %d match(es), using last value %s", metric, len(matches), value) + results.append(value) return results # ============================================================ @@ -127,39 +162,71 @@ def extract_metrics_from_log(path: str, metric_names: List[str]) -> List[float]: def load_hpo_space(args): data = {} - if args.hpo_json: data = json.loads(args.hpo_json) + if args.hpo_json: + logger.debug("Loading HPO space from JSON string") + data = json.loads(args.hpo_json) elif args.hpo_yaml: + logger.debug("Loading HPO space from YAML file: %s", args.hpo_yaml) with open(args.hpo_yaml, "r") as f: data = yaml.safe_load(f) - return data.get("hpo", {}) + space = data.get("hpo", {}) + logger.info("HPO space loaded: %d parameter(s): %s", len(space), list(space.keys())) + return space def load_static_args(args): data = {} - if args.static_args_json: data = json.loads(args.static_args_json) + if args.static_args_json: + logger.debug("Loading static args from JSON string") + data = json.loads(args.static_args_json) elif args.static_args_yaml: + logger.debug("Loading static args from YAML file: %s", args.static_args_yaml) with open(args.static_args_yaml, "r") as f: data = yaml.safe_load(f) elif args.hpo_yaml: + logger.debug("Loading static args from HPO YAML file: %s", args.hpo_yaml) with open(args.hpo_yaml, "r") as f: data = yaml.safe_load(f) - return data.get("static", data if data else {}) + static = data.get("static", data if data else {}) + logger.info("Static args loaded: %d key(s): %s", len(static), list(static.keys())) + return static def suggest_from_spec(trial, name, spec): t = spec["type"] - if t == "float": return trial.suggest_float(name, float(spec["low"]), float(spec["high"]), log=spec.get("log", False)) - if t == "int": return trial.suggest_int(name, int(spec["low"]), int(spec["high"]), log=spec.get("log", False)) - if t == "categorical": return trial.suggest_categorical(name, spec["choices"]) - if t == "flag": - # store_true style: True → --name present, False → flag omitted entirely - return trial.suggest_categorical(name, [True, False]) - if t == "group": - # Suggests one of the named group keys; caller expands the key → dict of args - return trial.suggest_categorical(name, list(spec["choices"].keys())) - raise ValueError(f"Unknown param type: {t}") + if t == "float": + val = trial.suggest_float(name, float(spec["low"]), float(spec["high"]), log=spec.get("log", False)) + elif t == "int": + val = trial.suggest_int(name, int(spec["low"]), int(spec["high"]), log=spec.get("log", False)) + elif t == "categorical": + val = trial.suggest_categorical(name, spec["choices"]) + elif t == "flag": + val = trial.suggest_categorical(name, [True, False]) + elif t == "group": + val = trial.suggest_categorical(name, list(spec["choices"].keys())) + else: + raise ValueError(f"Unknown param type: {t}") + logger.debug("Suggested '%s' (%s) = %r", name, t, val) + return val def main(): args = parse_args() + + logging.basicConfig( + level=getattr(logging, args.log_level), + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + # Suppress noisy optuna INFO logs unless user asked for DEBUG + logging.getLogger("optuna").setLevel( + logging.WARNING if args.log_level == "INFO" else getattr(logging, args.log_level) + ) + + logger.info("iterate2 starting") + logger.info("Log level: %s", args.log_level) + logger.info("WLM: %s | interpreter: %s | script: %s", args.wlm, args.interpreter, args.script) + logger.info("Optuna study: '%s' | db: %s | n_trials: %d", args.optuna_study_name, args.optuna_db_path, args.optuna_n_trials) + hpo_space = load_hpo_space(args) static_args = load_static_args(args) metric_list = [m.strip() for m in args.metrics.split(",")] - + logger.info("Optimising metrics: %s", metric_list) + script_path, root_dir = resolve_paths(args.script, args.root_dir) def objective(trial): @@ -174,38 +241,47 @@ def objective(trial): # gpu_num in hpo/static overrides the CLI --gpu-count for this trial's launcher gpu_count = int(script_args.pop("gpu_num", args.gpu_count)) + logger.debug("Trial %d: effective gpu_count=%d", trial.number, gpu_count) + logger.info("Trial %d: sampled parameters: %s", trial.number, script_args) out_file = f"trial_{trial.number}.out" err_file = f"trial_{trial.number}.err" + logger.debug("Trial %d: stdout → %s | stderr → %s", trial.number, out_file, err_file) shell_cmd = build_shell_command(args.interpreter, root_dir, script_path, args.venv, script_args, args.param_setter) launcher_cmd = build_launcher_command(args.wlm, shell_cmd, trial.number, out_file, err_file, gpu_count, args.cpu_count, args.mem_gb, args.lsf_gpu_config_string) - print(f"Trial {trial.number}: Running...") + logger.info("Trial %d: submitting → %s", trial.number, launcher_cmd) subprocess.run(launcher_cmd, shell=True, check=True) + logger.info("Trial %d: job finished", trial.number) values = extract_metrics_from_log(out_file, metric_list) - print(f"Trial {trial.number} results: {dict(zip(metric_list, values))}") - + logger.info("Trial %d: results %s", trial.number, dict(zip(metric_list, values))) + return tuple(values) # Multi-objective direction directions = ["maximize"] * len(metric_list) + logger.info("Creating Optuna study (directions: %s)", directions) + + storage = f"sqlite:///{args.optuna_db_path}" if "sqlite" not in args.optuna_db_path else args.optuna_db_path + logger.debug("Optuna storage: %s", storage) study = optuna.create_study( study_name=args.optuna_study_name, - storage=f"sqlite:///{args.optuna_db_path}" if "sqlite" not in args.optuna_db_path else args.optuna_db_path, + storage=storage, directions=directions, load_if_exists=True, ) + logger.info("Study '%s' ready (existing trials: %d)", args.optuna_study_name, len(study.trials)) study.optimize(objective, n_trials=args.optuna_n_trials) - print("\n" + "="*60) - print("OPTIMIZATION COMPLETE") - print(f"Pareto Front Trials: {len(study.best_trials)}") + logger.info("=" * 60) + logger.info("OPTIMIZATION COMPLETE") + logger.info("Pareto Front Trials: %d", len(study.best_trials)) for t in study.best_trials: - print(f"Trial {t.number}: Values={t.values}") + logger.info(" Trial %d: Values=%s Params=%s", t.number, t.values, t.params) if __name__ == "__main__": main() \ No newline at end of file From f995680eb1f8253cf2e578b40cd5c957712482ca Mon Sep 17 00:00:00 2001 From: Romeo Kienzler Date: Wed, 8 Apr 2026 17:30:54 +0200 Subject: [PATCH 03/16] add support for underscore parameters Signed-off-by: Romeo Kienzler --- terratorch_iterate/iterate2.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/terratorch_iterate/iterate2.py b/terratorch_iterate/iterate2.py index 97381fc..c621fe0 100644 --- a/terratorch_iterate/iterate2.py +++ b/terratorch_iterate/iterate2.py @@ -36,6 +36,13 @@ def parse_args(): parser.add_argument("--cpu-count", type=int, default=4) parser.add_argument("--mem-gb", type=int, default=128) parser.add_argument("--lsf-gpu-config-string", type=str, default=None) + parser.add_argument( + "--no-underscore-to-hyphen", + dest="underscore_to_hyphen", + action="store_false", + default=True, + help="Do not convert underscores to hyphens in arg names (default: convert)", + ) # ------------------------ # Optuna config @@ -98,14 +105,14 @@ def build_launcher_command(wlm, cmd, trial_id, out_file, err_file, gpu_count, cp logger.debug("Launcher command: %s", launcher) return launcher -def build_shell_command(interpreter, root_dir, script_path, venv, script_args, param_setter): +def build_shell_command(interpreter, root_dir, script_path, venv, script_args, param_setter, underscore_to_hyphen=True): parts = [f"cd {root_dir}"] if venv: parts.append(f"source {venv}/bin/activate") logger.debug("Activating venv: %s", venv) arg_list = [f"{interpreter} {script_path}"] for key, value in script_args.items(): - arg_name = key.replace("_", "-") + arg_name = key.replace("_", "-") if underscore_to_hyphen else key if value is None: logger.debug("Skipping arg '%s': value is None (flag omitted)", key) continue @@ -248,7 +255,7 @@ def objective(trial): err_file = f"trial_{trial.number}.err" logger.debug("Trial %d: stdout → %s | stderr → %s", trial.number, out_file, err_file) - shell_cmd = build_shell_command(args.interpreter, root_dir, script_path, args.venv, script_args, args.param_setter) + shell_cmd = build_shell_command(args.interpreter, root_dir, script_path, args.venv, script_args, args.param_setter, args.underscore_to_hyphen) launcher_cmd = build_launcher_command(args.wlm, shell_cmd, trial.number, out_file, err_file, gpu_count, args.cpu_count, args.mem_gb, args.lsf_gpu_config_string) logger.info("Trial %d: submitting → %s", trial.number, launcher_cmd) From 83468846cb5f6c9fe7f246accace4d5b64085a7c Mon Sep 17 00:00:00 2001 From: Romeo Kienzler Date: Fri, 10 Apr 2026 08:52:10 +0200 Subject: [PATCH 04/16] read metrics from stderr, extend regex Signed-off-by: Romeo Kienzler --- configs/gridfm_graphkit_hpo.yaml | 24 ++++++++++---------- terratorch_iterate/iterate2.py | 39 ++++++++++++++++++++++++++++---- 2 files changed, 46 insertions(+), 17 deletions(-) diff --git a/configs/gridfm_graphkit_hpo.yaml b/configs/gridfm_graphkit_hpo.yaml index 65073ed..334241f 100644 --- a/configs/gridfm_graphkit_hpo.yaml +++ b/configs/gridfm_graphkit_hpo.yaml @@ -1,4 +1,4 @@ -# HPO configuration for gridfm-graphkit HGNS PF case2000 +# HPO configuration for gridfm-graphkit HGNS PF case118 # # Hyperparameters: # gpu_num – number of GPUs to request from the WLM (launcher-level) @@ -9,6 +9,14 @@ # # Static args: # all other fixed CLI args +# +# Metrics: +# extracted from [performance] lines in trial output + +metrics: + - case118_ieee/layer_0_residual + - last epoch time + - last epoch it/s hpo: gpu_num: @@ -29,19 +37,11 @@ hpo: dataset: type: group # one choice selects all bundled args together choices: - case2000: - config: ./examples/config/HGNS_PF_datakit_case2000.yaml - data_path: /dccstor/gridfm/powermodels_data/v4/finetuning/pf/ - exp_name: case2000 - case1000: - config: ./examples/config/HGNS_PF_datakit_case1000.yaml - data_path: /dccstor/gridfm/powermodels_data/v4/finetuning/pf/ - exp_name: case1000 + case118: + config: ./examples/config/HGNS_PF_datakit_case118.yaml + data_path: /u/rkie/ static: run_name: run1 log_dir: logs - dataset_wrapper: SharedMemoryCacheDataset - plugins: gridfm_graphkit_ee num_workers: 32 - dataset_wrapper_cache_dir: /dccstor/terratorch/case2000_ieeecache diff --git a/terratorch_iterate/iterate2.py b/terratorch_iterate/iterate2.py index c621fe0..650298a 100644 --- a/terratorch_iterate/iterate2.py +++ b/terratorch_iterate/iterate2.py @@ -144,15 +144,27 @@ def build_shell_command(interpreter, root_dir, script_path, venv, script_args, p # MULTI-METRIC EXTRACTION # ============================================================ -def extract_metrics_from_log(path: str, metric_names: List[str]) -> List[float]: +def extract_metrics_from_log(path: str, metric_names: List[str], err_path: Optional[str] = None) -> List[float]: logger.debug("Extracting metrics %s from '%s'", metric_names, path) results = [] with open(path, "r", encoding="utf-8", errors="ignore") as f: text = f.read() logger.debug("Log file '%s': %d characters read", path, len(text)) + # Also read stderr — Lightning/rich writes test result tables there + if err_path: + try: + with open(err_path, "r", encoding="utf-8", errors="ignore") as f: + err_text = f.read() + logger.debug("Err file '%s': %d characters read", err_path, len(err_text)) + text = text + "\n" + err_text + except FileNotFoundError: + logger.debug("Err file '%s' not found, skipping", err_path) for metric in metric_names: - pattern = re.compile(rf"{re.escape(metric)}\s*[:=]\s*([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)") + # Matches: key: value | key=value | [performance] key : value | Lightning table │ key │ value │ + pattern = re.compile( + rf"(?:\[\w+\]\s*)?{re.escape(metric)}\s*(?:[:=│])\s*([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)" + ) matches = pattern.findall(text) if not matches: logger.warning("Metric '%s' not found in '%s' — defaulting to 0.0", metric, path) @@ -179,6 +191,22 @@ def load_hpo_space(args): logger.info("HPO space loaded: %d parameter(s): %s", len(space), list(space.keys())) return space +def load_metrics_from_yaml(args): + """Return metrics list from YAML 'metrics:' key, or None if not present.""" + data = {} + if args.hpo_json: + data = json.loads(args.hpo_json) + elif args.hpo_yaml: + with open(args.hpo_yaml, "r") as f: data = yaml.safe_load(f) + elif args.static_args_yaml: + with open(args.static_args_yaml, "r") as f: data = yaml.safe_load(f) + metrics = data.get("metrics", None) + if metrics is None: + return None + if isinstance(metrics, list): + return [m.strip() for m in metrics] + return [m.strip() for m in str(metrics).split(",")] + def load_static_args(args): data = {} if args.static_args_json: @@ -231,8 +259,9 @@ def main(): hpo_space = load_hpo_space(args) static_args = load_static_args(args) - metric_list = [m.strip() for m in args.metrics.split(",")] - logger.info("Optimising metrics: %s", metric_list) + yaml_metrics = load_metrics_from_yaml(args) + metric_list = yaml_metrics if yaml_metrics is not None else [m.strip() for m in args.metrics.split(",")] + logger.info("Optimising metrics: %s (source: %s)", metric_list, "yaml" if yaml_metrics else "cli") script_path, root_dir = resolve_paths(args.script, args.root_dir) @@ -262,7 +291,7 @@ def objective(trial): subprocess.run(launcher_cmd, shell=True, check=True) logger.info("Trial %d: job finished", trial.number) - values = extract_metrics_from_log(out_file, metric_list) + values = extract_metrics_from_log(out_file, metric_list, err_path=err_file) logger.info("Trial %d: results %s", trial.number, dict(zip(metric_list, values))) return tuple(values) From 1d359de05f88e6428b9b5fce99e1e531ef34ffc3 Mon Sep 17 00:00:00 2001 From: Romeo Kienzler Date: Fri, 10 Apr 2026 09:02:33 +0200 Subject: [PATCH 05/16] report performance in hpo run Signed-off-by: Romeo Kienzler --- configs/gridfm_graphkit_hpo.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/configs/gridfm_graphkit_hpo.yaml b/configs/gridfm_graphkit_hpo.yaml index 334241f..9939893 100644 --- a/configs/gridfm_graphkit_hpo.yaml +++ b/configs/gridfm_graphkit_hpo.yaml @@ -45,3 +45,4 @@ static: run_name: run1 log_dir: logs num_workers: 32 + report_performance: true From 44fd1f3eee952062d4b6131407347ef73e577383 Mon Sep 17 00:00:00 2001 From: Romeo Kienzler Date: Fri, 10 Apr 2026 10:28:09 +0200 Subject: [PATCH 06/16] add parallel execution within single iterate process Signed-off-by: Romeo Kienzler --- configs/gridfm_graphkit_hpo.yaml | 2 +- docs/iterate2.md | 51 +++++++++++++++++ terratorch_iterate/iterate2.py | 98 +++++++++++++++++++++++++++++++- 3 files changed, 147 insertions(+), 4 deletions(-) diff --git a/configs/gridfm_graphkit_hpo.yaml b/configs/gridfm_graphkit_hpo.yaml index 9939893..6ee3a2e 100644 --- a/configs/gridfm_graphkit_hpo.yaml +++ b/configs/gridfm_graphkit_hpo.yaml @@ -45,4 +45,4 @@ static: run_name: run1 log_dir: logs num_workers: 32 - report_performance: true + report-performance: true diff --git a/docs/iterate2.md b/docs/iterate2.md index 8922d52..c70bef2 100644 --- a/docs/iterate2.md +++ b/docs/iterate2.md @@ -41,6 +41,7 @@ iterate2 \ | `--cpu-count` | `4` | Number of CPUs per trial | | `--mem-gb` | `128` | Memory (GB) per trial | | `--lsf-gpu-config-string` | `None` | Optional verbatim LSF `-gpu` option string (see [GPU configuration](#gpu-configuration-on-lsf)) | +| `--parallelism` | `1` | Number of trials to run in parallel (see [Parallel execution](#parallel-execution)) | ### Optuna options @@ -297,6 +298,56 @@ bsub -n 20 -R "span[hosts=1]" \ --- +--- + +## Parallel execution + +By default `iterate2` runs one trial at a time. Pass `--parallelism N` to run up to `N` trials simultaneously, each in its own thread. + +```sh +iterate2 \ + --parallelism 4 \ + --wlm lsf \ + ... +``` + +### How it works + +Each thread independently: + +1. Asks Optuna for a new set of hyperparameters (`study.ask()`) +2. Builds and submits the launcher command (e.g. `bsub -K …`) +3. Streams every output line to the main process stdout/stderr, prefixed with `[trial-N]` +4. Reports the extracted metrics back to Optuna (`study.tell()`) + +Output from concurrent trials is prefixed so you can follow individual workers: + +``` +[trial-3] Epoch 1/10 ━━━━━━━━━━ 100/100 0:01:12 +[trial-5] Using bfloat16 precision +[trial-3] [performance] val_loss : 0.0421 +[trial-5] Epoch 1/10 ━━━━━━━━━━ 100/100 0:01:15 +``` + +### Output files + +| WLM | stdout | stderr | +|---|---|---| +| `none` | `trial_N.out` (written by iterate2) | `trial_N.err` (written by iterate2) | +| `lsf` / `slurm` | `trial_N.out` (written by WLM on cluster) | `trial_N.err` (written by WLM on cluster) | + +For WLM backends the local WLM tool output (bsub/srun status messages) is written to `trial_N_wlm.out` / `trial_N_wlm.err` so the cluster-managed files are never overwritten. + +### SQLite and parallelism + +Optuna retries on SQLite locking errors automatically. Values up to `--parallelism 4` work well with SQLite. For higher concurrency use a PostgreSQL storage URL: + +```sh +--optuna-db-path postgresql://user:pass@host/dbname +``` + +--- + ## Workload managers ### LSF diff --git a/terratorch_iterate/iterate2.py b/terratorch_iterate/iterate2.py index 650298a..19823f4 100644 --- a/terratorch_iterate/iterate2.py +++ b/terratorch_iterate/iterate2.py @@ -5,7 +5,9 @@ import logging import os import subprocess +import sys import re +import threading from pathlib import Path from typing import Dict, Any, Optional, Literal, List @@ -36,6 +38,15 @@ def parse_args(): parser.add_argument("--cpu-count", type=int, default=4) parser.add_argument("--mem-gb", type=int, default=128) parser.add_argument("--lsf-gpu-config-string", type=str, default=None) + parser.add_argument( + "--parallelism", + type=int, + default=1, + help="Number of trials to run in parallel (default: 1 = sequential). " + "Each parallel trial runs in its own thread. " + "For SQLite storage, values >4 may cause locking contention; " + "consider PostgreSQL for high parallelism.", + ) parser.add_argument( "--no-underscore-to-hyphen", dest="underscore_to_hyphen", @@ -99,7 +110,9 @@ def build_launcher_command(wlm, cmd, trial_id, out_file, err_file, gpu_count, cp elif wlm == "slurm": launcher = f"srun --gres=gpu:{gpu_count} --cpus-per-task={cpu_count} --mem={mem_gb}G --job-name=hpo_trial_{trial_id} --output={out_file} --error={err_file} bash -c \"{cmd}\"" elif wlm == "none": - launcher = f'bash -c "{cmd} > {out_file} 2> {err_file}"' + # No embedded redirect: run_and_stream() captures stdout/stderr via PIPE + # and writes to out_file/err_file itself. + launcher = f'bash -c "{cmd}"' else: raise ValueError(f"Unknown WLM: {wlm}") logger.debug("Launcher command: %s", launcher) @@ -140,6 +153,84 @@ def build_shell_command(interpreter, root_dir, script_path, venv, script_args, p logger.debug("Shell command: %s", cmd) return cmd +# ============================================================ +# PARALLEL STREAMING RUNNER +# ============================================================ + +_print_lock = threading.Lock() + +def _stream_pipe(pipe, dest_file, trial_id: int, stream_name: str, dest_stream): + """Read lines from *pipe*, write to *dest_file* and print prefixed to *dest_stream*.""" + prefix = f"[trial-{trial_id}]" + with open(dest_file, "w", encoding="utf-8", errors="replace") as fh: + for raw in pipe: + line = raw.decode("utf-8", errors="replace") + fh.write(line) + fh.flush() + with _print_lock: + dest_stream.write(f"{prefix} {line}") + dest_stream.flush() + +def run_and_stream(launcher_cmd: str, trial_id: int, out_file: str, err_file: str, wlm: str): + """ + Run *launcher_cmd* in a shell. + + For ``wlm='none'``: captures stdout and stderr via PIPE, streams every line + to the main process stdout/stderr (prefixed with ``[trial-N]``), and also + writes them to *out_file* / *err_file* for later metric extraction. + + For WLM backends (lsf, slurm, …): the WLM tool itself manages the output + files on the cluster. The local subprocess output (WLM status messages, + errors) is still streamed with the same prefix so parallel workers are + distinguishable. + """ + logger.debug("Trial %d: run_and_stream wlm=%s cmd=%s", trial_id, wlm, launcher_cmd) + proc = subprocess.Popen( + launcher_cmd, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + + if wlm == "none": + # Full capture: write to files AND stream to console + t_out = threading.Thread( + target=_stream_pipe, + args=(proc.stdout, out_file, trial_id, "stdout", sys.stdout), + daemon=True, + ) + t_err = threading.Thread( + target=_stream_pipe, + args=(proc.stderr, err_file, trial_id, "stderr", sys.stderr), + daemon=True, + ) + else: + # WLM manages the cluster output files (out_file/err_file) itself. + # Stream only the local WLM tool output (bsub/srun status messages) + # to console; write it to separate local files to avoid clobbering the + # cluster-managed trial output files. + wlm_out = out_file.replace(".out", "_wlm.out") + wlm_err = err_file.replace(".err", "_wlm.err") + t_out = threading.Thread( + target=_stream_pipe, + args=(proc.stdout, wlm_out, trial_id, "wlm-stdout", sys.stdout), + daemon=True, + ) + t_err = threading.Thread( + target=_stream_pipe, + args=(proc.stderr, wlm_err, trial_id, "wlm-stderr", sys.stderr), + daemon=True, + ) + + t_out.start() + t_err.start() + proc.wait() + t_out.join() + t_err.join() + + if proc.returncode != 0: + raise subprocess.CalledProcessError(proc.returncode, launcher_cmd) + # ============================================================ # MULTI-METRIC EXTRACTION # ============================================================ @@ -288,7 +379,7 @@ def objective(trial): launcher_cmd = build_launcher_command(args.wlm, shell_cmd, trial.number, out_file, err_file, gpu_count, args.cpu_count, args.mem_gb, args.lsf_gpu_config_string) logger.info("Trial %d: submitting → %s", trial.number, launcher_cmd) - subprocess.run(launcher_cmd, shell=True, check=True) + run_and_stream(launcher_cmd, trial.number, out_file, err_file, args.wlm) logger.info("Trial %d: job finished", trial.number) values = extract_metrics_from_log(out_file, metric_list, err_path=err_file) @@ -311,7 +402,8 @@ def objective(trial): ) logger.info("Study '%s' ready (existing trials: %d)", args.optuna_study_name, len(study.trials)) - study.optimize(objective, n_trials=args.optuna_n_trials) + logger.info("Parallelism: %d worker(s)", args.parallelism) + study.optimize(objective, n_trials=args.optuna_n_trials, n_jobs=args.parallelism) logger.info("=" * 60) logger.info("OPTIMIZATION COMPLETE") From 7c28bcddd7f6f4aaeaba1be5614cb44c2996c47d Mon Sep 17 00:00:00 2001 From: Romeo Kienzler Date: Fri, 10 Apr 2026 10:34:21 +0200 Subject: [PATCH 07/16] support for JournalFile Signed-off-by: Romeo Kienzler --- docs/iterate2.md | 10 ++++++++-- terratorch_iterate/iterate2.py | 10 +++++++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/docs/iterate2.md b/docs/iterate2.md index c70bef2..f0da899 100644 --- a/docs/iterate2.md +++ b/docs/iterate2.md @@ -48,7 +48,7 @@ iterate2 \ | Option | Default | Description | |---|---|---| | `--optuna-study-name` | *(required)* | Name of the Optuna study | -| `--optuna-db-path` | *(required)* | Storage URL for the Optuna database, e.g. `sqlite:///hpo.db` | +| `--optuna-db-path` | *(required)* | Storage URL. `sqlite:///hpo.db` for SQLite, `js:///path/journal.log` for JournalStorage, or any Optuna-supported URL | | `--optuna-n-trials` | `100` | Number of trials to run | ### HPO search space @@ -340,12 +340,18 @@ For WLM backends the local WLM tool output (bsub/srun status messages) is writte ### SQLite and parallelism -Optuna retries on SQLite locking errors automatically. Values up to `--parallelism 4` work well with SQLite. For higher concurrency use a PostgreSQL storage URL: +Optuna retries on SQLite locking errors automatically. Values up to `--parallelism 4` work well with SQLite. For higher concurrency use PostgreSQL or **JournalStorage**: ```sh +# PostgreSQL --optuna-db-path postgresql://user:pass@host/dbname + +# JournalStorage (file-based, lock-free, safe for parallel workers on a shared filesystem) +--optuna-db-path js:///path/to/study_journal.log ``` +`js:///` is a custom `iterate2` scheme. The path after `js:///` is passed to Optuna's `JournalFileStorage`. JournalStorage serialises trials to an append-only log and is well-suited for NFS/GPFS shared filesystems where SQLite locking is unreliable. + --- ## Workload managers diff --git a/terratorch_iterate/iterate2.py b/terratorch_iterate/iterate2.py index 19823f4..f33d4a7 100644 --- a/terratorch_iterate/iterate2.py +++ b/terratorch_iterate/iterate2.py @@ -12,6 +12,7 @@ from typing import Dict, Any, Optional, Literal, List import optuna +from optuna.storages import JournalStorage, JournalFileStorage import yaml logger = logging.getLogger("iterate2") @@ -391,7 +392,14 @@ def objective(trial): directions = ["maximize"] * len(metric_list) logger.info("Creating Optuna study (directions: %s)", directions) - storage = f"sqlite:///{args.optuna_db_path}" if "sqlite" not in args.optuna_db_path else args.optuna_db_path + if args.optuna_db_path.startswith("js:///"): + journal_path = args.optuna_db_path[len("js://"):] + logger.info("Using JournalStorage at '%s'", journal_path) + storage = JournalStorage(JournalFileStorage(journal_path)) + elif "sqlite" in args.optuna_db_path: + storage = args.optuna_db_path + else: + storage = f"sqlite:///{args.optuna_db_path}" logger.debug("Optuna storage: %s", storage) study = optuna.create_study( From 8bb7310751f7b04ab4c5c168b9e8013b2455ef1f Mon Sep 17 00:00:00 2001 From: Romeo Kienzler Date: Tue, 14 Apr 2026 14:51:58 +0200 Subject: [PATCH 08/16] add vela support Signed-off-by: Romeo Kienzler --- docs/iterate2.md | 77 ++++++- examples/run_vela_example.sh | 52 +++++ examples/vela_gridfm_template.yaml | 32 +++ terratorch_iterate/iterate2.py | 323 ++++++++++++++++++++++++++++- 4 files changed, 476 insertions(+), 8 deletions(-) create mode 100644 examples/run_vela_example.sh create mode 100644 examples/vela_gridfm_template.yaml diff --git a/docs/iterate2.md b/docs/iterate2.md index f0da899..c21359e 100644 --- a/docs/iterate2.md +++ b/docs/iterate2.md @@ -36,13 +36,26 @@ iterate2 \ | `--venv` | `.venv` | Virtual-environment directory to activate. Set to empty string to disable | | `--interpreter` | `python` | Python interpreter to invoke | | `--param-setter` | `None` | Use setter-style argument passing (see [Setter-style arguments](#setter-style-arguments)) | -| `--wlm` | `none` | Workload manager: `lsf`, `slurm`, `openshift`, or `none` | +| `--wlm` | `none` | Workload manager: `lsf`, `slurm`, `vela`, or `none` | | `--gpu-count` | `1` | Number of GPUs per trial | | `--cpu-count` | `4` | Number of CPUs per trial | | `--mem-gb` | `128` | Memory (GB) per trial | | `--lsf-gpu-config-string` | `None` | Optional verbatim LSF `-gpu` option string (see [GPU configuration](#gpu-configuration-on-lsf)) | | `--parallelism` | `1` | Number of trials to run in parallel (see [Parallel execution](#parallel-execution)) | +### Vela (OpenShift) options + +Required when `--wlm vela`. + +| Option | Default | Description | +|---|---|---| +| `--vela-job-template` | *(required)* | Path to the Vela job YAML template. `{{HPO_COMMAND}}` in `setupCommands` is replaced per trial | +| `--vela-chart-path` | *(required)* | Path to the `pytorchjob-generator` helm chart directory | +| `--vela-namespace` | *(current context)* | OpenShift/Kubernetes namespace | +| `--vela-cmd-placeholder` | `{{HPO_COMMAND}}` | String in `setupCommands` that is replaced with the HPO-parametrised CLI call | +| `--vela-pod-ready-timeout` | `600` | Seconds to wait for the trial pod to reach Running state | +| `--vela-job-timeout` | `86400` | Seconds to wait (streaming logs) for the job to complete | + ### Optuna options | Option | Default | Description | @@ -368,9 +381,67 @@ Uses `srun` with `--gres=gpu:`, `--cpus-per-task`, and `--mem` flags. Runs the command directly in a local shell, redirecting stdout/stderr to `trial_.out` / `trial_.err`. -### openshift +### Vela (OpenShift / MLBatch) + +`--wlm vela` submits each trial as a [PyTorchJob](https://www.kubeflow.org/docs/components/training/pytorch/) via the [MLBatch `pytorchjob-generator`](https://github.com/project-codeflare/mlbatch) helm chart. + +#### Submission flow + +1. For each Optuna trial iterate2: + * Builds the CLI invocation from sampled + static args. + * Patches the **job template YAML**: + * appends `-trial-` to `jobName` (unique resource per trial) + * sets `numGpusPerPod` from `gpu_num` (HPO or CLI `--gpu-count`) + * replaces the `{{HPO_COMMAND}}` placeholder in `setupCommands` with the generated CLI call + * Runs `helm template -f | oc create [-n ] -f-` +2. Polls until `-master-0` pod appears, then streams `oc logs -f ` — **this call blocks until the container exits**, so the trial behaves the same as other WLM backends. +3. Pod exit code is checked; non-zero raises an error. +4. The `PyTorchJob` resource is deleted. + +#### Job template + +Create a YAML file modelled on `examples/vela_gridfm_template.yaml`. The only special requirement is the `{{HPO_COMMAND}}` placeholder somewhere in `setupCommands`: + +```yaml +jobName: "my-project-hpo" # iterate2 appends -trial-N +numGpusPerPod: 1 # iterate2 overwrites with gpu_num +numCpusPerPod: 32 +totalMemoryPerPod: "32Gi" + +volumes: + - name: "data-vol" + claimName: "my-pvc" + mountPath: "/mnt/data" + +setupCommands: + - "wget -q https://example.com/config.yaml" + - "{{HPO_COMMAND}}" # ← iterate2 fills this in +``` + +#### Example invocation -Not yet implemented. +```sh +iterate2 \ + --script "gridfm_graphkit train" \ + --interpreter "" \ + --wlm vela \ + --vela-job-template examples/vela_gridfm_template.yaml \ + --vela-chart-path ../mlbatch/tools/pytorchjob-generator/chart \ + --vela-namespace my-namespace \ + --gpu-count 1 \ + --optuna-study-name gridfm_vela_hpo \ + --optuna-db-path sqlite:///gridfm_vela_hpo.db \ + --optuna-n-trials 20 \ + --hpo-yaml configs/gridfm_graphkit_hpo.yaml +``` + +See `examples/run_vela_example.sh` for a complete ready-to-run script. + +!!! note + `--script` is the bare CLI entry-point (`gridfm_graphkit train`). Set `--interpreter ""` to suppress the default `python` prefix. + +!!! tip + `gpu_num` in the HPO space controls both `numGpusPerPod` in the job YAML **and** the WLM resource request, just like with LSF/Slurm. --- diff --git a/examples/run_vela_example.sh b/examples/run_vela_example.sh new file mode 100644 index 0000000..41cdb06 --- /dev/null +++ b/examples/run_vela_example.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# ============================================================================= +# Example: iterate2 with --wlm vela (OpenShift / MLBatch PyTorchJob) +# +# Prerequisites +# ------------- +# * helm CLI installed and on PATH +# * oc CLI logged in to the target cluster +# * mlbatch/tools/pytorchjob-generator/chart checked out locally +# * The gridfm HPO YAML (configs/gridfm_graphkit_hpo.yaml) present +# +# How it works +# ------------ +# 1. For each Optuna trial iterate2: +# a. Samples hyperparameters from gridfm_graphkit_hpo.yaml +# b. Builds the gridfm_graphkit CLI invocation from static + sampled params +# c. Patches vela_gridfm_template.yaml: +# - appends "-trial-" to jobName (unique resource per trial) +# - sets numGpusPerPod = gpu_num (from the HPO space) +# - replaces {{HPO_COMMAND}} (the actual CLI call) +# d. Runs: helm template -f | oc create -f- +# e. Polls until -master-0 pod is Running +# f. Streams: oc logs -f -master-0 +# (blocks until container exits; output captured for metric extraction) +# g. Checks pod exit code; deletes the PyTorchJob resource +# 2. Metrics are extracted from the captured log and returned to Optuna. +# ============================================================================= + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +# Path to the mlbatch pytorchjob-generator helm chart. +# Clone mlbatch first: git clone https://github.com/project-codeflare/mlbatch +CHART_PATH="${MLBATCH_CHART_PATH:-${REPO_ROOT}/../mlbatch/tools/pytorchjob-generator/chart}" + +iterate2 \ + --script "gridfm_graphkit train" \ + --interpreter "" \ + --wlm vela \ + --vela-job-template "${SCRIPT_DIR}/vela_gridfm_template.yaml" \ + --vela-chart-path "${CHART_PATH}" \ + --vela-namespace "${OC_NAMESPACE:-}" \ + --vela-cmd-placeholder "{{HPO_COMMAND}}" \ + --vela-pod-ready-timeout 600 \ + --vela-job-timeout 86400 \ + --gpu-count 1 \ + --optuna-study-name gridfm_vela_hpo \ + --optuna-db-path "sqlite:///gridfm_vela_hpo.db" \ + --optuna-n-trials 20 \ + --hpo-yaml "${REPO_ROOT}/configs/gridfm_graphkit_hpo.yaml" diff --git a/examples/vela_gridfm_template.yaml b/examples/vela_gridfm_template.yaml new file mode 100644 index 0000000..c30f5ec --- /dev/null +++ b/examples/vela_gridfm_template.yaml @@ -0,0 +1,32 @@ +#################### +# Job Metadata +# NOTE: iterate2 appends "-trial-" to jobName for each HPO trial so +# every trial creates a unique Kubernetes resource. +#################### +jobName: "romeokienzler-gridfm-hpo" +containerImage: "us.icr.io/geodn/gridfm-graphkit:0.1" +imagePullPolicy: "IfNotPresent" +imagePullSecrets: + - name: "pullsecret-gridfm-geodn" + +################################## +# Resource Requirements +# NOTE: numGpusPerPod is overridden per-trial by the gpu_num HPO parameter. +################################## +numPods: 1 +numCpusPerPod: 32 +numGpusPerPod: 1 # placeholder – iterate2 overwrites this with gpu_num +totalMemoryPerPod: "32Gi" + +volumes: + - name: "gridfm-storage" + claimName: "gridfm" + mountPath: "/mnt/data" + +######################## +# Workload Specification +######################## +setupCommands: + - "wget -q https://raw.githubusercontent.com/gridfm/gridfm-graphkit/refs/heads/main/examples/config/HGNS_PF_datakit_case118.yaml" + # iterate2 replaces the placeholder below with the HPO-parametrised CLI call. + - "{{HPO_COMMAND}}" diff --git a/terratorch_iterate/iterate2.py b/terratorch_iterate/iterate2.py index f33d4a7..a057e05 100644 --- a/terratorch_iterate/iterate2.py +++ b/terratorch_iterate/iterate2.py @@ -7,7 +7,9 @@ import subprocess import sys import re +import tempfile import threading +import time from pathlib import Path from typing import Dict, Any, Optional, Literal, List @@ -34,11 +36,51 @@ def parse_args(): parser.add_argument("--venv", default=".venv", help="Virtualenv dir") parser.add_argument("--interpreter", default="python", help="Interpreter to use") parser.add_argument("--param-setter", type=str, default=None) - parser.add_argument("--wlm", choices=["lsf", "slurm", "openshift", "none"], default="none") + parser.add_argument("--wlm", choices=["lsf", "slurm", "openshift", "vela", "none"], default="none") parser.add_argument("--gpu-count", type=int, default=1) parser.add_argument("--cpu-count", type=int, default=4) parser.add_argument("--mem-gb", type=int, default=128) parser.add_argument("--lsf-gpu-config-string", type=str, default=None) + + # ------------------------ + # Vela / OpenShift options + # ------------------------ + parser.add_argument( + "--vela-job-template", + type=str, + default=None, + help="Path to the Vela job YAML template (required when --wlm vela)", + ) + parser.add_argument( + "--vela-chart-path", + type=str, + default=None, + help="Path to the helm chart directory (required when --wlm vela)", + ) + parser.add_argument( + "--vela-namespace", + type=str, + default=None, + help="OpenShift/Kubernetes namespace (uses current context if omitted)", + ) + parser.add_argument( + "--vela-cmd-placeholder", + type=str, + default="{{HPO_COMMAND}}", + help="String in the job template's setupCommands that is replaced with the HPO command (default: '{{HPO_COMMAND}}')", + ) + parser.add_argument( + "--vela-pod-ready-timeout", + type=int, + default=600, + help="Seconds to wait for the trial pod to reach Running state (default: 600)", + ) + parser.add_argument( + "--vela-job-timeout", + type=int, + default=86400, + help="Seconds to wait for the trial job to complete (default: 86400 = 24 h)", + ) parser.add_argument( "--parallelism", type=int, @@ -114,6 +156,9 @@ def build_launcher_command(wlm, cmd, trial_id, out_file, err_file, gpu_count, cp # No embedded redirect: run_and_stream() captures stdout/stderr via PIPE # and writes to out_file/err_file itself. launcher = f'bash -c "{cmd}"' + elif wlm in ("vela",): + # Vela uses a separate submission flow; this function is not called for it. + raise ValueError("build_launcher_command must not be called for wlm='vela'; use build_vela_job_yaml + run_vela_trial instead.") else: raise ValueError(f"Unknown WLM: {wlm}") logger.debug("Launcher command: %s", launcher) @@ -154,6 +199,242 @@ def build_shell_command(interpreter, root_dir, script_path, venv, script_args, p logger.debug("Shell command: %s", cmd) return cmd + +def build_container_command(interpreter: str, script_path: str, script_args: dict, param_setter: Optional[str], underscore_to_hyphen: bool = True) -> str: + """Build a bare CLI invocation suitable for running inside a container. + + Unlike :func:`build_shell_command` this function does **not** prepend + ``cd`` or ``source venv`` – those are not needed (or available) inside an + already-running container image. + """ + prefix = f"{interpreter} " if interpreter else "" + arg_list = [f"{prefix}{script_path}".strip()] + for key, value in script_args.items(): + arg_name = key.replace("_", "-") if underscore_to_hyphen else key + if value is None: + logger.debug("Container cmd: skipping '%s' (None)", key) + continue + if param_setter: + if isinstance(value, bool): + if value: + arg_list.append(f"--{param_setter} {key}") + else: + pass # omit + else: + arg_list.append(f"--{param_setter} {key} {value}") + else: + if isinstance(value, bool): + if value: + arg_list.append(f"--{arg_name}") + # else omit + else: + arg_list.append(f"--{arg_name} {value}") + cmd = " ".join(arg_list) + logger.debug("Container command: %s", cmd) + return cmd + + +def build_vela_job_yaml( + template_path: str, + trial_id: int, + gpu_count: int, + container_cmd: str, + placeholder: str, +) -> tuple[str, str]: + """Load *template_path*, inject HPO parameters, and return ``(yaml_str, job_name)``. + + Changes applied to the template: + * The ``jobName`` field gets a ``-trial-{trial_id}`` suffix so each trial + creates a unique Kubernetes resource. + * ``numGpusPerPod`` is overwritten with *gpu_count*. + * Any entry in ``setupCommands`` that equals *placeholder* is replaced with + *container_cmd*. If no entry matches the placeholder, the last entry is + replaced and a warning is emitted. + """ + with open(template_path, "r") as fh: + data = yaml.safe_load(fh) + + # Unique job name per trial + base_name = data.get("jobName", "hpo-job") + job_name = f"{base_name}-trial-{trial_id}" + data["jobName"] = job_name + logger.debug("Vela trial %d: jobName → %s", trial_id, job_name) + + # GPU count + data["numGpusPerPod"] = gpu_count + logger.debug("Vela trial %d: numGpusPerPod → %d", trial_id, gpu_count) + + # Inject command + setup_cmds = data.get("setupCommands", []) + replaced = False + for i, entry in enumerate(setup_cmds): + if placeholder in str(entry): + setup_cmds[i] = container_cmd + replaced = True + logger.debug("Vela trial %d: replaced setupCommands[%d] with HPO command", trial_id, i) + break + if not replaced: + logger.warning( + "Vela trial %d: placeholder '%s' not found in setupCommands – replacing last entry", + trial_id, placeholder, + ) + if setup_cmds: + setup_cmds[-1] = container_cmd + else: + setup_cmds.append(container_cmd) + data["setupCommands"] = setup_cmds + + return yaml.dump(data, default_flow_style=False, allow_unicode=True), job_name + + +def _oc(*args, namespace: Optional[str] = None, check: bool = True, capture: bool = False): + """Run an ``oc`` sub-command, optionally capturing output.""" + cmd = ["oc"] + list(args) + if namespace: + cmd += ["-n", namespace] + logger.debug("oc command: %s", " ".join(cmd)) + if capture: + return subprocess.run(cmd, check=check, capture_output=True, text=True) + return subprocess.run(cmd, check=check) + + +def run_vela_trial( + trial_id: int, + job_yaml: str, + chart_path: str, + job_name: str, + namespace: Optional[str], + out_file: str, + err_file: str, + pod_ready_timeout: int, + job_timeout: int, +) -> None: + """Submit a Vela/OpenShift PyTorchJob, stream its logs, and wait for completion. + + Steps + ----- + 1. Write *job_yaml* to a temp file. + 2. ``helm template -f | oc create [-n ] -f-`` + 3. Poll until the master pod (``-master-0``) appears. + 4. ``oc logs -f `` – streams every line to stdout **and** *out_file*. + 5. After streaming ends, check the pod's terminated exit-code. + Non-zero → raise :class:`subprocess.CalledProcessError`. + 6. Cleanup: delete the PyTorchJob resource. + """ + ns_args = ["-n", namespace] if namespace else [] + prefix = f"[trial-{trial_id}]" + + # Write temp YAML + with tempfile.NamedTemporaryFile( + mode="w", + suffix=".yaml", + prefix=f"vela_trial_{trial_id}_", + delete=False, + ) as fh: + fh.write(job_yaml) + tmp_yaml = fh.name + logger.debug("Vela trial %d: temp YAML written to %s", trial_id, tmp_yaml) + + try: + # ── 1. Submit ────────────────────────────────────────────────────────── + ns_flag = f"-n {namespace}" if namespace else "" + create_cmd = ( + f"helm template -f {tmp_yaml} {chart_path}" + f" | oc create {ns_flag} -f-" + ) + logger.info("Trial %d: submitting Vela job → %s", trial_id, create_cmd) + result = subprocess.run(create_cmd, shell=True, capture_output=True, text=True) + with _print_lock: + sys.stdout.write(f"{prefix} {result.stdout}") + sys.stdout.flush() + if result.returncode != 0: + raise RuntimeError( + f"Vela trial {trial_id}: oc create failed (rc={result.returncode}):\n" + f"{result.stderr}" + ) + logger.info("Trial %d: job '%s' created", trial_id, job_name) + + # ── 2. Wait for master pod to appear ────────────────────────────────── + master_pod = f"{job_name}-master-0" + deadline = time.monotonic() + pod_ready_timeout + logger.info("Trial %d: waiting for pod '%s' to appear (timeout %ds)…", trial_id, master_pod, pod_ready_timeout) + while time.monotonic() < deadline: + r = subprocess.run( + ["oc", "get", "pod", master_pod, "--ignore-not-found"] + ns_args, + capture_output=True, text=True, + ) + if master_pod in r.stdout: + logger.debug("Trial %d: pod '%s' found", trial_id, master_pod) + break + time.sleep(5) + else: + raise TimeoutError( + f"Vela trial {trial_id}: pod '{master_pod}' did not appear within {pod_ready_timeout}s" + ) + + # ── 3. Wait for pod to be Running/Succeeded ─────────────────────────── + logger.info("Trial %d: waiting for pod '%s' to be Running…", trial_id, master_pod) + wait_cmd = ( + ["oc", "wait", f"pod/{master_pod}", + "--for=condition=Ready", + f"--timeout={pod_ready_timeout}s"] + + ns_args + ) + wr = subprocess.run(wait_cmd, capture_output=True, text=True) + # oc wait returns non-zero if the pod is already Completed (no Ready condition); + # that's fine – the logs are still accessible. + logger.debug("Trial %d: oc wait rc=%d stderr=%s", trial_id, wr.returncode, wr.stderr.strip()) + + # ── 4. Stream logs ──────────────────────────────────────────────────── + log_cmd = ["oc", "logs", "-f", master_pod] + ns_args + logger.info("Trial %d: streaming logs from '%s'", trial_id, master_pod) + log_proc = subprocess.Popen( + log_cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + t_out = threading.Thread( + target=_stream_pipe, + args=(log_proc.stdout, out_file, trial_id, "stdout", sys.stdout), + daemon=True, + ) + t_err = threading.Thread( + target=_stream_pipe, + args=(log_proc.stderr, err_file, trial_id, "stderr", sys.stderr), + daemon=True, + ) + t_out.start() + t_err.start() + log_proc.wait(timeout=job_timeout) + t_out.join() + t_err.join() + logger.debug("Trial %d: log stream ended (rc=%d)", trial_id, log_proc.returncode) + + # ── 5. Check pod exit code ──────────────────────────────────────────── + ec_result = subprocess.run( + ["oc", "get", "pod", master_pod, "-o", + "jsonpath={.status.containerStatuses[0].state.terminated.exitCode}"] + + ns_args, + capture_output=True, text=True, + ) + exit_code_str = ec_result.stdout.strip() + exit_code = int(exit_code_str) if exit_code_str.lstrip("-").isdigit() else 0 + logger.info("Trial %d: pod exit code = %s", trial_id, exit_code) + if exit_code != 0: + raise subprocess.CalledProcessError(exit_code, log_cmd) + + finally: + # ── 6. Cleanup – delete the job ─────────────────────────────────────── + logger.debug("Trial %d: deleting PyTorchJob '%s'", trial_id, job_name) + subprocess.run( + ["oc", "delete", "pytorchjob", job_name, "--ignore-not-found"] + ns_args, + capture_output=True, + ) + try: + os.unlink(tmp_yaml) + except OSError: + pass + # ============================================================ # PARALLEL STREAMING RUNNER # ============================================================ @@ -376,11 +657,43 @@ def objective(trial): err_file = f"trial_{trial.number}.err" logger.debug("Trial %d: stdout → %s | stderr → %s", trial.number, out_file, err_file) - shell_cmd = build_shell_command(args.interpreter, root_dir, script_path, args.venv, script_args, args.param_setter, args.underscore_to_hyphen) - launcher_cmd = build_launcher_command(args.wlm, shell_cmd, trial.number, out_file, err_file, gpu_count, args.cpu_count, args.mem_gb, args.lsf_gpu_config_string) + if args.wlm == "vela": + # ── Vela / OpenShift path ────────────────────────────────────── + if not args.vela_job_template: + raise ValueError("--vela-job-template is required when --wlm vela") + if not args.vela_chart_path: + raise ValueError("--vela-chart-path is required when --wlm vela") + container_cmd = build_container_command( + args.interpreter, script_path, script_args, + args.param_setter, args.underscore_to_hyphen, + ) + logger.info("Trial %d: container command → %s", trial.number, container_cmd) + job_yaml, job_name = build_vela_job_yaml( + args.vela_job_template, + trial.number, + gpu_count, + container_cmd, + args.vela_cmd_placeholder, + ) + logger.debug("Trial %d: job YAML (first 400 chars):\n%s", trial.number, job_yaml[:400]) + run_vela_trial( + trial_id=trial.number, + job_yaml=job_yaml, + chart_path=args.vela_chart_path, + job_name=job_name, + namespace=args.vela_namespace, + out_file=out_file, + err_file=err_file, + pod_ready_timeout=args.vela_pod_ready_timeout, + job_timeout=args.vela_job_timeout, + ) + else: + # ── Standard WLM path (lsf / slurm / none) ──────────────────── + shell_cmd = build_shell_command(args.interpreter, root_dir, script_path, args.venv, script_args, args.param_setter, args.underscore_to_hyphen) + launcher_cmd = build_launcher_command(args.wlm, shell_cmd, trial.number, out_file, err_file, gpu_count, args.cpu_count, args.mem_gb, args.lsf_gpu_config_string) + logger.info("Trial %d: submitting → %s", trial.number, launcher_cmd) + run_and_stream(launcher_cmd, trial.number, out_file, err_file, args.wlm) - logger.info("Trial %d: submitting → %s", trial.number, launcher_cmd) - run_and_stream(launcher_cmd, trial.number, out_file, err_file, args.wlm) logger.info("Trial %d: job finished", trial.number) values = extract_metrics_from_log(out_file, metric_list, err_path=err_file) From 5982a3aca4572162a957f85bd7a7db03b5bfbf69 Mon Sep 17 00:00:00 2001 From: Romeo Kienzler Date: Tue, 14 Apr 2026 15:19:37 +0200 Subject: [PATCH 09/16] add gpu perf test on vela Signed-off-by: Romeo Kienzler --- configs/gpu_perf_hpo.yaml | 58 +++++++++++++++++++++++++++ examples/run_vela_gpu_perf_example.sh | 46 +++++++++++++++++++++ examples/vela_gpu_perf_template.yaml | 37 +++++++++++++++++ terratorch_iterate/iterate2.py | 12 +++--- 4 files changed, 146 insertions(+), 7 deletions(-) create mode 100644 configs/gpu_perf_hpo.yaml create mode 100644 examples/run_vela_gpu_perf_example.sh create mode 100644 examples/vela_gpu_perf_template.yaml diff --git a/configs/gpu_perf_hpo.yaml b/configs/gpu_perf_hpo.yaml new file mode 100644 index 0000000..47d4fb1 --- /dev/null +++ b/configs/gpu_perf_hpo.yaml @@ -0,0 +1,58 @@ +# HPO configuration for GPU performance benchmarking with the CLAIMED +# gpu_performance_test component on Vela (OpenShift / MLBatch). +# +# Hyperparameters: +# batch_size – DataLoader / training batch size +# num_workers – DataLoader worker count +# hidden_dim – MLP hidden layer width (training/inference cost) +# depth – MLP depth (training/inference cost) +# matrix_size – Matrix multiplication size (raw GPU compute) +# gpu_num – Number of GPUs to request per pod (launcher-level) +# +# Static args: +# All remaining fixed CLI flags for gpu_performance_test.py +# +# Metrics: +# Emitted as [performance] lines by the awk rename wrapper in the template. +# Using last-match semantics of extract_metrics_from_log each name is unique. + +metrics: + - dataloader_samples_per_sec + - training_samples_per_sec + - inference_samples_per_sec + - gflops + +hpo: + gpu_num: + type: categorical + choices: [1, 2] + + batch_size: + type: categorical + choices: [32, 64, 128] + + num_workers: + type: categorical + choices: [8, 16, 32] + + hidden_dim: + type: categorical + choices: [500, 1000, 2000] + + depth: + type: categorical + choices: [500, 1000, 2000] + + matrix_size: + type: categorical + choices: [5000, 10000, 20000] + +static: + mode: single_gpu + dataset_size: 100000 + steps: 1 + input_dim: 1000000 + num_classes: 100 + materialize_dir: "." + cleanup: true # flag – generates --cleanup (store_true) + iterations: 100 diff --git a/examples/run_vela_gpu_perf_example.sh b/examples/run_vela_gpu_perf_example.sh new file mode 100644 index 0000000..91448c2 --- /dev/null +++ b/examples/run_vela_gpu_perf_example.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +# ============================================================================= +# Example: iterate2 GPU performance benchmark HPO on Vela (OpenShift / MLBatch) +# +# What iterate2 does per trial +# ---------------------------- +# 1. Samples hyperparameters from configs/gpu_perf_hpo.yaml +# 2. Builds the CLI call: +# python /app/.local/.../gpu_performance_test.py \ +# --mode single_gpu --batch-size --num-workers ... +# 3. Patches examples/vela_gpu_perf_template.yaml: +# - appends -trial- to jobName +# - sets numGpusPerPod = gpu_num +# - replaces {{HPO_COMMAND}} with the CLI call (awk wrapper stays intact) +# 4. Submits: +# helm template -f | oc create [-n ] -f- +# 5. Streams: oc logs -f -master-0 (blocks until container exits) +# 6. Extracts metrics from the renamed [performance] lines: +# dataloader_samples_per_sec, training_samples_per_sec, +# inference_samples_per_sec, gflops +# 7. Deletes the PyTorchJob resource. +# ============================================================================= + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +# Path to the mlbatch pytorchjob-generator helm chart. +# Clone first: git clone https://github.com/project-codeflare/mlbatch +CHART_PATH="${MLBATCH_CHART_PATH:-${REPO_ROOT}/../mlbatch/tools/pytorchjob-generator/chart}" + +iterate2 \ + --script "/app/.local/lib/python3.12/site-packages/claimed/components/util/gpu_performance_test.py" \ + --interpreter "python" \ + --wlm vela \ + --vela-job-template "${SCRIPT_DIR}/vela_gpu_perf_template.yaml" \ + --vela-chart-path "${CHART_PATH}" \ + --vela-namespace "${OC_NAMESPACE:-}" \ + --vela-pod-ready-timeout 300 \ + --vela-job-timeout 7200 \ + --gpu-count 1 \ + --optuna-study-name gpu_perf_hpo \ + --optuna-db-path "sqlite:///gpu_perf_hpo.db" \ + --optuna-n-trials 30 \ + --hpo-yaml "${REPO_ROOT}/configs/gpu_perf_hpo.yaml" diff --git a/examples/vela_gpu_perf_template.yaml b/examples/vela_gpu_perf_template.yaml new file mode 100644 index 0000000..830b98d --- /dev/null +++ b/examples/vela_gpu_perf_template.yaml @@ -0,0 +1,37 @@ +#################### +# Job Metadata +# NOTE: iterate2 appends "-trial-" to jobName so each trial is a unique resource. +#################### +jobName: "romeokienzler-gpu-test-hpo" +containerImage: "us.icr.io/geodn/gridfm-graphkit:0.1" +imagePullPolicy: "IfNotPresent" +imagePullSecrets: + - name: "pullsecret-gridfm-geodn" + +################################## +# Resource Requirements +# numGpusPerPod is overwritten per-trial by the gpu_num HPO parameter. +################################## +numPods: 1 +numCpusPerPod: 32 +numGpusPerPod: 1 +totalMemoryPerPod: "32Gi" + +######################## +# Workload Specification +######################## +setupCommands: + - "pip install -U claimed" + # iterate2 replaces {{HPO_COMMAND}} with the HPO-parametrised CLI call. + # The awk wrapper renames duplicate "Samples/sec:" lines to unique + # [performance] metric names so iterate2 can extract all four metrics. + - | + {{HPO_COMMAND}} 2>&1 | awk ' + /^--- DataLoader/ { sec="dataloader" } + /^--- Training/ { sec="training" } + /^--- Inference/ { sec="inference" } + /^--- GPU compute/ { sec="gpu" } + /Samples\/sec:/ { print "[performance] " sec "_samples_per_sec: " $NF; next } + /GFLOPS:/ { print "[performance] gflops: " $NF; next } + { print } + ' diff --git a/terratorch_iterate/iterate2.py b/terratorch_iterate/iterate2.py index a057e05..11c03d7 100644 --- a/terratorch_iterate/iterate2.py +++ b/terratorch_iterate/iterate2.py @@ -269,19 +269,17 @@ def build_vela_job_yaml( replaced = False for i, entry in enumerate(setup_cmds): if placeholder in str(entry): - setup_cmds[i] = container_cmd + # In-place substitution: keeps any wrapper (e.g. awk pipeline) around the placeholder + setup_cmds[i] = str(entry).replace(placeholder, container_cmd) replaced = True - logger.debug("Vela trial %d: replaced setupCommands[%d] with HPO command", trial_id, i) + logger.debug("Vela trial %d: substituted placeholder in setupCommands[%d]", trial_id, i) break if not replaced: logger.warning( - "Vela trial %d: placeholder '%s' not found in setupCommands – replacing last entry", + "Vela trial %d: placeholder '%s' not found in setupCommands – appending command as new entry", trial_id, placeholder, ) - if setup_cmds: - setup_cmds[-1] = container_cmd - else: - setup_cmds.append(container_cmd) + setup_cmds.append(container_cmd) data["setupCommands"] = setup_cmds return yaml.dump(data, default_flow_style=False, allow_unicode=True), job_name From 52a27446bf10ce0bd12dcd0620bba2802c020143 Mon Sep 17 00:00:00 2001 From: Romeo Kienzler Date: Wed, 15 Apr 2026 12:37:56 +0200 Subject: [PATCH 10/16] add vela support Signed-off-by: Romeo Kienzler --- configs/gpu_perf_hpo.yaml | 13 ++-- examples/run_lsf_gpu_example.sh | 2 +- examples/run_vela_example.sh | 2 +- examples/run_vela_gpu_perf_example.sh | 32 ++++---- examples/vela_gpu_perf_template.yaml | 18 ++--- terratorch_iterate/iterate2.py | 107 ++++++++++++++++++-------- 6 files changed, 107 insertions(+), 67 deletions(-) mode change 100644 => 100755 examples/run_vela_example.sh mode change 100644 => 100755 examples/run_vela_gpu_perf_example.sh diff --git a/configs/gpu_perf_hpo.yaml b/configs/gpu_perf_hpo.yaml index 47d4fb1..bfb12ca 100644 --- a/configs/gpu_perf_hpo.yaml +++ b/configs/gpu_perf_hpo.yaml @@ -13,14 +13,15 @@ # All remaining fixed CLI flags for gpu_performance_test.py # # Metrics: -# Emitted as [performance] lines by the awk rename wrapper in the template. -# Using last-match semantics of extract_metrics_from_log each name is unique. +# The gpu_performance_test script prints "Samples/sec:" three times (DataLoader, +# Training, Inference) and "GFLOPS:" once. iterate2 uses the name#N syntax +# to select the Nth occurrence (0-based) of a repeated metric name. metrics: - - dataloader_samples_per_sec - - training_samples_per_sec - - inference_samples_per_sec - - gflops + - "Samples/sec#0" # DataLoader throughput (1st occurrence) + - "Samples/sec#1" # Training throughput (2nd occurrence) + - "Samples/sec#2" # Inference throughput (3rd occurrence) + - GFLOPS hpo: gpu_num: diff --git a/examples/run_lsf_gpu_example.sh b/examples/run_lsf_gpu_example.sh index 2913513..9808c92 100755 --- a/examples/run_lsf_gpu_example.sh +++ b/examples/run_lsf_gpu_example.sh @@ -22,7 +22,7 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -iterate2 \ +iterate \ --script "${SCRIPT_DIR}/bumpy_function.py" \ --root-dir "${SCRIPT_DIR}" \ --venv "" \ diff --git a/examples/run_vela_example.sh b/examples/run_vela_example.sh old mode 100644 new mode 100755 index 41cdb06..3327864 --- a/examples/run_vela_example.sh +++ b/examples/run_vela_example.sh @@ -35,7 +35,7 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" # Clone mlbatch first: git clone https://github.com/project-codeflare/mlbatch CHART_PATH="${MLBATCH_CHART_PATH:-${REPO_ROOT}/../mlbatch/tools/pytorchjob-generator/chart}" -iterate2 \ +iterate \ --script "gridfm_graphkit train" \ --interpreter "" \ --wlm vela \ diff --git a/examples/run_vela_gpu_perf_example.sh b/examples/run_vela_gpu_perf_example.sh old mode 100644 new mode 100755 index 91448c2..edf76b6 --- a/examples/run_vela_gpu_perf_example.sh +++ b/examples/run_vela_gpu_perf_example.sh @@ -11,13 +11,13 @@ # 3. Patches examples/vela_gpu_perf_template.yaml: # - appends -trial- to jobName # - sets numGpusPerPod = gpu_num -# - replaces {{HPO_COMMAND}} with the CLI call (awk wrapper stays intact) +# - replaces {{HPO_COMMAND}} with the CLI call (plain single-line string) # 4. Submits: # helm template -f | oc create [-n ] -f- # 5. Streams: oc logs -f -master-0 (blocks until container exits) -# 6. Extracts metrics from the renamed [performance] lines: -# dataloader_samples_per_sec, training_samples_per_sec, -# inference_samples_per_sec, gflops +# 6. Extracts metrics using Nth-occurrence syntax (name#N, 0-based): +# Samples/sec#0 (DataLoader), Samples/sec#1 (Training), +# Samples/sec#2 (Inference), GFLOPS # 7. Deletes the PyTorchJob resource. # ============================================================================= @@ -28,19 +28,25 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" # Path to the mlbatch pytorchjob-generator helm chart. # Clone first: git clone https://github.com/project-codeflare/mlbatch -CHART_PATH="${MLBATCH_CHART_PATH:-${REPO_ROOT}/../mlbatch/tools/pytorchjob-generator/chart}" +CHART_PATH="${MLBATCH_CHART_PATH:-${HOME}/tmp/mlbatch/tools/pytorchjob-generator/chart}" -iterate2 \ - --script "/app/.local/lib/python3.12/site-packages/claimed/components/util/gpu_performance_test.py" \ - --interpreter "python" \ - --wlm vela \ - --vela-job-template "${SCRIPT_DIR}/vela_gpu_perf_template.yaml" \ - --vela-chart-path "${CHART_PATH}" \ - --vela-namespace "${OC_NAMESPACE:-}" \ +# Only pass --vela-namespace when OC_NAMESPACE is set; an empty string causes +# argparse to receive a bare "" token it treats as an unrecognised argument. +NAMESPACE_ARG=() +[[ -n "${OC_NAMESPACE:-}" ]] && NAMESPACE_ARG=(--vela-namespace "${OC_NAMESPACE}") + +iterate \ + --script "/app/.local/lib/python3.12/site-packages/claimed/components/util/gpu_performance_test.py" \ + --interpreter "python" \ + --no-underscore-to-hyphen \ + --wlm vela \ + --vela-job-template "${SCRIPT_DIR}/vela_gpu_perf_template.yaml" \ + --vela-chart-path "${CHART_PATH}" \ + "${NAMESPACE_ARG[@]}" \ --vela-pod-ready-timeout 300 \ --vela-job-timeout 7200 \ --gpu-count 1 \ --optuna-study-name gpu_perf_hpo \ --optuna-db-path "sqlite:///gpu_perf_hpo.db" \ - --optuna-n-trials 30 \ + --optuna-n-trials 250 \ --hpo-yaml "${REPO_ROOT}/configs/gpu_perf_hpo.yaml" diff --git a/examples/vela_gpu_perf_template.yaml b/examples/vela_gpu_perf_template.yaml index 830b98d..476a5c7 100644 --- a/examples/vela_gpu_perf_template.yaml +++ b/examples/vela_gpu_perf_template.yaml @@ -15,7 +15,7 @@ imagePullSecrets: numPods: 1 numCpusPerPod: 32 numGpusPerPod: 1 -totalMemoryPerPod: "32Gi" +totalMemoryPerPod: "64Gi" ######################## # Workload Specification @@ -23,15 +23,7 @@ totalMemoryPerPod: "32Gi" setupCommands: - "pip install -U claimed" # iterate2 replaces {{HPO_COMMAND}} with the HPO-parametrised CLI call. - # The awk wrapper renames duplicate "Samples/sec:" lines to unique - # [performance] metric names so iterate2 can extract all four metrics. - - | - {{HPO_COMMAND}} 2>&1 | awk ' - /^--- DataLoader/ { sec="dataloader" } - /^--- Training/ { sec="training" } - /^--- Inference/ { sec="inference" } - /^--- GPU compute/ { sec="gpu" } - /Samples\/sec:/ { print "[performance] " sec "_samples_per_sec: " $NF; next } - /GFLOPS:/ { print "[performance] gflops: " $NF; next } - { print } - ' + # Each setupCommands entry must be a single line (Helm chart constraint). + # Metrics are disambiguated by iterate2 using Samples/sec#N (Nth-occurrence) + # rather than awk renaming. + - "{{HPO_COMMAND}}" diff --git a/terratorch_iterate/iterate2.py b/terratorch_iterate/iterate2.py index 11c03d7..c8c9033 100644 --- a/terratorch_iterate/iterate2.py +++ b/terratorch_iterate/iterate2.py @@ -241,48 +241,56 @@ def build_vela_job_yaml( container_cmd: str, placeholder: str, ) -> tuple[str, str]: - """Load *template_path*, inject HPO parameters, and return ``(yaml_str, job_name)``. + """Load *template_path* as raw text, inject HPO parameters, return ``(yaml_str, job_name)``. - Changes applied to the template: - * The ``jobName`` field gets a ``-trial-{trial_id}`` suffix so each trial - creates a unique Kubernetes resource. + All modifications are done via targeted regex/string substitutions on the raw + YAML text so that multi-line block scalars (e.g. awk pipelines), single-quoted + strings, and other constructs that PyYAML would mangle on a load→dump round-trip + are preserved exactly as written in the template. + + Changes applied: + * ``jobName`` gets a ``-trial-{trial_id}`` suffix (unique Kubernetes resource). * ``numGpusPerPod`` is overwritten with *gpu_count*. - * Any entry in ``setupCommands`` that equals *placeholder* is replaced with - *container_cmd*. If no entry matches the placeholder, the last entry is - replaced and a warning is emitted. + * The *placeholder* string inside ``setupCommands`` is replaced with + *container_cmd* in-place, preserving any surrounding wrapper (e.g. awk pipeline). """ with open(template_path, "r") as fh: - data = yaml.safe_load(fh) - - # Unique job name per trial - base_name = data.get("jobName", "hpo-job") - job_name = f"{base_name}-trial-{trial_id}" - data["jobName"] = job_name + text = fh.read() + + # ── jobName ────────────────────────────────────────────────────────────── + job_name_match = re.search(r'^(jobName\s*:\s*["\']?)([^"\'#\n]+)(["\']?)', text, re.MULTILINE) + if not job_name_match: + raise ValueError(f"'jobName' key not found in template '{template_path}'") + raw_name = job_name_match.group(2).strip() + job_name = f"{raw_name}-trial-{trial_id}" + text = ( + text[:job_name_match.start(2)] + + job_name_match.group(2).replace(raw_name, job_name) + + text[job_name_match.end(2):] + ) logger.debug("Vela trial %d: jobName → %s", trial_id, job_name) - # GPU count - data["numGpusPerPod"] = gpu_count + # ── numGpusPerPod ──────────────────────────────────────────────────────── + text = re.sub( + r'^(numGpusPerPod\s*:\s*)\S+', + lambda m: f"{m.group(1)}{gpu_count}", + text, + flags=re.MULTILINE, + ) logger.debug("Vela trial %d: numGpusPerPod → %d", trial_id, gpu_count) - # Inject command - setup_cmds = data.get("setupCommands", []) - replaced = False - for i, entry in enumerate(setup_cmds): - if placeholder in str(entry): - # In-place substitution: keeps any wrapper (e.g. awk pipeline) around the placeholder - setup_cmds[i] = str(entry).replace(placeholder, container_cmd) - replaced = True - logger.debug("Vela trial %d: substituted placeholder in setupCommands[%d]", trial_id, i) - break - if not replaced: + # ── placeholder substitution ───────────────────────────────────────────── + if placeholder in text: + text = text.replace(placeholder, container_cmd) + logger.debug("Vela trial %d: substituted placeholder '%s'", trial_id, placeholder) + else: logger.warning( - "Vela trial %d: placeholder '%s' not found in setupCommands – appending command as new entry", - trial_id, placeholder, + "Vela trial %d: placeholder '%s' not found in template '%s' – appending command", + trial_id, placeholder, template_path, ) - setup_cmds.append(container_cmd) - data["setupCommands"] = setup_cmds + text += f"\n - {container_cmd}\n" - return yaml.dump(data, default_flow_style=False, allow_unicode=True), job_name + return text, job_name def _oc(*args, namespace: Optional[str] = None, check: bool = True, capture: bool = False): @@ -419,7 +427,8 @@ def run_vela_trial( exit_code = int(exit_code_str) if exit_code_str.lstrip("-").isdigit() else 0 logger.info("Trial %d: pod exit code = %s", trial_id, exit_code) if exit_code != 0: - raise subprocess.CalledProcessError(exit_code, log_cmd) + logger.warning("Trial %d: pod exited with code %d – marking trial as pruned", trial_id, exit_code) + raise optuna.exceptions.TrialPruned(f"pod exited with code {exit_code}") finally: # ── 6. Cleanup – delete the job ─────────────────────────────────────── @@ -516,6 +525,19 @@ def run_and_stream(launcher_cmd: str, trial_id: int, out_file: str, err_file: st # ============================================================ def extract_metrics_from_log(path: str, metric_names: List[str], err_path: Optional[str] = None) -> List[float]: + """Extract metric values from a log file. + + Each entry in *metric_names* is either a plain name (uses the **last** + match) or ``name#N`` to select the **N-th occurrence** (0-based). This + lets you disambiguate scripts that print the same metric key multiple + times, e.g.:: + + metrics: + - "Samples/sec#0" # DataLoader throughput (first occurrence) + - "Samples/sec#1" # Training throughput (second occurrence) + - "Samples/sec#2" # Inference throughput (third occurrence) + - GFLOPS + """ logger.debug("Extracting metrics %s from '%s'", metric_names, path) results = [] with open(path, "r", encoding="utf-8", errors="ignore") as f: @@ -532,14 +554,33 @@ def extract_metrics_from_log(path: str, metric_names: List[str], err_path: Optio logger.debug("Err file '%s' not found, skipping", err_path) for metric in metric_names: + # Support name#N syntax for Nth-occurrence selection (0-based) + occurrence: Optional[int] = None + bare_metric = metric + idx_match = re.fullmatch(r'(.+)#(\d+)', metric) + if idx_match: + bare_metric = idx_match.group(1) + occurrence = int(idx_match.group(2)) + # Matches: key: value | key=value | [performance] key : value | Lightning table │ key │ value │ pattern = re.compile( - rf"(?:\[\w+\]\s*)?{re.escape(metric)}\s*(?:[:=│])\s*([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)" + rf"(?:\[\w+\]\s*)?{re.escape(bare_metric)}\s*(?:[:=│])\s*([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)" ) matches = pattern.findall(text) if not matches: logger.warning("Metric '%s' not found in '%s' — defaulting to 0.0", metric, path) results.append(0.0) + elif occurrence is not None: + if occurrence >= len(matches): + logger.warning( + "Metric '%s' occurrence #%d requested but only %d match(es) found — defaulting to 0.0", + metric, occurrence, len(matches), + ) + results.append(0.0) + else: + value = float(matches[occurrence]) + logger.debug("Metric '%s': using occurrence #%d = %s", metric, occurrence, value) + results.append(value) else: value = float(matches[-1]) logger.debug("Metric '%s': found %d match(es), using last value %s", metric, len(matches), value) From d27124716510f6522083020ef0ed0855269940da Mon Sep 17 00:00:00 2001 From: Romeo Kienzler Date: Wed, 15 Apr 2026 19:48:21 +0200 Subject: [PATCH 11/16] improve vela config Signed-off-by: Romeo Kienzler --- configs/gridfm_graphkit_hpo.yaml | 5 ++++- examples/vela_gridfm_template.yaml | 14 ++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/configs/gridfm_graphkit_hpo.yaml b/configs/gridfm_graphkit_hpo.yaml index 6ee3a2e..049466f 100644 --- a/configs/gridfm_graphkit_hpo.yaml +++ b/configs/gridfm_graphkit_hpo.yaml @@ -34,6 +34,10 @@ hpo: choices: ["max-autotune", "default", "reduce-overhead", null] # null → --compile flag is omitted entirely + num_workers: + type: categorical + choices: [8, 16, 24, 32] + dataset: type: group # one choice selects all bundled args together choices: @@ -44,5 +48,4 @@ hpo: static: run_name: run1 log_dir: logs - num_workers: 32 report-performance: true diff --git a/examples/vela_gridfm_template.yaml b/examples/vela_gridfm_template.yaml index c30f5ec..9779f06 100644 --- a/examples/vela_gridfm_template.yaml +++ b/examples/vela_gridfm_template.yaml @@ -4,7 +4,7 @@ # every trial creates a unique Kubernetes resource. #################### jobName: "romeokienzler-gridfm-hpo" -containerImage: "us.icr.io/geodn/gridfm-graphkit:0.1" +containerImage: "us.icr.io/geodn/gridfm-graphkit:0.5" imagePullPolicy: "IfNotPresent" imagePullSecrets: - name: "pullsecret-gridfm-geodn" @@ -14,9 +14,9 @@ imagePullSecrets: # NOTE: numGpusPerPod is overridden per-trial by the gpu_num HPO parameter. ################################## numPods: 1 -numCpusPerPod: 32 -numGpusPerPod: 1 # placeholder – iterate2 overwrites this with gpu_num -totalMemoryPerPod: "32Gi" +numCpusPerPod: 64 +numGpusPerPod: 2 # placeholder – iterate overrides this with gpu_num +totalMemoryPerPod: "64Gi" volumes: - name: "gridfm-storage" @@ -27,6 +27,8 @@ volumes: # Workload Specification ######################## setupCommands: - - "wget -q https://raw.githubusercontent.com/gridfm/gridfm-graphkit/refs/heads/main/examples/config/HGNS_PF_datakit_case118.yaml" - # iterate2 replaces the placeholder below with the HPO-parametrised CLI call. + - "pip install -U git+https://github.com/gridfm/gridfm-graphkit.git@fix_multi_gpu_training" + - "wget https://raw.githubusercontent.com/gridfm/gridfm-graphkit/refs/heads/main/examples/config/HGNS_PF_datakit_case118.yaml" + - "sed -i 's/epochs: 200/epochs: 5/g' HGNS_PF_datakit_case118.yaml" + # iterate replaces the placeholder below with the HPO-parametrised CLI call. - "{{HPO_COMMAND}}" From f61e8bbd85f7a4373888f16a24af507a8aec3e66 Mon Sep 17 00:00:00 2001 From: Romeo Kienzler Date: Thu, 16 Apr 2026 16:18:07 +0200 Subject: [PATCH 12/16] fix Signed-off-by: Romeo Kienzler --- examples/run_vela_example.sh | 11 ++++++++--- examples/run_vela_gpu_perf_example.sh | 5 +++-- examples/vela_gridfm_template.yaml | 2 +- terratorch_iterate/iterate2.py | 2 +- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/examples/run_vela_example.sh b/examples/run_vela_example.sh index 3327864..bb5cafc 100755 --- a/examples/run_vela_example.sh +++ b/examples/run_vela_example.sh @@ -33,7 +33,10 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" # Path to the mlbatch pytorchjob-generator helm chart. # Clone mlbatch first: git clone https://github.com/project-codeflare/mlbatch -CHART_PATH="${MLBATCH_CHART_PATH:-${REPO_ROOT}/../mlbatch/tools/pytorchjob-generator/chart}" +CHART_PATH="${MLBATCH_CHART_PATH:-${HOME}/tmp/mlbatch/tools/pytorchjob-generator/chart}" + +NAMESPACE_ARG=() +[[ -n "${OC_NAMESPACE:-}" ]] && NAMESPACE_ARG=(--vela-namespace "${OC_NAMESPACE}") iterate \ --script "gridfm_graphkit train" \ @@ -41,12 +44,14 @@ iterate \ --wlm vela \ --vela-job-template "${SCRIPT_DIR}/vela_gridfm_template.yaml" \ --vela-chart-path "${CHART_PATH}" \ - --vela-namespace "${OC_NAMESPACE:-}" \ + "${NAMESPACE_ARG[@]}" \ --vela-cmd-placeholder "{{HPO_COMMAND}}" \ --vela-pod-ready-timeout 600 \ --vela-job-timeout 86400 \ + --no-underscore-to-hyphen \ --gpu-count 1 \ --optuna-study-name gridfm_vela_hpo \ - --optuna-db-path "sqlite:///gridfm_vela_hpo.db" \ + --optuna-db-path "js:///gridfm_vela_hpo.journal" \ + --parallelism 16 \ --optuna-n-trials 20 \ --hpo-yaml "${REPO_ROOT}/configs/gridfm_graphkit_hpo.yaml" diff --git a/examples/run_vela_gpu_perf_example.sh b/examples/run_vela_gpu_perf_example.sh index edf76b6..bf845b6 100755 --- a/examples/run_vela_gpu_perf_example.sh +++ b/examples/run_vela_gpu_perf_example.sh @@ -47,6 +47,7 @@ iterate \ --vela-job-timeout 7200 \ --gpu-count 1 \ --optuna-study-name gpu_perf_hpo \ - --optuna-db-path "sqlite:///gpu_perf_hpo.db" \ + --optuna-db-path "js:///gpu_perf_hpo.journal" \ --optuna-n-trials 250 \ - --hpo-yaml "${REPO_ROOT}/configs/gpu_perf_hpo.yaml" + --hpo-yaml "${REPO_ROOT}/configs/gpu_perf_hpo.yaml" \ + --parallelism 5 \ diff --git a/examples/vela_gridfm_template.yaml b/examples/vela_gridfm_template.yaml index 9779f06..e75b3cd 100644 --- a/examples/vela_gridfm_template.yaml +++ b/examples/vela_gridfm_template.yaml @@ -29,6 +29,6 @@ volumes: setupCommands: - "pip install -U git+https://github.com/gridfm/gridfm-graphkit.git@fix_multi_gpu_training" - "wget https://raw.githubusercontent.com/gridfm/gridfm-graphkit/refs/heads/main/examples/config/HGNS_PF_datakit_case118.yaml" - - "sed -i 's/epochs: 200/epochs: 5/g' HGNS_PF_datakit_case118.yaml" +# - "sed -i 's/epochs: 200/epochs: 5/g' HGNS_PF_datakit_case118.yaml" # iterate replaces the placeholder below with the HPO-parametrised CLI call. - "{{HPO_COMMAND}}" diff --git a/terratorch_iterate/iterate2.py b/terratorch_iterate/iterate2.py index c8c9033..86c8e11 100644 --- a/terratorch_iterate/iterate2.py +++ b/terratorch_iterate/iterate2.py @@ -745,7 +745,7 @@ def objective(trial): logger.info("Creating Optuna study (directions: %s)", directions) if args.optuna_db_path.startswith("js:///"): - journal_path = args.optuna_db_path[len("js://"):] + journal_path = args.optuna_db_path[len("js:///"):] logger.info("Using JournalStorage at '%s'", journal_path) storage = JournalStorage(JournalFileStorage(journal_path)) elif "sqlite" in args.optuna_db_path: From d190a8e23f928db6149f379492841ebbfa5add8a Mon Sep 17 00:00:00 2001 From: Romeo Kienzler Date: Thu, 16 Apr 2026 18:44:02 +0200 Subject: [PATCH 13/16] don't crash on fail Signed-off-by: Romeo Kienzler --- terratorch_iterate/iterate2.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/terratorch_iterate/iterate2.py b/terratorch_iterate/iterate2.py index 86c8e11..658476d 100644 --- a/terratorch_iterate/iterate2.py +++ b/terratorch_iterate/iterate2.py @@ -763,7 +763,12 @@ def objective(trial): logger.info("Study '%s' ready (existing trials: %d)", args.optuna_study_name, len(study.trials)) logger.info("Parallelism: %d worker(s)", args.parallelism) - study.optimize(objective, n_trials=args.optuna_n_trials, n_jobs=args.parallelism) + study.optimize( + objective, + n_trials=args.optuna_n_trials, + n_jobs=args.parallelism, + catch=(Exception,), # mark trial as FAILED and continue; never crash the study + ) logger.info("=" * 60) logger.info("OPTIMIZATION COMPLETE") From b03fb3eb1e55c9bf4f6741c7b3bd10cfff627339 Mon Sep 17 00:00:00 2001 From: Romeo Kienzler Date: Thu, 16 Apr 2026 19:37:24 +0200 Subject: [PATCH 14/16] fix exit code handling Signed-off-by: Romeo Kienzler --- terratorch_iterate/iterate2.py | 60 ++++++++++++++++++++++++++++++---- 1 file changed, 53 insertions(+), 7 deletions(-) diff --git a/terratorch_iterate/iterate2.py b/terratorch_iterate/iterate2.py index 658476d..d596791 100644 --- a/terratorch_iterate/iterate2.py +++ b/terratorch_iterate/iterate2.py @@ -416,14 +416,60 @@ def run_vela_trial( t_err.join() logger.debug("Trial %d: log stream ended (rc=%d)", trial_id, log_proc.returncode) + # ── 4b. If oc logs exited early (e.g. "unexpected EOF"), the pod may + # still be running. Re-attach the log stream and wait for it to + # finish so we capture the full output and don't delete a live job. + if log_proc.returncode != 0: + logger.warning( + "Trial %d: oc logs exited with rc=%d (possible EOF disconnect) – " + "waiting for pod to terminate before reading exit code", + trial_id, log_proc.returncode, + ) + # Wait for pod phase Succeeded or Failed (container terminated). + oc_wait_phase = subprocess.run( + ["oc", "wait", f"pod/{master_pod}", + "--for=jsonpath={.status.phase}=Succeeded", + f"--timeout={job_timeout}s"] + + ns_args, + capture_output=True, text=True, + ) + if oc_wait_phase.returncode != 0: + # Pod may have Failed; try that phase too. + subprocess.run( + ["oc", "wait", f"pod/{master_pod}", + "--for=jsonpath={.status.phase}=Failed", + f"--timeout=30s"] + + ns_args, + capture_output=True, text=True, + ) + # Re-stream any log lines written after the disconnect into the same files. + catchup = subprocess.run( + ["oc", "logs", "--tail=-1", master_pod] + ns_args, + capture_output=True, text=True, + ) + if catchup.stdout: + with open(out_file, "a", encoding="utf-8", errors="replace") as fh: + fh.write(catchup.stdout) + if catchup.stderr: + with open(err_file, "a", encoding="utf-8", errors="replace") as fh: + fh.write(catchup.stderr) + # ── 5. Check pod exit code ──────────────────────────────────────────── - ec_result = subprocess.run( - ["oc", "get", "pod", master_pod, "-o", - "jsonpath={.status.containerStatuses[0].state.terminated.exitCode}"] - + ns_args, - capture_output=True, text=True, - ) - exit_code_str = ec_result.stdout.strip() + # Poll until the pod has a terminated exit code (handles the race + # between oc-logs EOF and pod termination being recorded in the API). + exit_code_str = "" + for _attempt in range(30): + ec_result = subprocess.run( + ["oc", "get", "pod", master_pod, "-o", + "jsonpath={.status.containerStatuses[0].state.terminated.exitCode}"] + + ns_args, + capture_output=True, text=True, + ) + exit_code_str = ec_result.stdout.strip() + if exit_code_str.lstrip("-").isdigit(): + break + logger.debug("Trial %d: exit code not yet available, retrying in 5 s…", trial_id) + time.sleep(5) exit_code = int(exit_code_str) if exit_code_str.lstrip("-").isdigit() else 0 logger.info("Trial %d: pod exit code = %s", trial_id, exit_code) if exit_code != 0: From 989de122b80d2736123d29b06862a58897320c46 Mon Sep 17 00:00:00 2001 From: Romeo Kienzler Date: Fri, 17 Apr 2026 14:27:45 +0200 Subject: [PATCH 15/16] add lsf config example for gridfm Signed-off-by: Romeo Kienzler --- examples/run_lsf_gridfm_example.sh | 48 ++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100755 examples/run_lsf_gridfm_example.sh diff --git a/examples/run_lsf_gridfm_example.sh b/examples/run_lsf_gridfm_example.sh new file mode 100755 index 0000000..e140776 --- /dev/null +++ b/examples/run_lsf_gridfm_example.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +# ============================================================================= +# Example: iterate with --wlm lsf for gridfm-graphkit HPO on an LSF cluster +# +# Prerequisites +# ------------- +# * LSF bsub/bjobs available on PATH +# * gridfm-graphkit installed in the venv (or via module load) +# * configs/gridfm_graphkit_hpo.yaml present +# +# How it works +# ------------ +# 1. For each Optuna trial iterate: +# a. Samples hyperparameters from gridfm_graphkit_hpo.yaml +# b. Builds the gridfm_graphkit CLI invocation from static + sampled params +# c. Submits a bsub job (-K blocks until completion) +# d. Reads stdout/stderr from trial.out / trial.err +# e. Extracts metrics and reports them to Optuna +# +# Customise +# --------- +# LSF_GPU_CONFIG – full -gpu option string for bsub +# GPU_COUNT – must match num= in LSF_GPU_CONFIG +# OC_NAMESPACE – not used for LSF; kept as no-op for parity +# ============================================================================= + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +# Adjust to match your cluster's GPU model and scheduling policy. +LSF_GPU_CONFIG="${LSF_GPU_CONFIG:-num=2:mode=exclusive_process:mps=no:gmodel=NVIDIAA100_SXM4_80GB}" + +iterate \ + --script "gridfm_graphkit train" \ + --interpreter "" \ + --wlm lsf \ + --no-underscore-to-hyphen \ + --gpu-count 2 \ + --cpu-count 32 \ + --mem-gb 256 \ + --lsf-gpu-config-string "${LSF_GPU_CONFIG}" \ + --optuna-study-name gridfm_lsf_hpo \ + --optuna-db-path "js:///gridfm_lsf_hpo.journal" \ + --parallelism 4 \ + --optuna-n-trials 20 \ + --hpo-yaml "${REPO_ROOT}/configs/gridfm_graphkit_hpo.yaml" From f3283adab9ab9343ba08b18723489489e8254e04 Mon Sep 17 00:00:00 2001 From: Romeo Kienzler Date: Fri, 17 Apr 2026 15:44:38 +0200 Subject: [PATCH 16/16] add prerun commands Signed-off-by: Romeo Kienzler --- examples/run_lsf_gridfm_example.sh | 7 +++++++ terratorch_iterate/iterate2.py | 21 +++++++++++++++++---- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/examples/run_lsf_gridfm_example.sh b/examples/run_lsf_gridfm_example.sh index e140776..5afb84e 100755 --- a/examples/run_lsf_gridfm_example.sh +++ b/examples/run_lsf_gridfm_example.sh @@ -32,10 +32,17 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" # Adjust to match your cluster's GPU model and scheduling policy. LSF_GPU_CONFIG="${LSF_GPU_CONFIG:-num=2:mode=exclusive_process:mps=no:gmodel=NVIDIAA100_SXM4_80GB}" +# Commands to run inside the bsub job before launching the training script. +# source ~/.bashrc ensures module / mamba initialisation is available; +# micromamba activate gridfm switches to the correct conda environment. +PRE_RUN="${PRE_RUN_COMMANDS:-source ~/.bashrc && micromamba activate gridfm}" + iterate \ --script "gridfm_graphkit train" \ --interpreter "" \ + --root-dir "${GRIDFM_ROOT:-${HOME}/gitco/gridfm-graphkit}" \ --wlm lsf \ + --pre-run-commands "${PRE_RUN}" \ --no-underscore-to-hyphen \ --gpu-count 2 \ --cpu-count 32 \ diff --git a/terratorch_iterate/iterate2.py b/terratorch_iterate/iterate2.py index d596791..a22ad94 100644 --- a/terratorch_iterate/iterate2.py +++ b/terratorch_iterate/iterate2.py @@ -33,7 +33,17 @@ def parse_args(): # ------------------------ parser.add_argument("--script", required=True, help="Training script to execute") parser.add_argument("--root-dir", default=None, help="Root dir (derived if omitted)") - parser.add_argument("--venv", default=".venv", help="Virtualenv dir") + parser.add_argument("--venv", default=".venv", help="Virtualenv dir (shortcut for source /bin/activate)") + parser.add_argument( + "--pre-run-commands", + default=None, + help=( + "Shell commands to run before the training script, joined with ' && '. " + "Useful for sourcing bashrc, activating conda/mamba envs, loading modules, etc. " + "Example: 'source ~/.bashrc && micromamba activate gridfm'. " + "When set, --venv is ignored." + ), + ) parser.add_argument("--interpreter", default="python", help="Interpreter to use") parser.add_argument("--param-setter", type=str, default=None) parser.add_argument("--wlm", choices=["lsf", "slurm", "openshift", "vela", "none"], default="none") @@ -164,9 +174,12 @@ def build_launcher_command(wlm, cmd, trial_id, out_file, err_file, gpu_count, cp logger.debug("Launcher command: %s", launcher) return launcher -def build_shell_command(interpreter, root_dir, script_path, venv, script_args, param_setter, underscore_to_hyphen=True): +def build_shell_command(interpreter, root_dir, script_path, venv, script_args, param_setter, underscore_to_hyphen=True, pre_run_commands=None): parts = [f"cd {root_dir}"] - if venv: + if pre_run_commands: + parts.append(pre_run_commands) + logger.debug("Pre-run commands: %s", pre_run_commands) + elif venv: parts.append(f"source {venv}/bin/activate") logger.debug("Activating venv: %s", venv) arg_list = [f"{interpreter} {script_path}"] @@ -774,7 +787,7 @@ def objective(trial): ) else: # ── Standard WLM path (lsf / slurm / none) ──────────────────── - shell_cmd = build_shell_command(args.interpreter, root_dir, script_path, args.venv, script_args, args.param_setter, args.underscore_to_hyphen) + shell_cmd = build_shell_command(args.interpreter, root_dir, script_path, args.venv, script_args, args.param_setter, args.underscore_to_hyphen, pre_run_commands=args.pre_run_commands) launcher_cmd = build_launcher_command(args.wlm, shell_cmd, trial.number, out_file, err_file, gpu_count, args.cpu_count, args.mem_gb, args.lsf_gpu_config_string) logger.info("Trial %d: submitting → %s", trial.number, launcher_cmd) run_and_stream(launcher_cmd, trial.number, out_file, err_file, args.wlm)