From 6440cb13c80962fd670a87d19fb3f3da9ae37979 Mon Sep 17 00:00:00 2001 From: Romeo Kienzler Date: Fri, 24 Apr 2026 11:31:46 +0200 Subject: [PATCH 01/12] Refactor WLM integration in iterate2: Introduce plugin system for LSF and Vela - Removed hardcoded WLM options and parameters from the argument parser. - Added support for a user-defined WLM plugin via `--wlm-plugin` argument. - Implemented `load_wlm_config` function to read WLM settings from HPO YAML. - Created reference implementations for LSF and Vela plugins in `examples/wlm_plugins/`. - Updated `run_and_stream` to handle local execution and WLM plugin invocation. - Enhanced logging to provide clearer feedback on trial execution and WLM interactions. - Cleaned up unused functions and parameters related to previous WLM handling. Signed-off-by: Romeo Kienzler --- configs/gridfm_graphkit_hpo.yaml | 23 + docs/iterate2.md | 126 ++--- examples/bumpy_hpo.yaml | 10 + examples/run_lsf_gridfm_example_postgres.sh | 91 ++-- examples/run_setter_example.sh | 1 - examples/run_vela_example.sh | 66 +-- examples/wlm_plugins/README.md | 85 ++++ examples/wlm_plugins/lsf_plugin.sh | 88 ++++ examples/wlm_plugins/vela_plugin.py | 251 ++++++++++ terratorch_iterate/iterate2/_iterate2.py | 504 ++++---------------- 10 files changed, 683 insertions(+), 562 deletions(-) create mode 100644 examples/wlm_plugins/README.md create mode 100755 examples/wlm_plugins/lsf_plugin.sh create mode 100755 examples/wlm_plugins/vela_plugin.py diff --git a/configs/gridfm_graphkit_hpo.yaml b/configs/gridfm_graphkit_hpo.yaml index d578e19..26047dc 100644 --- a/configs/gridfm_graphkit_hpo.yaml +++ b/configs/gridfm_graphkit_hpo.yaml @@ -52,3 +52,26 @@ static: run_name: run1 log_dir: logs report-performance: true + +# ======================== +# WLM plugin configuration +# Keys are forwarded to the WLM plugin as ITERATE_WLM_ env vars. +# Uncomment and adapt the block that matches your plugin. +# ======================== +# +# --- LSF (examples/wlm_plugins/lsf_plugin.sh) --- +# wlm: +# gpu-count: 1 +# cpu-count: 16 +# mem-gb: 32 +# lsf-gpu-config: "num=1:mode=exclusive_process:mps=no:gmodel=NVIDIAA100_SXM4_80GB" +# # queue: normal +# +# --- Vela / OpenShift (examples/wlm_plugins/vela_plugin.py) --- +# wlm: +# job-template: examples/vela_gridfm_template.yaml +# chart-path: ~/tmp/mlbatch/tools/pytorchjob-generator/chart +# namespace: my-openshift-project +# cmd-placeholder: "{{HPO_COMMAND}}" +# pod-ready-timeout: 600 +# job-timeout: 86400 diff --git a/docs/iterate2.md b/docs/iterate2.md index c21359e..74d9e44 100644 --- a/docs/iterate2.md +++ b/docs/iterate2.md @@ -6,19 +6,27 @@ Key capabilities: - **Multi-objective optimisation** — extract and optimise several metrics simultaneously (Pareto front) - **Five HPO parameter types** — `float`, `int`, `categorical`, `flag` (store-true), `group` (bundled arg sets) -- **Dynamic GPU count per trial** — `gpu_num` in the HPO space controls the WLM resource request per trial +- **Dynamic GPU count per trial** — `gpu_num` in the HPO space is passed to the WLM plugin via `ITERATE_WLM_GPU_COUNT` - **Null-omission** — `null` in a `categorical` choice causes the flag to be completely absent from the command line -- **Workload manager backends** — LSF, Slurm, or direct local execution +- **WLM plugin system** — any executable (bash, Python, …) can be used as a workload-manager backend; reference implementations for LSF and Vela/OpenShift are in `examples/wlm_plugins/` ## Quick start ```sh iterate2 \ --script train.py \ - --wlm lsf \ - --gpu-count 1 \ - --cpu-count 20 \ - --mem-gb 512 \ + --wlm-plugin examples/wlm_plugins/lsf_plugin.sh \ + --optuna-study-name my_study \ + --optuna-db-path sqlite:///hpo.db \ + --optuna-n-trials 50 \ + --hpo-yaml hpo_space.yaml # wlm: section sets gpu-count, cpu-count, … +``` + +For local execution (no cluster) simply omit `--wlm-plugin`: + +```sh +iterate2 \ + --script train.py \ --optuna-study-name my_study \ --optuna-db-path sqlite:///hpo.db \ --optuna-n-trials 50 \ @@ -36,26 +44,9 @@ iterate2 \ | `--venv` | `.venv` | Virtual-environment directory to activate. Set to empty string to disable | | `--interpreter` | `python` | Python interpreter to invoke | | `--param-setter` | `None` | Use setter-style argument passing (see [Setter-style arguments](#setter-style-arguments)) | -| `--wlm` | `none` | Workload manager: `lsf`, `slurm`, `vela`, or `none` | -| `--gpu-count` | `1` | Number of GPUs per trial | -| `--cpu-count` | `4` | Number of CPUs per trial | -| `--mem-gb` | `128` | Memory (GB) per trial | -| `--lsf-gpu-config-string` | `None` | Optional verbatim LSF `-gpu` option string (see [GPU configuration](#gpu-configuration-on-lsf)) | +| `--wlm-plugin` | *(local)* | Path to an executable WLM plugin script. When omitted, trials run locally in the current process | | `--parallelism` | `1` | Number of trials to run in parallel (see [Parallel execution](#parallel-execution)) | -### Vela (OpenShift) options - -Required when `--wlm vela`. - -| Option | Default | Description | -|---|---|---| -| `--vela-job-template` | *(required)* | Path to the Vela job YAML template. `{{HPO_COMMAND}}` in `setupCommands` is replaced per trial | -| `--vela-chart-path` | *(required)* | Path to the `pytorchjob-generator` helm chart directory | -| `--vela-namespace` | *(current context)* | OpenShift/Kubernetes namespace | -| `--vela-cmd-placeholder` | `{{HPO_COMMAND}}` | String in `setupCommands` that is replaced with the HPO-parametrised CLI call | -| `--vela-pod-ready-timeout` | `600` | Seconds to wait for the trial pod to reach Running state | -| `--vela-job-timeout` | `86400` | Seconds to wait (streaming logs) for the job to complete | - ### Optuna options | Option | Default | Description | @@ -189,7 +180,10 @@ Optuna tracks the choice as a single categorical (`dataset = "case2000"`), but t ##### `gpu_num` — dynamic GPU count -The special key `gpu_num` (as `categorical` or `int`) overrides `--gpu-count` for the **WLM resource request** of each individual trial. It is consumed by `iterate2` and never forwarded to the wrapped script. +The special key `gpu_num` (as `categorical` or `int`) is automatically extracted +from the sampled parameters and forwarded to the WLM plugin as +`ITERATE_WLM_GPU_COUNT`. It does **not** appear in the wrapped script's command +line. The WLM plugin uses it to set the cluster resource request for the trial. ```yaml gpu_num: @@ -197,6 +191,9 @@ gpu_num: choices: [1, 2, 4] ``` +Alternatively, set a fixed `gpu-count` in the `wlm:` section of the HPO YAML +when all trials use the same number of GPUs. + ### Static arguments Arguments passed unchanged to every trial. Can be supplied inline or via file: @@ -270,46 +267,59 @@ iterate2 --param-setter set ... --- -## GPU configuration on LSF +## WLM plugin system + +iteate2 has no built-in knowledge of any workload manager. Instead it calls a +user-supplied **plugin script** once per trial. The plugin can be any +executable (bash, Python, …). -When `--wlm lsf` is selected, `iterate2` constructs a `bsub` command for each trial. +### Plugin interface -### Default behaviour +iterate2 calls the plugin with no positional arguments. All information is +delivered through environment variables: -| `--gpu-count` | Generated fragment | +| Variable | Description | |---|---| -| `> 0` (default `1`) | `-gpu num=` | -| `0` | *(no `-gpu` flag, CPU-only job)* | +| `ITERATE_TRIAL_NUMBER` | Integer trial ID | +| `ITERATE_TRIAL_CMD` | Full shell command (with `cd`, `source venv`) – suited for HPC WLMs | +| `ITERATE_TRIAL_CONTAINER_CMD` | Bare CLI invocation (no `cd`/`source`) – suited for container-based systems | +| `ITERATE_OUT_FILE` | File where **stdout** must be written | +| `ITERATE_ERR_FILE` | File where **stderr** must be written | +| `ITERATE_WLM_` | Every key from the YAML `wlm:` section (uppercased, hyphens → underscores) | -### `--lsf-gpu-config-string` +The plugin must exit **0** on success; any other exit code marks the trial as +failed in Optuna. -For advanced LSF GPU scheduling you can supply the full value of the `-gpu` option as a string. When set, it **completely replaces** the auto-generated `-gpu num=` fragment. +### WLM configuration in the HPO YAML -```sh -iterate2 \ - --wlm lsf \ - --lsf-gpu-config-string "num=1:mode=exclusive_process:mps=yes:gmodel=NVIDIAA100_SXM4_80GB" \ - --cpu-count 20 \ - --mem-gb 512 \ - ... -``` +All WLM-specific parameters (GPU count, memory, queue, job template path, …) +live in an optional `wlm:` section of the HPO YAML: -This produces a `bsub` submission resembling: +```yaml +hpo: + lr: { type: float, low: 1e-5, high: 1e-2, log: true } -```sh -bsub -n 20 -R "span[hosts=1]" \ - -gpu "num=1:mode=exclusive_process:mps=yes:gmodel=NVIDIAA100_SXM4_80GB" \ - -M 512G -J hpo_trial_0 \ - "cd /my/root && source .venv/bin/activate && python train.py ..." +static: + epochs: 50 + +# WLM config – forwarded as ITERATE_WLM_* env vars to the plugin +wlm: + gpu-count: 1 + cpu-count: 8 + mem-gb: 32 + lsf-gpu-config: "num=1:mode=exclusive_process:mps=no:gmodel=NVIDIAA100_SXM4_80GB" ``` -!!! note - `--gpu-count` is still used for the `rusage` memory/CPU reservation string even when `--lsf-gpu-config-string` is set. Set it to match the `num=` value in your GPU string. +### Reference plugins -!!! tip - Use exclusive process mode (`mode=exclusive_process`) together with MPS (`mps=yes`) to share a single A100 across multiple MPS clients while still pinning the job to one physical GPU. +See `examples/wlm_plugins/` for fully documented reference implementations: ---- +| Plugin | WLM | +|---|---| +| `lsf_plugin.sh` | IBM Spectrum LSF (`bsub -K`) | +| `vela_plugin.py` | OpenShift / MLBatch PyTorchJob (`helm template \| oc create`) | + +Writing a SLURM plugin follows the same pattern as `lsf_plugin.sh`. --- @@ -320,7 +330,7 @@ By default `iterate2` runs one trial at a time. Pass `--parallelism N` to run up ```sh iterate2 \ --parallelism 4 \ - --wlm lsf \ + --wlm-plugin examples/wlm_plugins/lsf_plugin.sh \ ... ``` @@ -344,12 +354,12 @@ Output from concurrent trials is prefixed so you can follow individual workers: ### Output files -| WLM | stdout | stderr | -|---|---|---| -| `none` | `trial_N.out` (written by iterate2) | `trial_N.err` (written by iterate2) | -| `lsf` / `slurm` | `trial_N.out` (written by WLM on cluster) | `trial_N.err` (written by WLM on cluster) | +iteate2 tells the plugin where to write output via `ITERATE_OUT_FILE` / +`ITERATE_ERR_FILE`. The plugin is responsible for directing its job's +stdout/stderr to those files. iterate2 extracts metrics from them after the +plugin exits. -For WLM backends the local WLM tool output (bsub/srun status messages) is written to `trial_N_wlm.out` / `trial_N_wlm.err` so the cluster-managed files are never overwritten. +For local execution (no plugin) iterate2 writes them directly: ### SQLite and parallelism diff --git a/examples/bumpy_hpo.yaml b/examples/bumpy_hpo.yaml index 70a16bf..0ceee54 100644 --- a/examples/bumpy_hpo.yaml +++ b/examples/bumpy_hpo.yaml @@ -4,6 +4,16 @@ static: global-mu: 23 42 66 + +# ======================= +# WLM plugin configuration +# Keys are forwarded to the WLM plugin as ITERATE_WLM_ env vars. +# Omit this section entirely when running locally (no --wlm-plugin). +# ======================= +# wlm: +# gpu-count: 1 +# cpu-count: 4 +# mem-gb: 32 # ======================== # Training hyperparameters - evaluated by optuna and passed to the underlying training script diff --git a/examples/run_lsf_gridfm_example_postgres.sh b/examples/run_lsf_gridfm_example_postgres.sh index 197998e..c1a1f3d 100755 --- a/examples/run_lsf_gridfm_example_postgres.sh +++ b/examples/run_lsf_gridfm_example_postgres.sh @@ -1,42 +1,33 @@ #!/usr/bin/env bash # ============================================================================= -# Example: iterate --wlm lsf with PostgreSQL coordinator for gridfm-graphkit HPO +# Example: iterate2 with the LSF WLM plugin and PostgreSQL coordinator # -# Each Optuna trial is submitted as an LSF job that looks like: +# Each Optuna trial is submitted as an LSF job by the plugin script +# examples/wlm_plugins/lsf_plugin.sh. That script reads all LSF +# resource settings from ITERATE_WLM_* env vars which are populated from +# the ``wlm:`` section in the HPO YAML (configs/gridfm_graphkit_hpo.yaml). # -# bsub -gpu "num=1:mode=exclusive_process:mps=no:gmodel=NVIDIAA100_SXM4_80GB" \ -# -K -o trial.out -e trial.err \ -# -R "rusage[ngpus=1, cpu=16, mem=32GB]" \ -# -J hpo_trial_ \ -# "export PATH='/opt/share/cuda-12.8.1/bin:$PATH' && \ -# export CUDA_HOME='/opt/share/cuda-12.8.1/' && \ -# export LD_LIBRARY_PATH='/opt/share/cuda-12.8.1/lib64:$LD_LIBRARY_PATH' && \ -# cd /dccstor/terratorch/users/rkie/gitco/gridfm-graphkit && \ -# source /u/rkie/venvs/venv_gridfm-graphkit/bin/activate && \ -# gridfm_graphkit train " +# What changed vs the old --wlm lsf approach +# ------------------------------------------ +# * --wlm lsf, --gpu-count, --cpu-count, --mem-gb, --lsf-gpu-config-string +# are gone; all of these now live in the wlm: block of the HPO YAML. +# * --wlm-plugin points to the LSF plugin script (user-owned). +# * The plugin can be customised freely without touching iterate2 itself. # # Prerequisites # ------------- # * LSF bsub/bjobs available on PATH # * gridfm-graphkit installed in the venv below -# * configs/gridfm_graphkit_hpo.yaml present +# * configs/gridfm_graphkit_hpo.yaml present (with wlm: section filled in) # * psycopg2-binary installed: pip install 'terratorch-iterate[postgresql]' # * POSTGRES_URL set (or hard-code it in --optuna-db-path below) # # PostgreSQL coordinator # ---------------------- -# Using PostgreSQL instead of SQLite / JournalFS is the recommended backend for -# high-parallelism HPO on a cluster: multiple bsub jobs can safely write trial -# results concurrently without lock contention. -# -# Set the connection URL as an env-var to avoid embedding credentials in scripts -# that may end up in version control: +# Using PostgreSQL instead of SQLite / JournalFS is the recommended backend +# for high-parallelism HPO on a cluster. # # export POSTGRES_URL="postgresql://user:password@host:5432/optuna_studies" -# -# or pass it inline: -# -# POSTGRES_URL="postgresql://..." bash run_lsf_gridfm_example_postgres.sh # ============================================================================= set -euo pipefail @@ -56,20 +47,9 @@ GRIDFM_ROOT="${GRIDFM_ROOT:-/dccstor/terratorch/users/rkie/gitco/gridfm-graphkit GRIDFM_VENV="${GRIDFM_VENV:-/u/rkie/venvs/venv_gridfm-graphkit}" CUDA_BASE="${CUDA_BASE:-/opt/share/cuda-12.8.1}" DATA_PATH="${DATA_PATH:-/u/rkie/}" -LOG_DIR="${LOG_DIR:-logs}" - -# --------------------------------------------------------------------------- -# LSF GPU resource string -# Adjust gmodel to the GPU type available on your cluster. -# --------------------------------------------------------------------------- -LSF_GPU_CONFIG="${LSF_GPU_CONFIG:-num=1:mode=exclusive_process:mps=no:gmodel=NVIDIAA100_SXM4_80GB}" # --------------------------------------------------------------------------- # Pre-run commands executed inside every bsub job before the training script. -# Order matters: -# 1. Export CUDA paths so the GPU driver / toolkit is visible. -# 2. cd into the project root so relative config paths resolve correctly. -# 3. Activate the project venv. # --------------------------------------------------------------------------- PRE_RUN="\ export PATH='${CUDA_BASE}/bin:\$PATH' && \ @@ -79,31 +59,20 @@ cd '${GRIDFM_ROOT}' && \ source '${GRIDFM_VENV}/bin/activate'" # --------------------------------------------------------------------------- -# Static training arguments (not part of the HPO search space). -# These are appended verbatim after the sampled hyperparameters. -# --------------------------------------------------------------------------- -STATIC_ARGS_JSON='{ - "log_dir": "'"${LOG_DIR}"'", - "report-performance": true -}' - -# --------------------------------------------------------------------------- -# Launch iterate +# Launch iterate2 +# +# Resource settings (gpu-count, cpu-count, mem-gb, lsf-gpu-config) are +# defined in the wlm: section of gridfm_graphkit_hpo.yaml – not here. # --------------------------------------------------------------------------- -iterate \ - --script "gridfm_graphkit train" \ - --interpreter "" \ - --root-dir "${GRIDFM_ROOT}" \ - --wlm lsf \ - --pre-run-commands "${PRE_RUN}" \ - --no-underscore-to-hyphen \ - --gpu-count 1 \ - --cpu-count 16 \ - --mem-gb 32 \ - #--lsf-gpu-config-string "${LSF_GPU_CONFIG}" \ - --optuna-study-name gridfm_lsf_postgres_hpo \ - --optuna-db-path "${POSTGRES_URL}" \ - --parallelism 4 \ - --optuna-n-trials 20 \ - --hpo-yaml "${REPO_ROOT}/configs/gridfm_graphkit_hpo.yaml" \ - --static-args-json "${STATIC_ARGS_JSON}" +iterate2 \ + --script "gridfm_graphkit train" \ + --interpreter "" \ + --root-dir "${GRIDFM_ROOT}" \ + --wlm-plugin "${SCRIPT_DIR}/wlm_plugins/lsf_plugin.sh" \ + --pre-run-commands "${PRE_RUN}" \ + --no-underscore-to-hyphen \ + --optuna-study-name gridfm_lsf_postgres_hpo \ + --optuna-db-path "${POSTGRES_URL}" \ + --parallelism 4 \ + --optuna-n-trials 20 \ + --hpo-yaml "${REPO_ROOT}/configs/gridfm_graphkit_hpo.yaml" diff --git a/examples/run_setter_example.sh b/examples/run_setter_example.sh index cb62e01..81d56d4 100755 --- a/examples/run_setter_example.sh +++ b/examples/run_setter_example.sh @@ -27,7 +27,6 @@ iterate2 \ --root-dir "${SCRIPT_DIR}" \ --venv "" \ --param-setter set \ - --wlm none \ --optuna-study-name bumpy_setter_study \ --optuna-db-path "sqlite:///bumpy_setter_hpo.db" \ --optuna-n-trials 20 \ diff --git a/examples/run_vela_example.sh b/examples/run_vela_example.sh index bb5cafc..e6f3234 100755 --- a/examples/run_vela_example.sh +++ b/examples/run_vela_example.sh @@ -1,6 +1,19 @@ #!/usr/bin/env bash # ============================================================================= -# Example: iterate2 with --wlm vela (OpenShift / MLBatch PyTorchJob) +# Example: iterate2 with the Vela WLM plugin (OpenShift / MLBatch PyTorchJob) +# +# Each Optuna trial is submitted as a PyTorchJob by the plugin script +# examples/wlm_plugins/vela_plugin.py. That script reads all Vela/oc +# settings from ITERATE_WLM_* env vars which are populated from the +# ``wlm:`` section in the HPO YAML. +# +# What changed vs the old --wlm vela approach +# ------------------------------------------- +# * --wlm vela, --vela-job-template, --vela-chart-path, --vela-namespace, +# --vela-cmd-placeholder, --vela-pod-ready-timeout, --vela-job-timeout +# are gone; all of these now live in the wlm: block of the HPO YAML. +# * --wlm-plugin points to the Vela plugin script (user-owned). +# * The plugin can be customised freely without touching iterate2 itself. # # Prerequisites # ------------- @@ -8,22 +21,7 @@ # * oc CLI logged in to the target cluster # * mlbatch/tools/pytorchjob-generator/chart checked out locally # * The gridfm HPO YAML (configs/gridfm_graphkit_hpo.yaml) present -# -# How it works -# ------------ -# 1. For each Optuna trial iterate2: -# a. Samples hyperparameters from gridfm_graphkit_hpo.yaml -# b. Builds the gridfm_graphkit CLI invocation from static + sampled params -# c. Patches vela_gridfm_template.yaml: -# - appends "-trial-" to jobName (unique resource per trial) -# - sets numGpusPerPod = gpu_num (from the HPO space) -# - replaces {{HPO_COMMAND}} (the actual CLI call) -# d. Runs: helm template -f | oc create -f- -# e. Polls until -master-0 pod is Running -# f. Streams: oc logs -f -master-0 -# (blocks until container exits; output captured for metric extraction) -# g. Checks pod exit code; deletes the PyTorchJob resource -# 2. Metrics are extracted from the captured log and returned to Optuna. +# and the wlm: section filled in (see configs/gridfm_graphkit_hpo.yaml) # ============================================================================= set -euo pipefail @@ -31,27 +29,13 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" -# Path to the mlbatch pytorchjob-generator helm chart. -# Clone mlbatch first: git clone https://github.com/project-codeflare/mlbatch -CHART_PATH="${MLBATCH_CHART_PATH:-${HOME}/tmp/mlbatch/tools/pytorchjob-generator/chart}" - -NAMESPACE_ARG=() -[[ -n "${OC_NAMESPACE:-}" ]] && NAMESPACE_ARG=(--vela-namespace "${OC_NAMESPACE}") - -iterate \ - --script "gridfm_graphkit train" \ - --interpreter "" \ - --wlm vela \ - --vela-job-template "${SCRIPT_DIR}/vela_gridfm_template.yaml" \ - --vela-chart-path "${CHART_PATH}" \ - "${NAMESPACE_ARG[@]}" \ - --vela-cmd-placeholder "{{HPO_COMMAND}}" \ - --vela-pod-ready-timeout 600 \ - --vela-job-timeout 86400 \ - --no-underscore-to-hyphen \ - --gpu-count 1 \ - --optuna-study-name gridfm_vela_hpo \ - --optuna-db-path "js:///gridfm_vela_hpo.journal" \ - --parallelism 16 \ - --optuna-n-trials 20 \ - --hpo-yaml "${REPO_ROOT}/configs/gridfm_graphkit_hpo.yaml" +iterate2 \ + --script "gridfm_graphkit train" \ + --interpreter "" \ + --wlm-plugin "${SCRIPT_DIR}/wlm_plugins/vela_plugin.py" \ + --no-underscore-to-hyphen \ + --optuna-study-name gridfm_vela_hpo \ + --optuna-db-path "js:///gridfm_vela_hpo.journal" \ + --parallelism 16 \ + --optuna-n-trials 20 \ + --hpo-yaml "${REPO_ROOT}/configs/gridfm_graphkit_hpo.yaml" diff --git a/examples/wlm_plugins/README.md b/examples/wlm_plugins/README.md new file mode 100644 index 0000000..1dcf41c --- /dev/null +++ b/examples/wlm_plugins/README.md @@ -0,0 +1,85 @@ +# WLM Plugins + +This directory contains reference implementations of **iterate2 WLM plugins** – +executable scripts that submit, wait for, and validate individual Optuna trials +on different workload managers. + +## How the plugin system works + +When you pass `--wlm-plugin /path/to/plugin` to `iterate2`, it is invoked +**once per trial** with a set of environment variables that describe the work +to be done. The plugin is responsible for: + +1. Submitting the trial to the cluster / WLM. +2. Waiting until the job completes. +3. Ensuring trial stdout is written to `$ITERATE_OUT_FILE` and stderr to + `$ITERATE_ERR_FILE` (iterate2 extracts metrics from these files). +4. Exiting **0** on success, non-zero on failure. + +iterate2 marks the Optuna trial as **FAILED** on a non-zero exit code. + +### Environment variables provided by iterate2 + +| Variable | Description | +|---|---| +| `ITERATE_TRIAL_NUMBER` | Integer trial ID | +| `ITERATE_TRIAL_CMD` | Full shell command (with `cd`, `source venv`, etc.) – use for SSH / HPC WLMs | +| `ITERATE_TRIAL_CONTAINER_CMD` | Bare CLI invocation (no `cd`/`source`) – use for container-based WLMs (Vela/k8s) | +| `ITERATE_OUT_FILE` | Path where **stdout** must be written | +| `ITERATE_ERR_FILE` | Path where **stderr** must be written | +| `ITERATE_WLM_` | Every key from the `wlm:` YAML section, uppercased with hyphens→underscores | + +### WLM configuration in the HPO YAML + +All WLM-specific settings (GPU count, queue, job template path, …) live in the +`wlm:` section of the HPO YAML. This keeps the launch script clean: + +```yaml +# my_hpo.yaml +hpo: + lr: + type: float + low: 1e-5 + high: 1e-2 + log: true + +static: + epochs: 50 + +wlm: # keys forwarded as ITERATE_WLM_* env vars + gpu-count: 1 + cpu-count: 8 + mem-gb: 32 +``` + +The corresponding launch script only needs: + +```bash +iterate2 \ + --script train.py \ + --wlm-plugin wlm_plugins/lsf_plugin.sh \ + --hpo-yaml my_hpo.yaml \ + ... +``` + +## Provided plugins + +| Plugin | WLM | Notes | +|---|---|---| +| [`lsf_plugin.sh`](lsf_plugin.sh) | IBM Spectrum LSF | Uses `bsub -K`; reads `gpu-count`, `cpu-count`, `mem-gb`, `lsf-gpu-config`, `queue` from the `wlm:` section | +| [`vela_plugin.py`](vela_plugin.py) | OpenShift / MLBatch PyTorchJob | Uses `helm template \| oc create`; reads `job-template`, `chart-path`, `namespace`, `cmd-placeholder`, `pod-ready-timeout`, `job-timeout` from the `wlm:` section | + +## Writing your own plugin + +Any executable (shell script, Python script, compiled binary) works. Minimal +example that runs the trial locally: + +```bash +#!/usr/bin/env bash +# trivial_plugin.sh – run trial locally, redirect output to the log files +bash -c "${ITERATE_TRIAL_CMD}" \ + > "${ITERATE_OUT_FILE}" 2> "${ITERATE_ERR_FILE}" +``` + +For a SLURM example you can follow the same pattern as `lsf_plugin.sh`, +replacing `bsub` with `srun` / `sbatch`. diff --git a/examples/wlm_plugins/lsf_plugin.sh b/examples/wlm_plugins/lsf_plugin.sh new file mode 100755 index 0000000..2af092f --- /dev/null +++ b/examples/wlm_plugins/lsf_plugin.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash +# ============================================================================= +# iterate2 WLM plugin – IBM Spectrum LSF +# +# Submits each HPO trial as an LSF job via ``bsub -K`` (blocking). +# +# Environment variables provided by iterate2 +# ------------------------------------------ +# ITERATE_TRIAL_NUMBER integer trial ID +# ITERATE_TRIAL_CMD full shell command to execute (includes cd, venv activation, etc.) +# ITERATE_OUT_FILE path where trial stdout must be written +# ITERATE_ERR_FILE path where trial stderr must be written +# +# WLM configuration (from the ``wlm:`` section in the HPO YAML) +# ------------------------------------------------------------- +# All keys from the ``wlm:`` block are available as +# ``ITERATE_WLM_`` (hyphens → underscores). +# Recognised keys (with defaults): +# +# gpu-count (ITERATE_WLM_GPU_COUNT) default: 1 +# cpu-count (ITERATE_WLM_CPU_COUNT) default: 4 +# mem-gb (ITERATE_WLM_MEM_GB) default: 32 +# lsf-gpu-config (ITERATE_WLM_LSF_GPU_CONFIG) default: auto from gpu-count +# queue (ITERATE_WLM_QUEUE) optional +# +# Usage in HPO YAML +# ----------------- +# wlm: +# gpu-count: 1 +# cpu-count: 16 +# mem-gb: 32 +# lsf-gpu-config: "num=1:mode=exclusive_process:mps=no:gmodel=NVIDIAA100_SXM4_80GB" +# # queue: normal # uncomment to specify an LSF queue +# +# Usage in the launch script +# -------------------------- +# iterate2 \ +# --wlm-plugin "$(dirname "$0")/wlm_plugins/lsf_plugin.sh" \ +# --hpo-yaml my_hpo.yaml \ +# ... +# +# Exit code +# --------- +# The script exits 0 on success, non-zero on failure. +# iterate2 marks the Optuna trial as FAILED when exit code != 0. +# ============================================================================= + +set -euo pipefail + +# ── Read iterate2 env vars ──────────────────────────────────────────────────── +TRIAL_NUMBER="${ITERATE_TRIAL_NUMBER:?ITERATE_TRIAL_NUMBER not set}" +TRIAL_CMD="${ITERATE_TRIAL_CMD:?ITERATE_TRIAL_CMD not set}" +OUT_FILE="${ITERATE_OUT_FILE:?ITERATE_OUT_FILE not set}" +ERR_FILE="${ITERATE_ERR_FILE:?ITERATE_ERR_FILE not set}" + +# ── WLM config from YAML wlm: section ──────────────────────────────────────── +GPU_COUNT="${ITERATE_WLM_GPU_COUNT:-1}" +CPU_COUNT="${ITERATE_WLM_CPU_COUNT:-4}" +MEM_GB="${ITERATE_WLM_MEM_GB:-32}" +QUEUE="${ITERATE_WLM_QUEUE:-}" +# If a custom GPU resource string is provided use it; otherwise derive from gpu-count. +if [[ -n "${ITERATE_WLM_LSF_GPU_CONFIG:-}" ]]; then + GPU_FRAGMENT="-gpu \"${ITERATE_WLM_LSF_GPU_CONFIG}\"" +elif [[ "${GPU_COUNT}" -gt 0 ]]; then + GPU_FRAGMENT="-gpu num=${GPU_COUNT}" +else + GPU_FRAGMENT="" +fi + +QUEUE_FRAGMENT="" +if [[ -n "${QUEUE}" ]]; then + QUEUE_FRAGMENT="-q ${QUEUE}" +fi + +# ── Build bsub command ──────────────────────────────────────────────────────── +# -K : block until the job finishes (iterate2 calls this per-trial in a thread) +# -o/-e: LSF writes stdout/stderr directly to these files +BSUB_CMD="bsub ${GPU_FRAGMENT} ${QUEUE_FRAGMENT} \ + -K \ + -o ${OUT_FILE} \ + -e ${ERR_FILE} \ + -R \"rusage[ngpus=${GPU_COUNT}, cpu=${CPU_COUNT}, mem=${MEM_GB}GB]\" \ + -J hpo_trial_${TRIAL_NUMBER} \ + \"${TRIAL_CMD}\"" + +echo "[lsf_plugin] Trial ${TRIAL_NUMBER}: submitting → ${BSUB_CMD}" +eval "${BSUB_CMD}" +echo "[lsf_plugin] Trial ${TRIAL_NUMBER}: job finished" diff --git a/examples/wlm_plugins/vela_plugin.py b/examples/wlm_plugins/vela_plugin.py new file mode 100755 index 0000000..67bb1ff --- /dev/null +++ b/examples/wlm_plugins/vela_plugin.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python3 +""" +iterate2 WLM plugin – Vela / OpenShift PyTorchJob (MLBatch) + +Submits each HPO trial as a PyTorchJob on an OpenShift cluster via +``helm template | oc create``, streams pod logs, checks the exit code, +and cleans up the job resource. + +Environment variables provided by iterate2 +------------------------------------------ + ITERATE_TRIAL_NUMBER integer trial ID + ITERATE_TRIAL_CONTAINER_CMD bare CLI invocation for inside the container + (no ``cd``, no ``source venv`` – use this + one, not ITERATE_TRIAL_CMD) + ITERATE_OUT_FILE file to write trial stdout + ITERATE_ERR_FILE file to write trial stderr + +WLM configuration (from the ``wlm:`` section in the HPO YAML) +------------------------------------------------------------- +All keys from ``wlm:`` are available as ``ITERATE_WLM_`` +(hyphens → underscores). Recognised keys: + + job-template (ITERATE_WLM_JOB_TEMPLATE) REQUIRED path to PyTorchJob helm values YAML + chart-path (ITERATE_WLM_CHART_PATH) REQUIRED path to pytorchjob-generator helm chart + namespace (ITERATE_WLM_NAMESPACE) optional; uses current oc context if omitted + cmd-placeholder (ITERATE_WLM_CMD_PLACEHOLDER) default: {{HPO_COMMAND}} + gpu-count (ITERATE_WLM_GPU_COUNT) default: 1 + pod-ready-timeout (ITERATE_WLM_POD_READY_TIMEOUT) seconds; default: 600 + job-timeout (ITERATE_WLM_JOB_TIMEOUT) seconds; default: 86400 + +Usage in HPO YAML +----------------- + wlm: + job-template: examples/vela_gridfm_template.yaml + chart-path: ~/tmp/mlbatch/tools/pytorchjob-generator/chart + namespace: my-project + cmd-placeholder: "{{HPO_COMMAND}}" + gpu-count: 1 + pod-ready-timeout: 600 + job-timeout: 86400 + +Usage in the launch script +-------------------------- + iterate2 \\ + --wlm-plugin "$(dirname "$0")/wlm_plugins/vela_plugin.py" \\ + --hpo-yaml my_hpo.yaml \\ + --no-underscore-to-hyphen \\ + ... + +Exit code +--------- +Exits 0 on success, 1 on failure. iterate2 marks the Optuna trial as +FAILED on any non-zero exit. +""" + +import os +import re +import subprocess +import sys +import tempfile +import threading +import time +from pathlib import Path +from typing import Optional + + +# ── helpers ─────────────────────────────────────────────────────────────────── + +def env(key: str, default: Optional[str] = None, required: bool = False) -> str: + val = os.environ.get(key, default) + if required and not val: + sys.exit(f"[vela_plugin] ERROR: required env var '{key}' is not set") + return val or "" + + +def patch_job_yaml(template_path: str, trial_id: int, gpu_count: int, + container_cmd: str, placeholder: str) -> tuple[str, str]: + """Patch the helm values YAML; return (patched_text, job_name).""" + with open(template_path) as fh: + text = fh.read() + + # jobName → append -trial- + m = re.search(r'^(jobName\s*:\s*["\']?)([^"\'#\n]+)(["\']?)', text, re.MULTILINE) + if not m: + sys.exit(f"[vela_plugin] ERROR: 'jobName' key not found in '{template_path}'") + raw_name = m.group(2).strip() + job_name = f"{raw_name}-trial-{trial_id}" + text = text[:m.start(2)] + m.group(2).replace(raw_name, job_name) + text[m.end(2):] + + # numGpusPerPod → overwrite + text = re.sub( + r'^(numGpusPerPod\s*:\s*)\S+', + lambda m2: f"{m2.group(1)}{gpu_count}", + text, flags=re.MULTILINE, + ) + + # placeholder → container_cmd + if placeholder in text: + text = text.replace(placeholder, container_cmd) + else: + print(f"[vela_plugin] WARNING: placeholder '{placeholder}' not found in template – appending") + text += f"\n - {container_cmd}\n" + + return text, job_name + + +def stream_pipe(pipe, dest_file: str, prefix: str, dest_stream): + with open(dest_file, "w", encoding="utf-8", errors="replace") as fh: + for raw in pipe: + line = raw.decode("utf-8", errors="replace") + fh.write(line) + fh.flush() + dest_stream.write(f"{prefix} {line}") + dest_stream.flush() + + +# ── main ────────────────────────────────────────────────────────────────────── + +def main(): + trial_id = int(env("ITERATE_TRIAL_NUMBER", required=True)) + cmd = env("ITERATE_TRIAL_CONTAINER_CMD", required=True) + out_file = env("ITERATE_OUT_FILE", required=True) + err_file = env("ITERATE_ERR_FILE", required=True) + + template = env("ITERATE_WLM_JOB_TEMPLATE", required=True) + chart = env("ITERATE_WLM_CHART_PATH", required=True) + namespace = env("ITERATE_WLM_NAMESPACE", "") + placeholder = env("ITERATE_WLM_CMD_PLACEHOLDER", "{{HPO_COMMAND}}") + gpu_count = int(env("ITERATE_WLM_GPU_COUNT", "1")) + pod_timeout = int(env("ITERATE_WLM_POD_READY_TIMEOUT", "600")) + job_timeout = int(env("ITERATE_WLM_JOB_TIMEOUT", "86400")) + + ns_args = ["-n", namespace] if namespace else [] + prefix = f"[trial-{trial_id}]" + + # Resolve ~ in paths + template = str(Path(template).expanduser()) + chart = str(Path(chart).expanduser()) + + print(f"{prefix} Patching template {template}") + job_yaml, job_name = patch_job_yaml(template, trial_id, gpu_count, cmd, placeholder) + print(f"{prefix} Job name: {job_name}") + + # Write patched values to a temp file + with tempfile.NamedTemporaryFile( + mode="w", suffix=".yaml", prefix=f"vela_trial_{trial_id}_", delete=False + ) as fh: + fh.write(job_yaml) + tmp_yaml = fh.name + + try: + # ── Submit ──────────────────────────────────────────────────────────── + ns_flag = f"-n {namespace}" if namespace else "" + create_cmd = f"helm template -f {tmp_yaml} {chart} | oc create {ns_flag} -f-" + print(f"{prefix} Submitting: {create_cmd}") + result = subprocess.run(create_cmd, shell=True, capture_output=True, text=True) + sys.stdout.write(result.stdout) + if result.returncode != 0: + sys.stderr.write(result.stderr) + sys.exit(f"{prefix} ERROR: oc create failed (rc={result.returncode})") + + master_pod = f"{job_name}-master-0" + + # ── Wait for pod to appear ──────────────────────────────────────────── + deadline = time.monotonic() + pod_timeout + print(f"{prefix} Waiting for pod {master_pod} …") + while time.monotonic() < deadline: + r = subprocess.run( + ["oc", "get", "pod", master_pod, "--ignore-not-found"] + ns_args, + capture_output=True, text=True, + ) + if master_pod in r.stdout: + break + time.sleep(5) + else: + sys.exit(f"{prefix} ERROR: pod '{master_pod}' did not appear within {pod_timeout}s") + + # ── Wait for pod Ready (best-effort) ───────────────────────────────── + subprocess.run( + ["oc", "wait", f"pod/{master_pod}", "--for=condition=Ready", + f"--timeout={pod_timeout}s"] + ns_args, + capture_output=True, text=True, + ) + + # ── Stream logs ─────────────────────────────────────────────────────── + print(f"{prefix} Streaming logs from {master_pod}") + log_proc = subprocess.Popen( + ["oc", "logs", "-f", master_pod] + ns_args, + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + ) + t_out = threading.Thread( + target=stream_pipe, + args=(log_proc.stdout, out_file, prefix, sys.stdout), daemon=True, + ) + t_err = threading.Thread( + target=stream_pipe, + args=(log_proc.stderr, err_file, prefix, sys.stderr), daemon=True, + ) + t_out.start(); t_err.start() + try: + log_proc.wait(timeout=job_timeout) + except subprocess.TimeoutExpired: + log_proc.kill() + t_out.join(); t_err.join() + + # Catch-up read in case of early EOF disconnect + if log_proc.returncode != 0: + catchup = subprocess.run( + ["oc", "logs", "--tail=-1", master_pod] + ns_args, + capture_output=True, text=True, + ) + if catchup.stdout: + with open(out_file, "a") as fh: + fh.write(catchup.stdout) + if catchup.stderr: + with open(err_file, "a") as fh: + fh.write(catchup.stderr) + + # ── Check exit code ─────────────────────────────────────────────────── + exit_code_str = "" + for _ in range(30): + ec = subprocess.run( + ["oc", "get", "pod", master_pod, "-o", + "jsonpath={.status.containerStatuses[0].state.terminated.exitCode}"] + + ns_args, + capture_output=True, text=True, + ) + exit_code_str = ec.stdout.strip() + if exit_code_str.lstrip("-").isdigit(): + break + time.sleep(5) + + exit_code = int(exit_code_str) if exit_code_str.lstrip("-").isdigit() else 0 + print(f"{prefix} Pod exit code: {exit_code}") + if exit_code != 0: + sys.exit(f"{prefix} Trial FAILED: pod exited with code {exit_code}") + + finally: + # ── Cleanup ─────────────────────────────────────────────────────────── + subprocess.run( + ["oc", "delete", "pytorchjob", job_name, "--ignore-not-found"] + ns_args, + capture_output=True, + ) + try: + os.unlink(tmp_yaml) + except OSError: + pass + + +if __name__ == "__main__": + main() diff --git a/terratorch_iterate/iterate2/_iterate2.py b/terratorch_iterate/iterate2/_iterate2.py index e34e3ed..1fea9a9 100644 --- a/terratorch_iterate/iterate2/_iterate2.py +++ b/terratorch_iterate/iterate2/_iterate2.py @@ -49,50 +49,19 @@ def parse_args(): ) parser.add_argument("--interpreter", default="python", help="Interpreter to use") parser.add_argument("--param-setter", type=str, default=None) - parser.add_argument("--wlm", choices=["lsf", "slurm", "openshift", "vela", "none"], default="none") - parser.add_argument("--gpu-count", type=int, default=1) - parser.add_argument("--cpu-count", type=int, default=4) - parser.add_argument("--mem-gb", type=int, default=128) - parser.add_argument("--lsf-gpu-config-string", type=str, default=None) - - # ------------------------ - # Vela / OpenShift options - # ------------------------ - parser.add_argument( - "--vela-job-template", - type=str, - default=None, - help="Path to the Vela job YAML template (required when --wlm vela)", - ) - parser.add_argument( - "--vela-chart-path", - type=str, - default=None, - help="Path to the helm chart directory (required when --wlm vela)", - ) parser.add_argument( - "--vela-namespace", + "--wlm-plugin", type=str, default=None, - help="OpenShift/Kubernetes namespace (uses current context if omitted)", - ) - parser.add_argument( - "--vela-cmd-placeholder", - type=str, - default="{{HPO_COMMAND}}", - help="String in the job template's setupCommands that is replaced with the HPO command (default: '{{HPO_COMMAND}}')", - ) - parser.add_argument( - "--vela-pod-ready-timeout", - type=int, - default=600, - help="Seconds to wait for the trial pod to reach Running state (default: 600)", - ) - parser.add_argument( - "--vela-job-timeout", - type=int, - default=86400, - help="Seconds to wait for the trial job to complete (default: 86400 = 24 h)", + help=( + "Path to an executable WLM plugin script that submits/runs each trial. " + "When omitted, trials run locally (equivalent to the old --wlm none). " + "The plugin receives trial information as environment variables: " + "ITERATE_TRIAL_NUMBER, ITERATE_TRIAL_CMD, ITERATE_OUT_FILE, ITERATE_ERR_FILE, " + "and ITERATE_WLM_ for every key in the 'wlm:' YAML section. " + "Exit 0 to signal success; any other exit code marks the trial as failed. " + "See examples/wlm_plugins/ for LSF and Vela reference implementations." + ), ) parser.add_argument( "--parallelism", @@ -158,25 +127,6 @@ def resolve_paths(script: str, root_dir: Optional[str]): logger.debug("Resolved root_dir '%s' → '%s'", root_dir, resolved) return script, resolved -def build_launcher_command(wlm, cmd, trial_id, out_file, err_file, gpu_count, cpu_count, mem_gb, lsf_gpu_config_string): - logger.debug("Building launcher command: wlm=%s gpu_count=%d cpu_count=%d mem_gb=%d", wlm, gpu_count, cpu_count, mem_gb) - if wlm == "lsf": - gpu_fragment = f"-gpu \"{lsf_gpu_config_string}\"" if lsf_gpu_config_string else (f"-gpu num={gpu_count}" if gpu_count > 0 else "") - launcher = f"bsub {gpu_fragment} -K -o {out_file} -e {err_file} -R \"rusage[ngpus={gpu_count}, cpu={cpu_count}, mem={mem_gb}GB]\" -J hpo_trial_{trial_id} \"{cmd}\"" - elif wlm == "slurm": - launcher = f"srun --gres=gpu:{gpu_count} --cpus-per-task={cpu_count} --mem={mem_gb}G --job-name=hpo_trial_{trial_id} --output={out_file} --error={err_file} bash -c \"{cmd}\"" - elif wlm == "none": - # No embedded redirect: run_and_stream() captures stdout/stderr via PIPE - # and writes to out_file/err_file itself. - launcher = f'bash -c "{cmd}"' - elif wlm in ("vela",): - # Vela uses a separate submission flow; this function is not called for it. - raise ValueError("build_launcher_command must not be called for wlm='vela'; use build_vela_job_yaml + run_vela_trial instead.") - else: - raise ValueError(f"Unknown WLM: {wlm}") - logger.debug("Launcher command: %s", launcher) - return launcher - def build_shell_command(interpreter, root_dir, script_path, venv, script_args, param_setter, underscore_to_hyphen=True, pre_run_commands=None): parts = [f"cd {root_dir}"] if pre_run_commands: @@ -250,264 +200,6 @@ def build_container_command(interpreter: str, script_path: str, script_args: dic return cmd -def build_vela_job_yaml( - template_path: str, - trial_id: int, - gpu_count: int, - container_cmd: str, - placeholder: str, -) -> tuple[str, str]: - """Load *template_path* as raw text, inject HPO parameters, return ``(yaml_str, job_name)``. - - All modifications are done via targeted regex/string substitutions on the raw - YAML text so that multi-line block scalars (e.g. awk pipelines), single-quoted - strings, and other constructs that PyYAML would mangle on a load→dump round-trip - are preserved exactly as written in the template. - - Changes applied: - * ``jobName`` gets a ``-trial-{trial_id}`` suffix (unique Kubernetes resource). - * ``numGpusPerPod`` is overwritten with *gpu_count*. - * The *placeholder* string inside ``setupCommands`` is replaced with - *container_cmd* in-place, preserving any surrounding wrapper (e.g. awk pipeline). - """ - with open(template_path, "r") as fh: - text = fh.read() - - # ── jobName ────────────────────────────────────────────────────────────── - job_name_match = re.search(r'^(jobName\s*:\s*["\']?)([^"\'#\n]+)(["\']?)', text, re.MULTILINE) - if not job_name_match: - raise ValueError(f"'jobName' key not found in template '{template_path}'") - raw_name = job_name_match.group(2).strip() - job_name = f"{raw_name}-trial-{trial_id}" - text = ( - text[:job_name_match.start(2)] - + job_name_match.group(2).replace(raw_name, job_name) - + text[job_name_match.end(2):] - ) - logger.debug("Vela trial %d: jobName → %s", trial_id, job_name) - - # ── numGpusPerPod ──────────────────────────────────────────────────────── - text = re.sub( - r'^(numGpusPerPod\s*:\s*)\S+', - lambda m: f"{m.group(1)}{gpu_count}", - text, - flags=re.MULTILINE, - ) - logger.debug("Vela trial %d: numGpusPerPod → %d", trial_id, gpu_count) - - # ── placeholder substitution ───────────────────────────────────────────── - if placeholder in text: - text = text.replace(placeholder, container_cmd) - logger.debug("Vela trial %d: substituted placeholder '%s'", trial_id, placeholder) - else: - logger.warning( - "Vela trial %d: placeholder '%s' not found in template '%s' – appending command", - trial_id, placeholder, template_path, - ) - text += f"\n - {container_cmd}\n" - - return text, job_name - - -def _oc(*args, namespace: Optional[str] = None, check: bool = True, capture: bool = False): - """Run an ``oc`` sub-command, optionally capturing output.""" - cmd = ["oc"] + list(args) - if namespace: - cmd += ["-n", namespace] - logger.debug("oc command: %s", " ".join(cmd)) - if capture: - return subprocess.run(cmd, check=check, capture_output=True, text=True) - return subprocess.run(cmd, check=check) - - -def run_vela_trial( - trial_id: int, - job_yaml: str, - chart_path: str, - job_name: str, - namespace: Optional[str], - out_file: str, - err_file: str, - pod_ready_timeout: int, - job_timeout: int, -) -> None: - """Submit a Vela/OpenShift PyTorchJob, stream its logs, and wait for completion. - - Steps - ----- - 1. Write *job_yaml* to a temp file. - 2. ``helm template -f | oc create [-n ] -f-`` - 3. Poll until the master pod (``-master-0``) appears. - 4. ``oc logs -f `` – streams every line to stdout **and** *out_file*. - 5. After streaming ends, check the pod's terminated exit-code. - Non-zero → raise :class:`subprocess.CalledProcessError`. - 6. Cleanup: delete the PyTorchJob resource. - """ - ns_args = ["-n", namespace] if namespace else [] - prefix = f"[trial-{trial_id}]" - - # Write temp YAML - with tempfile.NamedTemporaryFile( - mode="w", - suffix=".yaml", - prefix=f"vela_trial_{trial_id}_", - delete=False, - ) as fh: - fh.write(job_yaml) - tmp_yaml = fh.name - logger.debug("Vela trial %d: temp YAML written to %s", trial_id, tmp_yaml) - - try: - # ── 1. Submit ────────────────────────────────────────────────────────── - ns_flag = f"-n {namespace}" if namespace else "" - create_cmd = ( - f"helm template -f {tmp_yaml} {chart_path}" - f" | oc create {ns_flag} -f-" - ) - logger.info("Trial %d: submitting Vela job → %s", trial_id, create_cmd) - result = subprocess.run(create_cmd, shell=True, capture_output=True, text=True) - with _print_lock: - sys.stdout.write(f"{prefix} {result.stdout}") - sys.stdout.flush() - if result.returncode != 0: - raise RuntimeError( - f"Vela trial {trial_id}: oc create failed (rc={result.returncode}):\n" - f"{result.stderr}" - ) - logger.info("Trial %d: job '%s' created", trial_id, job_name) - - # ── 2. Wait for master pod to appear ────────────────────────────────── - master_pod = f"{job_name}-master-0" - deadline = time.monotonic() + pod_ready_timeout - logger.info("Trial %d: waiting for pod '%s' to appear (timeout %ds)…", trial_id, master_pod, pod_ready_timeout) - while time.monotonic() < deadline: - r = subprocess.run( - ["oc", "get", "pod", master_pod, "--ignore-not-found"] + ns_args, - capture_output=True, text=True, - ) - if master_pod in r.stdout: - logger.debug("Trial %d: pod '%s' found", trial_id, master_pod) - break - time.sleep(5) - else: - raise TimeoutError( - f"Vela trial {trial_id}: pod '{master_pod}' did not appear within {pod_ready_timeout}s" - ) - - # ── 3. Wait for pod to be Running/Succeeded ─────────────────────────── - logger.info("Trial %d: waiting for pod '%s' to be Running…", trial_id, master_pod) - wait_cmd = ( - ["oc", "wait", f"pod/{master_pod}", - "--for=condition=Ready", - f"--timeout={pod_ready_timeout}s"] - + ns_args - ) - wr = subprocess.run(wait_cmd, capture_output=True, text=True) - # oc wait returns non-zero if the pod is already Completed (no Ready condition); - # that's fine – the logs are still accessible. - logger.debug("Trial %d: oc wait rc=%d stderr=%s", trial_id, wr.returncode, wr.stderr.strip()) - - # ── 4. Stream logs ──────────────────────────────────────────────────── - log_cmd = ["oc", "logs", "-f", master_pod] + ns_args - logger.info("Trial %d: streaming logs from '%s'", trial_id, master_pod) - log_proc = subprocess.Popen( - log_cmd, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - t_out = threading.Thread( - target=_stream_pipe, - args=(log_proc.stdout, out_file, trial_id, "stdout", sys.stdout), - daemon=True, - ) - t_err = threading.Thread( - target=_stream_pipe, - args=(log_proc.stderr, err_file, trial_id, "stderr", sys.stderr), - daemon=True, - ) - t_out.start() - t_err.start() - log_proc.wait(timeout=job_timeout) - t_out.join() - t_err.join() - logger.debug("Trial %d: log stream ended (rc=%d)", trial_id, log_proc.returncode) - - # ── 4b. If oc logs exited early (e.g. "unexpected EOF"), the pod may - # still be running. Re-attach the log stream and wait for it to - # finish so we capture the full output and don't delete a live job. - if log_proc.returncode != 0: - logger.warning( - "Trial %d: oc logs exited with rc=%d (possible EOF disconnect) – " - "waiting for pod to terminate before reading exit code", - trial_id, log_proc.returncode, - ) - # Wait for pod phase Succeeded or Failed (container terminated). - oc_wait_phase = subprocess.run( - ["oc", "wait", f"pod/{master_pod}", - "--for=jsonpath={.status.phase}=Succeeded", - f"--timeout={job_timeout}s"] - + ns_args, - capture_output=True, text=True, - ) - if oc_wait_phase.returncode != 0: - # Pod may have Failed; try that phase too. - subprocess.run( - ["oc", "wait", f"pod/{master_pod}", - "--for=jsonpath={.status.phase}=Failed", - f"--timeout=30s"] - + ns_args, - capture_output=True, text=True, - ) - # Re-stream any log lines written after the disconnect into the same files. - catchup = subprocess.run( - ["oc", "logs", "--tail=-1", master_pod] + ns_args, - capture_output=True, text=True, - ) - if catchup.stdout: - with open(out_file, "a", encoding="utf-8", errors="replace") as fh: - fh.write(catchup.stdout) - if catchup.stderr: - with open(err_file, "a", encoding="utf-8", errors="replace") as fh: - fh.write(catchup.stderr) - - # ── 5. Check pod exit code ──────────────────────────────────────────── - # Poll until the pod has a terminated exit code (handles the race - # between oc-logs EOF and pod termination being recorded in the API). - exit_code_str = "" - for _attempt in range(30): - ec_result = subprocess.run( - ["oc", "get", "pod", master_pod, "-o", - "jsonpath={.status.containerStatuses[0].state.terminated.exitCode}"] - + ns_args, - capture_output=True, text=True, - ) - exit_code_str = ec_result.stdout.strip() - if exit_code_str.lstrip("-").isdigit(): - break - logger.debug("Trial %d: exit code not yet available, retrying in 5 s…", trial_id) - time.sleep(5) - exit_code = int(exit_code_str) if exit_code_str.lstrip("-").isdigit() else 0 - logger.info("Trial %d: pod exit code = %s", trial_id, exit_code) - if exit_code != 0: - logger.warning("Trial %d: pod exited with code %d – marking trial as pruned", trial_id, exit_code) - raise optuna.exceptions.TrialPruned(f"pod exited with code {exit_code}") - - finally: - # ── 6. Cleanup – delete the job ─────────────────────────────────────── - logger.debug("Trial %d: deleting PyTorchJob '%s'", trial_id, job_name) - subprocess.run( - ["oc", "delete", "pytorchjob", job_name, "--ignore-not-found"] + ns_args, - capture_output=True, - ) - try: - os.unlink(tmp_yaml) - except OSError: - pass - -# ============================================================ -# PARALLEL STREAMING RUNNER -# ============================================================ - _print_lock = threading.Lock() def _stream_pipe(pipe, dest_file, trial_id: int, stream_name: str, dest_stream): @@ -522,57 +214,26 @@ def _stream_pipe(pipe, dest_file, trial_id: int, stream_name: str, dest_stream): dest_stream.write(f"{prefix} {line}") dest_stream.flush() -def run_and_stream(launcher_cmd: str, trial_id: int, out_file: str, err_file: str, wlm: str): - """ - Run *launcher_cmd* in a shell. - - For ``wlm='none'``: captures stdout and stderr via PIPE, streams every line - to the main process stdout/stderr (prefixed with ``[trial-N]``), and also - writes them to *out_file* / *err_file* for later metric extraction. - - For WLM backends (lsf, slurm, …): the WLM tool itself manages the output - files on the cluster. The local subprocess output (WLM status messages, - errors) is still streamed with the same prefix so parallel workers are - distinguishable. - """ - logger.debug("Trial %d: run_and_stream wlm=%s cmd=%s", trial_id, wlm, launcher_cmd) +def run_and_stream(launcher_cmd: str, trial_id: int, out_file: str, err_file: str): + """Run *launcher_cmd* locally, capturing stdout/stderr to files and streaming + every line to the console prefixed with ``[trial-N]``.""" + logger.debug("Trial %d: run_and_stream cmd=%s", trial_id, launcher_cmd) proc = subprocess.Popen( launcher_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) - - if wlm == "none": - # Full capture: write to files AND stream to console - t_out = threading.Thread( - target=_stream_pipe, - args=(proc.stdout, out_file, trial_id, "stdout", sys.stdout), - daemon=True, - ) - t_err = threading.Thread( - target=_stream_pipe, - args=(proc.stderr, err_file, trial_id, "stderr", sys.stderr), - daemon=True, - ) - else: - # WLM manages the cluster output files (out_file/err_file) itself. - # Stream only the local WLM tool output (bsub/srun status messages) - # to console; write it to separate local files to avoid clobbering the - # cluster-managed trial output files. - wlm_out = out_file.replace(".out", "_wlm.out") - wlm_err = err_file.replace(".err", "_wlm.err") - t_out = threading.Thread( - target=_stream_pipe, - args=(proc.stdout, wlm_out, trial_id, "wlm-stdout", sys.stdout), - daemon=True, - ) - t_err = threading.Thread( - target=_stream_pipe, - args=(proc.stderr, wlm_err, trial_id, "wlm-stderr", sys.stderr), - daemon=True, - ) - + t_out = threading.Thread( + target=_stream_pipe, + args=(proc.stdout, out_file, trial_id, "stdout", sys.stdout), + daemon=True, + ) + t_err = threading.Thread( + target=_stream_pipe, + args=(proc.stderr, err_file, trial_id, "stderr", sys.stderr), + daemon=True, + ) t_out.start() t_err.start() proc.wait() @@ -696,6 +357,26 @@ def load_static_args(args): logger.info("Static args loaded: %d key(s): %s", len(static), list(static.keys())) return static +def load_wlm_config(args) -> dict: + """Load the optional ``wlm:`` section from the HPO YAML. + + Returns a dict that is forwarded to the WLM plugin as + ``ITERATE_WLM_`` environment variables. Keys are taken verbatim + from the YAML (e.g. ``gpu-count``, ``mem-gb``) so plugin scripts can + use familiar names. + """ + data = {} + if args.hpo_yaml: + with open(args.hpo_yaml, "r") as f: + data = yaml.safe_load(f) or {} + elif args.static_args_yaml: + with open(args.static_args_yaml, "r") as f: + data = yaml.safe_load(f) or {} + wlm_cfg = data.get("wlm", {}) + logger.info("WLM config from YAML: %s", wlm_cfg) + return wlm_cfg + + def suggest_from_spec(trial, name, spec): t = spec["type"] if t == "float": @@ -728,11 +409,15 @@ def main(): logger.info("iterate2 starting") logger.info("Log level: %s", args.log_level) - logger.info("WLM: %s | interpreter: %s | script: %s", args.wlm, args.interpreter, args.script) + logger.info( + "WLM plugin: %s | interpreter: %s | script: %s", + args.wlm_plugin or "(local)", args.interpreter, args.script, + ) logger.info("Optuna study: '%s' | db: %s | n_trials: %d", args.optuna_study_name, args.optuna_db_path, args.optuna_n_trials) hpo_space = load_hpo_space(args) static_args = load_static_args(args) + wlm_config = load_wlm_config(args) yaml_metrics = load_metrics_from_yaml(args) metric_list = yaml_metrics if yaml_metrics is not None else [m.strip() for m in args.metrics.split(",")] logger.info("Optimising metrics: %s (source: %s)", metric_list, "yaml" if yaml_metrics else "cli") @@ -749,51 +434,68 @@ def objective(trial): else: script_args[name] = val - # gpu_num in hpo/static overrides the CLI --gpu-count for this trial's launcher - gpu_count = int(script_args.pop("gpu_num", args.gpu_count)) - logger.debug("Trial %d: effective gpu_count=%d", trial.number, gpu_count) + # gpu_num can appear in hpo or static space; pull it out and add to + # wlm_config so plugins can use ITERATE_WLM_GPU_NUM for resource allocation. + trial_wlm_config = dict(wlm_config) + if "gpu_num" in script_args: + trial_wlm_config.setdefault("gpu-count", script_args.pop("gpu_num")) + logger.info("Trial %d: sampled parameters: %s", trial.number, script_args) + logger.debug("Trial %d: effective WLM config: %s", trial.number, trial_wlm_config) out_file = f"trial_{trial.number}.out" err_file = f"trial_{trial.number}.err" logger.debug("Trial %d: stdout → %s | stderr → %s", trial.number, out_file, err_file) - if args.wlm == "vela": - # ── Vela / OpenShift path ────────────────────────────────────── - if not args.vela_job_template: - raise ValueError("--vela-job-template is required when --wlm vela") - if not args.vela_chart_path: - raise ValueError("--vela-chart-path is required when --wlm vela") - container_cmd = build_container_command( - args.interpreter, script_path, script_args, - args.param_setter, args.underscore_to_hyphen, - ) - logger.info("Trial %d: container command → %s", trial.number, container_cmd) - job_yaml, job_name = build_vela_job_yaml( - args.vela_job_template, - trial.number, - gpu_count, - container_cmd, - args.vela_cmd_placeholder, - ) - logger.debug("Trial %d: job YAML (first 400 chars):\n%s", trial.number, job_yaml[:400]) - run_vela_trial( - trial_id=trial.number, - job_yaml=job_yaml, - chart_path=args.vela_chart_path, - job_name=job_name, - namespace=args.vela_namespace, - out_file=out_file, - err_file=err_file, - pod_ready_timeout=args.vela_pod_ready_timeout, - job_timeout=args.vela_job_timeout, + # Build the shell command that the plugin (or local runner) shall execute. + shell_cmd = build_shell_command( + args.interpreter, root_dir, script_path, args.venv, + script_args, args.param_setter, args.underscore_to_hyphen, + pre_run_commands=args.pre_run_commands, + ) + # Container-safe command: no cd / source – for plugins running inside + # an already-configured container image (e.g. Vela/OpenShift). + container_cmd = build_container_command( + args.interpreter, script_path, script_args, + args.param_setter, args.underscore_to_hyphen, + ) + + if args.wlm_plugin: + # ── User-provided WLM plugin ─────────────────────────────────── + # The plugin is responsible for submitting the trial, waiting for + # completion, and writing stdout/stderr to out_file/err_file. + # It signals success via exit code 0; any other value fails the trial. + env = os.environ.copy() + env["ITERATE_TRIAL_NUMBER"] = str(trial.number) + env["ITERATE_TRIAL_CMD"] = shell_cmd + env["ITERATE_TRIAL_CONTAINER_CMD"] = container_cmd + env["ITERATE_OUT_FILE"] = out_file + env["ITERATE_ERR_FILE"] = err_file + for k, v in trial_wlm_config.items(): + env_key = "ITERATE_WLM_" + k.upper().replace("-", "_").replace(" ", "_") + env[env_key] = str(v) + logger.info("Trial %d: invoking WLM plugin %s", trial.number, args.wlm_plugin) + result = subprocess.run( + [args.wlm_plugin], + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, ) + # Stream plugin output to console so operators see submission status. + plugin_out = result.stdout.decode("utf-8", errors="replace") + plugin_err = result.stderr.decode("utf-8", errors="replace") + prefix = f"[trial-{trial.number}][plugin]" + for line in plugin_out.splitlines(): + sys.stdout.write(f"{prefix} {line}\n") + for line in plugin_err.splitlines(): + sys.stderr.write(f"{prefix} {line}\n") + if result.returncode != 0: + raise subprocess.CalledProcessError(result.returncode, args.wlm_plugin) else: - # ── Standard WLM path (lsf / slurm / none) ──────────────────── - shell_cmd = build_shell_command(args.interpreter, root_dir, script_path, args.venv, script_args, args.param_setter, args.underscore_to_hyphen, pre_run_commands=args.pre_run_commands) - launcher_cmd = build_launcher_command(args.wlm, shell_cmd, trial.number, out_file, err_file, gpu_count, args.cpu_count, args.mem_gb, args.lsf_gpu_config_string) - logger.info("Trial %d: submitting → %s", trial.number, launcher_cmd) - run_and_stream(launcher_cmd, trial.number, out_file, err_file, args.wlm) + # ── Local execution (no WLM plugin) ─────────────────────────── + launcher_cmd = f'bash -c "{shell_cmd}"' + logger.info("Trial %d: running locally → %s", trial.number, launcher_cmd) + run_and_stream(launcher_cmd, trial.number, out_file, err_file) logger.info("Trial %d: job finished", trial.number) From d8d71ef81a28bfccc65a2df202ea61b394d4783e Mon Sep 17 00:00:00 2001 From: Romeo Kienzler Date: Fri, 24 Apr 2026 11:56:01 +0200 Subject: [PATCH 02/12] Refactor iterate2 scripts and core functionality - Updated `run_setter_example.sh` to use `bumpy_function.py` instead of `bumpy_setter.py`, simplifying the example for local trials. - Modified `run_vela_example.sh` to clarify usage of the Vela/OpenShift job submission, ensuring better documentation and example clarity. - Refined `lsf_plugin.sh` to streamline job submission for IBM Spectrum LSF, enhancing clarity on environment variable usage and command construction. - Overhauled `_iterate2.py` to simplify the command-line interface, improve YAML loading, and enhance metric extraction logic. - Removed deprecated features and improved logging for better traceability during execution. - Enhanced the objective function to better handle parameter suggestions and metrics extraction. Signed-off-by: Romeo Kienzler --- configs/gridfm_graphkit_hpo.yaml | 23 - examples/bumpy_function.py | 122 ++-- examples/bumpy_hpo.yaml | 30 +- examples/run_lsf_gridfm_example_postgres.sh | 78 +-- examples/run_setter_example.sh | 36 +- examples/run_vela_example.sh | 38 +- examples/wlm_plugins/lsf_plugin.sh | 120 ++-- terratorch_iterate/iterate2/_iterate2.py | 691 ++++++-------------- 8 files changed, 381 insertions(+), 757 deletions(-) diff --git a/configs/gridfm_graphkit_hpo.yaml b/configs/gridfm_graphkit_hpo.yaml index 26047dc..d578e19 100644 --- a/configs/gridfm_graphkit_hpo.yaml +++ b/configs/gridfm_graphkit_hpo.yaml @@ -52,26 +52,3 @@ static: run_name: run1 log_dir: logs report-performance: true - -# ======================== -# WLM plugin configuration -# Keys are forwarded to the WLM plugin as ITERATE_WLM_ env vars. -# Uncomment and adapt the block that matches your plugin. -# ======================== -# -# --- LSF (examples/wlm_plugins/lsf_plugin.sh) --- -# wlm: -# gpu-count: 1 -# cpu-count: 16 -# mem-gb: 32 -# lsf-gpu-config: "num=1:mode=exclusive_process:mps=no:gmodel=NVIDIAA100_SXM4_80GB" -# # queue: normal -# -# --- Vela / OpenShift (examples/wlm_plugins/vela_plugin.py) --- -# wlm: -# job-template: examples/vela_gridfm_template.yaml -# chart-path: ~/tmp/mlbatch/tools/pytorchjob-generator/chart -# namespace: my-openshift-project -# cmd-placeholder: "{{HPO_COMMAND}}" -# pod-ready-timeout: 600 -# job-timeout: 86400 diff --git a/examples/bumpy_function.py b/examples/bumpy_function.py index e098d76..5661fdb 100644 --- a/examples/bumpy_function.py +++ b/examples/bumpy_function.py @@ -1,6 +1,23 @@ #!/usr/bin/env python3 -import argparse +""" +Bumpy 3-D multimodal function — called by iterate2 as a trial script. + +iterate2 sets the following environment variables before calling this script: + ITERATE_TRIAL_NUMBER – integer trial index + ITERATE_OUT_FILE – path where metrics must be written + ITERATE_ERR_FILE – path for error logging + ITERATE_PARAM_X – HPO parameter x + ITERATE_PARAM_Y – HPO parameter y + ITERATE_PARAM_Z – HPO parameter z + ITERATE_PARAM_GLOBAL_MU – static parameter (three space-separated floats) + +All output that iterate2 uses to extract metrics must be written to +ITERATE_OUT_FILE (not stdout), one metric per line in "name: value" format. +""" + import math +import os +import sys def bumpy_function_3d( @@ -9,11 +26,11 @@ def bumpy_function_3d( mu_rest, sigma_rest, amps_rest, ): """ - 3D smooth multimodal function with: - - one global optimum = 1 at global_mu = (mx,my,mz) - - multiple local optima < 1 + 3D smooth multimodal function. + - one global optimum = 1 at global_mu = (mx, my, mz) + - multiple local optima < 1 - f(p) = 1 - Π_k (1 - a_k * exp(-||p - mu_k||^2 / (2 sigma_k^2))) + f(p) = 1 - prod_k (1 - a_k * exp(-||p - mu_k||^2 / (2 sigma_k^2))) """ def sqdist(p, q): @@ -21,74 +38,49 @@ def sqdist(p, q): p = (x, y, z) - # Global peak (amplitude = 1) - val = 1.0 - math.exp( - -sqdist(p, global_mu) / (2.0 * global_sigma**2) - ) + val = 1.0 - math.exp(-sqdist(p, global_mu) / (2.0 * global_sigma**2)) - # Local peaks for mu_k, sig_k, a_k in zip(mu_rest, sigma_rest, amps_rest): - term = 1.0 - a_k * math.exp( - -sqdist(p, mu_k) / (2.0 * sig_k**2) - ) - val *= term + val *= 1.0 - a_k * math.exp(-sqdist(p, mu_k) / (2.0 * sig_k**2)) return 1.0 - val if __name__ == "__main__": - parser = argparse.ArgumentParser("Evaluate the 3D bumpy multimodal function.") - - parser.add_argument("--x", type=float, required=True) - parser.add_argument("--y", type=float, required=True) - parser.add_argument("--z", type=float, required=True) - parser.add_argument("--trial-number", type=int, default=0) - - parser.add_argument( - "--global-mu", - type=float, - nargs=3, - default=[0.0, 0.0, 0.0], - metavar=("MX", "MY", "MZ"), - ) - parser.add_argument("--global-sigma", type=float, default=0.7) - - parser.add_argument( - "--mu-rest", - type=float, - nargs="*", - default=[-2.0, 0.0, 0.0, 2.0, 0.0, 0.0], - help="Flat list of (x y z) triplets", - ) - parser.add_argument( - "--sigma-rest", - type=float, - nargs="*", - default=[0.6, 0.6], - ) - parser.add_argument( - "--amps-rest", - type=float, - nargs="*", - default=[0.5, 0.8], - ) - - args = parser.parse_args() - - mu_rest = [ - tuple(args.mu_rest[i:i+3]) - for i in range(0, len(args.mu_rest), 3) - ] - + # --- read parameters from environment ---------------------------------- # + try: + x = float(os.environ["ITERATE_PARAM_X"]) + y = float(os.environ["ITERATE_PARAM_Y"]) + z = float(os.environ["ITERATE_PARAM_Z"]) + global_mu = tuple(map(float, os.environ["ITERATE_PARAM_GLOBAL_MU"].split())) + out_file = os.environ["ITERATE_OUT_FILE"] + trial_num = os.environ.get("ITERATE_TRIAL_NUMBER", "?") + except KeyError as exc: + print(f"ERROR: missing required environment variable {exc}", file=sys.stderr) + sys.exit(1) + + if len(global_mu) != 3: + print("ERROR: ITERATE_PARAM_GLOBAL_MU must contain exactly three floats", file=sys.stderr) + sys.exit(1) + + # Fixed defaults for the local-optima configuration + mu_rest = [(-2.0, 0.0, 0.0), (2.0, 0.0, 0.0)] + sigma_rest = [0.6, 0.6] + amps_rest = [0.5, 0.8] + global_sigma = 0.7 + + # --- evaluate ---------------------------------------------------------- # yval = bumpy_function_3d( - x=args.x, - y=args.y, - z=args.z, - global_mu=tuple(args.global_mu), - global_sigma=args.global_sigma, + x=x, y=y, z=z, + global_mu=global_mu, + global_sigma=global_sigma, mu_rest=mu_rest, - sigma_rest=args.sigma_rest, - amps_rest=args.amps_rest, + sigma_rest=sigma_rest, + amps_rest=amps_rest, ) - print(f'yval: {yval}, trial_number: {args.trial_number}') + # --- write metrics to ITERATE_OUT_FILE --------------------------------- # + with open(out_file, "w") as fh: + fh.write(f"yval: {yval}\n") + + print(f"[trial-{trial_num}] yval={yval:.6f}") diff --git a/examples/bumpy_hpo.yaml b/examples/bumpy_hpo.yaml index 0ceee54..bd0d109 100644 --- a/examples/bumpy_hpo.yaml +++ b/examples/bumpy_hpo.yaml @@ -1,23 +1,15 @@ -# ======================= -# Static parameters - passed to the underlying training script as is -# ======================= - -static: - global-mu: 23 42 66 +# HPO search space for the bumpy 3-D multimodal function. +# +# Only three sections are recognised by iterate2: +# metrics: – names to extract from the trial script output +# static: – fixed parameters passed to every trial +# hpo: – parameters Optuna will optimise + +metrics: + - yval -# ======================= -# WLM plugin configuration -# Keys are forwarded to the WLM plugin as ITERATE_WLM_ env vars. -# Omit this section entirely when running locally (no --wlm-plugin). -# ======================= -# wlm: -# gpu-count: 1 -# cpu-count: 4 -# mem-gb: 32 - -# ======================== -# Training hyperparameters - evaluated by optuna and passed to the underlying training script -# ======================== +static: + global-mu: "23 42 66" hpo: x: diff --git a/examples/run_lsf_gridfm_example_postgres.sh b/examples/run_lsf_gridfm_example_postgres.sh index c1a1f3d..878ac05 100755 --- a/examples/run_lsf_gridfm_example_postgres.sh +++ b/examples/run_lsf_gridfm_example_postgres.sh @@ -1,31 +1,21 @@ #!/usr/bin/env bash # ============================================================================= -# Example: iterate2 with the LSF WLM plugin and PostgreSQL coordinator +# Example: iterate2 with LSF job submission and PostgreSQL coordinator # -# Each Optuna trial is submitted as an LSF job by the plugin script -# examples/wlm_plugins/lsf_plugin.sh. That script reads all LSF -# resource settings from ITERATE_WLM_* env vars which are populated from -# the ``wlm:`` section in the HPO YAML (configs/gridfm_graphkit_hpo.yaml). +# iterate2 is a pure Optuna orchestrator - it knows nothing about LSF. +# For every trial it: +# 1. Samples hyperparameters via Optuna +# 2. Calls --script with all params exposed as ITERATE_PARAM_ env vars +# 3. Reads metrics from ITERATE_OUT_FILE after the script exits # -# What changed vs the old --wlm lsf approach -# ------------------------------------------ -# * --wlm lsf, --gpu-count, --cpu-count, --mem-gb, --lsf-gpu-config-string -# are gone; all of these now live in the wlm: block of the HPO YAML. -# * --wlm-plugin points to the LSF plugin script (user-owned). -# * The plugin can be customised freely without touching iterate2 itself. +# The trial script (examples/wlm_plugins/lsf_plugin.sh) owns ALL +# cluster concerns: venv activation, CUDA setup, bsub submission, etc. # # Prerequisites -# ------------- -# * LSF bsub/bjobs available on PATH -# * gridfm-graphkit installed in the venv below -# * configs/gridfm_graphkit_hpo.yaml present (with wlm: section filled in) -# * psycopg2-binary installed: pip install 'terratorch-iterate[postgresql]' -# * POSTGRES_URL set (or hard-code it in --optuna-db-path below) -# -# PostgreSQL coordinator -# ---------------------- -# Using PostgreSQL instead of SQLite / JournalFS is the recommended backend -# for high-parallelism HPO on a cluster. +# * LSF bsub available on PATH +# * configs/gridfm_graphkit_hpo.yaml present +# * psycopg2-binary: pip install 'terratorch-iterate[postgresql]' +# * POSTGRES_URL set # # export POSTGRES_URL="postgresql://user:password@host:5432/optuna_studies" # ============================================================================= @@ -35,44 +25,12 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" -# --------------------------------------------------------------------------- -# Required: PostgreSQL connection URL -# --------------------------------------------------------------------------- : "${POSTGRES_URL:?Please set POSTGRES_URL=postgresql://user:password@host:port/dbname}" -# --------------------------------------------------------------------------- -# Customisable paths – override via environment variables -# --------------------------------------------------------------------------- -GRIDFM_ROOT="${GRIDFM_ROOT:-/dccstor/terratorch/users/rkie/gitco/gridfm-graphkit}" -GRIDFM_VENV="${GRIDFM_VENV:-/u/rkie/venvs/venv_gridfm-graphkit}" -CUDA_BASE="${CUDA_BASE:-/opt/share/cuda-12.8.1}" -DATA_PATH="${DATA_PATH:-/u/rkie/}" - -# --------------------------------------------------------------------------- -# Pre-run commands executed inside every bsub job before the training script. -# --------------------------------------------------------------------------- -PRE_RUN="\ -export PATH='${CUDA_BASE}/bin:\$PATH' && \ -export CUDA_HOME='${CUDA_BASE}' && \ -export LD_LIBRARY_PATH='${CUDA_BASE}/lib64:\$LD_LIBRARY_PATH' && \ -cd '${GRIDFM_ROOT}' && \ -source '${GRIDFM_VENV}/bin/activate'" - -# --------------------------------------------------------------------------- -# Launch iterate2 -# -# Resource settings (gpu-count, cpu-count, mem-gb, lsf-gpu-config) are -# defined in the wlm: section of gridfm_graphkit_hpo.yaml – not here. -# --------------------------------------------------------------------------- iterate2 \ - --script "gridfm_graphkit train" \ - --interpreter "" \ - --root-dir "${GRIDFM_ROOT}" \ - --wlm-plugin "${SCRIPT_DIR}/wlm_plugins/lsf_plugin.sh" \ - --pre-run-commands "${PRE_RUN}" \ - --no-underscore-to-hyphen \ - --optuna-study-name gridfm_lsf_postgres_hpo \ - --optuna-db-path "${POSTGRES_URL}" \ - --parallelism 4 \ - --optuna-n-trials 20 \ - --hpo-yaml "${REPO_ROOT}/configs/gridfm_graphkit_hpo.yaml" + --script "${SCRIPT_DIR}/wlm_plugins/lsf_plugin.sh" \ + --optuna-study-name gridfm_lsf_postgres_hpo \ + --optuna-db-path "${POSTGRES_URL}" \ + --parallelism 4 \ + --optuna-n-trials 20 \ + --hpo-yaml "${REPO_ROOT}/configs/gridfm_graphkit_hpo.yaml" diff --git a/examples/run_setter_example.sh b/examples/run_setter_example.sh index 81d56d4..253deb7 100755 --- a/examples/run_setter_example.sh +++ b/examples/run_setter_example.sh @@ -1,21 +1,15 @@ #!/usr/bin/env bash # ============================================================================= -# Example: iterate2 with --param-setter +# Example: run a local trial script (no cluster, no WLM) # -# Some scripts (e.g. those using Hydra, MMCV, or custom key-value CLIs) do not -# accept traditional named flags: +# iterate2 calls examples/bumpy_function.py directly for each trial. +# All hyperparameters are supplied via ITERATE_PARAM_ environment +# variables. The script is responsible for: +# - reading those variables +# - running the computation +# - writing "metric_name: value" lines to ITERATE_OUT_FILE # -# python script.py --learning-rate 0.001 --batch-size 32 -# -# Instead they expect a setter-style interface: -# -# python script.py --set learning_rate 0.001 --set batch_size 32 -# -# Pass --param-setter to iterate2 to switch to this style. -# Every HPO and static parameter will be forwarded as: -# -- key value -# -# This example uses examples/bumpy_setter.py which accepts --set key value. +# The metrics section in the HPO YAML tells iterate2 which names to look for. # ============================================================================= set -euo pipefail @@ -23,12 +17,8 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" iterate2 \ - --script "${SCRIPT_DIR}/bumpy_setter.py" \ - --root-dir "${SCRIPT_DIR}" \ - --venv "" \ - --param-setter set \ - --optuna-study-name bumpy_setter_study \ - --optuna-db-path "sqlite:///bumpy_setter_hpo.db" \ - --optuna-n-trials 20 \ - --hpo-yaml "${SCRIPT_DIR}/bumpy_setter_hpo.yaml" \ - --metric "yval" + --script "${SCRIPT_DIR}/bumpy_function.py" \ + --optuna-study-name bumpy_local_study \ + --optuna-db-path "sqlite:///bumpy_local_hpo.db" \ + --optuna-n-trials 20 \ + --hpo-yaml "${SCRIPT_DIR}/bumpy_hpo.yaml" diff --git a/examples/run_vela_example.sh b/examples/run_vela_example.sh index e6f3234..4a36f68 100755 --- a/examples/run_vela_example.sh +++ b/examples/run_vela_example.sh @@ -1,27 +1,22 @@ #!/usr/bin/env bash # ============================================================================= -# Example: iterate2 with the Vela WLM plugin (OpenShift / MLBatch PyTorchJob) +# Example: iterate2 with Vela/OpenShift job submission (MLBatch PyTorchJob) # -# Each Optuna trial is submitted as a PyTorchJob by the plugin script -# examples/wlm_plugins/vela_plugin.py. That script reads all Vela/oc -# settings from ITERATE_WLM_* env vars which are populated from the -# ``wlm:`` section in the HPO YAML. +# iterate2 is a pure Optuna orchestrator – it knows nothing about Vela/OpenShift. +# For every trial it: +# 1. Samples hyperparameters via Optuna +# 2. Calls --script with all params exposed as ITERATE_PARAM_ env vars +# 3. Reads metrics from ITERATE_OUT_FILE after the script exits # -# What changed vs the old --wlm vela approach -# ------------------------------------------- -# * --wlm vela, --vela-job-template, --vela-chart-path, --vela-namespace, -# --vela-cmd-placeholder, --vela-pod-ready-timeout, --vela-job-timeout -# are gone; all of these now live in the wlm: block of the HPO YAML. -# * --wlm-plugin points to the Vela plugin script (user-owned). -# * The plugin can be customised freely without touching iterate2 itself. +# The trial script (examples/wlm_plugins/vela_plugin.py) owns ALL +# cluster concerns: job template rendering, helm/oc submission, waiting, etc. # # Prerequisites # ------------- # * helm CLI installed and on PATH # * oc CLI logged in to the target cluster # * mlbatch/tools/pytorchjob-generator/chart checked out locally -# * The gridfm HPO YAML (configs/gridfm_graphkit_hpo.yaml) present -# and the wlm: section filled in (see configs/gridfm_graphkit_hpo.yaml) +# * configs/gridfm_graphkit_hpo.yaml present # ============================================================================= set -euo pipefail @@ -30,12 +25,9 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" iterate2 \ - --script "gridfm_graphkit train" \ - --interpreter "" \ - --wlm-plugin "${SCRIPT_DIR}/wlm_plugins/vela_plugin.py" \ - --no-underscore-to-hyphen \ - --optuna-study-name gridfm_vela_hpo \ - --optuna-db-path "js:///gridfm_vela_hpo.journal" \ - --parallelism 16 \ - --optuna-n-trials 20 \ - --hpo-yaml "${REPO_ROOT}/configs/gridfm_graphkit_hpo.yaml" + --script "${SCRIPT_DIR}/wlm_plugins/vela_plugin.py" \ + --optuna-study-name gridfm_vela_hpo \ + --optuna-db-path "js:///gridfm_vela_hpo.journal" \ + --parallelism 16 \ + --optuna-n-trials 20 \ + --hpo-yaml "${REPO_ROOT}/configs/gridfm_graphkit_hpo.yaml" diff --git a/examples/wlm_plugins/lsf_plugin.sh b/examples/wlm_plugins/lsf_plugin.sh index 2af092f..74a4510 100755 --- a/examples/wlm_plugins/lsf_plugin.sh +++ b/examples/wlm_plugins/lsf_plugin.sh @@ -1,88 +1,76 @@ #!/usr/bin/env bash # ============================================================================= -# iterate2 WLM plugin – IBM Spectrum LSF +# Trial script for iterate2 - IBM Spectrum LSF backend # -# Submits each HPO trial as an LSF job via ``bsub -K`` (blocking). +# iterate2 calls this script once per trial. It owns ALL cluster concerns: +# activating the venv, composing the training command from env vars, and +# submitting the job via bsub. # # Environment variables provided by iterate2 # ------------------------------------------ -# ITERATE_TRIAL_NUMBER integer trial ID -# ITERATE_TRIAL_CMD full shell command to execute (includes cd, venv activation, etc.) -# ITERATE_OUT_FILE path where trial stdout must be written -# ITERATE_ERR_FILE path where trial stderr must be written +# ITERATE_TRIAL_NUMBER integer trial ID +# ITERATE_OUT_FILE path where metric lines must be written +# ITERATE_ERR_FILE path for error output +# ITERATE_PARAM_ one variable per HPO + static parameter +# (key uppercased, hyphens -> underscores) # -# WLM configuration (from the ``wlm:`` section in the HPO YAML) -# ------------------------------------------------------------- -# All keys from the ``wlm:`` block are available as -# ``ITERATE_WLM_`` (hyphens → underscores). -# Recognised keys (with defaults): -# -# gpu-count (ITERATE_WLM_GPU_COUNT) default: 1 -# cpu-count (ITERATE_WLM_CPU_COUNT) default: 4 -# mem-gb (ITERATE_WLM_MEM_GB) default: 32 -# lsf-gpu-config (ITERATE_WLM_LSF_GPU_CONFIG) default: auto from gpu-count -# queue (ITERATE_WLM_QUEUE) optional -# -# Usage in HPO YAML -# ----------------- -# wlm: -# gpu-count: 1 -# cpu-count: 16 -# mem-gb: 32 -# lsf-gpu-config: "num=1:mode=exclusive_process:mps=no:gmodel=NVIDIAA100_SXM4_80GB" -# # queue: normal # uncomment to specify an LSF queue -# -# Usage in the launch script -# -------------------------- -# iterate2 \ -# --wlm-plugin "$(dirname "$0")/wlm_plugins/lsf_plugin.sh" \ -# --hpo-yaml my_hpo.yaml \ -# ... +# Customise the sections marked CONFIGURE below. # # Exit code # --------- -# The script exits 0 on success, non-zero on failure. -# iterate2 marks the Optuna trial as FAILED when exit code != 0. +# Exit 0 on success. iterate2 marks the Optuna trial FAILED on non-zero exit. # ============================================================================= set -euo pipefail -# ── Read iterate2 env vars ──────────────────────────────────────────────────── +# -- Read iterate2 standard vars ----------------------------------------------- TRIAL_NUMBER="${ITERATE_TRIAL_NUMBER:?ITERATE_TRIAL_NUMBER not set}" -TRIAL_CMD="${ITERATE_TRIAL_CMD:?ITERATE_TRIAL_CMD not set}" OUT_FILE="${ITERATE_OUT_FILE:?ITERATE_OUT_FILE not set}" ERR_FILE="${ITERATE_ERR_FILE:?ITERATE_ERR_FILE not set}" -# ── WLM config from YAML wlm: section ──────────────────────────────────────── -GPU_COUNT="${ITERATE_WLM_GPU_COUNT:-1}" -CPU_COUNT="${ITERATE_WLM_CPU_COUNT:-4}" -MEM_GB="${ITERATE_WLM_MEM_GB:-32}" -QUEUE="${ITERATE_WLM_QUEUE:-}" -# If a custom GPU resource string is provided use it; otherwise derive from gpu-count. -if [[ -n "${ITERATE_WLM_LSF_GPU_CONFIG:-}" ]]; then - GPU_FRAGMENT="-gpu \"${ITERATE_WLM_LSF_GPU_CONFIG}\"" -elif [[ "${GPU_COUNT}" -gt 0 ]]; then - GPU_FRAGMENT="-gpu num=${GPU_COUNT}" -else - GPU_FRAGMENT="" -fi +# -- CONFIGURE: paths ---------------------------------------------------------- +GRIDFM_ROOT="${GRIDFM_ROOT:-/path/to/gridfm-graphkit}" +GRIDFM_VENV="${GRIDFM_VENV:-/path/to/venv}" +CUDA_BASE="${CUDA_BASE:-/opt/share/cuda-12.8.1}" + +# -- CONFIGURE: LSF resources -------------------------------------------------- +GPU_COUNT=1 +CPU_COUNT=16 +MEM_GB=32 +GPU_STRING="num=${GPU_COUNT}:mode=exclusive_process:mps=no" +# QUEUE="normal" # uncomment to target a specific queue + +# -- Build the training command from ITERATE_PARAM_* vars ---------------------- +# Each HPO / static parameter is available as ITERATE_PARAM_. +# Translate them into the CLI flags your training script expects. +TRAIN_CMD="gridfm_graphkit train" +TRAIN_CMD+=" --gpu_num ${ITERATE_PARAM_GPU_NUM}" +TRAIN_CMD+=" --batch_size ${ITERATE_PARAM_BATCH_SIZE}" +TRAIN_CMD+=" --num_workers ${ITERATE_PARAM_NUM_WORKERS}" +TRAIN_CMD+=" --config ${ITERATE_PARAM_CONFIG}" +TRAIN_CMD+=" --data_path ${ITERATE_PARAM_DATA_PATH}" +# Add further params as needed: +# [[ -n "${ITERATE_PARAM_COMPILE:-}" ]] && TRAIN_CMD+=" --compile ${ITERATE_PARAM_COMPILE}" -QUEUE_FRAGMENT="" -if [[ -n "${QUEUE}" ]]; then - QUEUE_FRAGMENT="-q ${QUEUE}" -fi +# -- Compose the full job command ---------------------------------------------- +JOB_CMD="\ +export PATH='${CUDA_BASE}/bin:\$PATH' && \ +export CUDA_HOME='${CUDA_BASE}' && \ +export LD_LIBRARY_PATH='${CUDA_BASE}/lib64:\$LD_LIBRARY_PATH' && \ +cd '${GRIDFM_ROOT}' && \ +source '${GRIDFM_VENV}/bin/activate' && \ +${TRAIN_CMD}" -# ── Build bsub command ──────────────────────────────────────────────────────── -# -K : block until the job finishes (iterate2 calls this per-trial in a thread) -# -o/-e: LSF writes stdout/stderr directly to these files -BSUB_CMD="bsub ${GPU_FRAGMENT} ${QUEUE_FRAGMENT} \ +# -- Submit via bsub ----------------------------------------------------------- +# -K blocks until the job finishes (iterate2 runs each trial in a thread). +# -o/-e redirect LSF job stdout/stderr to the files iterate2 will read. +bsub \ -K \ - -o ${OUT_FILE} \ - -e ${ERR_FILE} \ - -R \"rusage[ngpus=${GPU_COUNT}, cpu=${CPU_COUNT}, mem=${MEM_GB}GB]\" \ - -J hpo_trial_${TRIAL_NUMBER} \ - \"${TRIAL_CMD}\"" + -gpu "${GPU_STRING}" \ + -R "rusage[ngpus_physical=${GPU_COUNT},cpus=${CPU_COUNT},mem=${MEM_GB}GB]" \ + -o "${OUT_FILE}" \ + -e "${ERR_FILE}" \ + -J "hpo_trial_${TRIAL_NUMBER}" \ + "${JOB_CMD}" -echo "[lsf_plugin] Trial ${TRIAL_NUMBER}: submitting → ${BSUB_CMD}" -eval "${BSUB_CMD}" -echo "[lsf_plugin] Trial ${TRIAL_NUMBER}: job finished" +echo "[lsf_plugin] trial ${TRIAL_NUMBER} finished" diff --git a/terratorch_iterate/iterate2/_iterate2.py b/terratorch_iterate/iterate2/_iterate2.py index 1fea9a9..8d2e8cf 100644 --- a/terratorch_iterate/iterate2/_iterate2.py +++ b/terratorch_iterate/iterate2/_iterate2.py @@ -1,209 +1,175 @@ #!/usr/bin/env python3 +""" +iterate2 – minimal Optuna HPO launcher. + +iterate2 does exactly three things: + 1. Load the HPO search space and static parameters from a YAML file. + 2. For every Optuna trial, sample parameters and call a user-provided + script with those parameters exposed as environment variables. + 3. After the script exits, extract one or more metrics from the log file + the script wrote and return them to Optuna. + +The user script is fully in charge of *how* the trial runs – activating a +virtualenv, submitting a bsub/sbatch job, running locally, launching a +container, etc. iterate2 has no opinion on any of that. + +Environment variables passed to the script +------------------------------------------ + ITERATE_TRIAL_NUMBER integer trial ID (0-based) + ITERATE_OUT_FILE path the script must write its stdout to + ITERATE_ERR_FILE path the script must write its stderr to + ITERATE_PARAM_ one variable per sampled + static parameter + (key uppercased, hyphens and spaces → underscores) + +HPO YAML format +--------------- + metrics: # list of metric names to extract from ITERATE_OUT_FILE + - val_loss + - accuracy + + static: # fixed parameters, forwarded as-is every trial + epochs: 50 + dataset: /data/my_dataset + + hpo: # parameters Optuna will optimise + learning_rate: + type: float + low: 1e-5 + high: 1e-2 + log: true + batch_size: + type: categorical + choices: [16, 32, 64] +""" import argparse -import json import logging import os +import re import subprocess import sys -import re -import tempfile import threading -import time from pathlib import Path -from typing import Dict, Any, Optional, Literal, List +from typing import List, Optional import optuna import yaml from terratorch_iterate.iterate2.plugin.coordinator import load_builtin_plugins, resolve_storage -# Load built-in coordinator plugins (sqlite, journalfs, postgresql) load_builtin_plugins() logger = logging.getLogger("iterate2") -# ============================================================ -# CLI -# ============================================================ +# ─── CLI ───────────────────────────────────────────────────────────────────── def parse_args(): - parser = argparse.ArgumentParser( - description="Generic Optuna HPO launcher with Multi-Metric support" + p = argparse.ArgumentParser( + description="Minimal Optuna HPO launcher – calls a user script per trial" ) + p.add_argument("--script", required=True, + help="Executable to call for each trial") + p.add_argument("--hpo-yaml", required=True, + help="YAML file with 'hpo:', 'static:', and 'metrics:' sections") + p.add_argument("--optuna-study-name", required=True) + p.add_argument("--optuna-db-path", required=True, + help="Optuna storage URL (sqlite:///hpo.db, js:///journal.log, postgresql://…)") + p.add_argument("--optuna-n-trials", type=int, default=100) + p.add_argument("--parallelism", type=int, default=1, + help="Parallel trials (threads). Use PostgreSQL/JournalStorage for >4.") + p.add_argument("--log-level", default="INFO", + choices=["DEBUG", "INFO", "WARNING", "ERROR"]) + return p.parse_args() + + +# ─── YAML LOADING ──────────────────────────────────────────────────────────── + +def load_yaml(path: str) -> dict: + with open(path) as f: + return yaml.safe_load(f) or {} + +def load_hpo_space(data: dict) -> dict: + space = data.get("hpo", {}) + logger.info("HPO space: %d param(s): %s", len(space), list(space.keys())) + return space - # ------------------------ - # Execution config - # ------------------------ - parser.add_argument("--script", required=True, help="Training script to execute") - parser.add_argument("--root-dir", default=None, help="Root dir (derived if omitted)") - parser.add_argument("--venv", default=".venv", help="Virtualenv dir (shortcut for source /bin/activate)") - parser.add_argument( - "--pre-run-commands", - default=None, - help=( - "Shell commands to run before the training script, joined with ' && '. " - "Useful for sourcing bashrc, activating conda/mamba envs, loading modules, etc. " - "Example: 'source ~/.bashrc && micromamba activate gridfm'. " - "When set, --venv is ignored." - ), - ) - parser.add_argument("--interpreter", default="python", help="Interpreter to use") - parser.add_argument("--param-setter", type=str, default=None) - parser.add_argument( - "--wlm-plugin", - type=str, - default=None, - help=( - "Path to an executable WLM plugin script that submits/runs each trial. " - "When omitted, trials run locally (equivalent to the old --wlm none). " - "The plugin receives trial information as environment variables: " - "ITERATE_TRIAL_NUMBER, ITERATE_TRIAL_CMD, ITERATE_OUT_FILE, ITERATE_ERR_FILE, " - "and ITERATE_WLM_ for every key in the 'wlm:' YAML section. " - "Exit 0 to signal success; any other exit code marks the trial as failed. " - "See examples/wlm_plugins/ for LSF and Vela reference implementations." - ), - ) - parser.add_argument( - "--parallelism", - type=int, - default=1, - help="Number of trials to run in parallel (default: 1 = sequential). " - "Each parallel trial runs in its own thread. " - "For SQLite storage, values >4 may cause locking contention; " - "consider PostgreSQL for high parallelism.", - ) - parser.add_argument( - "--no-underscore-to-hyphen", - dest="underscore_to_hyphen", - action="store_false", - default=True, - help="Do not convert underscores to hyphens in arg names (default: convert)", - ) +def load_static(data: dict) -> dict: + static = data.get("static", {}) + logger.info("Static params: %d key(s): %s", len(static), list(static.keys())) + return static - # ------------------------ - # Optuna config - # ------------------------ - parser.add_argument("--optuna-study-name", required=True) - parser.add_argument("--optuna-db-path", required=True) - parser.add_argument("--optuna-n-trials", type=int, default=100) - - # ------------------------ - # HPO space - # ------------------------ - parser.add_argument("--hpo-json", type=str, default=None) - parser.add_argument("--hpo-yaml", type=str, default=None) - parser.add_argument("--static-args-json", type=str, default=None) - parser.add_argument("--static-args-yaml", type=str, default=None) - - # ------------------------ - # Metric extraction (Supports comma-separated list) - # ------------------------ - parser.add_argument( - "--metrics", - default="score_combined", - help="Comma-separated metric names to extract (e.g. score_linear_acc,score_modality_leak,score_combined)", - ) +def load_metrics(data: dict, fallback: str = "score") -> List[str]: + raw = data.get("metrics", None) + if raw is None: + logger.warning("No 'metrics:' key in YAML – defaulting to '%s'", fallback) + return [fallback] + if isinstance(raw, list): + return [str(m).strip() for m in raw] + return [m.strip() for m in str(raw).split(",")] - # ------------------------ - # Logging - # ------------------------ - parser.add_argument( - "--log-level", - default="INFO", - choices=["DEBUG", "INFO", "WARNING", "ERROR"], - help="Logging verbosity (default: INFO)", - ) - return parser.parse_args() - - -# ============================================================ -# HELPERS & COMMAND BUILDERS -# ============================================================ - -def resolve_paths(script: str, root_dir: Optional[str]): - if root_dir is None: root_dir = '.' - resolved = Path(root_dir).resolve() - logger.debug("Resolved root_dir '%s' → '%s'", root_dir, resolved) - return script, resolved - -def build_shell_command(interpreter, root_dir, script_path, venv, script_args, param_setter, underscore_to_hyphen=True, pre_run_commands=None): - parts = [f"cd {root_dir}"] - if pre_run_commands: - parts.append(pre_run_commands) - logger.debug("Pre-run commands: %s", pre_run_commands) - elif venv: - parts.append(f"source {venv}/bin/activate") - logger.debug("Activating venv: %s", venv) - arg_list = [f"{interpreter} {script_path}"] - for key, value in script_args.items(): - arg_name = key.replace("_", "-") if underscore_to_hyphen else key - if value is None: - logger.debug("Skipping arg '%s': value is None (flag omitted)", key) - continue - if param_setter: - if isinstance(value, bool): - if value: - arg_list.append(f"--{param_setter} {key}") - logger.debug("Setter flag: --%s %s (store_true)", param_setter, key) - else: - logger.debug("Skipping flag '%s': False → omitted", key) - else: - arg_list.append(f"--{param_setter} {key} {value}") - logger.debug("Setter arg: --%s %s %s", param_setter, key, value) - else: - if isinstance(value, bool): - if value: - arg_list.append(f"--{arg_name}") - logger.debug("Flag present: --%s", arg_name) - else: - logger.debug("Skipping flag '--%s': False → omitted", arg_name) - else: - arg_list.append(f"--{arg_name} {value}") - logger.debug("Arg: --%s %s", arg_name, value) - cmd = " && ".join(parts + [" ".join(arg_list)]) - logger.debug("Shell command: %s", cmd) - return cmd - - -def build_container_command(interpreter: str, script_path: str, script_args: dict, param_setter: Optional[str], underscore_to_hyphen: bool = True) -> str: - """Build a bare CLI invocation suitable for running inside a container. - - Unlike :func:`build_shell_command` this function does **not** prepend - ``cd`` or ``source venv`` – those are not needed (or available) inside an - already-running container image. - """ - prefix = f"{interpreter} " if interpreter else "" - arg_list = [f"{prefix}{script_path}".strip()] - for key, value in script_args.items(): - arg_name = key.replace("_", "-") if underscore_to_hyphen else key - if value is None: - logger.debug("Container cmd: skipping '%s' (None)", key) - continue - if param_setter: - if isinstance(value, bool): - if value: - arg_list.append(f"--{param_setter} {key}") - else: - pass # omit +# ─── OPTUNA PARAM SAMPLING ─────────────────────────────────────────────────── + +def suggest(trial: optuna.Trial, name: str, spec: dict): + t = spec["type"] + if t == "float": + return trial.suggest_float(name, float(spec["low"]), float(spec["high"]), + log=spec.get("log", False)) + if t == "int": + return trial.suggest_int(name, int(spec["low"]), int(spec["high"]), + log=spec.get("log", False)) + if t == "categorical": + return trial.suggest_categorical(name, spec["choices"]) + if t == "flag": + return trial.suggest_categorical(name, [True, False]) + if t == "group": + return trial.suggest_categorical(name, list(spec["choices"].keys())) + raise ValueError(f"Unknown param type '{t}' for '{name}'") + + +# ─── METRIC EXTRACTION ─────────────────────────────────────────────────────── + +def extract_metrics(out_file: str, err_file: str, metric_names: List[str]) -> List[float]: + """Read both output files and extract the last occurrence of each metric.""" + text = "" + for path in (out_file, err_file): + try: + text += Path(path).read_text(encoding="utf-8", errors="ignore") + "\n" + except FileNotFoundError: + pass + + results = [] + for metric in metric_names: + # Support name#N for Nth-occurrence selection (0-based) + occurrence: Optional[int] = None + bare = metric + m = re.fullmatch(r'(.+)#(\d+)', metric) + if m: + bare, occurrence = m.group(1), int(m.group(2)) + + pattern = re.compile( + rf"(?:\[\w+\]\s*)?{re.escape(bare)}\s*[:=│]\s*([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)" + ) + matches = pattern.findall(text) + if not matches: + logger.warning("Metric '%s' not found – defaulting to 0.0", metric) + results.append(0.0) + elif occurrence is not None: + if occurrence >= len(matches): + logger.warning("Metric '%s' occurrence #%d not found – defaulting to 0.0", + metric, occurrence) + results.append(0.0) else: - arg_list.append(f"--{param_setter} {key} {value}") + results.append(float(matches[occurrence])) else: - if isinstance(value, bool): - if value: - arg_list.append(f"--{arg_name}") - # else omit - else: - arg_list.append(f"--{arg_name} {value}") - cmd = " ".join(arg_list) - logger.debug("Container command: %s", cmd) - return cmd + results.append(float(matches[-1])) + return results +# ─── SCRIPT RUNNER ─────────────────────────────────────────────────────────── + _print_lock = threading.Lock() -def _stream_pipe(pipe, dest_file, trial_id: int, stream_name: str, dest_stream): - """Read lines from *pipe*, write to *dest_file* and print prefixed to *dest_stream*.""" +def _stream(pipe, dest_file: str, trial_id: int, dest_stream): prefix = f"[trial-{trial_id}]" with open(dest_file, "w", encoding="utf-8", errors="replace") as fh: for raw in pipe: @@ -214,185 +180,24 @@ def _stream_pipe(pipe, dest_file, trial_id: int, stream_name: str, dest_stream): dest_stream.write(f"{prefix} {line}") dest_stream.flush() -def run_and_stream(launcher_cmd: str, trial_id: int, out_file: str, err_file: str): - """Run *launcher_cmd* locally, capturing stdout/stderr to files and streaming - every line to the console prefixed with ``[trial-N]``.""" - logger.debug("Trial %d: run_and_stream cmd=%s", trial_id, launcher_cmd) +def run_script(script: str, env: dict, trial_id: int, out_file: str, err_file: str): + """Run *script* with *env*, stream output, raise on non-zero exit.""" + logger.info("Trial %d: calling %s", trial_id, script) proc = subprocess.Popen( - launcher_cmd, - shell=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - t_out = threading.Thread( - target=_stream_pipe, - args=(proc.stdout, out_file, trial_id, "stdout", sys.stdout), - daemon=True, - ) - t_err = threading.Thread( - target=_stream_pipe, - args=(proc.stderr, err_file, trial_id, "stderr", sys.stderr), - daemon=True, + [script], env=env, + stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) - t_out.start() - t_err.start() + import threading as _t + t_out = _t.Thread(target=_stream, args=(proc.stdout, out_file, trial_id, sys.stdout), daemon=True) + t_err = _t.Thread(target=_stream, args=(proc.stderr, err_file, trial_id, sys.stderr), daemon=True) + t_out.start(); t_err.start() proc.wait() - t_out.join() - t_err.join() - + t_out.join(); t_err.join() if proc.returncode != 0: - raise subprocess.CalledProcessError(proc.returncode, launcher_cmd) - -# ============================================================ -# MULTI-METRIC EXTRACTION -# ============================================================ - -def extract_metrics_from_log(path: str, metric_names: List[str], err_path: Optional[str] = None) -> List[float]: - """Extract metric values from a log file. - - Each entry in *metric_names* is either a plain name (uses the **last** - match) or ``name#N`` to select the **N-th occurrence** (0-based). This - lets you disambiguate scripts that print the same metric key multiple - times, e.g.:: - - metrics: - - "Samples/sec#0" # DataLoader throughput (first occurrence) - - "Samples/sec#1" # Training throughput (second occurrence) - - "Samples/sec#2" # Inference throughput (third occurrence) - - GFLOPS - """ - logger.debug("Extracting metrics %s from '%s'", metric_names, path) - results = [] - with open(path, "r", encoding="utf-8", errors="ignore") as f: - text = f.read() - logger.debug("Log file '%s': %d characters read", path, len(text)) - # Also read stderr — Lightning/rich writes test result tables there - if err_path: - try: - with open(err_path, "r", encoding="utf-8", errors="ignore") as f: - err_text = f.read() - logger.debug("Err file '%s': %d characters read", err_path, len(err_text)) - text = text + "\n" + err_text - except FileNotFoundError: - logger.debug("Err file '%s' not found, skipping", err_path) - - for metric in metric_names: - # Support name#N syntax for Nth-occurrence selection (0-based) - occurrence: Optional[int] = None - bare_metric = metric - idx_match = re.fullmatch(r'(.+)#(\d+)', metric) - if idx_match: - bare_metric = idx_match.group(1) - occurrence = int(idx_match.group(2)) - - # Matches: key: value | key=value | [performance] key : value | Lightning table │ key │ value │ - pattern = re.compile( - rf"(?:\[\w+\]\s*)?{re.escape(bare_metric)}\s*(?:[:=│])\s*([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)" - ) - matches = pattern.findall(text) - if not matches: - logger.warning("Metric '%s' not found in '%s' — defaulting to 0.0", metric, path) - results.append(0.0) - elif occurrence is not None: - if occurrence >= len(matches): - logger.warning( - "Metric '%s' occurrence #%d requested but only %d match(es) found — defaulting to 0.0", - metric, occurrence, len(matches), - ) - results.append(0.0) - else: - value = float(matches[occurrence]) - logger.debug("Metric '%s': using occurrence #%d = %s", metric, occurrence, value) - results.append(value) - else: - value = float(matches[-1]) - logger.debug("Metric '%s': found %d match(es), using last value %s", metric, len(matches), value) - results.append(value) - return results + raise subprocess.CalledProcessError(proc.returncode, script) -# ============================================================ -# MAIN -# ============================================================ - -def load_hpo_space(args): - data = {} - if args.hpo_json: - logger.debug("Loading HPO space from JSON string") - data = json.loads(args.hpo_json) - elif args.hpo_yaml: - logger.debug("Loading HPO space from YAML file: %s", args.hpo_yaml) - with open(args.hpo_yaml, "r") as f: data = yaml.safe_load(f) - space = data.get("hpo", {}) - logger.info("HPO space loaded: %d parameter(s): %s", len(space), list(space.keys())) - return space - -def load_metrics_from_yaml(args): - """Return metrics list from YAML 'metrics:' key, or None if not present.""" - data = {} - if args.hpo_json: - data = json.loads(args.hpo_json) - elif args.hpo_yaml: - with open(args.hpo_yaml, "r") as f: data = yaml.safe_load(f) - elif args.static_args_yaml: - with open(args.static_args_yaml, "r") as f: data = yaml.safe_load(f) - metrics = data.get("metrics", None) - if metrics is None: - return None - if isinstance(metrics, list): - return [m.strip() for m in metrics] - return [m.strip() for m in str(metrics).split(",")] - -def load_static_args(args): - data = {} - if args.static_args_json: - logger.debug("Loading static args from JSON string") - data = json.loads(args.static_args_json) - elif args.static_args_yaml: - logger.debug("Loading static args from YAML file: %s", args.static_args_yaml) - with open(args.static_args_yaml, "r") as f: data = yaml.safe_load(f) - elif args.hpo_yaml: - logger.debug("Loading static args from HPO YAML file: %s", args.hpo_yaml) - with open(args.hpo_yaml, "r") as f: data = yaml.safe_load(f) - static = data.get("static", data if data else {}) - logger.info("Static args loaded: %d key(s): %s", len(static), list(static.keys())) - return static -def load_wlm_config(args) -> dict: - """Load the optional ``wlm:`` section from the HPO YAML. - - Returns a dict that is forwarded to the WLM plugin as - ``ITERATE_WLM_`` environment variables. Keys are taken verbatim - from the YAML (e.g. ``gpu-count``, ``mem-gb``) so plugin scripts can - use familiar names. - """ - data = {} - if args.hpo_yaml: - with open(args.hpo_yaml, "r") as f: - data = yaml.safe_load(f) or {} - elif args.static_args_yaml: - with open(args.static_args_yaml, "r") as f: - data = yaml.safe_load(f) or {} - wlm_cfg = data.get("wlm", {}) - logger.info("WLM config from YAML: %s", wlm_cfg) - return wlm_cfg - - -def suggest_from_spec(trial, name, spec): - t = spec["type"] - if t == "float": - val = trial.suggest_float(name, float(spec["low"]), float(spec["high"]), log=spec.get("log", False)) - elif t == "int": - val = trial.suggest_int(name, int(spec["low"]), int(spec["high"]), log=spec.get("log", False)) - elif t == "categorical": - val = trial.suggest_categorical(name, spec["choices"]) - elif t == "flag": - val = trial.suggest_categorical(name, [True, False]) - elif t == "group": - val = trial.suggest_categorical(name, list(spec["choices"].keys())) - else: - raise ValueError(f"Unknown param type: {t}") - logger.debug("Suggested '%s' (%s) = %r", name, t, val) - return val +# ─── MAIN ──────────────────────────────────────────────────────────────────── def main(): args = parse_args() @@ -402,161 +207,91 @@ def main(): format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) - # Suppress noisy optuna INFO logs unless user asked for DEBUG logging.getLogger("optuna").setLevel( logging.WARNING if args.log_level == "INFO" else getattr(logging, args.log_level) ) - logger.info("iterate2 starting") - logger.info("Log level: %s", args.log_level) - logger.info( - "WLM plugin: %s | interpreter: %s | script: %s", - args.wlm_plugin or "(local)", args.interpreter, args.script, - ) - logger.info("Optuna study: '%s' | db: %s | n_trials: %d", args.optuna_study_name, args.optuna_db_path, args.optuna_n_trials) - - hpo_space = load_hpo_space(args) - static_args = load_static_args(args) - wlm_config = load_wlm_config(args) - yaml_metrics = load_metrics_from_yaml(args) - metric_list = yaml_metrics if yaml_metrics is not None else [m.strip() for m in args.metrics.split(",")] - logger.info("Optimising metrics: %s (source: %s)", metric_list, "yaml" if yaml_metrics else "cli") - - script_path, root_dir = resolve_paths(args.script, args.root_dir) - - def objective(trial): - script_args = static_args.copy() - for name, spec in hpo_space.items(): - val = suggest_from_spec(trial, name, spec) - if spec["type"] == "group": - # Expand the chosen group's key→value pairs directly into script_args - script_args.update(spec["choices"][val]) - else: - script_args[name] = val - - # gpu_num can appear in hpo or static space; pull it out and add to - # wlm_config so plugins can use ITERATE_WLM_GPU_NUM for resource allocation. - trial_wlm_config = dict(wlm_config) - if "gpu_num" in script_args: - trial_wlm_config.setdefault("gpu-count", script_args.pop("gpu_num")) + logger.info("iterate2 starting script=%s yaml=%s", args.script, args.hpo_yaml) + logger.info("study=%s db=%s n_trials=%d parallelism=%d", + args.optuna_study_name, args.optuna_db_path, + args.optuna_n_trials, args.parallelism) - logger.info("Trial %d: sampled parameters: %s", trial.number, script_args) - logger.debug("Trial %d: effective WLM config: %s", trial.number, trial_wlm_config) - - out_file = f"trial_{trial.number}.out" - err_file = f"trial_{trial.number}.err" - logger.debug("Trial %d: stdout → %s | stderr → %s", trial.number, out_file, err_file) - - # Build the shell command that the plugin (or local runner) shall execute. - shell_cmd = build_shell_command( - args.interpreter, root_dir, script_path, args.venv, - script_args, args.param_setter, args.underscore_to_hyphen, - pre_run_commands=args.pre_run_commands, - ) - # Container-safe command: no cd / source – for plugins running inside - # an already-configured container image (e.g. Vela/OpenShift). - container_cmd = build_container_command( - args.interpreter, script_path, script_args, - args.param_setter, args.underscore_to_hyphen, - ) - - if args.wlm_plugin: - # ── User-provided WLM plugin ─────────────────────────────────── - # The plugin is responsible for submitting the trial, waiting for - # completion, and writing stdout/stderr to out_file/err_file. - # It signals success via exit code 0; any other value fails the trial. - env = os.environ.copy() - env["ITERATE_TRIAL_NUMBER"] = str(trial.number) - env["ITERATE_TRIAL_CMD"] = shell_cmd - env["ITERATE_TRIAL_CONTAINER_CMD"] = container_cmd - env["ITERATE_OUT_FILE"] = out_file - env["ITERATE_ERR_FILE"] = err_file - for k, v in trial_wlm_config.items(): - env_key = "ITERATE_WLM_" + k.upper().replace("-", "_").replace(" ", "_") - env[env_key] = str(v) - logger.info("Trial %d: invoking WLM plugin %s", trial.number, args.wlm_plugin) - result = subprocess.run( - [args.wlm_plugin], - env=env, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - # Stream plugin output to console so operators see submission status. - plugin_out = result.stdout.decode("utf-8", errors="replace") - plugin_err = result.stderr.decode("utf-8", errors="replace") - prefix = f"[trial-{trial.number}][plugin]" - for line in plugin_out.splitlines(): - sys.stdout.write(f"{prefix} {line}\n") - for line in plugin_err.splitlines(): - sys.stderr.write(f"{prefix} {line}\n") - if result.returncode != 0: - raise subprocess.CalledProcessError(result.returncode, args.wlm_plugin) - else: - # ── Local execution (no WLM plugin) ─────────────────────────── - launcher_cmd = f'bash -c "{shell_cmd}"' - logger.info("Trial %d: running locally → %s", trial.number, launcher_cmd) - run_and_stream(launcher_cmd, trial.number, out_file, err_file) - - logger.info("Trial %d: job finished", trial.number) - - values = extract_metrics_from_log(out_file, metric_list, err_path=err_file) - logger.info("Trial %d: results %s", trial.number, dict(zip(metric_list, values))) - - return tuple(values) - - # Multi-objective direction - directions = ["maximize"] * len(metric_list) - logger.info("Creating Optuna study (directions: %s)", directions) + data = load_yaml(args.hpo_yaml) + hpo_space = load_hpo_space(data) + static = load_static(data) + metrics = load_metrics(data) + directions = ["maximize"] * len(metrics) + logger.info("Metrics: %s", metrics) storage = resolve_storage(args.optuna_db_path) - logger.debug("Optuna storage: %s", storage) - study = optuna.create_study( study_name=args.optuna_study_name, storage=storage, directions=directions, load_if_exists=True, ) - logger.info("Study '%s' ready (existing trials: %d)", args.optuna_study_name, len(study.trials)) + logger.info("Study '%s' ready (existing trials: %d)", + args.optuna_study_name, len(study.trials)) - # ── Re-queue failed trials (25 % retry / 75 % new) ──────────────────── - failed_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.FAIL] + # ── Re-queue failed trials: 25 % retry, 75 % new ───────────────────── + failed = [t for t in study.trials if t.state == optuna.trial.TrialState.FAIL] n_total = args.optuna_n_trials - if failed_trials: - n_retry = max(1, round(0.25 * n_total)) - n_retry = min(n_retry, len(failed_trials)) # can't retry more than we have + if failed: + n_retry = min(max(1, round(0.25 * n_total)), len(failed)) n_new = n_total - n_retry - # enqueue the most-recent failed trials first - trials_to_retry = failed_trials[-n_retry:] - logger.info( - "Found %d failed trial(s). Re-queuing %d (25%%) and running %d new (75%%).", - len(failed_trials), n_retry, n_new, - ) - for ft in trials_to_retry: - if ft.params: # skip trials that had no params at all + logger.info("Found %d failed trial(s) – re-queuing %d (25%%), %d new (75%%)", + len(failed), n_retry, n_new) + for ft in failed[-n_retry:]: + if ft.params: study.enqueue_trial(ft.params) - logger.info(" Enqueued params from failed trial %d: %s", ft.number, ft.params) + logger.info(" Enqueued failed trial %d params: %s", ft.number, ft.params) + # ───────────────────────────────────────────────────────────────────── + + def objective(trial): + # ── Sample parameters ───────────────────────────────────────────── + params = dict(static) # start with static params + for name, spec in hpo_space.items(): + val = suggest(trial, name, spec) + if spec["type"] == "group": + params.update(spec["choices"][val]) # expand group → flat keys else: - logger.info(" Skipped failed trial %d (no params recorded).", ft.number) - # adjust total so we run exactly n_new *additional* new trials on top - n_total = n_new + n_retry # enqueued slots count toward n_trials - else: - logger.info("No failed trials found – running %d fresh trials.", n_total) - # ── end retry logic ─────────────────────────────────────────────────── - - logger.info("Parallelism: %d worker(s)", args.parallelism) + params[name] = val + + logger.info("Trial %d params: %s", trial.number, params) + + out_file = f"trial_{trial.number}.out" + err_file = f"trial_{trial.number}.err" + + # ── Build env for the script ────────────────────────────────────── + env = os.environ.copy() + env["ITERATE_TRIAL_NUMBER"] = str(trial.number) + env["ITERATE_OUT_FILE"] = out_file + env["ITERATE_ERR_FILE"] = err_file + for k, v in params.items(): + env_key = "ITERATE_PARAM_" + str(k).upper().replace("-", "_").replace(" ", "_") + env[env_key] = str(v) if v is not None else "" + + # ── Call the script ─────────────────────────────────────────────── + run_script(args.script, env, trial.number, out_file, err_file) + + # ── Extract metrics ─────────────────────────────────────────────── + values = extract_metrics(out_file, err_file, metrics) + logger.info("Trial %d results: %s", trial.number, dict(zip(metrics, values))) + return tuple(values) + + logger.info("Starting optimisation (%d worker(s))", args.parallelism) study.optimize( objective, n_trials=n_total, n_jobs=args.parallelism, - catch=(Exception,), # mark trial as FAILED and continue; never crash the study + catch=(Exception,), ) logger.info("=" * 60) - logger.info("OPTIMIZATION COMPLETE") - logger.info("Pareto Front Trials: %d", len(study.best_trials)) + logger.info("OPTIMISATION COMPLETE Pareto front: %d trial(s)", len(study.best_trials)) for t in study.best_trials: - logger.info(" Trial %d: Values=%s Params=%s", t.number, t.values, t.params) + logger.info(" Trial %d: values=%s params=%s", t.number, t.values, t.params) + if __name__ == "__main__": - main() \ No newline at end of file + main() From be709e2bb32d2399d0e7366872fd1c490745551c Mon Sep 17 00:00:00 2001 From: Romeo Kienzler Date: Fri, 24 Apr 2026 17:27:25 +0200 Subject: [PATCH 03/12] fix lsf plugin Signed-off-by: Romeo Kienzler --- examples/wlm_plugins/lsf_plugin.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/wlm_plugins/lsf_plugin.sh b/examples/wlm_plugins/lsf_plugin.sh index 74a4510..47069c6 100755 --- a/examples/wlm_plugins/lsf_plugin.sh +++ b/examples/wlm_plugins/lsf_plugin.sh @@ -64,10 +64,13 @@ ${TRAIN_CMD}" # -- Submit via bsub ----------------------------------------------------------- # -K blocks until the job finishes (iterate2 runs each trial in a thread). # -o/-e redirect LSF job stdout/stderr to the files iterate2 will read. +MEM_MB=$(( MEM_GB * 1024 )) + bsub \ -K \ -gpu "${GPU_STRING}" \ - -R "rusage[ngpus_physical=${GPU_COUNT},cpus=${CPU_COUNT},mem=${MEM_GB}GB]" \ + -n "${CPU_COUNT}" \ + -R "rusage[mem=${MEM_MB}]" \ -o "${OUT_FILE}" \ -e "${ERR_FILE}" \ -J "hpo_trial_${TRIAL_NUMBER}" \ From b444ac3315d1a441aef75911ea72048a0321c576 Mon Sep 17 00:00:00 2001 From: Romeo Kienzler Date: Fri, 24 Apr 2026 20:53:19 +0200 Subject: [PATCH 04/12] Add example scripts for running gridfm-graphkit HPO on CCC and ZuVela clusters Signed-off-by: Romeo Kienzler --- examples/run_ccc_gridfm_example.sh | 37 +++++++++++++ examples/run_zuvela_gridfm_example.sh | 35 +++++++++++++ examples/wlm_plugins/ccc_plugin.sh | 75 +++++++++++++++++++++++++++ examples/wlm_plugins/zuvela_plugin.sh | 73 ++++++++++++++++++++++++++ 4 files changed, 220 insertions(+) create mode 100755 examples/run_ccc_gridfm_example.sh create mode 100755 examples/run_zuvela_gridfm_example.sh create mode 100755 examples/wlm_plugins/ccc_plugin.sh create mode 100755 examples/wlm_plugins/zuvela_plugin.sh diff --git a/examples/run_ccc_gridfm_example.sh b/examples/run_ccc_gridfm_example.sh new file mode 100755 index 0000000..fa773b9 --- /dev/null +++ b/examples/run_ccc_gridfm_example.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +# ============================================================================= +# Run gridfm-graphkit HPO on CCC (IBM Spectrum LSF cluster) +# +# iterate2 orchestrates Optuna trials. For each trial it calls +# examples/wlm_plugins/ccc_plugin.sh, which owns all LSF concerns: +# venv activation, CUDA setup, and bsub submission. +# +# Prerequisites +# * bsub / bjobs available on PATH +# * gridfm-graphkit installed in GRIDFM_VENV +# * configs/gridfm_graphkit_hpo.yaml present +# * psycopg2-binary: pip install 'terratorch-iterate[postgresql]' +# * POSTGRES_URL exported +# +# export POSTGRES_URL="postgresql://user:password@host:5432/optuna_studies" +# ============================================================================= + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +: "${POSTGRES_URL:?Please export POSTGRES_URL=postgresql://user:password@host:port/dbname}" + +# Override via env vars if your paths differ from the plugin defaults +export GRIDFM_ROOT="${GRIDFM_ROOT:-/dccstor/terratorch/users/rkie/gitco/gridfm-graphkit}" +export GRIDFM_VENV="${GRIDFM_VENV:-/u/rkie/venvs/venv_gridfm-graphkit}" +export CUDA_BASE="${CUDA_BASE:-/opt/share/cuda-12.8.1}" + +iterate2 \ + --script "${SCRIPT_DIR}/wlm_plugins/ccc_plugin.sh" \ + --optuna-study-name gridfm_ccc_hpo \ + --optuna-db-path "${POSTGRES_URL}" \ + --parallelism 4 \ + --optuna-n-trials 20 \ + --hpo-yaml "${REPO_ROOT}/configs/gridfm_graphkit_hpo.yaml" diff --git a/examples/run_zuvela_gridfm_example.sh b/examples/run_zuvela_gridfm_example.sh new file mode 100755 index 0000000..09d679f --- /dev/null +++ b/examples/run_zuvela_gridfm_example.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# ============================================================================= +# Run gridfm-graphkit HPO on ZuVela (IBM Spectrum LSF cluster) +# +# iterate2 orchestrates Optuna trials. For each trial it calls +# examples/wlm_plugins/zuvela_plugin.sh, which owns all LSF concerns: +# micromamba environment activation and bsub submission. +# +# Prerequisites +# * bsub / bjobs available on PATH +# * gridfm-graphkit installed in the "gridfm" micromamba env +# * configs/gridfm_graphkit_hpo.yaml present +# * A study storage backend available (SQLite shown; switch to PostgreSQL +# for high-parallelism runs: export POSTGRES_URL=... and use it below) +# ============================================================================= + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +# Override via env vars if your paths differ from the plugin defaults +export GRIDFM_ROOT="${GRIDFM_ROOT:-${HOME}/gitco/gridfm-graphkit}" +export MICROMAMBA_ENV="${MICROMAMBA_ENV:-gridfm}" + +STUDY_DB="${STUDY_DB:-sqlite:///gridfm_zuvela_hpo.db}" +# For PostgreSQL: export STUDY_DB="postgresql://user:password@host:5432/optuna" + +iterate2 \ + --script "${SCRIPT_DIR}/wlm_plugins/zuvela_plugin.sh" \ + --optuna-study-name gridfm_zuvela_hpo \ + --optuna-db-path "${STUDY_DB}" \ + --parallelism 4 \ + --optuna-n-trials 20 \ + --hpo-yaml "${REPO_ROOT}/configs/gridfm_graphkit_hpo.yaml" diff --git a/examples/wlm_plugins/ccc_plugin.sh b/examples/wlm_plugins/ccc_plugin.sh new file mode 100755 index 0000000..7ba5910 --- /dev/null +++ b/examples/wlm_plugins/ccc_plugin.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# ============================================================================= +# Trial script for iterate2 - CCC (IBM Spectrum LSF) backend +# +# Called once per Optuna trial. Activates the venv, builds the training +# command from ITERATE_PARAM_* env vars, and submits it via bsub -K. +# +# Environment variables provided by iterate2 +# ------------------------------------------ +# ITERATE_TRIAL_NUMBER integer trial ID +# ITERATE_OUT_FILE metric lines are read from here by iterate2 +# ITERATE_ERR_FILE path for error output +# ITERATE_PARAM_ one variable per HPO + static parameter +# (key uppercased, hyphens -> underscores) +# +# Path overrides (set in the run script or your environment) +# ---------------------------------------------------------- +# GRIDFM_ROOT repo root (default below) +# GRIDFM_VENV Python venv path (default below) +# CUDA_BASE CUDA install root (default below) +# ============================================================================= + +set -euo pipefail + +# -- iterate2 standard vars --------------------------------------------------- +TRIAL_NUMBER="${ITERATE_TRIAL_NUMBER:?ITERATE_TRIAL_NUMBER not set}" +OUT_FILE="${ITERATE_OUT_FILE:?ITERATE_OUT_FILE not set}" +ERR_FILE="${ITERATE_ERR_FILE:?ITERATE_ERR_FILE not set}" + +# -- Paths (override via env) ------------------------------------------------- +GRIDFM_ROOT="${GRIDFM_ROOT:-/dccstor/terratorch/users/rkie/gitco/gridfm-graphkit}" +GRIDFM_VENV="${GRIDFM_VENV:-/u/rkie/venvs/venv_gridfm-graphkit}" +CUDA_BASE="${CUDA_BASE:-/opt/share/cuda-12.8.1}" + +# -- LSF resources ------------------------------------------------------------ +GPU_COUNT=1 +CPU_COUNT=16 +MEM_GB=32 +MEM_MB=$(( MEM_GB * 1024 )) +GPU_STRING="num=${GPU_COUNT}:mode=exclusive_process:mps=no:gmodel=NVIDIAA100_SXM4_80GB" +# QUEUE="normal" # uncomment to target a specific queue + +# -- Build training command from ITERATE_PARAM_* vars ------------------------- +TRAIN_CMD="gridfm_graphkit train" +TRAIN_CMD+=" --gpu_num ${ITERATE_PARAM_GPU_NUM}" +TRAIN_CMD+=" --batch_size ${ITERATE_PARAM_BATCH_SIZE}" +TRAIN_CMD+=" --num_workers ${ITERATE_PARAM_NUM_WORKERS}" +TRAIN_CMD+=" --config ${ITERATE_PARAM_CONFIG}" +TRAIN_CMD+=" --data_path ${ITERATE_PARAM_DATA_PATH}" +# [[ -n "${ITERATE_PARAM_COMPILE:-}" ]] && TRAIN_CMD+=" --compile ${ITERATE_PARAM_COMPILE}" + +# -- Compose full job shell command ------------------------------------------- +JOB_CMD="\ +export PATH='${CUDA_BASE}/bin:\$PATH' && \ +export CUDA_HOME='${CUDA_BASE}' && \ +export LD_LIBRARY_PATH='${CUDA_BASE}/lib64:\$LD_LIBRARY_PATH' && \ +cd '${GRIDFM_ROOT}' && \ +source '${GRIDFM_VENV}/bin/activate' && \ +${TRAIN_CMD}" + +# -- Submit via bsub ---------------------------------------------------------- +# -K : blocks until the job completes (iterate2 runs each trial in a thread) +# -n : CPU slots +# -o/-e: redirect job stdout/stderr to paths iterate2 will read metrics from +bsub \ + -K \ + -gpu "${GPU_STRING}" \ + -n "${CPU_COUNT}" \ + -R "rusage[mem=${MEM_MB}]" \ + -o "${OUT_FILE}" \ + -e "${ERR_FILE}" \ + -J "hpo_trial_${TRIAL_NUMBER}" \ + "${JOB_CMD}" + +echo "[ccc_plugin] trial ${TRIAL_NUMBER} finished" diff --git a/examples/wlm_plugins/zuvela_plugin.sh b/examples/wlm_plugins/zuvela_plugin.sh new file mode 100755 index 0000000..31a0830 --- /dev/null +++ b/examples/wlm_plugins/zuvela_plugin.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash +# ============================================================================= +# Trial script for iterate2 - ZuVela (IBM Spectrum LSF) backend +# +# Called once per Optuna trial. Activates the micromamba environment, builds +# the training command from ITERATE_PARAM_* env vars, and submits via bsub -K. +# +# bsub pattern used (non-interactive, blocking): +# bsub -gpu num= -K \ +# -R "rusage[ngpus=, cpu=, mem=GB]" \ +# -J gridfm_ \ +# 'cd ~/gitco/gridfm-graphkit && source ~/.bashrc && micromamba activate gridfm && ' +# +# Environment variables provided by iterate2 +# ------------------------------------------ +# ITERATE_TRIAL_NUMBER integer trial ID +# ITERATE_OUT_FILE metric lines are read from here by iterate2 +# ITERATE_ERR_FILE path for error output +# ITERATE_PARAM_ one variable per HPO + static parameter +# (key uppercased, hyphens -> underscores) +# +# Path overrides (set in the run script or your environment) +# ---------------------------------------------------------- +# GRIDFM_ROOT repo root (default: ~/gitco/gridfm-graphkit) +# MICROMAMBA_ENV micromamba env name (default: gridfm) +# ============================================================================= + +set -euo pipefail + +# -- iterate2 standard vars --------------------------------------------------- +TRIAL_NUMBER="${ITERATE_TRIAL_NUMBER:?ITERATE_TRIAL_NUMBER not set}" +OUT_FILE="${ITERATE_OUT_FILE:?ITERATE_OUT_FILE not set}" +ERR_FILE="${ITERATE_ERR_FILE:?ITERATE_ERR_FILE not set}" + +# -- Paths (override via env) ------------------------------------------------- +GRIDFM_ROOT="${GRIDFM_ROOT:-${HOME}/gitco/gridfm-graphkit}" +MICROMAMBA_ENV="${MICROMAMBA_ENV:-gridfm}" + +# -- LSF resources ------------------------------------------------------------ +GPU_COUNT=1 +CPU_COUNT=16 +MEM_GB=32 + +# -- Build training command from ITERATE_PARAM_* vars ------------------------- +TRAIN_CMD="gridfm_graphkit train" +TRAIN_CMD+=" --gpu_num ${ITERATE_PARAM_GPU_NUM}" +TRAIN_CMD+=" --batch_size ${ITERATE_PARAM_BATCH_SIZE}" +TRAIN_CMD+=" --num_workers ${ITERATE_PARAM_NUM_WORKERS}" +TRAIN_CMD+=" --config ${ITERATE_PARAM_CONFIG}" +TRAIN_CMD+=" --data_path ${ITERATE_PARAM_DATA_PATH}" +# [[ -n "${ITERATE_PARAM_COMPILE:-}" ]] && TRAIN_CMD+=" --compile ${ITERATE_PARAM_COMPILE}" + +# -- Compose full job shell command ------------------------------------------- +# source ~/.bashrc to initialise micromamba shell hooks +JOB_CMD="\ +cd '${GRIDFM_ROOT}' && \ +source ~/.bashrc && \ +micromamba activate '${MICROMAMBA_ENV}' && \ +${TRAIN_CMD}" + +# -- Submit via bsub ---------------------------------------------------------- +# -K : blocks until the job completes (iterate2 runs each trial in a thread) +# -o/-e: write output to paths iterate2 will scan for metrics +bsub \ + -K \ + -gpu "num=${GPU_COUNT}" \ + -R "rusage[ngpus=${GPU_COUNT}, cpu=${CPU_COUNT}, mem=${MEM_GB}GB]" \ + -o "${OUT_FILE}" \ + -e "${ERR_FILE}" \ + -J "gridfm_trial_${TRIAL_NUMBER}" \ + "${JOB_CMD}" + +echo "[zuvela_plugin] trial ${TRIAL_NUMBER} finished" From 560087d8c188870f33f0e7ecea5127233ecfc047 Mon Sep 17 00:00:00 2001 From: Romeo Kienzler Date: Fri, 24 Apr 2026 21:02:30 +0200 Subject: [PATCH 05/12] Update GPU_COUNT handling in ZuVela and CCC plugins to use HPO group parameter Signed-off-by: Romeo Kienzler --- examples/run_zuvela_gridfm_example.sh | 2 +- examples/wlm_plugins/ccc_plugin.sh | 5 +++-- examples/wlm_plugins/zuvela_plugin.sh | 5 +++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/run_zuvela_gridfm_example.sh b/examples/run_zuvela_gridfm_example.sh index 09d679f..70aa771 100755 --- a/examples/run_zuvela_gridfm_example.sh +++ b/examples/run_zuvela_gridfm_example.sh @@ -26,7 +26,7 @@ export MICROMAMBA_ENV="${MICROMAMBA_ENV:-gridfm}" STUDY_DB="${STUDY_DB:-sqlite:///gridfm_zuvela_hpo.db}" # For PostgreSQL: export STUDY_DB="postgresql://user:password@host:5432/optuna" -iterate2 \ +iterate \ --script "${SCRIPT_DIR}/wlm_plugins/zuvela_plugin.sh" \ --optuna-study-name gridfm_zuvela_hpo \ --optuna-db-path "${STUDY_DB}" \ diff --git a/examples/wlm_plugins/ccc_plugin.sh b/examples/wlm_plugins/ccc_plugin.sh index 7ba5910..14f1d5f 100755 --- a/examples/wlm_plugins/ccc_plugin.sh +++ b/examples/wlm_plugins/ccc_plugin.sh @@ -33,7 +33,8 @@ GRIDFM_VENV="${GRIDFM_VENV:-/u/rkie/venvs/venv_gridfm-graphkit}" CUDA_BASE="${CUDA_BASE:-/opt/share/cuda-12.8.1}" # -- LSF resources ------------------------------------------------------------ -GPU_COUNT=1 +# GPU_COUNT comes from the HPO group param so LSF allocates the right number. +GPU_COUNT="${ITERATE_PARAM_GPU_NUM:-1}" CPU_COUNT=16 MEM_GB=32 MEM_MB=$(( MEM_GB * 1024 )) @@ -41,8 +42,8 @@ GPU_STRING="num=${GPU_COUNT}:mode=exclusive_process:mps=no:gmodel=NVIDIAA100_SXM # QUEUE="normal" # uncomment to target a specific queue # -- Build training command from ITERATE_PARAM_* vars ------------------------- +# --gpu_num is NOT a gridfm_graphkit flag; GPU count is controlled via bsub -gpu. TRAIN_CMD="gridfm_graphkit train" -TRAIN_CMD+=" --gpu_num ${ITERATE_PARAM_GPU_NUM}" TRAIN_CMD+=" --batch_size ${ITERATE_PARAM_BATCH_SIZE}" TRAIN_CMD+=" --num_workers ${ITERATE_PARAM_NUM_WORKERS}" TRAIN_CMD+=" --config ${ITERATE_PARAM_CONFIG}" diff --git a/examples/wlm_plugins/zuvela_plugin.sh b/examples/wlm_plugins/zuvela_plugin.sh index 31a0830..3b2779c 100755 --- a/examples/wlm_plugins/zuvela_plugin.sh +++ b/examples/wlm_plugins/zuvela_plugin.sh @@ -37,13 +37,14 @@ GRIDFM_ROOT="${GRIDFM_ROOT:-${HOME}/gitco/gridfm-graphkit}" MICROMAMBA_ENV="${MICROMAMBA_ENV:-gridfm}" # -- LSF resources ------------------------------------------------------------ -GPU_COUNT=1 +# GPU_COUNT comes from the HPO group param so LSF allocates the right number. +GPU_COUNT="${ITERATE_PARAM_GPU_NUM:-1}" CPU_COUNT=16 MEM_GB=32 # -- Build training command from ITERATE_PARAM_* vars ------------------------- +# --gpu_num is NOT a gridfm_graphkit flag; GPU count is controlled via bsub -gpu. TRAIN_CMD="gridfm_graphkit train" -TRAIN_CMD+=" --gpu_num ${ITERATE_PARAM_GPU_NUM}" TRAIN_CMD+=" --batch_size ${ITERATE_PARAM_BATCH_SIZE}" TRAIN_CMD+=" --num_workers ${ITERATE_PARAM_NUM_WORKERS}" TRAIN_CMD+=" --config ${ITERATE_PARAM_CONFIG}" From a0168b7e192e343f13817b0788a192a205ac4aa2 Mon Sep 17 00:00:00 2001 From: Romeo Kienzler Date: Sat, 25 Apr 2026 09:59:34 +0200 Subject: [PATCH 06/12] Fix data_path in gridfm_graphkit HPO configuration Signed-off-by: Romeo Kienzler --- configs/gridfm_graphkit_hpo.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/gridfm_graphkit_hpo.yaml b/configs/gridfm_graphkit_hpo.yaml index d578e19..94b9a28 100644 --- a/configs/gridfm_graphkit_hpo.yaml +++ b/configs/gridfm_graphkit_hpo.yaml @@ -46,7 +46,7 @@ hpo: choices: case118: config: ./examples/config/HGNS_PF_datakit_case118.yaml - data_path: /u/rkie/ + data_path: /u/rki/ static: run_name: run1 From 05d14f0af2da1a9ee3fd1eb5a4d65b8f2506a30b Mon Sep 17 00:00:00 2001 From: Romeo Kienzler Date: Tue, 28 Apr 2026 14:09:45 +0200 Subject: [PATCH 07/12] Enhance training command in zuvela_plugin.sh to include run name, log directory, and performance reporting options Signed-off-by: Romeo Kienzler --- examples/wlm_plugins/zuvela_plugin.sh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/wlm_plugins/zuvela_plugin.sh b/examples/wlm_plugins/zuvela_plugin.sh index 3b2779c..fd9bf4b 100755 --- a/examples/wlm_plugins/zuvela_plugin.sh +++ b/examples/wlm_plugins/zuvela_plugin.sh @@ -45,11 +45,14 @@ MEM_GB=32 # -- Build training command from ITERATE_PARAM_* vars ------------------------- # --gpu_num is NOT a gridfm_graphkit flag; GPU count is controlled via bsub -gpu. TRAIN_CMD="gridfm_graphkit train" -TRAIN_CMD+=" --batch_size ${ITERATE_PARAM_BATCH_SIZE}" +TRAIN_CMD+=" --batch_size ${ITERATE_PARAM_BATCH_SIZE}" TRAIN_CMD+=" --num_workers ${ITERATE_PARAM_NUM_WORKERS}" -TRAIN_CMD+=" --config ${ITERATE_PARAM_CONFIG}" -TRAIN_CMD+=" --data_path ${ITERATE_PARAM_DATA_PATH}" +TRAIN_CMD+=" --config ${ITERATE_PARAM_CONFIG}" +TRAIN_CMD+=" --data_path ${ITERATE_PARAM_DATA_PATH}" # [[ -n "${ITERATE_PARAM_COMPILE:-}" ]] && TRAIN_CMD+=" --compile ${ITERATE_PARAM_COMPILE}" +[[ -n "${ITERATE_PARAM_RUN_NAME:-}" ]] && TRAIN_CMD+=" --run_name ${ITERATE_PARAM_RUN_NAME}" +[[ -n "${ITERATE_PARAM_LOG_DIR:-}" ]] && TRAIN_CMD+=" --log_dir ${ITERATE_PARAM_LOG_DIR}" +[[ "${ITERATE_PARAM_REPORT_PERFORMANCE:-}" == "True" ]] && TRAIN_CMD+=" --report-performance" # -- Compose full job shell command ------------------------------------------- # source ~/.bashrc to initialise micromamba shell hooks From acf84411e688147e81209377c8cfc2b4a3ba5a27 Mon Sep 17 00:00:00 2001 From: Romeo Kienzler Date: Tue, 28 Apr 2026 18:27:27 +0200 Subject: [PATCH 08/12] Update database connection handling in run_zuvela_gridfm_example.sh to require PostgreSQL URL Signed-off-by: Romeo Kienzler --- examples/run_zuvela_gridfm_example.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/run_zuvela_gridfm_example.sh b/examples/run_zuvela_gridfm_example.sh index 70aa771..94337a3 100755 --- a/examples/run_zuvela_gridfm_example.sh +++ b/examples/run_zuvela_gridfm_example.sh @@ -23,8 +23,10 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" export GRIDFM_ROOT="${GRIDFM_ROOT:-${HOME}/gitco/gridfm-graphkit}" export MICROMAMBA_ENV="${MICROMAMBA_ENV:-gridfm}" -STUDY_DB="${STUDY_DB:-sqlite:///gridfm_zuvela_hpo.db}" -# For PostgreSQL: export STUDY_DB="postgresql://user:password@host:5432/optuna" +# Require a PostgreSQL URL – set via environment before calling this script: +# export POSTGRES_URL="postgresql://user:password@host:5432/optuna_studies" +: "${POSTGRES_URL:?Please set POSTGRES_URL=postgresql://user:password@host:port/dbname}" +STUDY_DB="${POSTGRES_URL}" iterate \ --script "${SCRIPT_DIR}/wlm_plugins/zuvela_plugin.sh" \ From e564e2c88ede855bad2c133ec8e6234caf5a6a22 Mon Sep 17 00:00:00 2001 From: Romeo Kienzler Date: Mon, 4 May 2026 11:55:08 +0200 Subject: [PATCH 09/12] Refactor scripts to replace 'iterate2' with 'iterate' for consistency across examples Signed-off-by: Romeo Kienzler --- examples/run_ccc_gridfm_example.sh | 2 +- examples/run_lsf_gridfm_example_postgres.sh | 2 +- examples/run_setter_example.sh | 2 +- examples/run_vela_example.sh | 2 +- examples/run_zuvela_gridfm_example.sh | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/run_ccc_gridfm_example.sh b/examples/run_ccc_gridfm_example.sh index fa773b9..af5a8a7 100755 --- a/examples/run_ccc_gridfm_example.sh +++ b/examples/run_ccc_gridfm_example.sh @@ -28,7 +28,7 @@ export GRIDFM_ROOT="${GRIDFM_ROOT:-/dccstor/terratorch/users/rkie/gitco/gridfm-g export GRIDFM_VENV="${GRIDFM_VENV:-/u/rkie/venvs/venv_gridfm-graphkit}" export CUDA_BASE="${CUDA_BASE:-/opt/share/cuda-12.8.1}" -iterate2 \ +iterate \ --script "${SCRIPT_DIR}/wlm_plugins/ccc_plugin.sh" \ --optuna-study-name gridfm_ccc_hpo \ --optuna-db-path "${POSTGRES_URL}" \ diff --git a/examples/run_lsf_gridfm_example_postgres.sh b/examples/run_lsf_gridfm_example_postgres.sh index 878ac05..ed28a78 100755 --- a/examples/run_lsf_gridfm_example_postgres.sh +++ b/examples/run_lsf_gridfm_example_postgres.sh @@ -27,7 +27,7 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" : "${POSTGRES_URL:?Please set POSTGRES_URL=postgresql://user:password@host:port/dbname}" -iterate2 \ +iterate \ --script "${SCRIPT_DIR}/wlm_plugins/lsf_plugin.sh" \ --optuna-study-name gridfm_lsf_postgres_hpo \ --optuna-db-path "${POSTGRES_URL}" \ diff --git a/examples/run_setter_example.sh b/examples/run_setter_example.sh index 253deb7..b728dd4 100755 --- a/examples/run_setter_example.sh +++ b/examples/run_setter_example.sh @@ -16,7 +16,7 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -iterate2 \ +iterate \ --script "${SCRIPT_DIR}/bumpy_function.py" \ --optuna-study-name bumpy_local_study \ --optuna-db-path "sqlite:///bumpy_local_hpo.db" \ diff --git a/examples/run_vela_example.sh b/examples/run_vela_example.sh index 4a36f68..aa5461e 100755 --- a/examples/run_vela_example.sh +++ b/examples/run_vela_example.sh @@ -24,7 +24,7 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" -iterate2 \ +iterate \ --script "${SCRIPT_DIR}/wlm_plugins/vela_plugin.py" \ --optuna-study-name gridfm_vela_hpo \ --optuna-db-path "js:///gridfm_vela_hpo.journal" \ diff --git a/examples/run_zuvela_gridfm_example.sh b/examples/run_zuvela_gridfm_example.sh index 94337a3..7dd2888 100755 --- a/examples/run_zuvela_gridfm_example.sh +++ b/examples/run_zuvela_gridfm_example.sh @@ -10,8 +10,8 @@ # * bsub / bjobs available on PATH # * gridfm-graphkit installed in the "gridfm" micromamba env # * configs/gridfm_graphkit_hpo.yaml present -# * A study storage backend available (SQLite shown; switch to PostgreSQL -# for high-parallelism runs: export POSTGRES_URL=... and use it below) +# * psycopg2-binary: pip install 'terratorch-iterate[postgresql]' +# * POSTGRES_URL set: export POSTGRES_URL="postgresql://user:password@host:5432/optuna_studies" # ============================================================================= set -euo pipefail From 62334b63a48c50c3b220b50ca8ab17b9831461ad Mon Sep 17 00:00:00 2001 From: Romeo Kienzler Date: Mon, 11 May 2026 11:30:03 +0200 Subject: [PATCH 10/12] Update venv option description in iterate2 documentation for clarity Signed-off-by: Romeo Kienzler --- docs/iterate2.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/iterate2.md b/docs/iterate2.md index 74d9e44..0a84e26 100644 --- a/docs/iterate2.md +++ b/docs/iterate2.md @@ -41,7 +41,7 @@ iterate2 \ |---|---|---| | `--script` | *(required)* | Training script to execute | | `--root-dir` | `.` | Working directory; derived from `--script` if omitted | -| `--venv` | `.venv` | Virtual-environment directory to activate. Set to empty string to disable | +| `--venv` | *(none)* | Virtual-environment directory to activate. Omit to skip venv activation entirely | | `--interpreter` | `python` | Python interpreter to invoke | | `--param-setter` | `None` | Use setter-style argument passing (see [Setter-style arguments](#setter-style-arguments)) | | `--wlm-plugin` | *(local)* | Path to an executable WLM plugin script. When omitted, trials run locally in the current process | From a2143e27052bab3e1a27e90fe068de37aeba3d12 Mon Sep 17 00:00:00 2001 From: Romeo Kienzler Date: Mon, 11 May 2026 14:19:09 +0200 Subject: [PATCH 11/12] Bump version to 0.4rc1 in pyproject.toml Signed-off-by: Romeo Kienzler --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d53e340..115f621 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ include = ["terratorch_iterate*"] [project] name = "terratorch-iterate" -version = "0.3" +version = "0.4rc1" requires-python = ">= 3.11" description = "A terratorch's plugin for benchmarking and hyperparameter optimization" authors = [ From b32d1bcefd26c417431b6292d950c3ccbff1c37d Mon Sep 17 00:00:00 2001 From: Romeo Kienzler Date: Wed, 13 May 2026 16:23:04 +0200 Subject: [PATCH 12/12] Update version to 0.4 in pyproject.toml Signed-off-by: Romeo Kienzler --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 115f621..3237f3e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ include = ["terratorch_iterate*"] [project] name = "terratorch-iterate" -version = "0.4rc1" +version = "0.4" requires-python = ">= 3.11" description = "A terratorch's plugin for benchmarking and hyperparameter optimization" authors = [