From f26ccd401a7c3dcd908a264fb9dc2d2335dcf9b1 Mon Sep 17 00:00:00 2001 From: Swagatam Haldar <38006539+swag2198@users.noreply.github.com> Date: Tue, 14 Apr 2026 09:45:38 +0200 Subject: [PATCH 1/2] `--local` not working on compute nodes without internet access (#58) * add partition, timelimit overrides, fix collect results metrics * add a single --slurm_opt arg to capture partition,account,time etc. * feat: add comma separated slurm overrides #44 * slurm args format to json, add json tests * add .strip() to values of slurm json * uppercase env vars for slurm, special handling for only time * fix: --local works now on interactive Leo offline compute nodes * fix ruff --- README.md | 10 ++++++++++ oellm/main.py | 18 +++++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4e461ab..a92f932 100644 --- a/README.md +++ b/README.md @@ -87,6 +87,16 @@ oellm schedule-eval \ Results are written to `./oellm-output//results/`. +**Air-gapped cluster nodes (no internet):** batch jobs set `HF_HUB_OFFLINE=1` and get `HF_HOME` from your cluster env. With `--local`, the CLI defaults `HF_HOME` to `~/.cache/huggingface` if unset and would otherwise allow Hub access—so on a compute node without network, export your real cache and offline flag before running, for example: + +```bash +export HF_HOME=/leonardo_work/OELLM_prod2026/users/shaldar0/oellm-evals/hf_data +export HF_HUB_OFFLINE=1 +oellm schedule-eval ... --venv_path .venv --local true +``` + +The `HF_HUB_OFFLINE` value is read when you invoke `oellm` and baked into the generated script. + ## SLURM Overrides Override cluster defaults (partition, account, time limit, etc.) with `--slurm_template_var` (JSON object): diff --git a/oellm/main.py b/oellm/main.py index 3165209..c32baec 100644 --- a/oellm/main.py +++ b/oellm/main.py @@ -31,6 +31,22 @@ ) +def _resolve_hf_hub_offline(local: bool) -> int: + """Value embedded in the generated eval script as HF_HUB_OFFLINE. + + If ``HF_HUB_OFFLINE`` is set in the environment when ``oellm`` runs, that + value wins. Otherwise defaults to online Hub access for ``--local`` + (typical laptop dev) and offline for SLURM jobs (air-gapped workers). + """ + raw = os.environ.get("HF_HUB_OFFLINE") + if raw is not None and str(raw).strip() != "": + try: + return int(str(raw).strip()) + except ValueError: + logging.warning("Invalid HF_HUB_OFFLINE=%r; using default", raw) + return 0 if local else 1 + + @dataclass class EvaluationJob: model_path: Path | str @@ -369,7 +385,7 @@ def schedule_evals( venv_path=venv_path or "", lm_eval_include_path=lm_eval_include_path or str(files("oellm.resources") / "custom_lm_eval_tasks"), - hf_hub_offline=0 if local else 1, + hf_hub_offline=_resolve_hf_hub_offline(local), lighteval_model_args="trust_remote_code=True,batch_size=1" if local else "trust_remote_code=True", From f73d9dd6a3d0407ee38edc5b7cb65ced06eb8f14 Mon Sep 17 00:00:00 2001 From: Ivan Slobozhan Date: Tue, 14 Apr 2026 13:12:05 +0200 Subject: [PATCH 2/2] fix lints --- oellm/scheduler.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/oellm/scheduler.py b/oellm/scheduler.py index 98c288f..7314a3b 100644 --- a/oellm/scheduler.py +++ b/oellm/scheduler.py @@ -13,22 +13,6 @@ from oellm.constants import EvaluationJob from oellm.runner import EvalRunner - - -def _resolve_hf_hub_offline(local: bool) -> int: - """Value embedded in the generated eval script as HF_HUB_OFFLINE. - - If ``HF_HUB_OFFLINE`` is set in the environment when ``oellm`` runs, that - value wins. Otherwise defaults to online Hub access for ``--local`` - (typical laptop dev) and offline for SLURM jobs (air-gapped workers). - """ - raw = os.environ.get("HF_HUB_OFFLINE") - if raw is not None and str(raw).strip() != "": - try: - return int(str(raw).strip()) - except ValueError: - logging.warning("Invalid HF_HUB_OFFLINE=%r; using default", raw) - return 0 if local else 1 from oellm.task_groups import ( _collect_dataset_specs, _collect_hf_dataset_files, @@ -50,6 +34,22 @@ def _resolve_hf_hub_offline(local: bool) -> int: ) +def _resolve_hf_hub_offline(local: bool) -> int: + """Value embedded in the generated eval script as HF_HUB_OFFLINE. + + If ``HF_HUB_OFFLINE`` is set in the environment when ``oellm`` runs, that + value wins. Otherwise defaults to online Hub access for ``--local`` + (typical laptop dev) and offline for SLURM jobs (air-gapped workers). + """ + raw = os.environ.get("HF_HUB_OFFLINE") + if raw is not None and str(raw).strip() != "": + try: + return int(str(raw).strip()) + except ValueError: + logging.warning("Invalid HF_HUB_OFFLINE=%r; using default", raw) + return 0 if local else 1 + + @capture_third_party_output_from_kwarg("verbose") def schedule_evals( models: str | None = None,