diff --git a/README.md b/README.md index 6214e30..b8c28c2 100644 --- a/README.md +++ b/README.md @@ -137,6 +137,16 @@ oellm schedule-eval \ Results are written to `./oellm-output//results/`. +**Air-gapped cluster nodes (no internet):** batch jobs set `HF_HUB_OFFLINE=1` and get `HF_HOME` from your cluster env. With `--local`, the CLI defaults `HF_HOME` to `~/.cache/huggingface` if unset and would otherwise allow Hub access—so on a compute node without network, export your real cache and offline flag before running, for example: + +```bash +export HF_HOME=/leonardo_work/OELLM_prod2026/users/shaldar0/oellm-evals/hf_data +export HF_HUB_OFFLINE=1 +oellm schedule-eval ... --venv_path .venv --local true +``` + +The `HF_HUB_OFFLINE` value is read when you invoke `oellm` and baked into the generated script. + ## SLURM Overrides Override cluster defaults (partition, account, time limit, etc.) with `--slurm-template-var` (JSON object): diff --git a/oellm/scheduler.py b/oellm/scheduler.py index c06453d..7314a3b 100644 --- a/oellm/scheduler.py +++ b/oellm/scheduler.py @@ -34,6 +34,22 @@ ) +def _resolve_hf_hub_offline(local: bool) -> int: + """Value embedded in the generated eval script as HF_HUB_OFFLINE. + + If ``HF_HUB_OFFLINE`` is set in the environment when ``oellm`` runs, that + value wins. Otherwise defaults to online Hub access for ``--local`` + (typical laptop dev) and offline for SLURM jobs (air-gapped workers). + """ + raw = os.environ.get("HF_HUB_OFFLINE") + if raw is not None and str(raw).strip() != "": + try: + return int(str(raw).strip()) + except ValueError: + logging.warning("Invalid HF_HUB_OFFLINE=%r; using default", raw) + return 0 if local else 1 + + @capture_third_party_output_from_kwarg("verbose") def schedule_evals( models: str | None = None, @@ -359,7 +375,7 @@ def schedule_evals( venv_path=venv_path or "", lm_eval_include_path=lm_eval_include_path or str(files("oellm.resources") / "custom_lm_eval_tasks"), - hf_hub_offline=0 if local else 1, + hf_hub_offline=_resolve_hf_hub_offline(local), lighteval_model_args="trust_remote_code=True,batch_size=1" if local else "trust_remote_code=True",