diff --git a/README.md b/README.md index 1a44218..2083798 100644 --- a/README.md +++ b/README.md @@ -172,19 +172,43 @@ The `HF_HUB_OFFLINE` value is read when you invoke `oellm` and baked into the ge ## SLURM Overrides -Override cluster defaults (partition, account, time limit, etc.) with `--slurm-template-var` (JSON object): +Override cluster defaults (partition, account, time limit, memory, etc.) with `--slurm-template-var` (JSON object). Provide `SLURM_MEM` to request an exact host memory amount, otherwise falls back to a default of `96G`. ```bash # Use a different partition (e.g. dev-g on LUMI when small-g is crowded) oellm schedule-eval --models "model-name" --task-groups "open-sci-0.01" \ --slurm-template-var '{"PARTITION":"dev-g"}' -# Multiple overrides: partition, account, time limit, GPUs +# Multiple overrides: partition, account, time limit, GPUs, exact RAM oellm schedule-eval --models "model-name" --task-groups "open-sci-0.01" \ - --slurm-template-var '{"PARTITION":"dev-g","ACCOUNT":"myproject","TIME":"02:00:00","GPUS_PER_NODE":2}' + --slurm-template-var '{"PARTITION":"dev-g","ACCOUNT":"myproject","TIME":"02:00:00","GPUS_PER_NODE":2,"SLURM_MEM":"96G"}' ``` -Use exact env var names: `PARTITION`, `ACCOUNT`, `GPUS_PER_NODE`. `TIME` (HH:MM:SS) overrides the time limit. +Use exact env var names: `PARTITION`, `ACCOUNT`, `GPUS_PER_NODE`, `SLURM_MEM`. `TIME` (HH:MM:SS) overrides the time limit. + +## Lighteval Batch Size + +For lighteval runs, generated jobs default to `batch_size=1` for local runs and +`batch_size=32` for non-local (SLURM/cluster) runs. This reduces the risk of +out-of-memory failures where lighteval's auto batch-size detection can be +overly optimistic for multiple-choice loglikelihood tasks. You can still +override these defaults: + +```bash +# Set an explicit batch size (overrides the local/cluster default) +BATCH_SIZE=8 oellm schedule-eval \ + --models "model-name" \ + --task-groups "belebele-eu-cf" \ + --venv-path .venv +``` + +If you need full manual control over all model args, set `MODEL_ARGS`, +for example: + +```bash +MODEL_ARGS='batch_size=8' oellm schedule-eval \ + --models "model-name" --task-groups "belebele-eu-cf" --venv-path .venv +``` ## ⚠️ Dataset Pre-Download Warning diff --git a/oellm/constants.py b/oellm/constants.py index bb52e2a..e1bf391 100644 --- a/oellm/constants.py +++ b/oellm/constants.py @@ -48,6 +48,7 @@ class EvaluationJob: "acc,none", "acc", "accuracy", + "acc_norm", "f1", "exact_match", ] diff --git a/oellm/main.py b/oellm/main.py index d19eb97..c1c1469 100644 --- a/oellm/main.py +++ b/oellm/main.py @@ -86,8 +86,8 @@ def schedule_evals( submitting to SLURM. Requires --venv_path. Skips cluster environment detection and runs all evaluations sequentially in a single process. slurm_template_var: JSON object of template variable overrides. Use exact env var names - (PARTITION, ACCOUNT, GPUS_PER_NODE). "TIME" overrides the time limit. - Example: '{"PARTITION":"dev-g","ACCOUNT":"FOO","TIME":"02:00:00","GPUS_PER_NODE":2}' + (PARTITION, ACCOUNT, GPUS_PER_NODE, SLURM_MEM). "TIME" overrides the time limit. + Example: '{"PARTITION":"dev-g","ACCOUNT":"FOO","TIME":"02:00:00","GPUS_PER_NODE":2,"SLURM_MEM":"96G"}' """ from oellm.scheduler import schedule_evals as _sched diff --git a/oellm/resources/task-groups.yaml b/oellm/resources/task-groups.yaml index ce96454..ec848b5 100644 --- a/oellm/resources/task-groups.yaml +++ b/oellm/resources/task-groups.yaml @@ -1,5 +1,31 @@ task_metrics: mmlu: acc + belebele_bul_Cyrl_cf: acc_norm + belebele_hrv_Latn_cf: acc_norm + belebele_ces_Latn_cf: acc_norm + belebele_dan_Latn_cf: acc_norm + belebele_nld_Latn_cf: acc_norm + belebele_eng_Latn_cf: acc_norm + belebele_est_Latn_cf: acc_norm + belebele_fin_Latn_cf: acc_norm + belebele_fra_Latn_cf: acc_norm + belebele_deu_Latn_cf: acc_norm + belebele_ell_Grek_cf: acc_norm + belebele_hun_Latn_cf: acc_norm + belebele_ita_Latn_cf: acc_norm + belebele_lvs_Latn_cf: acc_norm + belebele_lit_Latn_cf: acc_norm + belebele_mlt_Latn_cf: acc_norm + belebele_pol_Latn_cf: acc_norm + belebele_por_Latn_cf: acc_norm + belebele_ron_Latn_cf: acc_norm + belebele_slk_Latn_cf: acc_norm + belebele_slv_Latn_cf: acc_norm + belebele_spa_Latn_cf: acc_norm + belebele_swe_Latn_cf: acc_norm + belebele_nob_Latn_cf: acc_norm + belebele_eus_Latn_cf: acc_norm + belebele_cat_Latn_cf: acc_norm copa: acc lambada_openai: acc openbookqa: acc_norm @@ -192,6 +218,65 @@ task_groups: subset: swe_Latn - task: belebele_nob_Latn subset: nob_Latn + belebele-eu-cf: + description: "Belebele European language tasks (cloze formulation, lighteval)" + suite: lighteval + n_shots: [0] + dataset: facebook/belebele + tasks: + - task: belebele_bul_Cyrl_cf + subset: bul_Cyrl + - task: belebele_hrv_Latn_cf + subset: hrv_Latn + - task: belebele_ces_Latn_cf + subset: ces_Latn + - task: belebele_dan_Latn_cf + subset: dan_Latn + - task: belebele_nld_Latn_cf + subset: nld_Latn + - task: belebele_eng_Latn_cf + subset: eng_Latn + - task: belebele_est_Latn_cf + subset: est_Latn + - task: belebele_fin_Latn_cf + subset: fin_Latn + - task: belebele_fra_Latn_cf + subset: fra_Latn + - task: belebele_deu_Latn_cf + subset: deu_Latn + - task: belebele_ell_Grek_cf + subset: ell_Grek + - task: belebele_hun_Latn_cf + subset: hun_Latn + - task: belebele_ita_Latn_cf + subset: ita_Latn + - task: belebele_lvs_Latn_cf + subset: lvs_Latn + - task: belebele_lit_Latn_cf + subset: lit_Latn + - task: belebele_mlt_Latn_cf + subset: mlt_Latn + - task: belebele_pol_Latn_cf + subset: pol_Latn + - task: belebele_por_Latn_cf + subset: por_Latn + - task: belebele_ron_Latn_cf + subset: ron_Latn + - task: belebele_slk_Latn_cf + subset: slk_Latn + - task: belebele_slv_Latn_cf + subset: slv_Latn + - task: belebele_spa_Latn_cf + subset: spa_Latn + - task: belebele_swe_Latn_cf + subset: swe_Latn + - task: belebele_nob_Latn_cf + subset: nob_Latn + - task: belebele_eus_Latn_cf + subset: eus_Latn + - task: belebele_cat_Latn_cf + subset: cat_Latn + flores-200-eu-to-eng: description: "Flores 200 EU to English translation" suite: lighteval diff --git a/oellm/resources/template.sbatch b/oellm/resources/template.sbatch index d67ab94..42900f9 100644 --- a/oellm/resources/template.sbatch +++ b/oellm/resources/template.sbatch @@ -2,7 +2,7 @@ #SBATCH --job-name=oellm-eval #SBATCH --time={time_limit} #SBATCH --gres=gpu:$GPUS_PER_NODE -#SBATCH --mem=0 +#SBATCH --mem={slurm_mem} #SBATCH --output={log_dir}/%x-%A-%a.out #SBATCH --partition=$PARTITION #SBATCH --account=$ACCOUNT @@ -131,6 +131,9 @@ do case "$suite_normalized" in lm_eval|lm-eval|lm-eval-harness) + echo + echo "----------------------------------------------------" + echo "lm_eval Execution" run_python -m lm_eval --model hf \ --model_args pretrained="$model_path",trust_remote_code=True \ --tasks "$task_path" \ @@ -139,6 +142,7 @@ do --trust_remote_code \ ${{LM_EVAL_INCLUDE_PATH:+--include_path $LM_EVAL_INCLUDE_PATH}} \ ${{LIMIT:+--limit $LIMIT}} + echo "----------------------------------------------------" ;; lighteval|light-eval) LIGHT_TASK="$task_path" @@ -164,7 +168,7 @@ do if [ -n "$VENV_PATH" ]; then source "$VENV_PATH/bin/activate" lighteval accelerate \ - "model_name=$model_path,{lighteval_model_args}" \ + "model_name=$model_path,trust_remote_code=True,{additional_model_args}" \ "$LIGHT_TASK_ARG" \ --load-tasks-multilingual \ --output-dir "$RESULTS_SUBDIR" \ @@ -177,7 +181,7 @@ do $EVAL_SIF_PATH \ env CUDA_VISIBLE_DEVICES=$GPU_DEVICES \ lighteval accelerate \ - "model_name=$model_path,{lighteval_model_args}" \ + "model_name=$model_path,{additional_model_args}" \ "$LIGHT_TASK_ARG" \ --load-tasks-multilingual \ --output-dir "$RESULTS_SUBDIR" \ @@ -266,6 +270,7 @@ do ;; esac + echo "----------------------------------------------------" echo "Evaluation finished for model: $model_path" done diff --git a/oellm/results.py b/oellm/results.py index 3fe0456..2ed66a7 100644 --- a/oellm/results.py +++ b/oellm/results.py @@ -71,6 +71,21 @@ def _first_matching_prefix(d: dict, prefix: str) -> tuple[float | None, str | No return None, None +def _split_task_and_nshot(name: str) -> tuple[str, int | None]: + """Split ``'task|N'`` task names used by some harnesses. + + Returns ``(task, N)`` when the suffix is numeric, ``(task, None)`` + otherwise. Non-string inputs pass through unchanged. + """ + if not isinstance(name, str): + return name, None + if "|" in name: + base, after = name.rsplit("|", 1) + if after.isdigit(): + return base, int(after) + return name, None + + def _infer_global_n_shot(n_shot_data: dict) -> int | None: """Infer a global n_shot if exactly one unique value exists.""" try: @@ -202,9 +217,21 @@ def collect_results( with open(json_file) as f: data = json.load(f) - # lmms-eval sets model_name to the adapter type (e.g. "llava_hf"), - # not the checkpoint path; the actual path is in model_name_or_path. - model_name = data.get("model_name_or_path") or data.get("model_name", "unknown") + # Model name lives in different keys depending on the harness: + # - lmms-eval: model_name_or_path is the checkpoint, model_name is the + # adapter class (e.g. "llava_hf") + # - lighteval: config_general.{model_name,model,model_path} + # - legacy: summary_general.model or top-level model + model_name = ( + data.get("model_name_or_path") + or data.get("model_name") + or data.get("config_general", {}).get("model_name") + or data.get("config_general", {}).get("model") + or data.get("config_general", {}).get("model_path") + or data.get("summary_general", {}).get("model") + or data.get("model") + or "unknown" + ) results = data.get("results", {}) n_shot_data = data.get("n-shot", {}) @@ -231,14 +258,20 @@ def collect_results( # Prefer only the first aggregate metric from groups (simplified) if groups_map: group_name, group_results = next(iter(groups_map.items())) - n_shot = n_shot_data.get(group_name, "unknown") + orig_group_name = group_name + n_shot = n_shot_data.get(orig_group_name, "unknown") if n_shot == "unknown": - for subtask_name in group_subtasks_map.get(group_name, []): + for subtask_name in group_subtasks_map.get(orig_group_name, []): if subtask_name in n_shot_data: n_shot = n_shot_data[subtask_name] break if n_shot == "unknown" and global_n_shot is not None: n_shot = global_n_shot + # Strip ``'|N'`` n-shot suffix from the group name, falling back + # to the parsed N when n_shot is still unknown. + group_name, parsed_n = _split_task_and_nshot(orig_group_name) + if n_shot == "unknown" and parsed_n is not None: + n_shot = parsed_n performance, metric_name = _resolve_metric( group_name, group_results, task_metrics ) @@ -273,30 +306,35 @@ def collect_results( if task_name.startswith("global_mmlu_") and task_name.count("_") >= 4: continue + # Strip ``'|N'`` n-shot suffix from the task name; use parsed N + # as a last-resort fallback when n_shot isn't otherwise resolvable. + task_name_clean, parsed_n = _split_task_and_nshot(task_name) n_shot = _resolve_n_shot( - task_name, + task_name_clean, n_shot_data, group_subtasks_map, group_aggregate_names, global_n_shot, ) + if n_shot == "unknown" and parsed_n is not None: + n_shot = parsed_n # Skip lmms-eval parent task placeholders (no numeric metrics, just alias) if set(task_results.keys()) <= {"alias", " ", ""}: continue performance, metric_name = _resolve_metric( - task_name, task_results, task_metrics + task_name_clean, task_results, task_metrics ) if performance is not None: if check: - completed_jobs.add((model_name, task_name, n_shot)) + completed_jobs.add((model_name, task_name_clean, n_shot)) rows.append( { "model_name": model_name, - "task": task_name, + "task": task_name_clean, "n_shot": n_shot, "performance": performance, "metric_name": metric_name if metric_name is not None else "", diff --git a/oellm/scheduler.py b/oellm/scheduler.py index 7314a3b..af203c9 100644 --- a/oellm/scheduler.py +++ b/oellm/scheduler.py @@ -14,6 +14,7 @@ from oellm.constants import EvaluationJob from oellm.runner import EvalRunner from oellm.task_groups import ( + _build_task_suite_map, _collect_dataset_specs, _collect_hf_dataset_files, _collect_hf_model_repos, @@ -50,6 +51,53 @@ def _resolve_hf_hub_offline(local: bool) -> int: return 0 if local else 1 +def _resolve_slurm_mem() -> str: + """Return the host-memory request for the generated SLURM job. + + Reads ``SLURM_MEM`` from the environment; falls back to ``96G``. Can also + be overridden per-invocation via ``--slurm-template-var``. + """ + explicit_mem = os.environ.get("SLURM_MEM") + if explicit_mem is not None and str(explicit_mem).strip() != "": + return str(explicit_mem).strip() + + logging.warning("SLURM_MEM not set; falling back to default memory request '96G'.") + return "96G" + + +def _resolve_additional_model_args(local: bool = False) -> str: + """Return model args for lighteval, defaulting to an explicit batch size. + + - if ``local`` is True: ``batch_size=1`` + - otherwise: ``batch_size=32`` + + Override the entire string via ``MODEL_ARGS`` or just the batch size via + ``BATCH_SIZE``. Applied to the lighteval suite only. + """ + explicit_model_args = os.environ.get("MODEL_ARGS") + if explicit_model_args is not None and str(explicit_model_args).strip() != "": + return str(explicit_model_args).strip() + + batch_size = os.environ.get("BATCH_SIZE") + if batch_size is not None and str(batch_size).strip() != "": + batch_size_value = str(batch_size).strip() + try: + if int(batch_size_value) < 1: + raise ValueError + except ValueError: + fallback = "1" if local else "32" + logging.warning( + "Invalid BATCH_SIZE=%r; falling back to batch_size=%s", + batch_size, + fallback, + ) + batch_size_value = fallback + else: + batch_size_value = "1" if local else "32" + + return f"batch_size={batch_size_value}" + + @capture_third_party_output_from_kwarg("verbose") def schedule_evals( models: str | None = None, @@ -106,8 +154,8 @@ def schedule_evals( local: If True, run evaluations directly on the local machine using bash instead of submitting to SLURM. Requires --venv_path. slurm_template_var: JSON object of template variable overrides. Use exact env var names - (PARTITION, ACCOUNT, GPUS_PER_NODE). "TIME" overrides the time limit. - Example: '{"PARTITION":"dev-g","ACCOUNT":"FOO","TIME":"02:00:00","GPUS_PER_NODE":2}' + (PARTITION, ACCOUNT, GPUS_PER_NODE, SLURM_MEM). "TIME" overrides the time limit. + Example: '{"PARTITION":"dev-g","ACCOUNT":"FOO","TIME":"02:00:00","GPUS_PER_NODE":2,"SLURM_MEM":"96G"}' """ _setup_logging(verbose) @@ -186,13 +234,18 @@ def schedule_evals( elif models: if group_names is None: + # Look up each bare task name in the registered groups so + # ``--tasks belebele_eng_Latn_cf`` (lighteval) or ``--tasks + # regiondial_refcocog_all`` (contrib) get routed correctly. + # Tasks not in any group default to lm_eval. + task_suite_map = _build_task_suite_map() eval_jobs.extend( [ EvaluationJob( model_path=model, task_path=task, n_shot=shot, - eval_suite="lm_eval", + eval_suite=task_suite_map.get(task, "lm_eval"), ) for model in models for task in tasks @@ -354,6 +407,9 @@ def schedule_evals( os.environ[key] = str(value) logging.info(f"Using slurm_template_var override: {key}={value}") + slurm_mem = _resolve_slurm_mem() + additional_model_args = _resolve_additional_model_args(local) + logging.info("Evaluation planning:") logging.info(f" Total evaluations: {total_evals}") logging.info( @@ -361,6 +417,7 @@ def schedule_evals( ) logging.info(f" Evaluations per job: {evals_per_job}") logging.info(f" Time limit: {time_limit}") + logging.info(f" Requested host memory: {slurm_mem}") sbatch_script = sbatch_template.format( csv_path=csv_path, @@ -371,14 +428,13 @@ def schedule_evals( log_dir=evals_dir / "slurm_logs", evals_dir=str(evals_dir / "results"), time_limit=time_limit, # Dynamic time limit + slurm_mem=slurm_mem, limit=limit if limit else "", # Sample limit for quick testing venv_path=venv_path or "", lm_eval_include_path=lm_eval_include_path or str(files("oellm.resources") / "custom_lm_eval_tasks"), hf_hub_offline=_resolve_hf_hub_offline(local), - lighteval_model_args="trust_remote_code=True,batch_size=1" - if local - else "trust_remote_code=True", + additional_model_args=additional_model_args, evalchemy_dir=os.environ.get("EVALCHEMY_DIR", "/opt/evalchemy"), ) diff --git a/oellm/task_groups.py b/oellm/task_groups.py index eba49e6..d930532 100644 --- a/oellm/task_groups.py +++ b/oellm/task_groups.py @@ -323,6 +323,24 @@ def _lookup_dataset_specs_for_tasks(task_names: Iterable[str]) -> list[DatasetSp return specs +def _build_task_suite_map() -> dict[str, str]: + """Return ``{task_name: eval_suite}`` across core YAML and contrib plugins. + + Uses :func:`_parse_task_groups` + :func:`_iter_all_tasks` so contrib + registries (e.g. ``regiondial_bench``) are included, not just the core + ``task-groups.yaml``. Task-level ``suite`` overrides group-level. First + occurrence wins when a task name appears in multiple groups. + + Consumers should still ``.get(task, "lm_eval")`` — tasks not registered + in any group simply aren't in the map. + """ + parsed = _parse_task_groups(get_all_task_group_names()) + task_suite_map: dict[str, str] = {} + for t, suite, _group in _iter_all_tasks(parsed): + task_suite_map.setdefault(t.name, suite) + return task_suite_map + + def get_all_task_group_names() -> list[str]: """Return all available task group names (core + all contrib suites).""" data = ( diff --git a/oellm/utils.py b/oellm/utils.py index 976ce5a..16a53b9 100644 --- a/oellm/utils.py +++ b/oellm/utils.py @@ -438,6 +438,18 @@ def _pre_download_datasets_from_specs( trust_remote_code=trust_remote_code, ) continue + if "Feature type" in str(e) and "not found" in str(e): + hf_datasets_cache = os.environ.get( + "HF_DATASETS_CACHE", + str(Path.home() / ".cache" / "huggingface" / "datasets"), + ) + safe_name = spec.repo_id.replace("/", "___") + cache_dir = os.path.join(hf_datasets_cache, safe_name) + raise RuntimeError( + f"Cached metadata for '{label}' is incompatible with the installed " + f"datasets version ('{e}'). Delete the stale cache and re-run:\n\n" + f" rm -rf {cache_dir}\n" + ) from None raise logging.debug(f"Finished downloading dataset '{label}'.") diff --git a/tests/test_task_suite_map.py b/tests/test_task_suite_map.py new file mode 100644 index 0000000..c8c6be1 --- /dev/null +++ b/tests/test_task_suite_map.py @@ -0,0 +1,88 @@ +"""Tests for :func:`oellm.task_groups._build_task_suite_map`. + +The helper powers the ``--tasks`` (bare-task-name) path in the scheduler. +It must cover every suite we actually support — core YAML-registered suites +(lm-eval-harness, lighteval, lmms_eval, evalchemy) AND contrib-registered +suites (e.g. regiondial_bench). +""" + +from __future__ import annotations + +from oellm.task_groups import _build_task_suite_map + + +def test_map_is_non_empty(): + m = _build_task_suite_map() + assert len(m) > 0, "suite map must contain at least core YAML tasks" + + +def test_map_includes_lm_eval_harness_task(): + m = _build_task_suite_map() + # copa is a classic lm-eval-harness task in task-groups.yaml + assert m.get("copa") == "lm-eval-harness" + + +def test_map_includes_lighteval_task(): + m = _build_task_suite_map() + # belebele_*_cf tasks are lighteval + assert m.get("belebele_eng_Latn_cf") == "lighteval" + + +def test_map_includes_lmms_eval_task(): + """lmms_eval tasks come from image/video task groups — must be routable.""" + m = _build_task_suite_map() + # vqav2_val is the base VQA v2 task (image modality) + assert m.get("vqav2_val") == "lmms_eval" + + +def test_map_includes_contrib_task(): + """Contrib plugins (e.g. regiondial_bench) register their own TASK_GROUPS. + + These are the regression target: the original upstream helper only read + YAML and missed contrib entirely. + """ + m = _build_task_suite_map() + assert m.get("regiondial_refcocog") == "regiondial_bench" + + +def test_map_honours_task_level_suite_override(): + """Evalchemy tasks set ``suite: evalchemy`` at the task level, not the + group level — the helper must prefer the task-level value. + """ + m = _build_task_suite_map() + assert m.get("GPQADiamond") == "evalchemy" + + +def test_map_covers_all_actually_registered_suites(): + """Sanity: every distinct suite we see should be one we actually route. + + Guards against a new suite slipping into YAML or contrib without us + adding a case branch in template.sbatch (the ``*)`` catch-all routes + everything unknown to the contrib dispatcher, but we still want this + assertion as documentation). + """ + m = _build_task_suite_map() + distinct_suites = set(m.values()) + expected_subset = { + "lm-eval-harness", + "lighteval", + "lmms_eval", + "evalchemy", + "regiondial_bench", + } + # All expected suites must be present. Extra contrib suites are fine. + assert expected_subset.issubset(distinct_suites), ( + f"missing suites: {expected_subset - distinct_suites}" + ) + + +def test_first_occurrence_wins_when_task_in_multiple_groups(): + """If a task name appears in multiple groups, first occurrence wins. + + This is documented behavior of ``setdefault`` in the helper. We don't + assert a specific pair here because the YAML contents shift; we only + assert the determinism property. + """ + m1 = _build_task_suite_map() + m2 = _build_task_suite_map() + assert m1 == m2