Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 28 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -172,19 +172,43 @@ The `HF_HUB_OFFLINE` value is read when you invoke `oellm` and baked into the ge

## SLURM Overrides

Override cluster defaults (partition, account, time limit, etc.) with `--slurm-template-var` (JSON object):
Override cluster defaults (partition, account, time limit, memory, etc.) with `--slurm-template-var` (JSON object). Provide `SLURM_MEM` to request an exact host memory amount, otherwise falls back to a default of `96G`.

```bash
# Use a different partition (e.g. dev-g on LUMI when small-g is crowded)
oellm schedule-eval --models "model-name" --task-groups "open-sci-0.01" \
--slurm-template-var '{"PARTITION":"dev-g"}'

# Multiple overrides: partition, account, time limit, GPUs
# Multiple overrides: partition, account, time limit, GPUs, exact RAM
oellm schedule-eval --models "model-name" --task-groups "open-sci-0.01" \
--slurm-template-var '{"PARTITION":"dev-g","ACCOUNT":"myproject","TIME":"02:00:00","GPUS_PER_NODE":2}'
--slurm-template-var '{"PARTITION":"dev-g","ACCOUNT":"myproject","TIME":"02:00:00","GPUS_PER_NODE":2,"SLURM_MEM":"96G"}'
```

Use exact env var names: `PARTITION`, `ACCOUNT`, `GPUS_PER_NODE`. `TIME` (HH:MM:SS) overrides the time limit.
Use exact env var names: `PARTITION`, `ACCOUNT`, `GPUS_PER_NODE`, `SLURM_MEM`. `TIME` (HH:MM:SS) overrides the time limit.

## Lighteval Batch Size

For lighteval runs, generated jobs default to `batch_size=1` for local runs and
`batch_size=32` for non-local (SLURM/cluster) runs. This reduces the risk of
out-of-memory failures where lighteval's auto batch-size detection can be
overly optimistic for multiple-choice loglikelihood tasks. You can still
override these defaults:

```bash
# Set an explicit batch size (overrides the local/cluster default)
BATCH_SIZE=8 oellm schedule-eval \
--models "model-name" \
--task-groups "belebele-eu-cf" \
--venv-path .venv
```

If you need full manual control over all model args, set `MODEL_ARGS`,
for example:

```bash
MODEL_ARGS='batch_size=8' oellm schedule-eval \
--models "model-name" --task-groups "belebele-eu-cf" --venv-path .venv
```

## ⚠️ Dataset Pre-Download Warning

Expand Down
1 change: 1 addition & 0 deletions oellm/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ class EvaluationJob:
"acc,none",
"acc",
"accuracy",
"acc_norm",
"f1",
"exact_match",
]
Expand Down
4 changes: 2 additions & 2 deletions oellm/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,8 @@ def schedule_evals(
submitting to SLURM. Requires --venv_path. Skips cluster environment detection and
runs all evaluations sequentially in a single process.
slurm_template_var: JSON object of template variable overrides. Use exact env var names
(PARTITION, ACCOUNT, GPUS_PER_NODE). "TIME" overrides the time limit.
Example: '{"PARTITION":"dev-g","ACCOUNT":"FOO","TIME":"02:00:00","GPUS_PER_NODE":2}'
(PARTITION, ACCOUNT, GPUS_PER_NODE, SLURM_MEM). "TIME" overrides the time limit.
Example: '{"PARTITION":"dev-g","ACCOUNT":"FOO","TIME":"02:00:00","GPUS_PER_NODE":2,"SLURM_MEM":"96G"}'
"""
from oellm.scheduler import schedule_evals as _sched

Expand Down
85 changes: 85 additions & 0 deletions oellm/resources/task-groups.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,31 @@
task_metrics:
mmlu: acc
belebele_bul_Cyrl_cf: acc_norm
belebele_hrv_Latn_cf: acc_norm
belebele_ces_Latn_cf: acc_norm
belebele_dan_Latn_cf: acc_norm
belebele_nld_Latn_cf: acc_norm
belebele_eng_Latn_cf: acc_norm
belebele_est_Latn_cf: acc_norm
belebele_fin_Latn_cf: acc_norm
belebele_fra_Latn_cf: acc_norm
belebele_deu_Latn_cf: acc_norm
belebele_ell_Grek_cf: acc_norm
belebele_hun_Latn_cf: acc_norm
belebele_ita_Latn_cf: acc_norm
belebele_lvs_Latn_cf: acc_norm
belebele_lit_Latn_cf: acc_norm
belebele_mlt_Latn_cf: acc_norm
belebele_pol_Latn_cf: acc_norm
belebele_por_Latn_cf: acc_norm
belebele_ron_Latn_cf: acc_norm
belebele_slk_Latn_cf: acc_norm
belebele_slv_Latn_cf: acc_norm
belebele_spa_Latn_cf: acc_norm
belebele_swe_Latn_cf: acc_norm
belebele_nob_Latn_cf: acc_norm
belebele_eus_Latn_cf: acc_norm
belebele_cat_Latn_cf: acc_norm
copa: acc
lambada_openai: acc
openbookqa: acc_norm
Expand Down Expand Up @@ -192,6 +218,65 @@ task_groups:
subset: swe_Latn
- task: belebele_nob_Latn
subset: nob_Latn
belebele-eu-cf:
description: "Belebele European language tasks (cloze formulation, lighteval)"
suite: lighteval
n_shots: [0]
dataset: facebook/belebele
tasks:
- task: belebele_bul_Cyrl_cf
subset: bul_Cyrl
- task: belebele_hrv_Latn_cf
subset: hrv_Latn
- task: belebele_ces_Latn_cf
subset: ces_Latn
- task: belebele_dan_Latn_cf
subset: dan_Latn
- task: belebele_nld_Latn_cf
subset: nld_Latn
- task: belebele_eng_Latn_cf
subset: eng_Latn
- task: belebele_est_Latn_cf
subset: est_Latn
- task: belebele_fin_Latn_cf
subset: fin_Latn
- task: belebele_fra_Latn_cf
subset: fra_Latn
- task: belebele_deu_Latn_cf
subset: deu_Latn
- task: belebele_ell_Grek_cf
subset: ell_Grek
- task: belebele_hun_Latn_cf
subset: hun_Latn
- task: belebele_ita_Latn_cf
subset: ita_Latn
- task: belebele_lvs_Latn_cf
subset: lvs_Latn
- task: belebele_lit_Latn_cf
subset: lit_Latn
- task: belebele_mlt_Latn_cf
subset: mlt_Latn
- task: belebele_pol_Latn_cf
subset: pol_Latn
- task: belebele_por_Latn_cf
subset: por_Latn
- task: belebele_ron_Latn_cf
subset: ron_Latn
- task: belebele_slk_Latn_cf
subset: slk_Latn
- task: belebele_slv_Latn_cf
subset: slv_Latn
- task: belebele_spa_Latn_cf
subset: spa_Latn
- task: belebele_swe_Latn_cf
subset: swe_Latn
- task: belebele_nob_Latn_cf
subset: nob_Latn
- task: belebele_eus_Latn_cf
subset: eus_Latn
- task: belebele_cat_Latn_cf
subset: cat_Latn

flores-200-eu-to-eng:
description: "Flores 200 EU to English translation"
suite: lighteval
Expand Down
11 changes: 8 additions & 3 deletions oellm/resources/template.sbatch
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#SBATCH --job-name=oellm-eval
#SBATCH --time={time_limit}
#SBATCH --gres=gpu:$GPUS_PER_NODE
#SBATCH --mem=0
#SBATCH --mem={slurm_mem}
#SBATCH --output={log_dir}/%x-%A-%a.out
#SBATCH --partition=$PARTITION
#SBATCH --account=$ACCOUNT
Expand Down Expand Up @@ -131,6 +131,9 @@ do

case "$suite_normalized" in
lm_eval|lm-eval|lm-eval-harness)
echo
echo "----------------------------------------------------"
echo "lm_eval Execution"
run_python -m lm_eval --model hf \
--model_args pretrained="$model_path",trust_remote_code=True \
--tasks "$task_path" \
Expand All @@ -139,6 +142,7 @@ do
--trust_remote_code \
${{LM_EVAL_INCLUDE_PATH:+--include_path $LM_EVAL_INCLUDE_PATH}} \
${{LIMIT:+--limit $LIMIT}}
echo "----------------------------------------------------"
;;
lighteval|light-eval)
LIGHT_TASK="$task_path"
Expand All @@ -164,7 +168,7 @@ do
if [ -n "$VENV_PATH" ]; then
source "$VENV_PATH/bin/activate"
lighteval accelerate \
"model_name=$model_path,{lighteval_model_args}" \
"model_name=$model_path,trust_remote_code=True,{additional_model_args}" \
"$LIGHT_TASK_ARG" \
--load-tasks-multilingual \
--output-dir "$RESULTS_SUBDIR" \
Expand All @@ -177,7 +181,7 @@ do
$EVAL_SIF_PATH \
env CUDA_VISIBLE_DEVICES=$GPU_DEVICES \
lighteval accelerate \
"model_name=$model_path,{lighteval_model_args}" \
"model_name=$model_path,{additional_model_args}" \
"$LIGHT_TASK_ARG" \
--load-tasks-multilingual \
--output-dir "$RESULTS_SUBDIR" \
Expand Down Expand Up @@ -266,6 +270,7 @@ do
;;
esac

echo "----------------------------------------------------"
echo "Evaluation finished for model: $model_path"

done
Expand Down
56 changes: 47 additions & 9 deletions oellm/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,21 @@ def _first_matching_prefix(d: dict, prefix: str) -> tuple[float | None, str | No
return None, None


def _split_task_and_nshot(name: str) -> tuple[str, int | None]:
"""Split ``'task|N'`` task names used by some harnesses.

Returns ``(task, N)`` when the suffix is numeric, ``(task, None)``
otherwise. Non-string inputs pass through unchanged.
"""
if not isinstance(name, str):
return name, None
if "|" in name:
base, after = name.rsplit("|", 1)
if after.isdigit():
return base, int(after)
return name, None


def _infer_global_n_shot(n_shot_data: dict) -> int | None:
"""Infer a global n_shot if exactly one unique value exists."""
try:
Expand Down Expand Up @@ -202,9 +217,21 @@ def collect_results(
with open(json_file) as f:
data = json.load(f)

# lmms-eval sets model_name to the adapter type (e.g. "llava_hf"),
# not the checkpoint path; the actual path is in model_name_or_path.
model_name = data.get("model_name_or_path") or data.get("model_name", "unknown")
# Model name lives in different keys depending on the harness:
# - lmms-eval: model_name_or_path is the checkpoint, model_name is the
# adapter class (e.g. "llava_hf")
# - lighteval: config_general.{model_name,model,model_path}
# - legacy: summary_general.model or top-level model
model_name = (
data.get("model_name_or_path")
or data.get("model_name")
or data.get("config_general", {}).get("model_name")
or data.get("config_general", {}).get("model")
or data.get("config_general", {}).get("model_path")
or data.get("summary_general", {}).get("model")
or data.get("model")
or "unknown"
)

results = data.get("results", {})
n_shot_data = data.get("n-shot", {})
Expand All @@ -231,14 +258,20 @@ def collect_results(
# Prefer only the first aggregate metric from groups (simplified)
if groups_map:
group_name, group_results = next(iter(groups_map.items()))
n_shot = n_shot_data.get(group_name, "unknown")
orig_group_name = group_name
n_shot = n_shot_data.get(orig_group_name, "unknown")
if n_shot == "unknown":
for subtask_name in group_subtasks_map.get(group_name, []):
for subtask_name in group_subtasks_map.get(orig_group_name, []):
if subtask_name in n_shot_data:
n_shot = n_shot_data[subtask_name]
break
if n_shot == "unknown" and global_n_shot is not None:
n_shot = global_n_shot
# Strip ``'|N'`` n-shot suffix from the group name, falling back
# to the parsed N when n_shot is still unknown.
group_name, parsed_n = _split_task_and_nshot(orig_group_name)
if n_shot == "unknown" and parsed_n is not None:
n_shot = parsed_n
performance, metric_name = _resolve_metric(
group_name, group_results, task_metrics
)
Expand Down Expand Up @@ -273,30 +306,35 @@ def collect_results(
if task_name.startswith("global_mmlu_") and task_name.count("_") >= 4:
continue

# Strip ``'|N'`` n-shot suffix from the task name; use parsed N
# as a last-resort fallback when n_shot isn't otherwise resolvable.
task_name_clean, parsed_n = _split_task_and_nshot(task_name)
n_shot = _resolve_n_shot(
task_name,
task_name_clean,
n_shot_data,
group_subtasks_map,
group_aggregate_names,
global_n_shot,
)
if n_shot == "unknown" and parsed_n is not None:
n_shot = parsed_n

# Skip lmms-eval parent task placeholders (no numeric metrics, just alias)
if set(task_results.keys()) <= {"alias", " ", ""}:
continue

performance, metric_name = _resolve_metric(
task_name, task_results, task_metrics
task_name_clean, task_results, task_metrics
)

if performance is not None:
if check:
completed_jobs.add((model_name, task_name, n_shot))
completed_jobs.add((model_name, task_name_clean, n_shot))

rows.append(
{
"model_name": model_name,
"task": task_name,
"task": task_name_clean,
"n_shot": n_shot,
"performance": performance,
"metric_name": metric_name if metric_name is not None else "",
Expand Down
Loading
Loading