elliot-project · islobozhan · Apr 23, 2026 · Apr 17, 2026 · Apr 21, 2026 · Apr 22, 2026
diff --git a/README.md b/README.md
@@ -172,19 +172,43 @@ The `HF_HUB_OFFLINE` value is read when you invoke `oellm` and baked into the ge
 
 ## SLURM Overrides
 
-Override cluster defaults (partition, account, time limit, etc.) with `--slurm-template-var` (JSON object):
+Override cluster defaults (partition, account, time limit, memory, etc.) with `--slurm-template-var` (JSON object). Provide `SLURM_MEM` to request an exact host memory amount, otherwise falls back to a default of `96G`.
 
 ```bash
 # Use a different partition (e.g. dev-g on LUMI when small-g is crowded)
 oellm schedule-eval --models "model-name" --task-groups "open-sci-0.01" \
   --slurm-template-var '{"PARTITION":"dev-g"}'
 
-# Multiple overrides: partition, account, time limit, GPUs
+# Multiple overrides: partition, account, time limit, GPUs, exact RAM
 oellm schedule-eval --models "model-name" --task-groups "open-sci-0.01" \
-  --slurm-template-var '{"PARTITION":"dev-g","ACCOUNT":"myproject","TIME":"02:00:00","GPUS_PER_NODE":2}'
+  --slurm-template-var '{"PARTITION":"dev-g","ACCOUNT":"myproject","TIME":"02:00:00","GPUS_PER_NODE":2,"SLURM_MEM":"96G"}'
 ```
 
-Use exact env var names: `PARTITION`, `ACCOUNT`, `GPUS_PER_NODE`. `TIME` (HH:MM:SS) overrides the time limit.
+Use exact env var names: `PARTITION`, `ACCOUNT`, `GPUS_PER_NODE`, `SLURM_MEM`. `TIME` (HH:MM:SS) overrides the time limit.
+
+## Lighteval Batch Size
+
+For lighteval runs, generated jobs default to `batch_size=1` for local runs and
+`batch_size=32` for non-local (SLURM/cluster) runs. This reduces the risk of
+out-of-memory failures where lighteval's auto batch-size detection can be
+overly optimistic for multiple-choice loglikelihood tasks. You can still
+override these defaults:
+
+```bash
+# Set an explicit batch size (overrides the local/cluster default)
+BATCH_SIZE=8 oellm schedule-eval \
+  --models "model-name" \
+  --task-groups "belebele-eu-cf" \
+  --venv-path .venv
+```
+
+If you need full manual control over all model args, set `MODEL_ARGS`,
+for example:
+
+```bash
+MODEL_ARGS='batch_size=8' oellm schedule-eval \
+  --models "model-name" --task-groups "belebele-eu-cf" --venv-path .venv
+```
 
 ## ⚠️ Dataset Pre-Download Warning
 

diff --git a/oellm/constants.py b/oellm/constants.py
@@ -48,6 +48,7 @@ class EvaluationJob:
     "acc,none",
     "acc",
     "accuracy",
+    "acc_norm",
     "f1",
     "exact_match",
 ]

diff --git a/oellm/main.py b/oellm/main.py
@@ -86,8 +86,8 @@ def schedule_evals(
             submitting to SLURM. Requires --venv_path. Skips cluster environment detection and
             runs all evaluations sequentially in a single process.
         slurm_template_var: JSON object of template variable overrides. Use exact env var names
-            (PARTITION, ACCOUNT, GPUS_PER_NODE). "TIME" overrides the time limit.
-            Example: '{"PARTITION":"dev-g","ACCOUNT":"FOO","TIME":"02:00:00","GPUS_PER_NODE":2}'
+            (PARTITION, ACCOUNT, GPUS_PER_NODE, SLURM_MEM). "TIME" overrides the time limit.
+            Example: '{"PARTITION":"dev-g","ACCOUNT":"FOO","TIME":"02:00:00","GPUS_PER_NODE":2,"SLURM_MEM":"96G"}'
     """
     from oellm.scheduler import schedule_evals as _sched
 

diff --git a/oellm/resources/task-groups.yaml b/oellm/resources/task-groups.yaml
@@ -1,5 +1,31 @@
 task_metrics:
   mmlu: acc
+  belebele_bul_Cyrl_cf: acc_norm
+  belebele_hrv_Latn_cf: acc_norm
+  belebele_ces_Latn_cf: acc_norm
+  belebele_dan_Latn_cf: acc_norm
+  belebele_nld_Latn_cf: acc_norm
+  belebele_eng_Latn_cf: acc_norm
+  belebele_est_Latn_cf: acc_norm
+  belebele_fin_Latn_cf: acc_norm
+  belebele_fra_Latn_cf: acc_norm
+  belebele_deu_Latn_cf: acc_norm
+  belebele_ell_Grek_cf: acc_norm
+  belebele_hun_Latn_cf: acc_norm
+  belebele_ita_Latn_cf: acc_norm
+  belebele_lvs_Latn_cf: acc_norm
+  belebele_lit_Latn_cf: acc_norm
+  belebele_mlt_Latn_cf: acc_norm
+  belebele_pol_Latn_cf: acc_norm
+  belebele_por_Latn_cf: acc_norm
+  belebele_ron_Latn_cf: acc_norm
+  belebele_slk_Latn_cf: acc_norm
+  belebele_slv_Latn_cf: acc_norm
+  belebele_spa_Latn_cf: acc_norm
+  belebele_swe_Latn_cf: acc_norm
+  belebele_nob_Latn_cf: acc_norm
+  belebele_eus_Latn_cf: acc_norm
+  belebele_cat_Latn_cf: acc_norm
   copa: acc
   lambada_openai: acc
   openbookqa: acc_norm
@@ -192,6 +218,65 @@ task_groups:
         subset: swe_Latn
       - task: belebele_nob_Latn
         subset: nob_Latn
+  belebele-eu-cf:
+    description: "Belebele European language tasks (cloze formulation, lighteval)"
+    suite: lighteval
+    n_shots: [0]
+    dataset: facebook/belebele
+    tasks:
+      - task: belebele_bul_Cyrl_cf
+        subset: bul_Cyrl
+      - task: belebele_hrv_Latn_cf
+        subset: hrv_Latn
+      - task: belebele_ces_Latn_cf
+        subset: ces_Latn
+      - task: belebele_dan_Latn_cf
+        subset: dan_Latn
+      - task: belebele_nld_Latn_cf
+        subset: nld_Latn
+      - task: belebele_eng_Latn_cf
+        subset: eng_Latn
+      - task: belebele_est_Latn_cf
+        subset: est_Latn
+      - task: belebele_fin_Latn_cf
+        subset: fin_Latn
+      - task: belebele_fra_Latn_cf
+        subset: fra_Latn
+      - task: belebele_deu_Latn_cf
+        subset: deu_Latn
+      - task: belebele_ell_Grek_cf
+        subset: ell_Grek
+      - task: belebele_hun_Latn_cf
+        subset: hun_Latn
+      - task: belebele_ita_Latn_cf
+        subset: ita_Latn
+      - task: belebele_lvs_Latn_cf
+        subset: lvs_Latn
+      - task: belebele_lit_Latn_cf
+        subset: lit_Latn
+      - task: belebele_mlt_Latn_cf
+        subset: mlt_Latn
+      - task: belebele_pol_Latn_cf
+        subset: pol_Latn
+      - task: belebele_por_Latn_cf
+        subset: por_Latn
+      - task: belebele_ron_Latn_cf
+        subset: ron_Latn
+      - task: belebele_slk_Latn_cf
+        subset: slk_Latn
+      - task: belebele_slv_Latn_cf
+        subset: slv_Latn
+      - task: belebele_spa_Latn_cf
+        subset: spa_Latn
+      - task: belebele_swe_Latn_cf
+        subset: swe_Latn
+      - task: belebele_nob_Latn_cf
+        subset: nob_Latn
+      - task: belebele_eus_Latn_cf
+        subset: eus_Latn
+      - task: belebele_cat_Latn_cf
+        subset: cat_Latn
+
   flores-200-eu-to-eng:
     description: "Flores 200 EU to English translation"
     suite: lighteval

diff --git a/oellm/resources/template.sbatch b/oellm/resources/template.sbatch
@@ -2,7 +2,7 @@
 #SBATCH --job-name=oellm-eval
 #SBATCH --time={time_limit}
 #SBATCH --gres=gpu:$GPUS_PER_NODE
-#SBATCH --mem=0
+#SBATCH --mem={slurm_mem}
 #SBATCH --output={log_dir}/%x-%A-%a.out
 #SBATCH --partition=$PARTITION
 #SBATCH --account=$ACCOUNT
@@ -131,6 +131,9 @@ do
 
     case "$suite_normalized" in
         lm_eval|lm-eval|lm-eval-harness)
+            echo
+            echo "----------------------------------------------------"
+            echo "lm_eval Execution"
             run_python -m lm_eval --model hf \
                 --model_args pretrained="$model_path",trust_remote_code=True \
                 --tasks "$task_path" \
@@ -139,6 +142,7 @@ do
                 --trust_remote_code \
                 ${{LM_EVAL_INCLUDE_PATH:+--include_path $LM_EVAL_INCLUDE_PATH}} \
                 ${{LIMIT:+--limit $LIMIT}}
+            echo "----------------------------------------------------"
             ;;
         lighteval|light-eval)
             LIGHT_TASK="$task_path"
@@ -164,7 +168,7 @@ do
             if [ -n "$VENV_PATH" ]; then
                 source "$VENV_PATH/bin/activate"
                 lighteval accelerate \
-                    "model_name=$model_path,{lighteval_model_args}" \
+                    "model_name=$model_path,trust_remote_code=True,{additional_model_args}" \
                     "$LIGHT_TASK_ARG" \
                     --load-tasks-multilingual \
                     --output-dir "$RESULTS_SUBDIR" \
@@ -177,7 +181,7 @@ do
                     $EVAL_SIF_PATH \
                     env CUDA_VISIBLE_DEVICES=$GPU_DEVICES \
                     lighteval accelerate \
-                        "model_name=$model_path,{lighteval_model_args}" \
+                        "model_name=$model_path,{additional_model_args}" \
                         "$LIGHT_TASK_ARG" \
                         --load-tasks-multilingual \
                         --output-dir "$RESULTS_SUBDIR" \
@@ -266,6 +270,7 @@ do
             ;;
     esac
 
+    echo "----------------------------------------------------"
     echo "Evaluation finished for model: $model_path"
 
 done

diff --git a/oellm/results.py b/oellm/results.py
@@ -71,6 +71,21 @@ def _first_matching_prefix(d: dict, prefix: str) -> tuple[float | None, str | No
     return None, None
 
 
+def _split_task_and_nshot(name: str) -> tuple[str, int | None]:
+    """Split ``'task|N'`` task names used by some harnesses.
+
+    Returns ``(task, N)`` when the suffix is numeric, ``(task, None)``
+    otherwise.  Non-string inputs pass through unchanged.
+    """
+    if not isinstance(name, str):
+        return name, None
+    if "|" in name:
+        base, after = name.rsplit("|", 1)
+        if after.isdigit():
+            return base, int(after)
+    return name, None
+
+
 def _infer_global_n_shot(n_shot_data: dict) -> int | None:
     """Infer a global n_shot if exactly one unique value exists."""
     try:
@@ -202,9 +217,21 @@ def collect_results(
         with open(json_file) as f:
             data = json.load(f)
 
-        # lmms-eval sets model_name to the adapter type (e.g. "llava_hf"),
-        # not the checkpoint path; the actual path is in model_name_or_path.
-        model_name = data.get("model_name_or_path") or data.get("model_name", "unknown")
+        # Model name lives in different keys depending on the harness:
+        # - lmms-eval: model_name_or_path is the checkpoint, model_name is the
+        #   adapter class (e.g. "llava_hf")
+        # - lighteval: config_general.{model_name,model,model_path}
+        # - legacy: summary_general.model or top-level model
+        model_name = (
+            data.get("model_name_or_path")
+            or data.get("model_name")
+            or data.get("config_general", {}).get("model_name")
+            or data.get("config_general", {}).get("model")
+            or data.get("config_general", {}).get("model_path")
+            or data.get("summary_general", {}).get("model")
+            or data.get("model")
+            or "unknown"
+        )
 
         results = data.get("results", {})
         n_shot_data = data.get("n-shot", {})
@@ -231,14 +258,20 @@ def collect_results(
         # Prefer only the first aggregate metric from groups (simplified)
         if groups_map:
             group_name, group_results = next(iter(groups_map.items()))
-            n_shot = n_shot_data.get(group_name, "unknown")
+            orig_group_name = group_name
+            n_shot = n_shot_data.get(orig_group_name, "unknown")
             if n_shot == "unknown":
-                for subtask_name in group_subtasks_map.get(group_name, []):
+                for subtask_name in group_subtasks_map.get(orig_group_name, []):
                     if subtask_name in n_shot_data:
                         n_shot = n_shot_data[subtask_name]
                         break
             if n_shot == "unknown" and global_n_shot is not None:
                 n_shot = global_n_shot
+            # Strip ``'|N'`` n-shot suffix from the group name, falling back
+            # to the parsed N when n_shot is still unknown.
+            group_name, parsed_n = _split_task_and_nshot(orig_group_name)
+            if n_shot == "unknown" and parsed_n is not None:
+                n_shot = parsed_n
             performance, metric_name = _resolve_metric(
                 group_name, group_results, task_metrics
             )
@@ -273,30 +306,35 @@ def collect_results(
             if task_name.startswith("global_mmlu_") and task_name.count("_") >= 4:
                 continue
 
+            # Strip ``'|N'`` n-shot suffix from the task name; use parsed N
+            # as a last-resort fallback when n_shot isn't otherwise resolvable.
+            task_name_clean, parsed_n = _split_task_and_nshot(task_name)
             n_shot = _resolve_n_shot(
-                task_name,
+                task_name_clean,
                 n_shot_data,
                 group_subtasks_map,
                 group_aggregate_names,
                 global_n_shot,
             )
+            if n_shot == "unknown" and parsed_n is not None:
+                n_shot = parsed_n
 
             # Skip lmms-eval parent task placeholders (no numeric metrics, just alias)
             if set(task_results.keys()) <= {"alias", " ", ""}:
                 continue
 
             performance, metric_name = _resolve_metric(
-                task_name, task_results, task_metrics
+                task_name_clean, task_results, task_metrics
             )
 
             if performance is not None:
                 if check:
-                    completed_jobs.add((model_name, task_name, n_shot))
+                    completed_jobs.add((model_name, task_name_clean, n_shot))
 
                 rows.append(
                     {
                         "model_name": model_name,
-                        "task": task_name,
+                        "task": task_name_clean,
                         "n_shot": n_shot,
                         "performance": performance,
                         "metric_name": metric_name if metric_name is not None else "",