diff --git a/README.md b/README.md
index 1a44218..2083798 100644
--- a/README.md
+++ b/README.md
@@ -172,19 +172,43 @@ The `HF_HUB_OFFLINE` value is read when you invoke `oellm` and baked into the ge
 
 ## SLURM Overrides
 
-Override cluster defaults (partition, account, time limit, etc.) with `--slurm-template-var` (JSON object):
+Override cluster defaults (partition, account, time limit, memory, etc.) with `--slurm-template-var` (JSON object). Provide `SLURM_MEM` to request an exact host memory amount, otherwise falls back to a default of `96G`.
 
 ```bash
 # Use a different partition (e.g. dev-g on LUMI when small-g is crowded)
 oellm schedule-eval --models "model-name" --task-groups "open-sci-0.01" \
   --slurm-template-var '{"PARTITION":"dev-g"}'
 
-# Multiple overrides: partition, account, time limit, GPUs
+# Multiple overrides: partition, account, time limit, GPUs, exact RAM
 oellm schedule-eval --models "model-name" --task-groups "open-sci-0.01" \
-  --slurm-template-var '{"PARTITION":"dev-g","ACCOUNT":"myproject","TIME":"02:00:00","GPUS_PER_NODE":2}'
+  --slurm-template-var '{"PARTITION":"dev-g","ACCOUNT":"myproject","TIME":"02:00:00","GPUS_PER_NODE":2,"SLURM_MEM":"96G"}'
 ```
 
-Use exact env var names: `PARTITION`, `ACCOUNT`, `GPUS_PER_NODE`. `TIME` (HH:MM:SS) overrides the time limit.
+Use exact env var names: `PARTITION`, `ACCOUNT`, `GPUS_PER_NODE`, `SLURM_MEM`. `TIME` (HH:MM:SS) overrides the time limit.
+
+## Lighteval Batch Size
+
+For lighteval runs, generated jobs default to `batch_size=1` for local runs and
+`batch_size=32` for non-local (SLURM/cluster) runs. This reduces the risk of
+out-of-memory failures where lighteval's auto batch-size detection can be
+overly optimistic for multiple-choice loglikelihood tasks. You can still
+override these defaults:
+
+```bash
+# Set an explicit batch size (overrides the local/cluster default)
+BATCH_SIZE=8 oellm schedule-eval \
+  --models "model-name" \
+  --task-groups "belebele-eu-cf" \
+  --venv-path .venv
+```
+
+If you need full manual control over all model args, set `MODEL_ARGS`,
+for example:
+
+```bash
+MODEL_ARGS='batch_size=8' oellm schedule-eval \
+  --models "model-name" --task-groups "belebele-eu-cf" --venv-path .venv
+```
 
 ## ⚠️ Dataset Pre-Download Warning
 
diff --git a/oellm/constants.py b/oellm/constants.py
index bb52e2a..e1bf391 100644
--- a/oellm/constants.py
+++ b/oellm/constants.py
@@ -48,6 +48,7 @@ class EvaluationJob:
     "acc,none",
     "acc",
     "accuracy",
+    "acc_norm",
     "f1",
     "exact_match",
 ]
diff --git a/oellm/main.py b/oellm/main.py
index d19eb97..c1c1469 100644
--- a/oellm/main.py
+++ b/oellm/main.py
@@ -86,8 +86,8 @@ def schedule_evals(
             submitting to SLURM. Requires --venv_path. Skips cluster environment detection and
             runs all evaluations sequentially in a single process.
         slurm_template_var: JSON object of template variable overrides. Use exact env var names
-            (PARTITION, ACCOUNT, GPUS_PER_NODE). "TIME" overrides the time limit.
-            Example: '{"PARTITION":"dev-g","ACCOUNT":"FOO","TIME":"02:00:00","GPUS_PER_NODE":2}'
+            (PARTITION, ACCOUNT, GPUS_PER_NODE, SLURM_MEM). "TIME" overrides the time limit.
+            Example: '{"PARTITION":"dev-g","ACCOUNT":"FOO","TIME":"02:00:00","GPUS_PER_NODE":2,"SLURM_MEM":"96G"}'
     """
     from oellm.scheduler import schedule_evals as _sched
 
diff --git a/oellm/resources/task-groups.yaml b/oellm/resources/task-groups.yaml
index ce96454..ec848b5 100644
--- a/oellm/resources/task-groups.yaml
+++ b/oellm/resources/task-groups.yaml
@@ -1,5 +1,31 @@
 task_metrics:
   mmlu: acc
+  belebele_bul_Cyrl_cf: acc_norm
+  belebele_hrv_Latn_cf: acc_norm
+  belebele_ces_Latn_cf: acc_norm
+  belebele_dan_Latn_cf: acc_norm
+  belebele_nld_Latn_cf: acc_norm
+  belebele_eng_Latn_cf: acc_norm
+  belebele_est_Latn_cf: acc_norm
+  belebele_fin_Latn_cf: acc_norm
+  belebele_fra_Latn_cf: acc_norm
+  belebele_deu_Latn_cf: acc_norm
+  belebele_ell_Grek_cf: acc_norm
+  belebele_hun_Latn_cf: acc_norm
+  belebele_ita_Latn_cf: acc_norm
+  belebele_lvs_Latn_cf: acc_norm
+  belebele_lit_Latn_cf: acc_norm
+  belebele_mlt_Latn_cf: acc_norm
+  belebele_pol_Latn_cf: acc_norm
+  belebele_por_Latn_cf: acc_norm
+  belebele_ron_Latn_cf: acc_norm
+  belebele_slk_Latn_cf: acc_norm
+  belebele_slv_Latn_cf: acc_norm
+  belebele_spa_Latn_cf: acc_norm
+  belebele_swe_Latn_cf: acc_norm
+  belebele_nob_Latn_cf: acc_norm
+  belebele_eus_Latn_cf: acc_norm
+  belebele_cat_Latn_cf: acc_norm
   copa: acc
   lambada_openai: acc
   openbookqa: acc_norm
@@ -192,6 +218,65 @@ task_groups:
         subset: swe_Latn
       - task: belebele_nob_Latn
         subset: nob_Latn
+  belebele-eu-cf:
+    description: "Belebele European language tasks (cloze formulation, lighteval)"
+    suite: lighteval
+    n_shots: [0]
+    dataset: facebook/belebele
+    tasks:
+      - task: belebele_bul_Cyrl_cf
+        subset: bul_Cyrl
+      - task: belebele_hrv_Latn_cf
+        subset: hrv_Latn
+      - task: belebele_ces_Latn_cf
+        subset: ces_Latn
+      - task: belebele_dan_Latn_cf
+        subset: dan_Latn
+      - task: belebele_nld_Latn_cf
+        subset: nld_Latn
+      - task: belebele_eng_Latn_cf
+        subset: eng_Latn
+      - task: belebele_est_Latn_cf
+        subset: est_Latn
+      - task: belebele_fin_Latn_cf
+        subset: fin_Latn
+      - task: belebele_fra_Latn_cf
+        subset: fra_Latn
+      - task: belebele_deu_Latn_cf
+        subset: deu_Latn
+      - task: belebele_ell_Grek_cf
+        subset: ell_Grek
+      - task: belebele_hun_Latn_cf
+        subset: hun_Latn
+      - task: belebele_ita_Latn_cf
+        subset: ita_Latn
+      - task: belebele_lvs_Latn_cf
+        subset: lvs_Latn
+      - task: belebele_lit_Latn_cf
+        subset: lit_Latn
+      - task: belebele_mlt_Latn_cf
+        subset: mlt_Latn
+      - task: belebele_pol_Latn_cf
+        subset: pol_Latn
+      - task: belebele_por_Latn_cf
+        subset: por_Latn
+      - task: belebele_ron_Latn_cf
+        subset: ron_Latn
+      - task: belebele_slk_Latn_cf
+        subset: slk_Latn
+      - task: belebele_slv_Latn_cf
+        subset: slv_Latn
+      - task: belebele_spa_Latn_cf
+        subset: spa_Latn
+      - task: belebele_swe_Latn_cf
+        subset: swe_Latn
+      - task: belebele_nob_Latn_cf
+        subset: nob_Latn
+      - task: belebele_eus_Latn_cf
+        subset: eus_Latn
+      - task: belebele_cat_Latn_cf
+        subset: cat_Latn
+      
   flores-200-eu-to-eng:
     description: "Flores 200 EU to English translation"
     suite: lighteval
diff --git a/oellm/resources/template.sbatch b/oellm/resources/template.sbatch
index d67ab94..42900f9 100644
--- a/oellm/resources/template.sbatch
+++ b/oellm/resources/template.sbatch
@@ -2,7 +2,7 @@
 #SBATCH --job-name=oellm-eval
 #SBATCH --time={time_limit}
 #SBATCH --gres=gpu:$GPUS_PER_NODE
-#SBATCH --mem=0
+#SBATCH --mem={slurm_mem}
 #SBATCH --output={log_dir}/%x-%A-%a.out
 #SBATCH --partition=$PARTITION
 #SBATCH --account=$ACCOUNT
@@ -131,6 +131,9 @@ do
 
     case "$suite_normalized" in
         lm_eval|lm-eval|lm-eval-harness)
+            echo
+            echo "----------------------------------------------------"
+            echo "lm_eval Execution"
             run_python -m lm_eval --model hf \
                 --model_args pretrained="$model_path",trust_remote_code=True \
                 --tasks "$task_path" \
@@ -139,6 +142,7 @@ do
                 --trust_remote_code \
                 ${{LM_EVAL_INCLUDE_PATH:+--include_path $LM_EVAL_INCLUDE_PATH}} \
                 ${{LIMIT:+--limit $LIMIT}}
+            echo "----------------------------------------------------"
             ;;
         lighteval|light-eval)
             LIGHT_TASK="$task_path"
@@ -164,7 +168,7 @@ do
             if [ -n "$VENV_PATH" ]; then
                 source "$VENV_PATH/bin/activate"
                 lighteval accelerate \
-                    "model_name=$model_path,{lighteval_model_args}" \
+                    "model_name=$model_path,trust_remote_code=True,{additional_model_args}" \
                     "$LIGHT_TASK_ARG" \
                     --load-tasks-multilingual \
                     --output-dir "$RESULTS_SUBDIR" \
@@ -177,7 +181,7 @@ do
                     $EVAL_SIF_PATH \
                     env CUDA_VISIBLE_DEVICES=$GPU_DEVICES \
                     lighteval accelerate \
-                        "model_name=$model_path,{lighteval_model_args}" \
+                        "model_name=$model_path,{additional_model_args}" \
                         "$LIGHT_TASK_ARG" \
                         --load-tasks-multilingual \
                         --output-dir "$RESULTS_SUBDIR" \
@@ -266,6 +270,7 @@ do
             ;;
     esac
 
+    echo "----------------------------------------------------"
     echo "Evaluation finished for model: $model_path"
 
 done
diff --git a/oellm/results.py b/oellm/results.py
index 3fe0456..2ed66a7 100644
--- a/oellm/results.py
+++ b/oellm/results.py
@@ -71,6 +71,21 @@ def _first_matching_prefix(d: dict, prefix: str) -> tuple[float | None, str | No
     return None, None
 
 
+def _split_task_and_nshot(name: str) -> tuple[str, int | None]:
+    """Split ``'task|N'`` task names used by some harnesses.
+
+    Returns ``(task, N)`` when the suffix is numeric, ``(task, None)``
+    otherwise.  Non-string inputs pass through unchanged.
+    """
+    if not isinstance(name, str):
+        return name, None
+    if "|" in name:
+        base, after = name.rsplit("|", 1)
+        if after.isdigit():
+            return base, int(after)
+    return name, None
+
+
 def _infer_global_n_shot(n_shot_data: dict) -> int | None:
     """Infer a global n_shot if exactly one unique value exists."""
     try:
@@ -202,9 +217,21 @@ def collect_results(
         with open(json_file) as f:
             data = json.load(f)
 
-        # lmms-eval sets model_name to the adapter type (e.g. "llava_hf"),
-        # not the checkpoint path; the actual path is in model_name_or_path.
-        model_name = data.get("model_name_or_path") or data.get("model_name", "unknown")
+        # Model name lives in different keys depending on the harness:
+        # - lmms-eval: model_name_or_path is the checkpoint, model_name is the
+        #   adapter class (e.g. "llava_hf")
+        # - lighteval: config_general.{model_name,model,model_path}
+        # - legacy: summary_general.model or top-level model
+        model_name = (
+            data.get("model_name_or_path")
+            or data.get("model_name")
+            or data.get("config_general", {}).get("model_name")
+            or data.get("config_general", {}).get("model")
+            or data.get("config_general", {}).get("model_path")
+            or data.get("summary_general", {}).get("model")
+            or data.get("model")
+            or "unknown"
+        )
 
         results = data.get("results", {})
         n_shot_data = data.get("n-shot", {})
@@ -231,14 +258,20 @@ def collect_results(
         # Prefer only the first aggregate metric from groups (simplified)
         if groups_map:
             group_name, group_results = next(iter(groups_map.items()))
-            n_shot = n_shot_data.get(group_name, "unknown")
+            orig_group_name = group_name
+            n_shot = n_shot_data.get(orig_group_name, "unknown")
             if n_shot == "unknown":
-                for subtask_name in group_subtasks_map.get(group_name, []):
+                for subtask_name in group_subtasks_map.get(orig_group_name, []):
                     if subtask_name in n_shot_data:
                         n_shot = n_shot_data[subtask_name]
                         break
             if n_shot == "unknown" and global_n_shot is not None:
                 n_shot = global_n_shot
+            # Strip ``'|N'`` n-shot suffix from the group name, falling back
+            # to the parsed N when n_shot is still unknown.
+            group_name, parsed_n = _split_task_and_nshot(orig_group_name)
+            if n_shot == "unknown" and parsed_n is not None:
+                n_shot = parsed_n
             performance, metric_name = _resolve_metric(
                 group_name, group_results, task_metrics
             )
@@ -273,30 +306,35 @@ def collect_results(
             if task_name.startswith("global_mmlu_") and task_name.count("_") >= 4:
                 continue
 
+            # Strip ``'|N'`` n-shot suffix from the task name; use parsed N
+            # as a last-resort fallback when n_shot isn't otherwise resolvable.
+            task_name_clean, parsed_n = _split_task_and_nshot(task_name)
             n_shot = _resolve_n_shot(
-                task_name,
+                task_name_clean,
                 n_shot_data,
                 group_subtasks_map,
                 group_aggregate_names,
                 global_n_shot,
             )
+            if n_shot == "unknown" and parsed_n is not None:
+                n_shot = parsed_n
 
             # Skip lmms-eval parent task placeholders (no numeric metrics, just alias)
             if set(task_results.keys()) <= {"alias", " ", ""}:
                 continue
 
             performance, metric_name = _resolve_metric(
-                task_name, task_results, task_metrics
+                task_name_clean, task_results, task_metrics
             )
 
             if performance is not None:
                 if check:
-                    completed_jobs.add((model_name, task_name, n_shot))
+                    completed_jobs.add((model_name, task_name_clean, n_shot))
 
                 rows.append(
                     {
                         "model_name": model_name,
-                        "task": task_name,
+                        "task": task_name_clean,
                         "n_shot": n_shot,
                         "performance": performance,
                         "metric_name": metric_name if metric_name is not None else "",
diff --git a/oellm/scheduler.py b/oellm/scheduler.py
index 7314a3b..af203c9 100644
--- a/oellm/scheduler.py
+++ b/oellm/scheduler.py
@@ -14,6 +14,7 @@
 from oellm.constants import EvaluationJob
 from oellm.runner import EvalRunner
 from oellm.task_groups import (
+    _build_task_suite_map,
     _collect_dataset_specs,
     _collect_hf_dataset_files,
     _collect_hf_model_repos,
@@ -50,6 +51,53 @@ def _resolve_hf_hub_offline(local: bool) -> int:
     return 0 if local else 1
 
 
+def _resolve_slurm_mem() -> str:
+    """Return the host-memory request for the generated SLURM job.
+
+    Reads ``SLURM_MEM`` from the environment; falls back to ``96G``.  Can also
+    be overridden per-invocation via ``--slurm-template-var``.
+    """
+    explicit_mem = os.environ.get("SLURM_MEM")
+    if explicit_mem is not None and str(explicit_mem).strip() != "":
+        return str(explicit_mem).strip()
+
+    logging.warning("SLURM_MEM not set; falling back to default memory request '96G'.")
+    return "96G"
+
+
+def _resolve_additional_model_args(local: bool = False) -> str:
+    """Return model args for lighteval, defaulting to an explicit batch size.
+
+    - if ``local`` is True: ``batch_size=1``
+    - otherwise: ``batch_size=32``
+
+    Override the entire string via ``MODEL_ARGS`` or just the batch size via
+    ``BATCH_SIZE``.  Applied to the lighteval suite only.
+    """
+    explicit_model_args = os.environ.get("MODEL_ARGS")
+    if explicit_model_args is not None and str(explicit_model_args).strip() != "":
+        return str(explicit_model_args).strip()
+
+    batch_size = os.environ.get("BATCH_SIZE")
+    if batch_size is not None and str(batch_size).strip() != "":
+        batch_size_value = str(batch_size).strip()
+        try:
+            if int(batch_size_value) < 1:
+                raise ValueError
+        except ValueError:
+            fallback = "1" if local else "32"
+            logging.warning(
+                "Invalid BATCH_SIZE=%r; falling back to batch_size=%s",
+                batch_size,
+                fallback,
+            )
+            batch_size_value = fallback
+    else:
+        batch_size_value = "1" if local else "32"
+
+    return f"batch_size={batch_size_value}"
+
+
 @capture_third_party_output_from_kwarg("verbose")
 def schedule_evals(
     models: str | None = None,
@@ -106,8 +154,8 @@ def schedule_evals(
         local: If True, run evaluations directly on the local machine using bash instead of
             submitting to SLURM. Requires --venv_path.
         slurm_template_var: JSON object of template variable overrides. Use exact env var names
-            (PARTITION, ACCOUNT, GPUS_PER_NODE). "TIME" overrides the time limit.
-            Example: '{"PARTITION":"dev-g","ACCOUNT":"FOO","TIME":"02:00:00","GPUS_PER_NODE":2}'
+            (PARTITION, ACCOUNT, GPUS_PER_NODE, SLURM_MEM). "TIME" overrides the time limit.
+            Example: '{"PARTITION":"dev-g","ACCOUNT":"FOO","TIME":"02:00:00","GPUS_PER_NODE":2,"SLURM_MEM":"96G"}'
     """
     _setup_logging(verbose)
 
@@ -186,13 +234,18 @@ def schedule_evals(
 
     elif models:
         if group_names is None:
+            # Look up each bare task name in the registered groups so
+            # ``--tasks belebele_eng_Latn_cf`` (lighteval) or ``--tasks
+            # regiondial_refcocog_all`` (contrib) get routed correctly.
+            # Tasks not in any group default to lm_eval.
+            task_suite_map = _build_task_suite_map()
             eval_jobs.extend(
                 [
                     EvaluationJob(
                         model_path=model,
                         task_path=task,
                         n_shot=shot,
-                        eval_suite="lm_eval",
+                        eval_suite=task_suite_map.get(task, "lm_eval"),
                     )
                     for model in models
                     for task in tasks
@@ -354,6 +407,9 @@ def schedule_evals(
                 os.environ[key] = str(value)
                 logging.info(f"Using slurm_template_var override: {key}={value}")
 
+    slurm_mem = _resolve_slurm_mem()
+    additional_model_args = _resolve_additional_model_args(local)
+
     logging.info("Evaluation planning:")
     logging.info(f"   Total evaluations: {total_evals}")
     logging.info(
@@ -361,6 +417,7 @@ def schedule_evals(
     )
     logging.info(f"   Evaluations per job: {evals_per_job}")
     logging.info(f"   Time limit: {time_limit}")
+    logging.info(f"   Requested host memory: {slurm_mem}")
 
     sbatch_script = sbatch_template.format(
         csv_path=csv_path,
@@ -371,14 +428,13 @@ def schedule_evals(
         log_dir=evals_dir / "slurm_logs",
         evals_dir=str(evals_dir / "results"),
         time_limit=time_limit,  # Dynamic time limit
+        slurm_mem=slurm_mem,
         limit=limit if limit else "",  # Sample limit for quick testing
         venv_path=venv_path or "",
         lm_eval_include_path=lm_eval_include_path
         or str(files("oellm.resources") / "custom_lm_eval_tasks"),
         hf_hub_offline=_resolve_hf_hub_offline(local),
-        lighteval_model_args="trust_remote_code=True,batch_size=1"
-        if local
-        else "trust_remote_code=True",
+        additional_model_args=additional_model_args,
         evalchemy_dir=os.environ.get("EVALCHEMY_DIR", "/opt/evalchemy"),
     )
 
diff --git a/oellm/task_groups.py b/oellm/task_groups.py
index eba49e6..d930532 100644
--- a/oellm/task_groups.py
+++ b/oellm/task_groups.py
@@ -323,6 +323,24 @@ def _lookup_dataset_specs_for_tasks(task_names: Iterable[str]) -> list[DatasetSp
     return specs
 
 
+def _build_task_suite_map() -> dict[str, str]:
+    """Return ``{task_name: eval_suite}`` across core YAML and contrib plugins.
+
+    Uses :func:`_parse_task_groups` + :func:`_iter_all_tasks` so contrib
+    registries (e.g. ``regiondial_bench``) are included, not just the core
+    ``task-groups.yaml``.  Task-level ``suite`` overrides group-level.  First
+    occurrence wins when a task name appears in multiple groups.
+
+    Consumers should still ``.get(task, "lm_eval")`` — tasks not registered
+    in any group simply aren't in the map.
+    """
+    parsed = _parse_task_groups(get_all_task_group_names())
+    task_suite_map: dict[str, str] = {}
+    for t, suite, _group in _iter_all_tasks(parsed):
+        task_suite_map.setdefault(t.name, suite)
+    return task_suite_map
+
+
 def get_all_task_group_names() -> list[str]:
     """Return all available task group names (core + all contrib suites)."""
     data = (
diff --git a/oellm/utils.py b/oellm/utils.py
index 976ce5a..16a53b9 100644
--- a/oellm/utils.py
+++ b/oellm/utils.py
@@ -438,6 +438,18 @@ def _pre_download_datasets_from_specs(
                             trust_remote_code=trust_remote_code,
                         )
                     continue
+                if "Feature type" in str(e) and "not found" in str(e):
+                    hf_datasets_cache = os.environ.get(
+                        "HF_DATASETS_CACHE",
+                        str(Path.home() / ".cache" / "huggingface" / "datasets"),
+                    )
+                    safe_name = spec.repo_id.replace("/", "___")
+                    cache_dir = os.path.join(hf_datasets_cache, safe_name)
+                    raise RuntimeError(
+                        f"Cached metadata for '{label}' is incompatible with the installed "
+                        f"datasets version ('{e}'). Delete the stale cache and re-run:\n\n"
+                        f"    rm -rf {cache_dir}\n"
+                    ) from None
                 raise
 
             logging.debug(f"Finished downloading dataset '{label}'.")
diff --git a/tests/test_task_suite_map.py b/tests/test_task_suite_map.py
new file mode 100644
index 0000000..c8c6be1
--- /dev/null
+++ b/tests/test_task_suite_map.py
@@ -0,0 +1,88 @@
+"""Tests for :func:`oellm.task_groups._build_task_suite_map`.
+
+The helper powers the ``--tasks`` (bare-task-name) path in the scheduler.
+It must cover every suite we actually support — core YAML-registered suites
+(lm-eval-harness, lighteval, lmms_eval, evalchemy) AND contrib-registered
+suites (e.g. regiondial_bench).
+"""
+
+from __future__ import annotations
+
+from oellm.task_groups import _build_task_suite_map
+
+
+def test_map_is_non_empty():
+    m = _build_task_suite_map()
+    assert len(m) > 0, "suite map must contain at least core YAML tasks"
+
+
+def test_map_includes_lm_eval_harness_task():
+    m = _build_task_suite_map()
+    # copa is a classic lm-eval-harness task in task-groups.yaml
+    assert m.get("copa") == "lm-eval-harness"
+
+
+def test_map_includes_lighteval_task():
+    m = _build_task_suite_map()
+    # belebele_*_cf tasks are lighteval
+    assert m.get("belebele_eng_Latn_cf") == "lighteval"
+
+
+def test_map_includes_lmms_eval_task():
+    """lmms_eval tasks come from image/video task groups — must be routable."""
+    m = _build_task_suite_map()
+    # vqav2_val is the base VQA v2 task (image modality)
+    assert m.get("vqav2_val") == "lmms_eval"
+
+
+def test_map_includes_contrib_task():
+    """Contrib plugins (e.g. regiondial_bench) register their own TASK_GROUPS.
+
+    These are the regression target: the original upstream helper only read
+    YAML and missed contrib entirely.
+    """
+    m = _build_task_suite_map()
+    assert m.get("regiondial_refcocog") == "regiondial_bench"
+
+
+def test_map_honours_task_level_suite_override():
+    """Evalchemy tasks set ``suite: evalchemy`` at the task level, not the
+    group level — the helper must prefer the task-level value.
+    """
+    m = _build_task_suite_map()
+    assert m.get("GPQADiamond") == "evalchemy"
+
+
+def test_map_covers_all_actually_registered_suites():
+    """Sanity: every distinct suite we see should be one we actually route.
+
+    Guards against a new suite slipping into YAML or contrib without us
+    adding a case branch in template.sbatch (the ``*)`` catch-all routes
+    everything unknown to the contrib dispatcher, but we still want this
+    assertion as documentation).
+    """
+    m = _build_task_suite_map()
+    distinct_suites = set(m.values())
+    expected_subset = {
+        "lm-eval-harness",
+        "lighteval",
+        "lmms_eval",
+        "evalchemy",
+        "regiondial_bench",
+    }
+    # All expected suites must be present.  Extra contrib suites are fine.
+    assert expected_subset.issubset(distinct_suites), (
+        f"missing suites: {expected_subset - distinct_suites}"
+    )
+
+
+def test_first_occurrence_wins_when_task_in_multiple_groups():
+    """If a task name appears in multiple groups, first occurrence wins.
+
+    This is documented behavior of ``setdefault`` in the helper.  We don't
+    assert a specific pair here because the YAML contents shift; we only
+    assert the determinism property.
+    """
+    m1 = _build_task_suite_map()
+    m2 = _build_task_suite_map()
+    assert m1 == m2