Aleph-Alpha-Research · prabhuteja12 · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026
diff --git a/docs/tasks/ARCBPB.md b/docs/tasks/ARCBPB.md
@@ -0,0 +1,20 @@
+# ARCBPB
+
+````
+NAME = ARCBPB
+DATASET_PATH = allenai/ai2_arc
+SAMPLE_SPLIT = test
+FEWSHOT_SPLIT = train
+RESPONSE_TYPE = LOGLIKELIHOODS
+METRICS = [BitsPerByteLoglikelihood]
+SUBJECTS = ['ARC-Easy', 'ARC-Challenge']
+LANGUAGE = <Language.ENG: 'English'>
+````
+
+- Module: `eval_framework.tasks.benchmarks.arc`
+
+- File: [src/eval_framework/tasks/benchmarks/arc.py](../../src/eval_framework/tasks/benchmarks/arc.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/arc.py)
+
+- Link to dataset: [https://huggingface.co/datasets/allenai/ai2_arc](https://huggingface.co/datasets/allenai/ai2_arc)
+
+More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "ARCBPB"`.
diff --git a/docs/tasks/ARCCloze.md b/docs/tasks/ARCCloze.md
@@ -0,0 +1,20 @@
+# ARCCloze
+
+````
+NAME = ARCCloze
+DATASET_PATH = allenai/ai2_arc
+SAMPLE_SPLIT = test
+FEWSHOT_SPLIT = train
+RESPONSE_TYPE = LOGLIKELIHOODS
+METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood]
+SUBJECTS = ['ARC-Easy', 'ARC-Challenge']
+LANGUAGE = <Language.ENG: 'English'>
+````
+
+- Module: `eval_framework.tasks.benchmarks.arc`
+
+- File: [src/eval_framework/tasks/benchmarks/arc.py](../../src/eval_framework/tasks/benchmarks/arc.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/arc.py)
+
+- Link to dataset: [https://huggingface.co/datasets/allenai/ai2_arc](https://huggingface.co/datasets/allenai/ai2_arc)
+
+More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "ARCCloze"`.
diff --git a/docs/tasks/ARCMC.md b/docs/tasks/ARCMC.md
@@ -0,0 +1,20 @@
+# ARCMC
+
+````
+NAME = ARCMC
+DATASET_PATH = allenai/ai2_arc
+SAMPLE_SPLIT = test
+FEWSHOT_SPLIT = train
+RESPONSE_TYPE = LOGLIKELIHOODS
+METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood]
+SUBJECTS = ['ARC-Easy', 'ARC-Challenge']
+LANGUAGE = <Language.ENG: 'English'>
+````
+
+- Module: `eval_framework.tasks.benchmarks.arc`
+
+- File: [src/eval_framework/tasks/benchmarks/arc.py](../../src/eval_framework/tasks/benchmarks/arc.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/arc.py)
+
+- Link to dataset: [https://huggingface.co/datasets/allenai/ai2_arc](https://huggingface.co/datasets/allenai/ai2_arc)
+
+More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "ARCMC"`.
diff --git a/docs/tasks/CodexHumanEval_BPB.md b/docs/tasks/CodexHumanEval_BPB.md
@@ -0,0 +1,20 @@
+# CodexHumanEval_BPB
+
+````
+NAME = CodexHumanEval_BPB
+DATASET_PATH = openai/openai_humaneval
+SAMPLE_SPLIT = test
+FEWSHOT_SPLIT = test
+RESPONSE_TYPE = LOGLIKELIHOODS
+METRICS = [BitsPerByteLoglikelihood]
+SUBJECTS = ['no_subject']
+LANGUAGE = <Language.ENG: 'English'>
+````
+
+- Module: `eval_framework.tasks.benchmarks.humaneval`
+
+- File: [src/eval_framework/tasks/benchmarks/humaneval.py](../../src/eval_framework/tasks/benchmarks/humaneval.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/humaneval.py)
+
+- Link to dataset: [https://huggingface.co/datasets/openai/openai_humaneval](https://huggingface.co/datasets/openai/openai_humaneval)
+
+More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "CodexHumanEval_BPB"`.
diff --git a/docs/tasks/CodexMBPP_BPB.md b/docs/tasks/CodexMBPP_BPB.md
@@ -0,0 +1,20 @@
+# CodexMBPP_BPB
+
+````
+NAME = CodexMBPP_BPB
+DATASET_PATH = google-research-datasets/mbpp
+SAMPLE_SPLIT = test
+FEWSHOT_SPLIT = test
+RESPONSE_TYPE = LOGLIKELIHOODS
+METRICS = [BitsPerByteLoglikelihood]
+SUBJECTS = ['full']
+LANGUAGE = <Language.ENG: 'English'>
+````
+
+- Module: `eval_framework.tasks.benchmarks.mbpp`
+
+- File: [src/eval_framework/tasks/benchmarks/mbpp.py](../../src/eval_framework/tasks/benchmarks/mbpp.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/mbpp.py)
+
+- Link to dataset: [https://huggingface.co/datasets/google-research-datasets/mbpp](https://huggingface.co/datasets/google-research-datasets/mbpp)
+
+More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "CodexMBPP_BPB"`.
diff --git a/docs/tasks/HELLASWAGBPB.md b/docs/tasks/HELLASWAGBPB.md
@@ -0,0 +1,20 @@
+# HELLASWAGBPB
+
+````
+NAME = HELLASWAGBPB
+DATASET_PATH = Rowan/hellaswag
+SAMPLE_SPLIT = validation
+FEWSHOT_SPLIT = train
+RESPONSE_TYPE = LOGLIKELIHOODS
+METRICS = [BitsPerByteLoglikelihood]
+SUBJECTS = ['no_subject']
+LANGUAGE = <Language.ENG: 'English'>
+````
+
+- Module: `eval_framework.tasks.benchmarks.hellaswag`
+
+- File: [src/eval_framework/tasks/benchmarks/hellaswag.py](../../src/eval_framework/tasks/benchmarks/hellaswag.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/hellaswag.py)
+
+- Link to dataset: [https://huggingface.co/datasets/Rowan/hellaswag](https://huggingface.co/datasets/Rowan/hellaswag)
+
+More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "HELLASWAGBPB"`.
diff --git a/docs/tasks/HELLASWAGCloze.md b/docs/tasks/HELLASWAGCloze.md
@@ -0,0 +1,20 @@
+# HELLASWAGCloze
+
+````
+NAME = HELLASWAGCloze
+DATASET_PATH = Rowan/hellaswag
+SAMPLE_SPLIT = validation
+FEWSHOT_SPLIT = train
+RESPONSE_TYPE = LOGLIKELIHOODS
+METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood]
+SUBJECTS = ['no_subject']
+LANGUAGE = <Language.ENG: 'English'>
+````
+
+- Module: `eval_framework.tasks.benchmarks.hellaswag`
+
+- File: [src/eval_framework/tasks/benchmarks/hellaswag.py](../../src/eval_framework/tasks/benchmarks/hellaswag.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/hellaswag.py)
+
+- Link to dataset: [https://huggingface.co/datasets/Rowan/hellaswag](https://huggingface.co/datasets/Rowan/hellaswag)
+
+More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "HELLASWAGCloze"`.
diff --git a/docs/tasks/HELLASWAGMC.md b/docs/tasks/HELLASWAGMC.md
@@ -0,0 +1,20 @@
+# HELLASWAGMC
+
+````
+NAME = HELLASWAGMC
+DATASET_PATH = Rowan/hellaswag
+SAMPLE_SPLIT = validation
+FEWSHOT_SPLIT = train
+RESPONSE_TYPE = LOGLIKELIHOODS
+METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood]
+SUBJECTS = ['no_subject']
+LANGUAGE = <Language.ENG: 'English'>
+````
+
+- Module: `eval_framework.tasks.benchmarks.hellaswag`
+
+- File: [src/eval_framework/tasks/benchmarks/hellaswag.py](../../src/eval_framework/tasks/benchmarks/hellaswag.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/hellaswag.py)
+
+- Link to dataset: [https://huggingface.co/datasets/Rowan/hellaswag](https://huggingface.co/datasets/Rowan/hellaswag)
+
+More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "HELLASWAGMC"`.
diff --git a/docs/tasks/MATH500Minerva_BPB.md b/docs/tasks/MATH500Minerva_BPB.md
@@ -0,0 +1,20 @@
+# MATH500Minerva_BPB
+
+````
+NAME = MATH500Minerva_BPB
+DATASET_PATH = HuggingFaceH4/MATH-500
+SAMPLE_SPLIT = test
+FEWSHOT_SPLIT = test
+RESPONSE_TYPE = LOGLIKELIHOODS
+METRICS = [BitsPerByteLoglikelihood]
+SUBJECTS = ['no_subject']
+LANGUAGE = <Language.ENG: 'English'>
+````
+
+- Module: `eval_framework.tasks.benchmarks.math_reasoning`
+
+- File: [src/eval_framework/tasks/benchmarks/math_reasoning.py](../../src/eval_framework/tasks/benchmarks/math_reasoning.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/math_reasoning.py)
+
+- Link to dataset: [https://huggingface.co/datasets/HuggingFaceH4/MATH-500](https://huggingface.co/datasets/HuggingFaceH4/MATH-500)
+
+More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "MATH500Minerva_BPB"`.
diff --git a/docs/tasks/MMLUBPB.md b/docs/tasks/MMLUBPB.md
@@ -0,0 +1,20 @@
+# MMLUBPB
+
+````
+NAME = MMLUBPB
+DATASET_PATH = cais/mmlu
+SAMPLE_SPLIT = test
+FEWSHOT_SPLIT = dev
+RESPONSE_TYPE = LOGLIKELIHOODS
+METRICS = [BitsPerByteLoglikelihood]
+SUBJECTS = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']
+LANGUAGE = <Language.ENG: 'English'>
+````
+
+- Module: `eval_framework.tasks.benchmarks.mmlu`
+
+- File: [src/eval_framework/tasks/benchmarks/mmlu.py](../../src/eval_framework/tasks/benchmarks/mmlu.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/mmlu.py)
+
+- Link to dataset: [https://huggingface.co/datasets/cais/mmlu](https://huggingface.co/datasets/cais/mmlu)
+
+More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "MMLUBPB"`.
diff --git a/docs/tasks/MMLUCloze.md b/docs/tasks/MMLUCloze.md
@@ -0,0 +1,20 @@
+# MMLUCloze
+
+````
+NAME = MMLUCloze
+DATASET_PATH = cais/mmlu
+SAMPLE_SPLIT = test
+FEWSHOT_SPLIT = dev
+RESPONSE_TYPE = LOGLIKELIHOODS
+METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood]
+SUBJECTS = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']
+LANGUAGE = <Language.ENG: 'English'>
+````
+
+- Module: `eval_framework.tasks.benchmarks.mmlu`
+
+- File: [src/eval_framework/tasks/benchmarks/mmlu.py](../../src/eval_framework/tasks/benchmarks/mmlu.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/mmlu.py)
+
+- Link to dataset: [https://huggingface.co/datasets/cais/mmlu](https://huggingface.co/datasets/cais/mmlu)
+
+More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "MMLUCloze"`.
diff --git a/docs/tasks/MMLUHumanitiesBPB.md b/docs/tasks/MMLUHumanitiesBPB.md
@@ -0,0 +1,20 @@
+# MMLUHumanitiesBPB
+
+````
+NAME = MMLUHumanitiesBPB
+DATASET_PATH = cais/mmlu
+SAMPLE_SPLIT = test
+FEWSHOT_SPLIT = dev
+RESPONSE_TYPE = LOGLIKELIHOODS
+METRICS = [BitsPerByteLoglikelihood]
+SUBJECTS = ['formal_logic', 'high_school_european_history', 'high_school_us_history', 'high_school_world_history', 'international_law', 'jurisprudence', 'logical_fallacies', 'moral_disputes', 'moral_scenarios', 'philosophy', 'prehistory', 'professional_law', 'world_religions']
+LANGUAGE = <Language.ENG: 'English'>
+````
+
+- Module: `eval_framework.tasks.benchmarks.mmlu`
+
+- File: [src/eval_framework/tasks/benchmarks/mmlu.py](../../src/eval_framework/tasks/benchmarks/mmlu.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/mmlu.py)
+
+- Link to dataset: [https://huggingface.co/datasets/cais/mmlu](https://huggingface.co/datasets/cais/mmlu)
+
+More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "MMLUHumanitiesBPB"`.
diff --git a/docs/tasks/MMLUMC.md b/docs/tasks/MMLUMC.md
@@ -0,0 +1,20 @@
+# MMLUMC
+
+````
+NAME = MMLUMC
+DATASET_PATH = cais/mmlu
+SAMPLE_SPLIT = test
+FEWSHOT_SPLIT = dev
+RESPONSE_TYPE = LOGLIKELIHOODS
+METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood]
+SUBJECTS = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']
+LANGUAGE = <Language.ENG: 'English'>
+````
+
+- Module: `eval_framework.tasks.benchmarks.mmlu`
+
+- File: [src/eval_framework/tasks/benchmarks/mmlu.py](../../src/eval_framework/tasks/benchmarks/mmlu.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/mmlu.py)
+
+- Link to dataset: [https://huggingface.co/datasets/cais/mmlu](https://huggingface.co/datasets/cais/mmlu)
+
+More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "MMLUMC"`.
diff --git a/docs/tasks/MMLUOtherBPB.md b/docs/tasks/MMLUOtherBPB.md
@@ -0,0 +1,20 @@
+# MMLUOtherBPB
+
+````
+NAME = MMLUOtherBPB
+DATASET_PATH = cais/mmlu
+SAMPLE_SPLIT = test
+FEWSHOT_SPLIT = dev
+RESPONSE_TYPE = LOGLIKELIHOODS
+METRICS = [BitsPerByteLoglikelihood]
+SUBJECTS = ['anatomy', 'business_ethics', 'clinical_knowledge', 'college_medicine', 'global_facts', 'human_aging', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting', 'professional_medicine', 'virology']
+LANGUAGE = <Language.ENG: 'English'>
+````
+
+- Module: `eval_framework.tasks.benchmarks.mmlu`
+
+- File: [src/eval_framework/tasks/benchmarks/mmlu.py](../../src/eval_framework/tasks/benchmarks/mmlu.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/mmlu.py)
+
+- Link to dataset: [https://huggingface.co/datasets/cais/mmlu](https://huggingface.co/datasets/cais/mmlu)
+
+More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "MMLUOtherBPB"`.
diff --git a/docs/tasks/MMLUSocialSciencesBPB.md b/docs/tasks/MMLUSocialSciencesBPB.md
@@ -0,0 +1,20 @@
+# MMLUSocialSciencesBPB
+
+````
+NAME = MMLUSocialSciencesBPB
+DATASET_PATH = cais/mmlu
+SAMPLE_SPLIT = test
+FEWSHOT_SPLIT = dev
+RESPONSE_TYPE = LOGLIKELIHOODS
+METRICS = [BitsPerByteLoglikelihood]
+SUBJECTS = ['econometrics', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_microeconomics', 'high_school_psychology', 'human_sexuality', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy']
+LANGUAGE = <Language.ENG: 'English'>
+````
+
+- Module: `eval_framework.tasks.benchmarks.mmlu`
+
+- File: [src/eval_framework/tasks/benchmarks/mmlu.py](../../src/eval_framework/tasks/benchmarks/mmlu.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/mmlu.py)
+
+- Link to dataset: [https://huggingface.co/datasets/cais/mmlu](https://huggingface.co/datasets/cais/mmlu)
+
+More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "MMLUSocialSciencesBPB"`.
diff --git a/docs/tasks/MMLUStemBPB.md b/docs/tasks/MMLUStemBPB.md
@@ -0,0 +1,20 @@
+# MMLUStemBPB
+
+````
+NAME = MMLUStemBPB
+DATASET_PATH = cais/mmlu
+SAMPLE_SPLIT = test
+FEWSHOT_SPLIT = dev
+RESPONSE_TYPE = LOGLIKELIHOODS
+METRICS = [BitsPerByteLoglikelihood]
+SUBJECTS = ['abstract_algebra', 'astronomy', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_physics', 'computer_security', 'conceptual_physics', 'electrical_engineering', 'elementary_mathematics', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_mathematics', 'high_school_physics', 'high_school_statistics', 'machine_learning']
+LANGUAGE = <Language.ENG: 'English'>
+````
+
+- Module: `eval_framework.tasks.benchmarks.mmlu`
+
+- File: [src/eval_framework/tasks/benchmarks/mmlu.py](../../src/eval_framework/tasks/benchmarks/mmlu.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/mmlu.py)
+
+- Link to dataset: [https://huggingface.co/datasets/cais/mmlu](https://huggingface.co/datasets/cais/mmlu)
+
+More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "MMLUStemBPB"`.