diff --git a/docs/tasks/ARCBPB.md b/docs/tasks/ARCBPB.md new file mode 100644 index 00000000..787ac868 --- /dev/null +++ b/docs/tasks/ARCBPB.md @@ -0,0 +1,20 @@ +# ARCBPB + +```` +NAME = ARCBPB +DATASET_PATH = allenai/ai2_arc +SAMPLE_SPLIT = test +FEWSHOT_SPLIT = train +RESPONSE_TYPE = LOGLIKELIHOODS +METRICS = [BitsPerByteLoglikelihood] +SUBJECTS = ['ARC-Easy', 'ARC-Challenge'] +LANGUAGE = +```` + +- Module: `eval_framework.tasks.benchmarks.arc` + +- File: [src/eval_framework/tasks/benchmarks/arc.py](../../src/eval_framework/tasks/benchmarks/arc.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/arc.py) + +- Link to dataset: [https://huggingface.co/datasets/allenai/ai2_arc](https://huggingface.co/datasets/allenai/ai2_arc) + +More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "ARCBPB"`. diff --git a/docs/tasks/ARCCloze.md b/docs/tasks/ARCCloze.md new file mode 100644 index 00000000..351296cd --- /dev/null +++ b/docs/tasks/ARCCloze.md @@ -0,0 +1,20 @@ +# ARCCloze + +```` +NAME = ARCCloze +DATASET_PATH = allenai/ai2_arc +SAMPLE_SPLIT = test +FEWSHOT_SPLIT = train +RESPONSE_TYPE = LOGLIKELIHOODS +METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood] +SUBJECTS = ['ARC-Easy', 'ARC-Challenge'] +LANGUAGE = +```` + +- Module: `eval_framework.tasks.benchmarks.arc` + +- File: [src/eval_framework/tasks/benchmarks/arc.py](../../src/eval_framework/tasks/benchmarks/arc.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/arc.py) + +- Link to dataset: [https://huggingface.co/datasets/allenai/ai2_arc](https://huggingface.co/datasets/allenai/ai2_arc) + +More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "ARCCloze"`. diff --git a/docs/tasks/ARCMC.md b/docs/tasks/ARCMC.md new file mode 100644 index 00000000..7e93372b --- /dev/null +++ b/docs/tasks/ARCMC.md @@ -0,0 +1,20 @@ +# ARCMC + +```` +NAME = ARCMC +DATASET_PATH = allenai/ai2_arc +SAMPLE_SPLIT = test +FEWSHOT_SPLIT = train +RESPONSE_TYPE = LOGLIKELIHOODS +METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood] +SUBJECTS = ['ARC-Easy', 'ARC-Challenge'] +LANGUAGE = +```` + +- Module: `eval_framework.tasks.benchmarks.arc` + +- File: [src/eval_framework/tasks/benchmarks/arc.py](../../src/eval_framework/tasks/benchmarks/arc.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/arc.py) + +- Link to dataset: [https://huggingface.co/datasets/allenai/ai2_arc](https://huggingface.co/datasets/allenai/ai2_arc) + +More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "ARCMC"`. diff --git a/docs/tasks/CodexHumanEval_BPB.md b/docs/tasks/CodexHumanEval_BPB.md new file mode 100644 index 00000000..8275d619 --- /dev/null +++ b/docs/tasks/CodexHumanEval_BPB.md @@ -0,0 +1,20 @@ +# CodexHumanEval_BPB + +```` +NAME = CodexHumanEval_BPB +DATASET_PATH = openai/openai_humaneval +SAMPLE_SPLIT = test +FEWSHOT_SPLIT = test +RESPONSE_TYPE = LOGLIKELIHOODS +METRICS = [BitsPerByteLoglikelihood] +SUBJECTS = ['no_subject'] +LANGUAGE = +```` + +- Module: `eval_framework.tasks.benchmarks.humaneval` + +- File: [src/eval_framework/tasks/benchmarks/humaneval.py](../../src/eval_framework/tasks/benchmarks/humaneval.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/humaneval.py) + +- Link to dataset: [https://huggingface.co/datasets/openai/openai_humaneval](https://huggingface.co/datasets/openai/openai_humaneval) + +More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "CodexHumanEval_BPB"`. diff --git a/docs/tasks/CodexMBPP_BPB.md b/docs/tasks/CodexMBPP_BPB.md new file mode 100644 index 00000000..74e22efa --- /dev/null +++ b/docs/tasks/CodexMBPP_BPB.md @@ -0,0 +1,20 @@ +# CodexMBPP_BPB + +```` +NAME = CodexMBPP_BPB +DATASET_PATH = google-research-datasets/mbpp +SAMPLE_SPLIT = test +FEWSHOT_SPLIT = test +RESPONSE_TYPE = LOGLIKELIHOODS +METRICS = [BitsPerByteLoglikelihood] +SUBJECTS = ['full'] +LANGUAGE = +```` + +- Module: `eval_framework.tasks.benchmarks.mbpp` + +- File: [src/eval_framework/tasks/benchmarks/mbpp.py](../../src/eval_framework/tasks/benchmarks/mbpp.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/mbpp.py) + +- Link to dataset: [https://huggingface.co/datasets/google-research-datasets/mbpp](https://huggingface.co/datasets/google-research-datasets/mbpp) + +More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "CodexMBPP_BPB"`. diff --git a/docs/tasks/HELLASWAGBPB.md b/docs/tasks/HELLASWAGBPB.md new file mode 100644 index 00000000..9538d33a --- /dev/null +++ b/docs/tasks/HELLASWAGBPB.md @@ -0,0 +1,20 @@ +# HELLASWAGBPB + +```` +NAME = HELLASWAGBPB +DATASET_PATH = Rowan/hellaswag +SAMPLE_SPLIT = validation +FEWSHOT_SPLIT = train +RESPONSE_TYPE = LOGLIKELIHOODS +METRICS = [BitsPerByteLoglikelihood] +SUBJECTS = ['no_subject'] +LANGUAGE = +```` + +- Module: `eval_framework.tasks.benchmarks.hellaswag` + +- File: [src/eval_framework/tasks/benchmarks/hellaswag.py](../../src/eval_framework/tasks/benchmarks/hellaswag.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/hellaswag.py) + +- Link to dataset: [https://huggingface.co/datasets/Rowan/hellaswag](https://huggingface.co/datasets/Rowan/hellaswag) + +More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "HELLASWAGBPB"`. diff --git a/docs/tasks/HELLASWAGCloze.md b/docs/tasks/HELLASWAGCloze.md new file mode 100644 index 00000000..ea81b51c --- /dev/null +++ b/docs/tasks/HELLASWAGCloze.md @@ -0,0 +1,20 @@ +# HELLASWAGCloze + +```` +NAME = HELLASWAGCloze +DATASET_PATH = Rowan/hellaswag +SAMPLE_SPLIT = validation +FEWSHOT_SPLIT = train +RESPONSE_TYPE = LOGLIKELIHOODS +METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood] +SUBJECTS = ['no_subject'] +LANGUAGE = +```` + +- Module: `eval_framework.tasks.benchmarks.hellaswag` + +- File: [src/eval_framework/tasks/benchmarks/hellaswag.py](../../src/eval_framework/tasks/benchmarks/hellaswag.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/hellaswag.py) + +- Link to dataset: [https://huggingface.co/datasets/Rowan/hellaswag](https://huggingface.co/datasets/Rowan/hellaswag) + +More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "HELLASWAGCloze"`. diff --git a/docs/tasks/HELLASWAGMC.md b/docs/tasks/HELLASWAGMC.md new file mode 100644 index 00000000..7a37ba99 --- /dev/null +++ b/docs/tasks/HELLASWAGMC.md @@ -0,0 +1,20 @@ +# HELLASWAGMC + +```` +NAME = HELLASWAGMC +DATASET_PATH = Rowan/hellaswag +SAMPLE_SPLIT = validation +FEWSHOT_SPLIT = train +RESPONSE_TYPE = LOGLIKELIHOODS +METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood] +SUBJECTS = ['no_subject'] +LANGUAGE = +```` + +- Module: `eval_framework.tasks.benchmarks.hellaswag` + +- File: [src/eval_framework/tasks/benchmarks/hellaswag.py](../../src/eval_framework/tasks/benchmarks/hellaswag.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/hellaswag.py) + +- Link to dataset: [https://huggingface.co/datasets/Rowan/hellaswag](https://huggingface.co/datasets/Rowan/hellaswag) + +More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "HELLASWAGMC"`. diff --git a/docs/tasks/MATH500Minerva_BPB.md b/docs/tasks/MATH500Minerva_BPB.md new file mode 100644 index 00000000..8980f810 --- /dev/null +++ b/docs/tasks/MATH500Minerva_BPB.md @@ -0,0 +1,20 @@ +# MATH500Minerva_BPB + +```` +NAME = MATH500Minerva_BPB +DATASET_PATH = HuggingFaceH4/MATH-500 +SAMPLE_SPLIT = test +FEWSHOT_SPLIT = test +RESPONSE_TYPE = LOGLIKELIHOODS +METRICS = [BitsPerByteLoglikelihood] +SUBJECTS = ['no_subject'] +LANGUAGE = +```` + +- Module: `eval_framework.tasks.benchmarks.math_reasoning` + +- File: [src/eval_framework/tasks/benchmarks/math_reasoning.py](../../src/eval_framework/tasks/benchmarks/math_reasoning.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/math_reasoning.py) + +- Link to dataset: [https://huggingface.co/datasets/HuggingFaceH4/MATH-500](https://huggingface.co/datasets/HuggingFaceH4/MATH-500) + +More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "MATH500Minerva_BPB"`. diff --git a/docs/tasks/MMLUBPB.md b/docs/tasks/MMLUBPB.md new file mode 100644 index 00000000..ed99b33f --- /dev/null +++ b/docs/tasks/MMLUBPB.md @@ -0,0 +1,20 @@ +# MMLUBPB + +```` +NAME = MMLUBPB +DATASET_PATH = cais/mmlu +SAMPLE_SPLIT = test +FEWSHOT_SPLIT = dev +RESPONSE_TYPE = LOGLIKELIHOODS +METRICS = [BitsPerByteLoglikelihood] +SUBJECTS = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions'] +LANGUAGE = +```` + +- Module: `eval_framework.tasks.benchmarks.mmlu` + +- File: [src/eval_framework/tasks/benchmarks/mmlu.py](../../src/eval_framework/tasks/benchmarks/mmlu.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/mmlu.py) + +- Link to dataset: [https://huggingface.co/datasets/cais/mmlu](https://huggingface.co/datasets/cais/mmlu) + +More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "MMLUBPB"`. diff --git a/docs/tasks/MMLUCloze.md b/docs/tasks/MMLUCloze.md new file mode 100644 index 00000000..fd7944bc --- /dev/null +++ b/docs/tasks/MMLUCloze.md @@ -0,0 +1,20 @@ +# MMLUCloze + +```` +NAME = MMLUCloze +DATASET_PATH = cais/mmlu +SAMPLE_SPLIT = test +FEWSHOT_SPLIT = dev +RESPONSE_TYPE = LOGLIKELIHOODS +METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood] +SUBJECTS = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions'] +LANGUAGE = +```` + +- Module: `eval_framework.tasks.benchmarks.mmlu` + +- File: [src/eval_framework/tasks/benchmarks/mmlu.py](../../src/eval_framework/tasks/benchmarks/mmlu.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/mmlu.py) + +- Link to dataset: [https://huggingface.co/datasets/cais/mmlu](https://huggingface.co/datasets/cais/mmlu) + +More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "MMLUCloze"`. diff --git a/docs/tasks/MMLUHumanitiesBPB.md b/docs/tasks/MMLUHumanitiesBPB.md new file mode 100644 index 00000000..3d0132eb --- /dev/null +++ b/docs/tasks/MMLUHumanitiesBPB.md @@ -0,0 +1,20 @@ +# MMLUHumanitiesBPB + +```` +NAME = MMLUHumanitiesBPB +DATASET_PATH = cais/mmlu +SAMPLE_SPLIT = test +FEWSHOT_SPLIT = dev +RESPONSE_TYPE = LOGLIKELIHOODS +METRICS = [BitsPerByteLoglikelihood] +SUBJECTS = ['formal_logic', 'high_school_european_history', 'high_school_us_history', 'high_school_world_history', 'international_law', 'jurisprudence', 'logical_fallacies', 'moral_disputes', 'moral_scenarios', 'philosophy', 'prehistory', 'professional_law', 'world_religions'] +LANGUAGE = +```` + +- Module: `eval_framework.tasks.benchmarks.mmlu` + +- File: [src/eval_framework/tasks/benchmarks/mmlu.py](../../src/eval_framework/tasks/benchmarks/mmlu.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/mmlu.py) + +- Link to dataset: [https://huggingface.co/datasets/cais/mmlu](https://huggingface.co/datasets/cais/mmlu) + +More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "MMLUHumanitiesBPB"`. diff --git a/docs/tasks/MMLUMC.md b/docs/tasks/MMLUMC.md new file mode 100644 index 00000000..b007edd4 --- /dev/null +++ b/docs/tasks/MMLUMC.md @@ -0,0 +1,20 @@ +# MMLUMC + +```` +NAME = MMLUMC +DATASET_PATH = cais/mmlu +SAMPLE_SPLIT = test +FEWSHOT_SPLIT = dev +RESPONSE_TYPE = LOGLIKELIHOODS +METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood] +SUBJECTS = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions'] +LANGUAGE = +```` + +- Module: `eval_framework.tasks.benchmarks.mmlu` + +- File: [src/eval_framework/tasks/benchmarks/mmlu.py](../../src/eval_framework/tasks/benchmarks/mmlu.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/mmlu.py) + +- Link to dataset: [https://huggingface.co/datasets/cais/mmlu](https://huggingface.co/datasets/cais/mmlu) + +More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "MMLUMC"`. diff --git a/docs/tasks/MMLUOtherBPB.md b/docs/tasks/MMLUOtherBPB.md new file mode 100644 index 00000000..a9c80a0a --- /dev/null +++ b/docs/tasks/MMLUOtherBPB.md @@ -0,0 +1,20 @@ +# MMLUOtherBPB + +```` +NAME = MMLUOtherBPB +DATASET_PATH = cais/mmlu +SAMPLE_SPLIT = test +FEWSHOT_SPLIT = dev +RESPONSE_TYPE = LOGLIKELIHOODS +METRICS = [BitsPerByteLoglikelihood] +SUBJECTS = ['anatomy', 'business_ethics', 'clinical_knowledge', 'college_medicine', 'global_facts', 'human_aging', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting', 'professional_medicine', 'virology'] +LANGUAGE = +```` + +- Module: `eval_framework.tasks.benchmarks.mmlu` + +- File: [src/eval_framework/tasks/benchmarks/mmlu.py](../../src/eval_framework/tasks/benchmarks/mmlu.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/mmlu.py) + +- Link to dataset: [https://huggingface.co/datasets/cais/mmlu](https://huggingface.co/datasets/cais/mmlu) + +More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "MMLUOtherBPB"`. diff --git a/docs/tasks/MMLUSocialSciencesBPB.md b/docs/tasks/MMLUSocialSciencesBPB.md new file mode 100644 index 00000000..8bc2a4c4 --- /dev/null +++ b/docs/tasks/MMLUSocialSciencesBPB.md @@ -0,0 +1,20 @@ +# MMLUSocialSciencesBPB + +```` +NAME = MMLUSocialSciencesBPB +DATASET_PATH = cais/mmlu +SAMPLE_SPLIT = test +FEWSHOT_SPLIT = dev +RESPONSE_TYPE = LOGLIKELIHOODS +METRICS = [BitsPerByteLoglikelihood] +SUBJECTS = ['econometrics', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_microeconomics', 'high_school_psychology', 'human_sexuality', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy'] +LANGUAGE = +```` + +- Module: `eval_framework.tasks.benchmarks.mmlu` + +- File: [src/eval_framework/tasks/benchmarks/mmlu.py](../../src/eval_framework/tasks/benchmarks/mmlu.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/mmlu.py) + +- Link to dataset: [https://huggingface.co/datasets/cais/mmlu](https://huggingface.co/datasets/cais/mmlu) + +More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "MMLUSocialSciencesBPB"`. diff --git a/docs/tasks/MMLUStemBPB.md b/docs/tasks/MMLUStemBPB.md new file mode 100644 index 00000000..812367a6 --- /dev/null +++ b/docs/tasks/MMLUStemBPB.md @@ -0,0 +1,20 @@ +# MMLUStemBPB + +```` +NAME = MMLUStemBPB +DATASET_PATH = cais/mmlu +SAMPLE_SPLIT = test +FEWSHOT_SPLIT = dev +RESPONSE_TYPE = LOGLIKELIHOODS +METRICS = [BitsPerByteLoglikelihood] +SUBJECTS = ['abstract_algebra', 'astronomy', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_physics', 'computer_security', 'conceptual_physics', 'electrical_engineering', 'elementary_mathematics', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_mathematics', 'high_school_physics', 'high_school_statistics', 'machine_learning'] +LANGUAGE = +```` + +- Module: `eval_framework.tasks.benchmarks.mmlu` + +- File: [src/eval_framework/tasks/benchmarks/mmlu.py](../../src/eval_framework/tasks/benchmarks/mmlu.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/mmlu.py) + +- Link to dataset: [https://huggingface.co/datasets/cais/mmlu](https://huggingface.co/datasets/cais/mmlu) + +More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "MMLUStemBPB"`. diff --git a/docs/tasks/README.md b/docs/tasks/README.md index d94ecd25..ec36b9ff 100644 --- a/docs/tasks/README.md +++ b/docs/tasks/README.md @@ -2,7 +2,7 @@ This directory contains the generated documentation for all benchmark tasks available in the package. -**Total number of tasks: 182** +**Total number of tasks: 198** The documentation can be generated or updated with `uv run -m eval_framework.utils.generate_task_docs`. @@ -14,6 +14,9 @@ NOTE: This is an automatically generated file. Any manual modifications will not - [AIME2025](AIME2025.md) - [AIME2026](AIME2026.md) - [ARC](ARC.md) +- [ARCBPB](ARCBPB.md) +- [ARCCloze](ARCCloze.md) +- [ARCMC](ARCMC.md) - [ARC_DE](ARC_DE.md) - [ARC_EU20_DE](ARC_EU20_DE.md) - [ARC_EU20_FR](ARC_EU20_FR.md) @@ -36,6 +39,8 @@ NOTE: This is an automatically generated file. Any manual modifications will not - [COPA_IDKEvalHarness](COPA_IDKEvalHarness.md) - [COPA_OLMES](COPA_OLMES.md) - [ChemBench](ChemBench.md) +- [CodexHumanEval_BPB](CodexHumanEval_BPB.md) +- [CodexMBPP_BPB](CodexMBPP_BPB.md) - [CommonsenseQACloze](CommonsenseQACloze.md) - [CommonsenseQAFullTextCloze](CommonsenseQAFullTextCloze.md) - [CommonsenseQAMC](CommonsenseQAMC.md) @@ -65,6 +70,9 @@ NOTE: This is an automatically generated file. Any manual modifications will not - [GlobalMMLU](GlobalMMLU.md) - [GlobalMMLU_German](GlobalMMLU_German.md) - [HELLASWAG](HELLASWAG.md) +- [HELLASWAGBPB](HELLASWAGBPB.md) +- [HELLASWAGCloze](HELLASWAGCloze.md) +- [HELLASWAGMC](HELLASWAGMC.md) - [HELLASWAG_DE](HELLASWAG_DE.md) - [HELLASWAG_EU20_DE](HELLASWAG_EU20_DE.md) - [HELLASWAG_EU20_FR](HELLASWAG_EU20_FR.md) @@ -93,6 +101,7 @@ NOTE: This is an automatically generated file. Any manual modifications will not - [MATH](MATH.md) - [MATH500](MATH500.md) - [MATH500Minerva](MATH500Minerva.md) +- [MATH500Minerva_BPB](MATH500Minerva_BPB.md) - [MATHLvl5](MATHLvl5.md) - [MATHMinerva](MATHMinerva.md) - [MATHMinervaBPB](MATHMinervaBPB.md) @@ -105,6 +114,13 @@ NOTE: This is an automatically generated file. Any manual modifications will not - [MBPP_PROMPT_WITHOUT_TESTS_SANITIZED](MBPP_PROMPT_WITHOUT_TESTS_SANITIZED.md) - [MBPP_SANITIZED](MBPP_SANITIZED.md) - [MMLU](MMLU.md) +- [MMLUBPB](MMLUBPB.md) +- [MMLUCloze](MMLUCloze.md) +- [MMLUHumanitiesBPB](MMLUHumanitiesBPB.md) +- [MMLUMC](MMLUMC.md) +- [MMLUOtherBPB](MMLUOtherBPB.md) +- [MMLUSocialSciencesBPB](MMLUSocialSciencesBPB.md) +- [MMLUStemBPB](MMLUStemBPB.md) - [MMLU_COT](MMLU_COT.md) - [MMLU_DE](MMLU_DE.md) - [MMLU_EU20_DE](MMLU_EU20_DE.md) diff --git a/src/eval_framework/tasks/benchmarks/arc.py b/src/eval_framework/tasks/benchmarks/arc.py index c8db9060..b047c7c7 100644 --- a/src/eval_framework/tasks/benchmarks/arc.py +++ b/src/eval_framework/tasks/benchmarks/arc.py @@ -9,6 +9,7 @@ from eval_framework.metrics.loglikelihood.dcs import DistributionalCorrectnessScore from eval_framework.metrics.loglikelihood.ternary import TernaryScore from eval_framework.tasks.base import BaseTask, Language, ResponseType +from eval_framework.tasks.task_style import BPBStyle, ClozeStyle, MCStyle, answer_key_to_index from eval_framework.tasks.utils import get_n_letters @@ -94,3 +95,45 @@ def _get_initial_prompt_text(self, item: dict[str, Any]) -> str: def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None: completions = super()._get_possible_completions(item) return (completions or []) + [" I do not know."] + + +class _ARCChoice_Base(BaseTask[str]): + """Shared base for choice-based ARC variants (Cloze, MC, BPB). + + Subclasses set ``NAME`` and ``TASK_STYLER``; everything else is inherited. + """ + + DATASET_PATH = "allenai/ai2_arc" + SAMPLE_SPLIT = "test" + FEWSHOT_SPLIT = "train" + SUBJECTS = ["ARC-Easy", "ARC-Challenge"] + PERTURBATION_UNMODIFIABLE_WORDS = ["Question"] + get_n_letters(5) + LANGUAGE = Language.ENG + + def _get_raw_question(self, item: dict[str, Any]) -> str: + return item["question"] + + def _get_choices(self, item: dict[str, Any]) -> list[str]: + return item["choices"]["text"] + + def _get_correct_index(self, item: dict[str, Any]) -> int: + return answer_key_to_index(item["answerKey"]) + + +class ARCCloze(_ARCChoice_Base): + NAME = "ARCCloze" + TASK_STYLER = ClozeStyle() + + +class ARCMC(_ARCChoice_Base): + """ARC with OLMES-style MC prompt: options listed as ' A. ...', scored over ' A'/' B'/....""" + + NAME = "ARCMC" + TASK_STYLER = MCStyle(space_prefixed_labels=True) + + +class ARCBPB(_ARCChoice_Base): + """BPB-only variant: scores loglikelihood over the ground-truth answer text only.""" + + NAME = "ARCBPB" + TASK_STYLER = BPBStyle() diff --git a/src/eval_framework/tasks/benchmarks/hellaswag.py b/src/eval_framework/tasks/benchmarks/hellaswag.py index 690c6a1b..3d69945c 100644 --- a/src/eval_framework/tasks/benchmarks/hellaswag.py +++ b/src/eval_framework/tasks/benchmarks/hellaswag.py @@ -10,6 +10,7 @@ from eval_framework.metrics.loglikelihood.dcs import DistributionalCorrectnessScore from eval_framework.metrics.loglikelihood.ternary import TernaryScore from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType +from eval_framework.tasks.task_style import BPBStyle, ClozeStyle, MCStyle class HELLASWAG(BaseTask[str]): @@ -73,3 +74,54 @@ def _get_initial_prompt_text(self, item: dict[str, Any]) -> str: def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None: completions = super()._get_possible_completions(item) return (completions or []) + [" I do not know."] + + +class _HELLASWAG_Base(BaseTask[str]): + """Shared base for HELLASWAG variants (Cloze, MC, BPB). + + Subclasses set ``NAME`` and ``TASK_STYLER``; everything else is inherited. + """ + + DATASET_PATH = "Rowan/hellaswag" + SAMPLE_SPLIT = "validation" + FEWSHOT_SPLIT = "train" + SUBJECTS = [NO_SUBJECT] + LANGUAGE = Language.ENG + + @staticmethod + def _preprocess(prompt: str) -> str: + # remove bracketed text + prompt = prompt.strip() + prompt = prompt.replace(" [title]", ". ") + prompt = re.sub("\\[.*?\\]", "", prompt) + prompt = prompt.replace(" ", " ") + prompt = re.sub(r"\.\. ", ". ", prompt) + return prompt + + def _get_choices(self, item: dict[str, Any]) -> list[str]: + return [self._preprocess(ending) for ending in item["endings"]] + + def _get_raw_question(self, item: dict[str, Any]) -> str: + # Include activity_label as prefix to match the OLMES prompt format: + # "ActivityLabel: preprocessed_context" + subject = self._preprocess(item["activity_label"]) + context = self._preprocess(item["ctx_a"] + " " + item["ctx_b"].capitalize()).strip() + return f"{subject}: {context}" + + def _get_correct_index(self, item: dict[str, Any]) -> int: + return int(item["label"] if item["label"] != "" else 0) + + +class HELLASWAGCloze(_HELLASWAG_Base): + NAME = "HELLASWAGCloze" + TASK_STYLER = ClozeStyle() + + +class HELLASWAGMC(_HELLASWAG_Base): + NAME = "HELLASWAGMC" + TASK_STYLER = MCStyle(space_prefixed_labels=True) + + +class HELLASWAGBPB(_HELLASWAG_Base): + NAME = "HellaSwagBPB" + TASK_STYLER = BPBStyle(question_prefix="", cue_text="", trailing_newline=False) diff --git a/src/eval_framework/tasks/benchmarks/humaneval.py b/src/eval_framework/tasks/benchmarks/humaneval.py index 163d6a8d..ba5ccc0c 100644 --- a/src/eval_framework/tasks/benchmarks/humaneval.py +++ b/src/eval_framework/tasks/benchmarks/humaneval.py @@ -4,6 +4,7 @@ from eval_framework.metrics.loglikelihood.bits_per_byte import BitsPerByteLoglikelihood from eval_framework.shared.types import BaseMetricContext from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType, Sample +from eval_framework.tasks.task_style import BPBStyle CODE_TO_EXECUTE = """ {start_of_code} @@ -139,3 +140,33 @@ def _get_instruction_text(self, item: dict[str, Any]) -> str: def _get_cue_text(self, item: dict[str, Any]) -> str: return self.CUE_PREFIX + item["prompt"].lstrip() + + +class _CodexHumanEval_Base(BaseTask[str]): + """Shared base for codex_humaneval_gold_bpb_3shot-compatible HumanEval variants.""" + + DATASET_PATH = "openai/openai_humaneval" + SAMPLE_SPLIT = "test" + FEWSHOT_SPLIT = "test" + SUBJECTS = [NO_SUBJECT] + LANGUAGE = Language.ENG + + def _get_raw_question(self, item: dict[str, Any]) -> str: + return item["prompt"] + + def _get_choices(self, item: dict[str, Any]) -> list[str]: + return [item["canonical_solution"]] + + def _get_correct_index(self, item: dict[str, Any]) -> int: + return 0 + + +class CodexHumanEval_BPB(_CodexHumanEval_Base): + """BPB-only HumanEval that matches codex_humaneval_gold_bpb_3shot. + + Prompt: ``{prompt}`` (function signature + docstring, verbatim) + Scored completion: ``{canonical_solution}`` + """ + + NAME = "CodexHumanEval_BPB" + TASK_STYLER = BPBStyle(question_prefix="", cue_text="", trailing_newline=False, leading_space_continuations=False) diff --git a/src/eval_framework/tasks/benchmarks/math_reasoning.py b/src/eval_framework/tasks/benchmarks/math_reasoning.py index ff47cf53..ef1958f2 100644 --- a/src/eval_framework/tasks/benchmarks/math_reasoning.py +++ b/src/eval_framework/tasks/benchmarks/math_reasoning.py @@ -16,6 +16,7 @@ ) from eval_framework.metrics.loglikelihood.bits_per_byte import BitsPerByteLoglikelihood from eval_framework.tasks.base import NO_SUBJECT, RANDOM_SEED, BaseTask, Language, ResponseType, Sample, SubjectType +from eval_framework.tasks.task_style import BPBStyle # Hendrycks MATH subject splits (shared by MATH, MATHMinervaEvalHarness, MATHMinervaBPB) MATH_SUBJECTS = [ @@ -790,3 +791,57 @@ def __init__(self, num_fewshot: int = 4) -> None: def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]: return _OLMES_FEWSHOTS[: self.num_fewshot] + + +class _MATH500Minerva_Base(BaseTask[str]): + """Shared base for TASK_STYLER-based MATH500Minerva variants. + + MATH-500 has no discrete answer choices, so MCStyle and ClozeStyle do not + apply. Only BPBStyle (bits-per-byte of the normalized gold answer) is + supported. Uses the MATH-500 dataset with the 4 hardcoded OLMES fewshot + examples from ``_OLMES_FEWSHOTS`` (same as MATHMinerva_OLMES). + + Subclasses set ``NAME`` and ``TASK_STYLER``; everything else is inherited. + """ + + DATASET_PATH = "HuggingFaceH4/MATH-500" + SAMPLE_SPLIT = "test" + FEWSHOT_SPLIT = "test" + SUBJECTS = [NO_SUBJECT] + LANGUAGE = Language.ENG + + def __init__(self, num_fewshot: int = 4) -> None: + if num_fewshot != 4: + logger.warning("MATH500Minerva TASK_STYLER variants support a fixed num_fewshot of 4.") + super().__init__(num_fewshot=4) + + def _get_raw_question(self, item: dict[str, Any]) -> str: + # Embed "Solution:" so BPBStyle's empty cue produces the same prompt as + # MATHMinervaEvalHarness: "Problem:\n{problem}\n\nSolution:". + return "Problem:\n" + item["problem"] + "\n\nSolution:" + + def _get_choices(self, item: dict[str, Any]) -> list[str]: + # BPB is scored over the full gold solution (matching minerva_math_500_gold_bpb_0shot). + return [item["solution"]] + + def _get_correct_index(self, item: dict[str, Any]) -> int: + return 0 + + def _get_fewshot_target_text(self, item: dict[str, Any]) -> str: + return " " + item["solution"] + + def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]: + return _OLMES_FEWSHOTS[: self.num_fewshot] + + +class MATH500Minerva_BPB(_MATH500Minerva_Base): + """BPB-only variant of MATH500Minerva with OLMES 4-shot prompt. + + Scores bits-per-byte of the normalized gold answer conditioned on the + Minerva-style prompt with 4 hardcoded OLMES fewshot examples. + """ + + NAME = "MATH500Minerva_BPB" + # trailing_newline=False keeps the prompt as "Problem:\n...\n\nSolution:" + # without an extra newline; question_prefix="" suppresses "Question: ". + TASK_STYLER = BPBStyle(question_prefix="", cue_text="", trailing_newline=False) diff --git a/src/eval_framework/tasks/benchmarks/mbpp.py b/src/eval_framework/tasks/benchmarks/mbpp.py index 4f0205de..3e3666e4 100644 --- a/src/eval_framework/tasks/benchmarks/mbpp.py +++ b/src/eval_framework/tasks/benchmarks/mbpp.py @@ -9,6 +9,7 @@ from eval_framework.metrics.loglikelihood.bits_per_byte import BitsPerByteLoglikelihood from eval_framework.shared.types import BaseMetricContext from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample +from eval_framework.tasks.task_style import BPBStyle logger = logging.getLogger(__name__) @@ -275,8 +276,9 @@ class MBPP_OLMES(MBPP): FEWSHOT_SPLIT = "test" def __init__(self, num_fewshot: int = 3) -> None: - super().__init__(num_fewshot) - assert num_fewshot == 3, "MBPP_OLMES requires exactly 3 fewshot examples" + if num_fewshot != 3: + logger.warning(f"MBPP_OLMES supports only 3-shot, got {num_fewshot}") + super().__init__(num_fewshot=3) self.stop_sequences = ["```", '\n"""', "\nassert", "\n#"] def _get_instruction_text(self, item: dict[str, Any]) -> str: @@ -307,3 +309,96 @@ def post_process_generated_completion(self, completion_text: str, sample: Sample mbpp_ground_truth = str(sample.ground_truth) code = self._code_expander(extracted_code, mbpp_ground_truth) return code + + +class _MBPP_Base(BaseTask[str]): + """Shared base for TASK_STYLER-based MBPP variants. + + MBPP has no discrete answer choices, so MCStyle and ClozeStyle do not apply. + Only BPBStyle (bits-per-byte of the reference solution) is supported. + + Subclasses set ``NAME`` and ``TASK_STYLER``; everything else is inherited. + """ + + DATASET_PATH = "google-research-datasets/mbpp" + SAMPLE_SPLIT = "test" + FEWSHOT_SPLIT = "train" + SUBJECTS = ["full"] + LANGUAGE = Language.ENG + + def _get_raw_question(self, item: dict[str, Any]) -> str: + tests = "\n".join(item["test_list"]) + text = item["text"] if "text" in item else item["prompt"] + return f"You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n{tests}" # noqa + + def _get_choices(self, item: dict[str, Any]) -> list[str]: + return [item["code"]] + + def _get_correct_index(self, item: dict[str, Any]) -> int: + return 0 + + def _get_fewshot_target_text(self, item: dict[str, Any]) -> str: + return f"{BEGIN}\n{item['code']}\n{END}" + + def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]: + return self.rnd.sample(self.dataset[self.FEWSHOT_SPLIT], self.num_fewshot) + + +class MBPP_BPB(_MBPP_Base): + """BPB-only variant: scores bits-per-byte of the reference code solution.""" + + NAME = "MBPP_BPB" + TASK_STYLER = BPBStyle(question_prefix="", cue_text=BEGIN) + + +class _CodexMBPP_Base(BaseTask[str]): + """Shared base for the codex_mbpp_gold_bpb_3shot-compatible MBPP variants. + + Prompt format (per item):: + + Write a python function to {description}. + ```python + {code} + ``` + + The task description is used verbatim as the question; no test-assertions + are included in the prompt. BPB is scored over the full reference code + (including the closing ``` fence). Line endings are normalised to LF and + trailing whitespace is stripped from the code string. + """ + + DATASET_PATH = "google-research-datasets/mbpp" + SAMPLE_SPLIT = "test" + FEWSHOT_SPLIT = "test" + SUBJECTS = ["full"] + LANGUAGE = Language.ENG + + @staticmethod + def _normalize_code(code: str) -> str: + return code.replace("\r\n", "\n").replace("\r", "").rstrip() + + def _get_raw_question(self, item: dict[str, Any]) -> str: + return item["text"] + + def _get_choices(self, item: dict[str, Any]) -> list[str]: + return [self._normalize_code(item["code"])] + + def _get_correct_index(self, item: dict[str, Any]) -> int: + return 0 + + def _get_fewshot_target_text(self, item: dict[str, Any]) -> str: + code = self._normalize_code(item["code"]) + return f"```python\n{code}\n```" + + +class CodexMBPP_BPB(_CodexMBPP_Base): + """BPB-only MBPP variant that matches the codex_mbpp_gold_bpb_3shot reference. + + Prompt: ``"{description}\\n```python\\n"`` + Completion: ``"{code}\\n```"`` + """ + + NAME = "CodexMBPP_BPB" + TASK_STYLER = BPBStyle( + question_prefix="", cue_text="```python\n", trailing_newline=True, leading_space_continuations=False + ) diff --git a/src/eval_framework/tasks/benchmarks/mmlu.py b/src/eval_framework/tasks/benchmarks/mmlu.py index f1410dfc..8bca317b 100644 --- a/src/eval_framework/tasks/benchmarks/mmlu.py +++ b/src/eval_framework/tasks/benchmarks/mmlu.py @@ -11,68 +11,80 @@ from eval_framework.metrics.loglikelihood.dcs import DistributionalCorrectnessScore from eval_framework.metrics.loglikelihood.ternary import TernaryScore from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample +from eval_framework.tasks.task_style import BPBStyle, ClozeStyle, MCStyle from eval_framework.tasks.utils import get_n_letters -MMLU_SUBJECTS = [ +MMLU_STEM = [ "abstract_algebra", - "anatomy", "astronomy", - "business_ethics", - "clinical_knowledge", "college_biology", "college_chemistry", "college_computer_science", "college_mathematics", - "college_medicine", "college_physics", "computer_security", "conceptual_physics", - "econometrics", "electrical_engineering", "elementary_mathematics", - "formal_logic", - "global_facts", "high_school_biology", "high_school_chemistry", "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", "high_school_mathematics", - "high_school_microeconomics", "high_school_physics", - "high_school_psychology", "high_school_statistics", + "machine_learning", +] + +MMLU_HUMANITIES = [ + "formal_logic", + "high_school_european_history", "high_school_us_history", "high_school_world_history", - "human_aging", - "human_sexuality", "international_law", "jurisprudence", "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", "moral_disputes", "moral_scenarios", - "nutrition", "philosophy", "prehistory", - "professional_accounting", "professional_law", - "professional_medicine", + "world_religions", +] + +MMLU_SOCIAL_SCIENCES = [ + "econometrics", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_microeconomics", + "high_school_psychology", + "human_sexuality", "professional_psychology", "public_relations", "security_studies", "sociology", "us_foreign_policy", +] + +MMLU_OTHER = [ + "anatomy", + "business_ethics", + "clinical_knowledge", + "college_medicine", + "global_facts", + "human_aging", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "nutrition", + "professional_accounting", + "professional_medicine", "virology", - "world_religions", ] +MMLU_SUBJECTS = sorted(MMLU_STEM + MMLU_HUMANITIES + MMLU_SOCIAL_SCIENCES + MMLU_OTHER) + class MMLU(BaseTask[str]): """MMLU dataset: https://huggingface.co/datasets/cais/mmlu""" @@ -228,3 +240,64 @@ def _get_initial_prompt_text(self, item: dict[str, Any]) -> str: 'Summarize your reasoning concisely, then conclude with "Therefore, the answer is: X", where X is ' "one of A, B, C, or D." ) + + +class _MMLU_Base(BaseTask[str]): + """Shared base for TASK_STYLER-based MMLU variants (Cloze, MC, BPB).""" + + DATASET_PATH = "cais/mmlu" + SAMPLE_SPLIT = "test" + FEWSHOT_SPLIT = "dev" + SUBJECTS = MMLU_SUBJECTS + PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"] + get_n_letters(4) + LANGUAGE = Language.ENG + + def _get_subject_name(self, item: dict[str, Any]) -> str: + return " ".join(item["subject"].split("_")) + + def _get_initial_prompt_text(self, item: dict[str, Any]) -> str: + return f"The following are multiple choice questions (with answers) about {self._get_subject_name(item)}:" + + def _get_raw_question(self, item: dict[str, Any]) -> str: + return item["question"].strip() + + def _get_choices(self, item: dict[str, Any]) -> list[str]: + return item["choices"] + + def _get_correct_index(self, item: dict[str, Any]) -> int: + return item["answer"] + + +class MMLUCloze(_MMLU_Base): + NAME = "MMLUCloze" + TASK_STYLER = ClozeStyle() + + +class MMLUMC(_MMLU_Base): + NAME = "MMLUMC" + TASK_STYLER = MCStyle(space_prefixed_labels=True) + + +class MMLUBPB(_MMLU_Base): + NAME = "MMLUBPB" + TASK_STYLER = BPBStyle() + + +class MMLUOtherBPB(MMLUBPB): + NAME = "MMLUOtherBPB" + SUBJECTS = MMLU_OTHER + + +class MMLUStemBPB(MMLUBPB): + NAME = "MMLUStemBPB" + SUBJECTS = MMLU_STEM + + +class MMLUHumanitiesBPB(MMLUBPB): + NAME = "MMLUHumanitiesBPB" + SUBJECTS = MMLU_HUMANITIES + + +class MMLUSocialSciencesBPB(MMLUBPB): + NAME = "MMLUSocialSciencesBPB" + SUBJECTS = MMLU_SOCIAL_SCIENCES diff --git a/src/eval_framework/tasks/task_names.py b/src/eval_framework/tasks/task_names.py index 18161ff3..71e64132 100644 --- a/src/eval_framework/tasks/task_names.py +++ b/src/eval_framework/tasks/task_names.py @@ -21,6 +21,9 @@ def register_all_tasks() -> None: register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.AIME2025") register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.AIME2026") register_lazy_task("eval_framework.tasks.benchmarks.arc.ARC") + register_lazy_task("eval_framework.tasks.benchmarks.arc.ARCCloze") + register_lazy_task("eval_framework.tasks.benchmarks.arc.ARCMC") + register_lazy_task("eval_framework.tasks.benchmarks.arc.ARCBPB") register_lazy_task("eval_framework.tasks.benchmarks.arc.ARC_IDK") register_lazy_task("eval_framework.tasks.benchmarks.arc.ARC_OLMES") register_lazy_task("eval_framework.tasks.benchmarks.arc_de.ARC_DE") @@ -61,12 +64,16 @@ def register_all_tasks() -> None: register_lazy_task("eval_framework.tasks.benchmarks.hellaswag.HELLASWAG_OLMES") register_lazy_task("eval_framework.tasks.benchmarks.hellaswag.HELLASWAG_IDK") register_lazy_task("eval_framework.tasks.benchmarks.hellaswag_de.HELLASWAG_DE") + register_lazy_task("eval_framework.tasks.benchmarks.hellaswag.HELLASWAGCloze") + register_lazy_task("eval_framework.tasks.benchmarks.hellaswag.HELLASWAGMC") + register_lazy_task("eval_framework.tasks.benchmarks.hellaswag.HELLASWAGBPB") register_lazy_task("eval_framework.tasks.benchmarks.opengptx_eu20.HELLASWAG_EU20_DE") register_lazy_task("eval_framework.tasks.benchmarks.opengptx_eu20.HELLASWAG_EU20_FR") register_lazy_task("eval_framework.tasks.benchmarks.humaneval.HumanEval") register_lazy_task("eval_framework.tasks.benchmarks.humaneval.HumanEvalBPB") register_lazy_task("eval_framework.tasks.benchmarks.humaneval.HumanEval_OLMES") register_lazy_task("eval_framework.tasks.benchmarks.humaneval.HumanEvalInstruct") + register_lazy_task("eval_framework.tasks.benchmarks.humaneval.CodexHumanEval_BPB") register_lazy_task("eval_framework.tasks.benchmarks.ifeval.IFEval") register_lazy_task("eval_framework.tasks.benchmarks.ifeval.IFEvalDe") register_lazy_task("eval_framework.tasks.benchmarks.ifeval.IFEvalFiSv") @@ -88,6 +95,7 @@ def register_all_tasks() -> None: register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.MATHMinervaBPB") register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.MATHMinerva_OLMES") register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.MATH500Minerva") + register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.MATH500Minerva_BPB") register_lazy_task("eval_framework.tasks.benchmarks.multipl_e.MultiPLEHumanEvalCpp") register_lazy_task("eval_framework.tasks.benchmarks.multipl_e.MultiPLEHumanEvalJava") register_lazy_task("eval_framework.tasks.benchmarks.multipl_e.MultiPLEHumanEvalJs") @@ -106,10 +114,18 @@ def register_all_tasks() -> None: register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_PROMPT_WITHOUT_TESTS") register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_PROMPT_WITHOUT_TESTS_SANITIZED") register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_OLMES") + register_lazy_task("eval_framework.tasks.benchmarks.mbpp.CodexMBPP_BPB") register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLU") register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLU_IDK") register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLU_OLMES") register_lazy_task("eval_framework.tasks.benchmarks.mmlu.FullTextMMLU") + register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLUCloze") + register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLUMC") + register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLUBPB") + register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLUOtherBPB") + register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLUStemBPB") + register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLUHumanitiesBPB") + register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLUSocialSciencesBPB") register_lazy_task("eval_framework.tasks.benchmarks.opengptx_eu20.MMLU_EU20_DE") register_lazy_task("eval_framework.tasks.benchmarks.opengptx_eu20.MMLU_EU20_FR") register_lazy_task("eval_framework.tasks.benchmarks.mmlu_de.MMLU_DE") diff --git a/tests/tests_eval_framework/tasks/task-prompts-hashes.json b/tests/tests_eval_framework/tasks/task-prompts-hashes.json index f823e21e..a56525ae 100644 --- a/tests/tests_eval_framework/tasks/task-prompts-hashes.json +++ b/tests/tests_eval_framework/tasks/task-prompts-hashes.json @@ -7,6 +7,12 @@ "AIME2026.Llama3Formatter": "3ff7afee4d41646990b5cc24272db494", "ARC.ConcatFormatter": "bd30651bf7141f65afcfa89cb449fe80", "ARC.Llama3Formatter": "0027b1a525b202c4bd3809d6be54f8fa", + "ARCBPB.ConcatFormatter": "1f52205996ca708a0a0a54309e2ccf44", + "ARCBPB.Llama3Formatter": "b22269e9e72763d8d5fa248577de4cab", + "ARCCloze.ConcatFormatter": "bd30651bf7141f65afcfa89cb449fe80", + "ARCCloze.Llama3Formatter": "0027b1a525b202c4bd3809d6be54f8fa", + "ARCMC.ConcatFormatter": "1ed0a2e8375df1d6856932346e8e40b2", + "ARCMC.Llama3Formatter": "0777ec98b33498d916ed22fbfe68ee94", "ARC_DE.ConcatFormatter": "bb58420112447c9e7f6b3b92cd3adc9f", "ARC_DE.Llama3Formatter": "71ed6989a95477588fbd80b6ae3bff68", "ARC_EU20_DE.ConcatFormatter": "8b931a0504d5e41c0ebdf2c799268e58", @@ -45,6 +51,10 @@ "COPA_OLMES.Llama3Formatter": "65456e820526b80949bec9dc00a8e33f", "ChemBench.ConcatFormatter": "4a5c849a20cba792c46ac4af8ed88e8d", "ChemBench.Llama3Formatter": "f36935a9d4c8900a5a74731b46e5b1b3", + "CodexHumanEval_BPB.ConcatFormatter": "6f3202cc06e81ed52eaf95c012642f3e", + "CodexHumanEval_BPB.Llama3Formatter": "134128d0706d25c4ecbc7e624754c843", + "CodexMBPP_BPB.ConcatFormatter": "ac5d042d10fc5486ceaae1e145bf20f0", + "CodexMBPP_BPB.Llama3Formatter": "bd2fc979902ee9c80d5d4cedac9aa5ce", "CommonsenseQACloze.ConcatFormatter": "c644c3c2a3395d83fca2edc3fc31844b", "CommonsenseQACloze.Llama3Formatter": "393a85d1b86304b22b57038f430302b0", "CommonsenseQAFullTextCloze.ConcatFormatter": "9f01052649ab337b9b7181c9302a4fc3", @@ -102,6 +112,12 @@ "GridDifference.EXTRACTED_GRID": "c3463fc837dbf31c6d20e4c6a135f14f", "HELLASWAG.ConcatFormatter": "c8f069fff818335c99bb92288a237d92", "HELLASWAG.Llama3Formatter": "75fc2b5e4e6161a1bb8b8050cbb716d2", + "HELLASWAGBPB.ConcatFormatter": "a6335b585e97fc2c60e35c9c01d09443", + "HELLASWAGBPB.Llama3Formatter": "c336c651ed15a445a910e1a5c3f1ca66", + "HELLASWAGCloze.ConcatFormatter": "c2cb6e4097b2a03a5c9397dbd3b82b0e", + "HELLASWAGCloze.Llama3Formatter": "b6b1b011514e12f4e52c43b18386ca1f", + "HELLASWAGMC.ConcatFormatter": "27935199d13dc7fa98ae3f5f79437685", + "HELLASWAGMC.Llama3Formatter": "24cb1d7d083f43df3098a7957ff353ec", "HELLASWAG_DE.ConcatFormatter": "9fb70ef960b7dc1401ef81b3991d6fea", "HELLASWAG_DE.Llama3Formatter": "69da70417ee40c99529b991eaa04a776", "HELLASWAG_EU20_DE.ConcatFormatter": "fcea590f6d03494e6da65bf3c274d0d0", @@ -158,6 +174,8 @@ "MATH500.Llama3Formatter": "c8624982f58f346c68622e2687a46965", "MATH500Minerva.ConcatFormatter": "4822e1d31c2a8b3b129d08c5974f3fe9", "MATH500Minerva.Llama3Formatter": "794e344523118fe325de091e455bec00", + "MATH500Minerva_BPB.ConcatFormatter": "a9bc27d9cfa622bb2d8ed28ab69c2819", + "MATH500Minerva_BPB.Llama3Formatter": "822d483aad1d2fe85cfb82627c2d375a", "MATHLvl5.ConcatFormatter": "82feee2e24f2f96f668d22c0d4554c4a", "MATHLvl5.Llama3Formatter": "6fdf0835ce969239a843c088c9104fe4", "MATHMinerva.ConcatFormatter": "817591afbe9426c45cddf82be7e11e07", @@ -182,6 +200,20 @@ "MBPP_SANITIZED.Llama3Formatter": "c3fa6d5b9126c9e320b95a0c504c2ef1", "MMLU.ConcatFormatter": "d8b543f6e31659e1e0bf9f90f51a3ce7", "MMLU.Llama3Formatter": "61546963de15da149c4a7ec0e321bc48", + "MMLUBPB.ConcatFormatter": "27a2b31b1250ca88f6242b30a1a3d26c", + "MMLUBPB.Llama3Formatter": "fbc252c7f431bbea1a0a968c5c2573ae", + "MMLUCloze.ConcatFormatter": "3f759a116bd57c5032c34bf150d5b81c", + "MMLUCloze.Llama3Formatter": "e583009d7b038f3a1de05f06d0b6c3ee", + "MMLUHumanitiesBPB.ConcatFormatter": "0f6a5e9955f1b8978ae818287ee5ddea", + "MMLUHumanitiesBPB.Llama3Formatter": "2864c96d60f466eb36ae0c8738e0df04", + "MMLUMC.ConcatFormatter": "faf81b862db8eca4f6cc98076fc8ac80", + "MMLUMC.Llama3Formatter": "b6e534972d97620e5a5d4c94bf217fc9", + "MMLUOtherBPB.ConcatFormatter": "9735fe03eb7c412a05138fc8ad3f62bd", + "MMLUOtherBPB.Llama3Formatter": "7a29e34f4576c958fe164bfdc9d2c871", + "MMLUSocialSciencesBPB.ConcatFormatter": "685731111e72dea393f6255734791fdd", + "MMLUSocialSciencesBPB.Llama3Formatter": "c2f374a9190aa2428ce9be8aac70d402", + "MMLUStemBPB.ConcatFormatter": "27a2b31b1250ca88f6242b30a1a3d26c", + "MMLUStemBPB.Llama3Formatter": "fbc252c7f431bbea1a0a968c5c2573ae", "MMLU_COT.ConcatFormatter": "158044a1336658a19faf45d116ea66e6", "MMLU_COT.Llama3Formatter": "a21859e78e8eae37a66a388708fe3a18", "MMLU_DE.ConcatFormatter": "b448e01092dd94cb83f788590b28b08b", diff --git a/tests/tests_eval_framework/tasks/test_mbpp_olmes.py b/tests/tests_eval_framework/tasks/test_mbpp_olmes.py index bb31a3eb..7e82eaa7 100644 --- a/tests/tests_eval_framework/tasks/test_mbpp_olmes.py +++ b/tests/tests_eval_framework/tasks/test_mbpp_olmes.py @@ -12,10 +12,6 @@ def task(self) -> MBPP_OLMES: with DatasetPatcher(MBPP_OLMES, num_fewshot=3, num_samples=10) as patched_task: return patched_task - def test_num_fewshot_must_be_3(self) -> None: - with pytest.raises(AssertionError, match="MBPP_OLMES requires exactly 3 fewshot examples"): - MBPP_OLMES(num_fewshot=1) - def test_stop_sequences(self) -> None: task = MBPP_OLMES(num_fewshot=3) assert task.stop_sequences == ["```", '\n"""', "\nassert", "\n#"]