From bbdebcbbd4bac3c4f508ea5a69368700190f2faa Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Wed, 1 Apr 2026 12:33:39 +0000 Subject: [PATCH 01/13] feat: first draft BPBStyle --- src/eval_framework/tasks/base.py | 3 +- src/eval_framework/tasks/task_style.py | 43 +++++++++++++++++++++++--- 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/src/eval_framework/tasks/base.py b/src/eval_framework/tasks/base.py index 0e031762..8247107f 100644 --- a/src/eval_framework/tasks/base.py +++ b/src/eval_framework/tasks/base.py @@ -34,6 +34,7 @@ class ResponseType(Enum): class TaskStyle(Enum): MULTIPLE_CHOICE = "multiple_choice" CLOZE = "cloze" + BPB = "bpb" class Language(Enum): @@ -311,7 +312,7 @@ def _get_cue_text(self, item: dict[str, Any]) -> str: def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None: if hasattr(self, "TASK_STYLER"): - return self.TASK_STYLER.get_possible_completions(self._get_choices(item)) + return self.TASK_STYLER.get_possible_completions(self._get_choices(item), self._get_correct_index(item)) return None def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]: diff --git a/src/eval_framework/tasks/task_style.py b/src/eval_framework/tasks/task_style.py index a9ddf54b..4615cc8c 100644 --- a/src/eval_framework/tasks/task_style.py +++ b/src/eval_framework/tasks/task_style.py @@ -52,6 +52,10 @@ class ARC(_ARC_Base): class ARC_MC(_ARC_Base): NAME = "ARC_MC" TASK_STYLER = MCStyle(space_prefixed_labels=True) + + class ARC_BPB(_ARC_Base): + NAME = "ARC_BPB" + TASK_STYLER = BPBStyle() """ import hashlib @@ -111,8 +115,13 @@ def get_ground_truth(self, choices: list[str], correct_index: int) -> str: """Return the ground-truth string for scoring.""" @abstractmethod - def get_possible_completions(self, choices: list[str]) -> list[str]: - """Return the list of scored completion strings.""" + def get_possible_completions(self, choices: list[str], correct_index: int | None = None) -> list[str]: + """Return the list of scored completion strings. + + ``correct_index`` is only required by ``BPBStyle``, which scores solely the + ground-truth completion. ``MCStyle`` and ``ClozeStyle`` score all choices and + ignore it; callers may omit it when using those stylers. + """ @abstractmethod def get_cue_text(self) -> str: @@ -196,7 +205,7 @@ def get_ground_truth(self, choices: list[str], correct_index: int) -> str: labels = get_n_letters(len(choices)) return f" {labels[correct_index]}" - def get_possible_completions(self, choices: list[str]) -> list[str]: + def get_possible_completions(self, choices: list[str], correct_index: int | None = None) -> list[str]: return [f" {label}" for label in get_n_letters(len(choices))] @@ -256,10 +265,36 @@ def get_instruction_text(self, raw_question: str, choices: list[str]) -> str: def get_ground_truth(self, choices: list[str], correct_index: int) -> str: return f" {choices[correct_index]}" - def get_possible_completions(self, choices: list[str]) -> list[str]: + def get_possible_completions(self, choices: list[str], correct_index: int | None = None) -> list[str]: return [f" {c}" for c in choices] +class BPBStyle(ClozeStyle): + """BPB-only styler: prompt identical to ClozeStyle, but scores only the ground-truth completion. + + One LLM forward pass per sample instead of N (one per choice), making evaluation + significantly faster when accuracy metrics are not needed. + + Args: + question_prefix: Prepended to the raw question (default ``"Question: "``). + cue_text: Assistant cue after the prompt (default ``"Answer:"``). + trailing_newline: When ``True`` (default), the instruction ends with ``"\\n"``. + + Assembled prompt example (3 choices):: + + "Question: What is the capital of France?\\n" + + Scored completions: [" Paris"] ← ground truth only, one forward pass + Ground truth: " Paris" + """ + + metrics: list[type["BaseMetric"]] = [BitsPerByteLoglikelihood] + task_style = TaskStyle.BPB + + def get_possible_completions(self, choices: list[str], correct_index: int | None = None) -> list[str]: + return [f" {choices[correct_index]}"] + + # --------------------------------------------------------------------------- # Helper functions # --------------------------------------------------------------------------- From f87d4dbc0f98aaff473bf786753ead560d6d9c68 Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Wed, 1 Apr 2026 12:34:03 +0000 Subject: [PATCH 02/13] feat: add naturalqsopenbpb as an example --- src/eval_framework/tasks/benchmarks/naturalqs_open.py | 8 ++++++++ src/eval_framework/tasks/task_names.py | 1 + 2 files changed, 9 insertions(+) diff --git a/src/eval_framework/tasks/benchmarks/naturalqs_open.py b/src/eval_framework/tasks/benchmarks/naturalqs_open.py index 9bcdc707..df322f56 100644 --- a/src/eval_framework/tasks/benchmarks/naturalqs_open.py +++ b/src/eval_framework/tasks/benchmarks/naturalqs_open.py @@ -3,6 +3,7 @@ from eval_framework.metrics.completion.drop_completion import DropF1ExactMatch, DropMetricContext from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType from eval_framework.tasks.task_style import ( + BPBStyle, ClozeStyle, MCStyle, answer_key_to_index, @@ -93,3 +94,10 @@ class NaturalQsOpenMC_OLMES(_NaturalQsOpenChoice_Base): NAME = "NaturalQsOpenMC_OLMES" TASK_STYLER = MCStyle(space_prefixed_labels=True) + + +class NaturalQsOpenBPB(_NaturalQsOpenChoice_Base): + """BPB-only variant.""" + + NAME = "NaturalQsOpenBPB" + TASK_STYLER = BPBStyle() diff --git a/src/eval_framework/tasks/task_names.py b/src/eval_framework/tasks/task_names.py index d8558654..c5887191 100644 --- a/src/eval_framework/tasks/task_names.py +++ b/src/eval_framework/tasks/task_names.py @@ -190,6 +190,7 @@ def register_all_tasks() -> None: register_lazy_task("eval_framework.tasks.benchmarks.naturalqs_open.NaturalQsOpenCloze") register_lazy_task("eval_framework.tasks.benchmarks.naturalqs_open.NaturalQsOpenMC") register_lazy_task("eval_framework.tasks.benchmarks.naturalqs_open.NaturalQsOpenMC_OLMES") + register_lazy_task("eval_framework.tasks.benchmarks.naturalqs_open.NaturalQsOpenBPB") register_lazy_task("eval_framework.tasks.benchmarks.social_iqa.SocialIQACloze") register_lazy_task("eval_framework.tasks.benchmarks.social_iqa.SocialIQAMC_OLMES") register_lazy_task("eval_framework.tasks.benchmarks.social_iqa.SocialIQAMC") From 47b545073077c8ca159ecf926105ec5a7219a350 Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Wed, 1 Apr 2026 12:34:20 +0000 Subject: [PATCH 03/13] test: add tests for bpbstyle --- .../tasks/test_task_style.py | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/tests/tests_eval_framework/tasks/test_task_style.py b/tests/tests_eval_framework/tasks/test_task_style.py index 05ff62cb..5951fbdb 100644 --- a/tests/tests_eval_framework/tasks/test_task_style.py +++ b/tests/tests_eval_framework/tasks/test_task_style.py @@ -8,6 +8,7 @@ from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType, TaskStyle from eval_framework.tasks.task_style import ( + BPBStyle, ClozeStyle, MCStyle, answer_key_to_index, @@ -408,3 +409,100 @@ def test_metadata_response_type_from_styler(self) -> None: task = _ConcreteMCTask() meta = task.get_metadata() assert meta["response_type"] == ResponseType.LOGLIKELIHOODS.value + + +# --------------------------------------------------------------------------- +# BPBStyle tests +# --------------------------------------------------------------------------- + + +class TestBPBStyle: + def setup_method(self) -> None: + self.styler = BPBStyle() + + def test_get_instruction_text(self) -> None: + """Prompt is identical to ClozeStyle — no choices shown.""" + text = self.styler.get_instruction_text(_TEST_QUESTION, _TEST_CHOICES) + assert text == "Question: Capital of France?\n" + assert "Berlin" not in text + + def test_get_ground_truth(self) -> None: + assert self.styler.get_ground_truth(_TEST_CHOICES, _TEST_CORRECT_INDEX) == " Paris" + + def test_get_possible_completions_returns_only_ground_truth(self) -> None: + completions = self.styler.get_possible_completions(_TEST_CHOICES, _TEST_CORRECT_INDEX) + assert completions == [" Paris"] + assert len(completions) == 1 + + def test_get_cue_text(self) -> None: + assert self.styler.get_cue_text() == "Answer:" + + def test_get_fewshot_target_text(self) -> None: + assert self.styler.get_fewshot_target_text(_TEST_CHOICES, _TEST_CORRECT_INDEX) == "Answer: Paris" + + def test_extra_metadata(self) -> None: + meta = self.styler.get_extra_metadata() + assert meta["task_style"] == TaskStyle.BPB.value + + def test_response_type(self) -> None: + assert self.styler.response_type == ResponseType.LOGLIKELIHOODS + + def test_metrics_bpb_only(self) -> None: + from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import AccuracyLoglikelihood + from eval_framework.metrics.loglikelihood.bits_per_byte import BitsPerByteLoglikelihood + + assert self.styler.metrics == [BitsPerByteLoglikelihood] + assert AccuracyLoglikelihood not in self.styler.metrics + + def test_for_language_german(self) -> None: + styler = BPBStyle.for_language(Language.DEU) + text = styler.get_instruction_text(_TEST_QUESTION, _TEST_CHOICES) + assert text.startswith("Frage: ") + assert styler.get_cue_text() == "Antwort:" + + +class _ConcreteBPBTask(BaseTask[str]): + """Minimal concrete task for testing BaseTask with BPBStyle.""" + + NAME = "TestBPBTask" + DATASET_PATH = "test/dataset" + SAMPLE_SPLIT = "test" + FEWSHOT_SPLIT = "train" + SUBJECTS = [NO_SUBJECT] + PERTURBATION_UNMODIFIABLE_WORDS = ["Question"] + LANGUAGE = Language.ENG + TASK_STYLER = BPBStyle() + + def _get_raw_question(self, item: dict) -> str: + return item["question"] + + def _get_choices(self, item: dict) -> list[str]: + return item["choices"] + + def _get_correct_index(self, item: dict) -> int: + return item["answer"] + + +class TestBaseTaskBPBStyle: + def setup_method(self) -> None: + self.task = _ConcreteBPBTask() + + def test_instruction_text(self) -> None: + text = self.task._get_instruction_text(_TEST_ITEM) + assert text == "Question: Capital of France?\n" + assert "Berlin" not in text + + def test_ground_truth(self) -> None: + assert self.task._get_ground_truth(_TEST_ITEM) == " Paris" + + def test_possible_completions_single_entry(self) -> None: + completions = self.task._get_possible_completions(_TEST_ITEM) + assert completions == [" Paris"] + + def test_metadata_task_style(self) -> None: + meta = self.task.get_metadata() + assert meta["task_style"] == TaskStyle.BPB.value + + def test_metadata_metrics_bpb_only(self) -> None: + meta = self.task.get_metadata() + assert meta["metrics"] == ["BitsPerByte"] From 875b78e1149e763acbea2d2e34ebbe68d6a890d5 Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Wed, 1 Apr 2026 12:46:06 +0000 Subject: [PATCH 04/13] test: add hash for added variant --- tests/tests_eval_framework/tasks/task-prompts-hashes.json | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/tests_eval_framework/tasks/task-prompts-hashes.json b/tests/tests_eval_framework/tasks/task-prompts-hashes.json index a2827d88..4219231e 100644 --- a/tests/tests_eval_framework/tasks/task-prompts-hashes.json +++ b/tests/tests_eval_framework/tasks/task-prompts-hashes.json @@ -236,6 +236,8 @@ "MultiPLEMBPPSh.Llama3Formatter": "d842dfd22a7fe4c8b4ea6eac54c42cb6", "NaturalQsOpen.ConcatFormatter": "6545c9dd54015c1e1227b322dcd82598", "NaturalQsOpen.Llama3Formatter": "3bfec3e5d48974262309c56a179fde60", + "NaturalQsOpenBPB.ConcatFormatter": "e02c13a7c739e5a95de5ef7b11635bff", + "NaturalQsOpenBPB.Llama3Formatter": "6ac2ba06cbd2c5dfaf0ee0cad406d047", "NaturalQsOpenCloze.ConcatFormatter": "681f3cb09c2d4f36a65f4d0d3fec9d1a", "NaturalQsOpenCloze.Llama3Formatter": "b350c62360e9b1ae5ffa780a98cbebdb", "NaturalQsOpenMC.ConcatFormatter": "600d03c8758b3131d4ff55f54423b5f6", From 72909ac37ca108cf191ddf7cfe0d93da489d1329 Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Wed, 1 Apr 2026 12:46:29 +0000 Subject: [PATCH 05/13] docs: update with BPB task variant --- docs/tasks/NaturalQsOpenBPB.md | 20 ++++++++++++++++++++ docs/tasks/README.md | 3 ++- 2 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 docs/tasks/NaturalQsOpenBPB.md diff --git a/docs/tasks/NaturalQsOpenBPB.md b/docs/tasks/NaturalQsOpenBPB.md new file mode 100644 index 00000000..671021f3 --- /dev/null +++ b/docs/tasks/NaturalQsOpenBPB.md @@ -0,0 +1,20 @@ +# NaturalQsOpenBPB + +```` +NAME = NaturalQsOpenBPB +DATASET_PATH = allenai/nq-gen2mc +SAMPLE_SPLIT = validation +FEWSHOT_SPLIT = validation +RESPONSE_TYPE = LOGLIKELIHOODS +METRICS = [BitsPerByteLoglikelihood] +SUBJECTS = ['no_subject'] +LANGUAGE = +```` + +- Module: `eval_framework.tasks.benchmarks.naturalqs_open` + +- File: [src/eval_framework/tasks/benchmarks/naturalqs_open.py](../../src/eval_framework/tasks/benchmarks/naturalqs_open.py) | [View on GitHub](https://github.com/Aleph-Alpha-Research/eval-framework/blob/main/src/eval_framework/tasks/benchmarks/naturalqs_open.py) + +- Link to dataset: [https://huggingface.co/datasets/allenai/nq-gen2mc](https://huggingface.co/datasets/allenai/nq-gen2mc) + +More detailed documentation, with prompt examples and ground truth completions, can be generated with `uv run -m eval_framework.utils.generate_task_docs --add-prompt-examples --only-tasks "NaturalQsOpenBPB"`. diff --git a/docs/tasks/README.md b/docs/tasks/README.md index a6696253..846b7e96 100644 --- a/docs/tasks/README.md +++ b/docs/tasks/README.md @@ -2,7 +2,7 @@ This directory contains the generated documentation for all benchmark tasks available in the package. -**Total number of tasks: 179** +**Total number of tasks: 180** The documentation can be generated or updated with `uv run -m eval_framework.utils.generate_task_docs`. @@ -132,6 +132,7 @@ NOTE: This is an automatically generated file. Any manual modifications will not - [MultiPLEMBPPRs](MultiPLEMBPPRs.md) - [MultiPLEMBPPSh](MultiPLEMBPPSh.md) - [NaturalQsOpen](NaturalQsOpen.md) +- [NaturalQsOpenBPB](NaturalQsOpenBPB.md) - [NaturalQsOpenCloze](NaturalQsOpenCloze.md) - [NaturalQsOpenMC](NaturalQsOpenMC.md) - [NaturalQsOpenMC_OLMES](NaturalQsOpenMC_OLMES.md) From 4f7d19afe71888f08fdddf4239eaad313c8a4f0e Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Wed, 1 Apr 2026 14:27:18 +0000 Subject: [PATCH 06/13] refactor: assert correct_index for BPB and add clarifying comments --- src/eval_framework/tasks/task_style.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/eval_framework/tasks/task_style.py b/src/eval_framework/tasks/task_style.py index 4615cc8c..61c464ef 100644 --- a/src/eval_framework/tasks/task_style.py +++ b/src/eval_framework/tasks/task_style.py @@ -116,10 +116,10 @@ def get_ground_truth(self, choices: list[str], correct_index: int) -> str: @abstractmethod def get_possible_completions(self, choices: list[str], correct_index: int | None = None) -> list[str]: - """Return the list of scored completion strings. + """Return the list of completion strings to be evaluated. ``correct_index`` is only required by ``BPBStyle``, which scores solely the - ground-truth completion. ``MCStyle`` and ``ClozeStyle`` score all choices and + ground-truth completion. ``MCStyle`` and ``ClozeStyle`` score all choices and ignore it; callers may omit it when using those stylers. """ @@ -206,6 +206,7 @@ def get_ground_truth(self, choices: list[str], correct_index: int) -> str: return f" {labels[correct_index]}" def get_possible_completions(self, choices: list[str], correct_index: int | None = None) -> list[str]: + """Note: `correct_index` is ignored for `MCStyle` and only used for `BPBStyle`.""" return [f" {label}" for label in get_n_letters(len(choices))] @@ -292,6 +293,9 @@ class BPBStyle(ClozeStyle): task_style = TaskStyle.BPB def get_possible_completions(self, choices: list[str], correct_index: int | None = None) -> list[str]: + assert correct_index is not None, ( + "BPBStyle only evaluates the loglikelhood of the ground truth answer and thus requires the correct index." + ) return [f" {choices[correct_index]}"] From 0d0690e4adb0d94e5aadc15052d44766a40788ce Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Thu, 2 Apr 2026 07:27:25 +0000 Subject: [PATCH 07/13] fix: read out response type and metrics correctly --- src/eval_framework/response_generator.py | 32 ++++++++++++++++++------ src/eval_framework/tasks/base.py | 12 ++++----- src/eval_framework/tasks/eval_config.py | 3 ++- src/eval_framework/tasks/task_style.py | 8 +++--- 4 files changed, 37 insertions(+), 18 deletions(-) diff --git a/src/eval_framework/response_generator.py b/src/eval_framework/response_generator.py index 2ffb0866..c8168eed 100644 --- a/src/eval_framework/response_generator.py +++ b/src/eval_framework/response_generator.py @@ -67,14 +67,18 @@ def __init__(self, llm: BaseLLM, config: EvalConfig, result_processor: ResultsFi if config.perturbation_config is not None: perturbation_task_class = create_perturbation_class(task_class, config.perturbation_config) self.task = perturbation_task_class.with_overwrite( - self.few_shot, custom_subjects=self.config.task_subjects, custom_hf_revision=self.config.hf_revision + self.few_shot, + custom_subjects=self.config.task_subjects, + custom_hf_revision=self.config.hf_revision, ) else: self.task = task_class.with_overwrite( - self.few_shot, custom_subjects=self.config.task_subjects, custom_hf_revision=self.config.hf_revision + self.few_shot, + custom_subjects=self.config.task_subjects, + custom_hf_revision=self.config.hf_revision, ) - self.response_type = task_class.RESPONSE_TYPE + self.response_type, _ = self.task._get_type_and_metrics() def _llm_task_param_precedence(self) -> tuple[list[str] | None, int | None]: """ @@ -89,7 +93,10 @@ def _llm_task_param_precedence(self) -> tuple[list[str] | None, int | None]: task_stop_sequences = getattr(self.task, "stop_sequences", None) task_max_tokens = self.config.max_tokens or getattr(self.task, "max_tokens", None) # if both task and model define a max_token, the smaller value is used - max_tokens = min([x for x in [llm_max_tokens, task_max_tokens] if x is not None], default=None) + max_tokens = min( + [x for x in [llm_max_tokens, task_max_tokens] if x is not None], + default=None, + ) logger.info(f"Set max_tokens to {max_tokens}") # if both task and model define stop sequences, those are merged into one list stop_sequences_merged = (llm_stop_sequences or []) + (task_stop_sequences or []) @@ -117,7 +124,9 @@ def _generate_loglikelihoods(self, samples: list[Sample]) -> list[Loglikelihood] loglikelihoods={}, loglikelihoods_sequence_positions={}, raw_loglikelihood_error=Error( - error_class=e.__class__.__name__, message=str(e), traceback=traceback.format_exc() + error_class=e.__class__.__name__, + message=str(e), + traceback=traceback.format_exc(), ), ) for _ in range(len(samples)) @@ -142,7 +151,9 @@ def _generate_loglikelihoods(self, samples: list[Sample]) -> list[Loglikelihood] ) return loglikelihood_list - def _generative_output_type_selector(self) -> Callable[[list[Sample]], list[Completion] | list[Loglikelihood]]: + def _generative_output_type_selector( + self, + ) -> Callable[[list[Sample]], list[Completion] | list[Loglikelihood]]: """ Selects the generative output type based on the response type. :return: function to generate responses @@ -151,7 +162,10 @@ def _generative_output_type_selector(self) -> Callable[[list[Sample]], list[Comp case ResponseType.COMPLETION: stop_sequences, max_tokens = self._llm_task_param_precedence() return partial( - self.task.generate_completions, self.llm, stop_sequences=stop_sequences, max_tokens=max_tokens + self.task.generate_completions, + self.llm, + stop_sequences=stop_sequences, + max_tokens=max_tokens, ) # type: ignore[call-arg] case ResponseType.LOGLIKELIHOODS: return self._generate_loglikelihoods @@ -245,7 +259,9 @@ def _process_batch(samples_batch: list[Sample]) -> None: samples_batch: list[Sample] = [] with tqdm( - total=total_num_samples, desc=f"Processing {self.response_type.value}", disable=get_disable_bar_flag() + total=total_num_samples, + desc=f"Processing {self.response_type.value}", + disable=get_disable_bar_flag(), ) as pbar: samples = self.task.iterate_samples(self.num_samples) for i, sample in enumerate(repeat_samples(samples, repeats)): diff --git a/src/eval_framework/tasks/base.py b/src/eval_framework/tasks/base.py index 8247107f..16fb94b8 100644 --- a/src/eval_framework/tasks/base.py +++ b/src/eval_framework/tasks/base.py @@ -332,12 +332,7 @@ def _get_context(self, item: dict[str, Any]) -> BaseMetricContext | list[BaseMet return None def get_metadata(self) -> dict[str, str | list[str]]: - if hasattr(self, "TASK_STYLER"): - response_type = self.TASK_STYLER.response_type - metrics = self.TASK_STYLER.metrics - else: - response_type = self.RESPONSE_TYPE - metrics = self.METRICS + response_type, metrics = self._get_type_and_metrics() meta: dict[str, str | list[str]] = { "dataset_path": self.DATASET_PATH, @@ -424,3 +419,8 @@ def generate_completions( ) ) return completion_list + + def _get_type_and_metrics(self) -> tuple[ResponseType, list[type["BaseMetric"]]]: + if hasattr(self, "TASK_STYLER"): + return self.TASK_STYLER.response_type, self.TASK_STYLER.metrics + return self.RESPONSE_TYPE, self.METRICS diff --git a/src/eval_framework/tasks/eval_config.py b/src/eval_framework/tasks/eval_config.py index d189572a..58bc2b4d 100644 --- a/src/eval_framework/tasks/eval_config.py +++ b/src/eval_framework/tasks/eval_config.py @@ -112,7 +112,8 @@ def validate_judge_model_args(cls, value: dict[str, Any]) -> dict[str, Any]: @model_validator(mode="after") def validate_llm_judge_defined(self) -> "EvalConfig": task = get_task(self.task_name) - for metric_class in task.METRICS: + _, task_metrics = task(num_fewshot=0)._get_type_and_metrics() + for metric_class in task_metrics: if issubclass(metric_class, BaseLLMJudgeMetric): assert self.llm_judge_class is not None, "The LLM Judge must be defined for this evaluation task." return self diff --git a/src/eval_framework/tasks/task_style.py b/src/eval_framework/tasks/task_style.py index 61c464ef..53a824dd 100644 --- a/src/eval_framework/tasks/task_style.py +++ b/src/eval_framework/tasks/task_style.py @@ -293,9 +293,11 @@ class BPBStyle(ClozeStyle): task_style = TaskStyle.BPB def get_possible_completions(self, choices: list[str], correct_index: int | None = None) -> list[str]: - assert correct_index is not None, ( - "BPBStyle only evaluates the loglikelhood of the ground truth answer and thus requires the correct index." - ) + if correct_index is None: + raise ValueError( + "BPBStyle evaluates the loglikelihood of the ground truth answer only," + "and thus requires the correct index." + ) return [f" {choices[correct_index]}"] From 9adb70125fe382714b9a21f59542c5ffec6ca871 Mon Sep 17 00:00:00 2001 From: Prabhu Sivaprasad Date: Thu, 2 Apr 2026 21:30:32 +0000 Subject: [PATCH 08/13] storing native tasks --- src/eval_framework/tasks/benchmarks/arc.py | 282 ++++++++++++++++++ .../tasks/benchmarks/hellaswag.py | 73 +++++ .../tasks/benchmarks/humaneval.py | 103 +++++++ .../tasks/benchmarks/math_reasoning.py | 55 ++++ src/eval_framework/tasks/benchmarks/mbpp.py | 122 ++++++++ src/eval_framework/tasks/benchmarks/mmlu.py | 129 ++++++-- 6 files changed, 739 insertions(+), 25 deletions(-) diff --git a/src/eval_framework/tasks/benchmarks/arc.py b/src/eval_framework/tasks/benchmarks/arc.py index c8db9060..c8b52a1e 100644 --- a/src/eval_framework/tasks/benchmarks/arc.py +++ b/src/eval_framework/tasks/benchmarks/arc.py @@ -9,8 +9,244 @@ from eval_framework.metrics.loglikelihood.dcs import DistributionalCorrectnessScore from eval_framework.metrics.loglikelihood.ternary import TernaryScore from eval_framework.tasks.base import BaseTask, Language, ResponseType +from eval_framework.tasks.task_style import BPBStyle, ClozeStyle, MCStyle, answer_key_to_index from eval_framework.tasks.utils import get_n_letters +# OLMES fixed fewshot sources, keyed by HF subject name. +# Source: https://github.com/allenai/olmes (FEWSHOT_SOURCES["OLMES:ARC-*"]) +_ARC_FEWSHOT_SOURCES: dict[str, list[dict[str, Any]]] = { + "ARC-Easy": [ + { + "id": "MCAS_2007_8_5189", + "question": "Lichens are symbiotic organisms made of green algae and fungi. What do the green algae supply " + "to the fungi in this symbiotic relationship?", + "choices": {"text": ["carbon dioxide", "food", "protection", "water"], "label": ["A", "B", "C", "D"]}, + "answerKey": "B", + }, + { + "id": "Mercury_SC_401169", + "question": "When a switch is used in an electrical circuit, the switch can", + "choices": { + "text": [ + "cause the charge to build.", + "increase and decrease the voltage.", + "cause the current to change direction.", + "stop and start the flow of current.", + ], + "label": ["A", "B", "C", "D"], + }, + "answerKey": "D", + }, + { + "id": "MCAS_2004_8_27", + "question": "Which of the following is an example of an assistive device?", + "choices": { + "text": ["contact lens", "motorcycle", "raincoat", "coffee pot"], + "label": ["A", "B", "C", "D"], + }, + "answerKey": "A", + }, + { + "id": "NYSEDREGENTS_2006_8_10", + "question": "Rocks are classified as igneous, metamorphic, or sedimentary according to", + "choices": { + "text": ["their color", "their shape", "how they formed", "the minerals they contain"], + "label": ["1", "2", "3", "4"], + }, + "answerKey": "3", + }, + { + "id": "Mercury_7013388", + "question": "A chewable calcium carbonate tablet is a common treatment for stomach discomfort. Calcium " + "carbonate is most likely used as this type of medicine because calcium carbonate", + "choices": { + "text": [ + "has a pleasant flavor.", + "is inexpensive to produce.", + "neutralizes digestive acid.", + "occurs naturally in the body.", + ], + "label": ["A", "B", "C", "D"], + }, + "answerKey": "C", + }, + { + "id": "Mercury_7179953", + "question": "Which two body systems are directly involved in movement?", + "choices": { + "text": [ + "muscular and skeletal", + "digestive and muscular", + "skeletal and respiratory", + "respiratory and digestive", + ], + "label": ["A", "B", "C", "D"], + }, + "answerKey": "A", + }, + { + "id": "Mercury_7205118", + "question": "Which change in the state of water particles causes the particles to become arranged in a" + " fixed position?", + "choices": {"text": ["boiling", "melting", "freezing", "evaporating"], "label": ["A", "B", "C", "D"]}, + "answerKey": "C", + }, + { + "id": "MCAS_2016_8_13", + "question": "Earth's core is primarily composed of which of the following materials?", + "choices": {"text": ["basalt", "iron", "magma", "quartz"], "label": ["A", "B", "C", "D"]}, + "answerKey": "B", + }, + ], + "ARC-Challenge": [ + { + "id": "Mercury_SC_415702", + "question": "George wants to warm his hands quickly by rubbing them. Which skin surface will produce the " + "most heat?", + "choices": { + "text": ["dry palms", "wet palms", "palms covered with oil", "palms covered with lotion"], + "label": ["A", "B", "C", "D"], + }, + "answerKey": "A", + }, + { + "id": "MCAS_2009_5_6516", + "question": "Which of the following statements best explains why magnets usually stick to a refrigerator " + "door?", + "choices": { + "text": [ + "The refrigerator door is smooth.", + "The refrigerator door contains iron.", + "The refrigerator door is a good conductor.", + "The refrigerator door has electric wires in it.", + ], + "label": ["A", "B", "C", "D"], + }, + "answerKey": "B", + }, + { + "id": "Mercury_7233695", + "question": "A fold observed in layers of sedimentary rock most likely resulted from the", + "choices": { + "text": [ + "cooling of flowing magma.", + "converging of crustal plates.", + "deposition of river sediments.", + "solution of carbonate minerals.", + ], + "label": ["A", "B", "C", "D"], + }, + "answerKey": "B", + }, + { + "id": "Mercury_7041615", + "question": "Which of these do scientists offer as the most recent explanation as to why many plants and " + "animals died out at the end of the Mesozoic era?", + "choices": { + "text": [ + "worldwide disease", + "global mountain building", + "rise of mammals that preyed upon plants and animals", + "impact of an asteroid created dust that blocked the sunlight", + ], + "label": ["A", "B", "C", "D"], + }, + "answerKey": "D", + }, + { + "id": "MCAS_1998_4_3", + "question": "Which of the following is a trait that a dog does NOT inherit from its parents?", + "choices": { + "text": [ + "the length of its fur", + "the shape of its nose", + "the size of its appetite", + "the color of its fur", + ], + "label": ["A", "B", "C", "D"], + }, + "answerKey": "C", + }, + { + "id": "Mercury_7041860", + "question": "A boat is acted on by a river current flowing north and by wind blowing on its sails. The boat" + " travels northeast. In which direction is the wind most likely applying force to the sails of the boat?", + "choices": {"text": ["west", "east", "north", "south"], "label": ["A", "B", "C", "D"]}, + "answerKey": "B", + }, + { + "id": "ACTAAP_2013_5_11", + "question": "As part of an experiment, an astronaut takes a scale to the Moon and weighs himself. The scale" + " reads 31 pounds. If the astronaut has a mass of about 84 kilograms, which are the approximate weight " + "and mass of the astronaut when standing on the Earth?", + "choices": { + "text": [ + "31 pounds and 14 kilograms", + "31 pounds and 84 kilograms", + "186 pounds and 14 kilograms", + "186 pounds and 84 kilograms", + ], + "label": ["A", "B", "C", "D"], + }, + "answerKey": "D", + }, + { + "id": "MDSA_2008_5_30", + "question": "On Earth, water can be a solid, a liquid, or a gas. Which energy source has the greatest " + "influence on the state of matter of water?", + "choices": { + "text": ["the sun", "the wind", "ocean currents", "the metal core"], + "label": ["A", "B", "C", "D"], + }, + "answerKey": "A", + }, + { + "id": "MEA_2016_8_14", + "question": "Which statement best compares single-celled and multi-celled organisms?", + "choices": { + "text": [ + "Tissues in a single-celled organism are like the cells in a multi-celled organism.", + "The nucleus in a single-celled organism is like the skin of a multi-celled organism.", + "Organelles in a single-celled organism are like the organs in a multi-celled organism.", + "The cytoplasm in a single-celled organism is like the nervous system in a multi-celled organism.", + ], + "label": ["A", "B", "C", "D"], + }, + "answerKey": "C", + }, + { + "id": "Mercury_SC_401653", + "question": "Which land form is the result of the constructive force of a glacier?", + "choices": { + "text": [ + "valleys carved by a moving glacier", + "piles of rocks deposited by a melting glacier", + "grooves created in a granite surface by a glacier", + "bedrock hills roughened by the passing of a glacier", + ], + "label": ["A", "B", "C", "D"], + }, + "answerKey": "B", + }, + { + "id": "Mercury_7106908", + "question": "Hatchling sea turtles are typically dark in color. Occasionally, a sea turtle hatches that " + "is almost white in color. When crawling from the nest on the beach to the ocean, this light-colored sea " + "turtle could be at risk for sunburn. The light color of the turtles would most likely", + "choices": { + "text": [ + "help the turtles have better chances at reproducing.", + "cause the shell of the sea turtles to become stronger.", + "reduce the chances of turtles surviving to reproduce.", + "help in the development of a new species of sea turtles.", + ], + "label": ["A", "B", "C", "D"], + }, + "answerKey": "C", + }, + ], +} # noqa: E501 + class ARC(BaseTask[str]): """ARC dataset: https://huggingface.co/datasets/allenai/ai2_arc""" @@ -94,3 +330,49 @@ def _get_initial_prompt_text(self, item: dict[str, Any]) -> str: def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None: completions = super()._get_possible_completions(item) return (completions or []) + [" I do not know."] + + +class _ARCChoice_Base(BaseTask[str]): + """Shared base for choice-based ARC variants (Cloze, MC, BPB). + + Subclasses set ``NAME`` and ``TASK_STYLER``; everything else is inherited. + """ + + DATASET_PATH = "allenai/ai2_arc" + SAMPLE_SPLIT = "test" + FEWSHOT_SPLIT = "train" + SUBJECTS = ["ARC-Easy", "ARC-Challenge"] + PERTURBATION_UNMODIFIABLE_WORDS = ["Question"] + get_n_letters(5) + LANGUAGE = Language.ENG + + def _get_raw_question(self, item: dict[str, Any]) -> str: + return item["question"] + + def _get_choices(self, item: dict[str, Any]) -> list[str]: + return item["choices"]["text"] + + def _get_correct_index(self, item: dict[str, Any]) -> int: + return answer_key_to_index(item["answerKey"]) + + def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict[str, Any]]: + subject = item.get("subject", "") + return _ARC_FEWSHOT_SOURCES.get(subject, [])[: self.num_fewshot] + + +class ARCCloze(_ARCChoice_Base): + NAME = "ARCCloze" + TASK_STYLER = ClozeStyle() + + +class ARCMC(_ARCChoice_Base): + """ARC with OLMES-style MC prompt: options listed as ' A. ...', scored over ' A'/' B'/....""" + + NAME = "ARCMC" + TASK_STYLER = MCStyle(space_prefixed_labels=True) + + +class ARCBPB(_ARCChoice_Base): + """BPB-only variant: scores loglikelihood over the ground-truth answer text only.""" + + NAME = "ARCBPB" + TASK_STYLER = BPBStyle() diff --git a/src/eval_framework/tasks/benchmarks/hellaswag.py b/src/eval_framework/tasks/benchmarks/hellaswag.py index 690c6a1b..4b54955b 100644 --- a/src/eval_framework/tasks/benchmarks/hellaswag.py +++ b/src/eval_framework/tasks/benchmarks/hellaswag.py @@ -10,6 +10,25 @@ from eval_framework.metrics.loglikelihood.dcs import DistributionalCorrectnessScore from eval_framework.metrics.loglikelihood.ternary import TernaryScore from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType +from eval_framework.tasks.task_style import BPBStyle, ClozeStyle, MCStyle + +# fmt: off +# OLMES fixed fewshot sources for HellaSwag. +# Source: https://github.com/allenai/olmes (FEWSHOT_SOURCES["OLMES:HellaSwag"]) +_HELLASWAG_FEWSHOTS: list[dict[str, Any]] = [ + {"ind": 12, "activity_label": "Health", "ctx_a": "[header] How to cope with suicidal thoughts [title] Put off any plans. [step] Promise yourself that you'll wait 48 hours before doing anything. Remember, thoughts don't have the power to force you to act.", "ctx_b": "", "endings": ["Even when you do, there may be a small image of the future still lurking around your brain. [substeps] For instance, don't tell yourself that you can't make it.", "You're doing something, and no one can force you to act. It's completely natural to feel negative thoughts before you act.", "Do not panic if people talk to you (even if it's about quitting smoking). Have a plan for how you're going to react to a group of people who bring on suicidal thoughts.", "Sometimes extreme pain can distort our perception. Waiting before taking action will give your mind time to clear."], "label": "3"},#noqa + {"ind": 39, "activity_label": "Education and Communications", "ctx_a": "[header] How to make a liquid into a solid [title] Place a small open container of water in the freezer compartment of a class or home refrigerator. [title] Leave the water there for several hours or overnight. [title] Remove from the freezer and note what has occurred.", "ctx_b": "", "endings": ["[step] Water changes state from liquid to solid when it reaches a temperature of 0 degrees celsius, or 32 degrees fahrenheit. This is a simple example of changing from liquid to solid, or freezing.", "[substeps] Check that the container is completely dry, but no ice has formed. You should get a sample before disposing of it.", "[step] Don't drink and continue making liquid. [title] Separate the ice water if you're not used to using water.", "[title] Set a timer to check on the reaction. [step] The liquid should be safe to use again once the water has frozen completely and the food appears firm."], "label": "0"}, #noqa + {"ind": 9, "activity_label": "Baking cookies", "ctx_a": "A female chef in white uniform shows a stack of baking pans in a large kitchen presenting them. The pans are filled with pastries and loaded into the oven.", "ctx_b": "a knife", "endings": ["is seen moving on a board and cutting out its contents.", "hits the peeled cheesecake, followed by sliced custard and still cooked ice cream.", "etches a shape into the inside of the baked pans.", "is used to cut cylinder shaped dough into rounds."], "label": "3"},#noqa + {"ind": 47, "activity_label": "Starting a campfire", "ctx_a": "He takes his lighter and lights the newspaper in several places to start the fire. The bonfire starts burning and continues to burn.", "ctx_b": "he", "endings": ["plays with the dog and makes two cookies.", "adds a few more twigs to keep the flames burning.", "gets up and attempts to put a flag on it, fails and makes a complete ass out of himself.", "puts on equipment and stools."], "label": "1"},#noqa + {"ind": 38, "activity_label": "Finance and Business", "ctx_a": "[header] How to write a method statement [title] Prepare to write by conducting a risk assessment--an in-depth examination of the task or process. [substeps] Identify the work hazards (those that could potentially cause poor health or personal harm) that are inherent in the task. Analyze what has been done about these hazards and if these measures are enough to reduce the harm potential to an acceptable level.", "ctx_b": "", "endings": ["Determine if there are further steps you would like to take. For example, if you want to write about looking as though you've truly experienced the problem in practice, doing a risk assessment may help you so further in mental illness.", "Review the information presented to the project and get an understanding of the hazards. [title] Organize and plan a rest period that will help the sanitation industry and forest service team manage the task more effectively.", "Decide what additional measures need to be taken to reduce harm if an acceptable level has not been met. [title] Begin to write your method statement, starting at the header.", "[title] Write the search code (cnet) heading. [step] To write an article or report, simply write the following code (cnet: alternative sources and outcomes."], "label": "2"},#noqa + {"ind": 38, "activity_label": "Arm wrestling", "ctx_a": "Two bodybuilder women are seated at a table. They are arm wrestling, vieing to win.", "ctx_b": "when there", "endings": ["'s another wrestler, they finish wrestling him.", "is a winner they go cheer each other on.", "is a victor, the two women shake hands.", "is not a winner, they get a huge kick in the face and continue wrestling as the crowd cheers on."], "label": "2"},#noqa + {"ind": 51, "activity_label": "Painting", "ctx_a": "A lady named linda, creator of paint along is demonstrating how to do an acrylic painting.", "ctx_b": "she", "endings": ["extensively paints from fabric and paint horse tails on a painting screen.", "starts with a one inch flat brush and yellow and white acrylic paint.", "shows off her paint thinner and begins to tell her story about the underground bottle of magenta paints.", "demonstrates how to bring a window down from the wall."], "label": "1"},#noqa + {"ind": 63, "activity_label": "Fixing the roof", "ctx_a": "A woman with long, black, curly hair is wearing casual wear, talking, and squatting on a roof.", "ctx_b": "the woman", "endings": ["then stands up and walks to a part of the roof where she lifts up a black shingle on the roof.", "turns on a machine attached to a hand cart with multiple metal rails and drives it underneath a large roof.", "raise her left leg to the graffiti, move it partially along, and just gets herself started climbing the tiles.", "holds her back while she works on the roof, she holds her legs behind her legs."], "label": "0"},#noqa + {"ind": 4, "activity_label": "Removing ice from car", "ctx_a": "Then, the man writes over the snow covering the window of a car, and a woman wearing winter clothes smiles.", "ctx_b": "then", "endings": [", the man adds wax to the windshield and cuts it.", ", a person board a ski lift, while two men supporting the head of the person wearing winter clothes snow as the we girls sled.", ", the man puts on a christmas coat, knitted with netting.", ", the man continues removing the snow on his car."], "label": "3"},#noqa + {"ind": 30, "activity_label": "Getting a haircut", "ctx_a": "The man in the blue shirt sits on the chair next to the sink. The other man begins washing his hair. He scrubs in the shampoo and then washes it off.", "ctx_b": "he", "endings": ["then combs it and blow dries his hair after styling it with gel.", "shows the razor that he has for shaving his hair.", "hair is now dry, he is on his way to the barber.", "moves the bucket to the other side of the sink and continues washing his hair."], "label": "0"},#noqa + {"ind": 61, "activity_label": "Brushing teeth", "ctx_a": "A little boy walk toward the sink.", "ctx_b": "the boy", "endings": ["falling shits his pants from the bottom out.", "stands water to rinse his mouth.", "stands on front the sink and puts toothpaste on the brush, and then brush the teeth.", "rinses his cup in the pot, then put glasses on it."], "label": "2"},#noqa +] +# fmt: on class HELLASWAG(BaseTask[str]): @@ -73,3 +92,57 @@ def _get_initial_prompt_text(self, item: dict[str, Any]) -> str: def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None: completions = super()._get_possible_completions(item) return (completions or []) + [" I do not know."] + + +class _HELLASWAG_Base(BaseTask[str]): + """Shared base for HELLASWAG variants (Cloze, MC, BPB). + + Subclasses set ``NAME`` and ``TASK_STYLER``; everything else is inherited. + """ + + DATASET_PATH = "Rowan/hellaswag" + SAMPLE_SPLIT = "validation" + FEWSHOT_SPLIT = "train" + SUBJECTS = [NO_SUBJECT] + LANGUAGE = Language.ENG + + @staticmethod + def _preprocess(prompt: str) -> str: + # remove bracketed text + prompt = prompt.strip() + prompt = prompt.replace(" [title]", ". ") + prompt = re.sub("\\[.*?\\]", "", prompt) + prompt = prompt.replace(" ", " ") + prompt = re.sub(r"\.\. ", ". ", prompt) + return prompt + + def _get_choices(self, item: dict[str, Any]) -> list[str]: + return [self._preprocess(ending) for ending in item["endings"]] + + def _get_raw_question(self, item: dict[str, Any]) -> str: + # Include activity_label as prefix to match the OLMES prompt format: + # "ActivityLabel: preprocessed_context" + subject = self._preprocess(item["activity_label"]) + context = self._preprocess(item["ctx_a"] + " " + item["ctx_b"].capitalize()).strip() + return f"{subject}: {context}" + + def _get_correct_index(self, item: dict[str, Any]) -> int: + return int(item["label"] if item["label"] != "" else 0) + + def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict[str, Any]]: + return _HELLASWAG_FEWSHOTS[: self.num_fewshot] + + +class HELLASWAGCloze(_HELLASWAG_Base): + NAME = "HELLASWAGCloze" + TASK_STYLER = ClozeStyle() + + +class HELLASWAGMC(_HELLASWAG_Base): + NAME = "HELLASWAGMC" + TASK_STYLER = MCStyle(space_prefixed_labels=True) + + +class HELLASWAGBPB(_HELLASWAG_Base): + NAME = "HellaSwagBPB" + TASK_STYLER = BPBStyle(question_prefix="", cue_text="", trailing_newline=False) diff --git a/src/eval_framework/tasks/benchmarks/humaneval.py b/src/eval_framework/tasks/benchmarks/humaneval.py index 163d6a8d..983d795f 100644 --- a/src/eval_framework/tasks/benchmarks/humaneval.py +++ b/src/eval_framework/tasks/benchmarks/humaneval.py @@ -4,6 +4,7 @@ from eval_framework.metrics.loglikelihood.bits_per_byte import BitsPerByteLoglikelihood from eval_framework.shared.types import BaseMetricContext from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType, Sample +from eval_framework.tasks.task_style import BPBStyle CODE_TO_EXECUTE = """ {start_of_code} @@ -139,3 +140,105 @@ def _get_instruction_text(self, item: dict[str, Any]) -> str: def _get_cue_text(self, item: dict[str, Any]) -> str: return self.CUE_PREFIX + item["prompt"].lstrip() + + +# fmt: off +# Fixed 3-shot fewshot examples for codex_humaneval_gold_bpb_3shot. +# Source: HumanEval test split, task_ids HumanEval/112, HumanEval/29, HumanEval/1 (in that order). +_CODEX_HUMANEVAL_FEWSHOTS: list[dict[str, Any]] = [ + { + "task_id": "HumanEval/112", + "entry_point": "reverse_delete", + # The HumanEval/112 prompt starts with "\n" in the dataset. In the + # reference (olmo_eval) this becomes the very first character of the + # pre-baked ctx string, and ConcatFormatter strips it when formatting a + # single-message context. We strip it here so that our multi-message + # context produces the same output. + "prompt": 'def reverse_delete(s,c):\n """Task\n We are given two strings s and c, you have to deleted all the characters in s that are equal to any character in c\n then check if the result string is palindrome.\n A string is called palindrome if it reads the same backward as forward.\n You should return a tuple containing the result string and True/False for the check.\n Example\n For s = "abcde", c = "ae", the result should be (\'bcd\',False)\n For s = "abcdef", c = "b" the result should be (\'acdef\',False)\n For s = "abcdedcba", c = "ab", the result should be (\'cdedc\',True)\n """\n',#noqa + "canonical_solution": " s = ''.join([char for char in s if char not in c])\n return (s,s[::-1] == s)\n", + }, + { + "task_id": "HumanEval/29", + "entry_point": "filter_by_prefix", + "prompt": "from typing import List\n\n\ndef filter_by_prefix(strings: List[str], prefix: str) -> List[str]:\n \"\"\" Filter an input list of strings only for ones that start with a given prefix.\n >>> filter_by_prefix([], 'a')\n []\n >>> filter_by_prefix(['abc', 'bcd', 'cde', 'array'], 'a')\n ['abc', 'array']\n \"\"\"\n",#noqa + "canonical_solution": " return [x for x in strings if x.startswith(prefix)]\n", + }, + { + "task_id": "HumanEval/1", + "entry_point": "separate_paren_groups", + "prompt": "from typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n separate those group into separate strings and return the list of those.\n Separate groups are balanced (each open brace is properly closed) and not nested within each other\n Ignore any spaces in the input string.\n >>> separate_paren_groups('( ) (( )) (( )( ))')\n ['()', '(())', '(()())']\n \"\"\"\n",#noqa + "canonical_solution": " result = []\n current_string = []\n current_depth = 0\n\n for c in paren_string:\n if c == '(':\n current_depth += 1\n current_string.append(c)\n elif c == ')':\n current_depth -= 1\n current_string.append(c)\n\n if current_depth == 0:\n result.append(''.join(current_string))\n current_string.clear()\n\n return result\n",#noqa + }, +] + +# Replacement fewshot used when the test item coincides with one of the standard +# three (to avoid self-referential fewshots, mirroring the olmo_eval behaviour). +_STRLEN_FEWSHOT: dict[str, Any] = { + "task_id": "HumanEval/23", + "entry_point": "strlen", + "prompt": '\n\ndef strlen(string: str) -> int:\n """ Return length of given string\n >>> strlen(\'\')\n 0\n >>> strlen(\'abc\')\n 3\n """\n',#noqa + "canonical_solution": " return len(string)\n", +} +# fmt: on + +_STANDARD_FEWSHOT_IDS: frozenset[str] = frozenset(d["task_id"] for d in _CODEX_HUMANEVAL_FEWSHOTS) + + +class _CodexHumanEval_Base(BaseTask[str]): + """Shared base for codex_humaneval_gold_bpb_3shot-compatible HumanEval variants. + + Follows the TASK_STYLER pattern (like ARC): + - ``_get_raw_question`` → ``item["prompt"]`` (function signature + docstring) + - ``_get_choices`` → ``[item["canonical_solution"]]`` + - ``_get_correct_index`` → ``0`` + + ``RESPONSE_TYPE`` and ``METRICS`` are provided by the ``TASK_STYLER``. + + BPBStyle normally prepends ``" "`` to the scored completion, but HumanEval + prompts already end with ``"\\n"`` which ConcatFormatter strips from the last + USER message. ``_get_possible_completions`` is therefore overridden to omit + that space so the completion starts directly with the four-space indent of + the function body, matching the olmo_eval reference. The fewshot *target* + retains the leading space via ``BPBStyle.get_fewshot_target_text`` because + those messages are not the final USER turn (no stripping). + """ + + DATASET_PATH = "openai/openai_humaneval" + SAMPLE_SPLIT = "test" + FEWSHOT_SPLIT = "test" + SUBJECTS = [NO_SUBJECT] + LANGUAGE = Language.ENG + + def _get_raw_question(self, item: dict[str, Any]) -> str: + return item["prompt"] + + def _get_choices(self, item: dict[str, Any]) -> list[str]: + return [item["canonical_solution"]] + + def _get_correct_index(self, item: dict[str, Any]) -> int: + return 0 + + def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None: + # Skip BPBStyle's default " " prefix — the prompt's trailing "\n" is + # stripped by ConcatFormatter, so no extra space is needed. + return [item["canonical_solution"]] + + def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict[str, Any]]: + test_id = item.get("task_id", "") + if test_id in _STANDARD_FEWSHOT_IDS: + # Avoid self-referential fewshots: drop the test item's own example + # and substitute strlen (HumanEval/23), mirroring olmo_eval. + base = [d for d in _CODEX_HUMANEVAL_FEWSHOTS if d["task_id"] != test_id] + return (base + [_STRLEN_FEWSHOT])[: self.num_fewshot] + return _CODEX_HUMANEVAL_FEWSHOTS[: self.num_fewshot] + + +class CodexHumanEval_BPB(_CodexHumanEval_Base): + """BPB-only HumanEval that matches codex_humaneval_gold_bpb_3shot. + + Prompt: ``{prompt}`` (function signature + docstring, verbatim) + Scored completion: ``{canonical_solution}`` + """ + + NAME = "CodexHumanEval_BPB" + TASK_STYLER = BPBStyle(question_prefix="", cue_text="", trailing_newline=False) diff --git a/src/eval_framework/tasks/benchmarks/math_reasoning.py b/src/eval_framework/tasks/benchmarks/math_reasoning.py index ff47cf53..ef1958f2 100644 --- a/src/eval_framework/tasks/benchmarks/math_reasoning.py +++ b/src/eval_framework/tasks/benchmarks/math_reasoning.py @@ -16,6 +16,7 @@ ) from eval_framework.metrics.loglikelihood.bits_per_byte import BitsPerByteLoglikelihood from eval_framework.tasks.base import NO_SUBJECT, RANDOM_SEED, BaseTask, Language, ResponseType, Sample, SubjectType +from eval_framework.tasks.task_style import BPBStyle # Hendrycks MATH subject splits (shared by MATH, MATHMinervaEvalHarness, MATHMinervaBPB) MATH_SUBJECTS = [ @@ -790,3 +791,57 @@ def __init__(self, num_fewshot: int = 4) -> None: def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]: return _OLMES_FEWSHOTS[: self.num_fewshot] + + +class _MATH500Minerva_Base(BaseTask[str]): + """Shared base for TASK_STYLER-based MATH500Minerva variants. + + MATH-500 has no discrete answer choices, so MCStyle and ClozeStyle do not + apply. Only BPBStyle (bits-per-byte of the normalized gold answer) is + supported. Uses the MATH-500 dataset with the 4 hardcoded OLMES fewshot + examples from ``_OLMES_FEWSHOTS`` (same as MATHMinerva_OLMES). + + Subclasses set ``NAME`` and ``TASK_STYLER``; everything else is inherited. + """ + + DATASET_PATH = "HuggingFaceH4/MATH-500" + SAMPLE_SPLIT = "test" + FEWSHOT_SPLIT = "test" + SUBJECTS = [NO_SUBJECT] + LANGUAGE = Language.ENG + + def __init__(self, num_fewshot: int = 4) -> None: + if num_fewshot != 4: + logger.warning("MATH500Minerva TASK_STYLER variants support a fixed num_fewshot of 4.") + super().__init__(num_fewshot=4) + + def _get_raw_question(self, item: dict[str, Any]) -> str: + # Embed "Solution:" so BPBStyle's empty cue produces the same prompt as + # MATHMinervaEvalHarness: "Problem:\n{problem}\n\nSolution:". + return "Problem:\n" + item["problem"] + "\n\nSolution:" + + def _get_choices(self, item: dict[str, Any]) -> list[str]: + # BPB is scored over the full gold solution (matching minerva_math_500_gold_bpb_0shot). + return [item["solution"]] + + def _get_correct_index(self, item: dict[str, Any]) -> int: + return 0 + + def _get_fewshot_target_text(self, item: dict[str, Any]) -> str: + return " " + item["solution"] + + def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]: + return _OLMES_FEWSHOTS[: self.num_fewshot] + + +class MATH500Minerva_BPB(_MATH500Minerva_Base): + """BPB-only variant of MATH500Minerva with OLMES 4-shot prompt. + + Scores bits-per-byte of the normalized gold answer conditioned on the + Minerva-style prompt with 4 hardcoded OLMES fewshot examples. + """ + + NAME = "MATH500Minerva_BPB" + # trailing_newline=False keeps the prompt as "Problem:\n...\n\nSolution:" + # without an extra newline; question_prefix="" suppresses "Question: ". + TASK_STYLER = BPBStyle(question_prefix="", cue_text="", trailing_newline=False) diff --git a/src/eval_framework/tasks/benchmarks/mbpp.py b/src/eval_framework/tasks/benchmarks/mbpp.py index 4f0205de..12a0c7fd 100644 --- a/src/eval_framework/tasks/benchmarks/mbpp.py +++ b/src/eval_framework/tasks/benchmarks/mbpp.py @@ -9,6 +9,7 @@ from eval_framework.metrics.loglikelihood.bits_per_byte import BitsPerByteLoglikelihood from eval_framework.shared.types import BaseMetricContext from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample +from eval_framework.tasks.task_style import BPBStyle logger = logging.getLogger(__name__) @@ -307,3 +308,124 @@ def post_process_generated_completion(self, completion_text: str, sample: Sample mbpp_ground_truth = str(sample.ground_truth) code = self._code_expander(extracted_code, mbpp_ground_truth) return code + + +class _MBPP_Base(BaseTask[str]): + """Shared base for TASK_STYLER-based MBPP variants. + + MBPP has no discrete answer choices, so MCStyle and ClozeStyle do not apply. + Only BPBStyle (bits-per-byte of the reference solution) is supported. + + Subclasses set ``NAME`` and ``TASK_STYLER``; everything else is inherited. + """ + + DATASET_PATH = "google-research-datasets/mbpp" + SAMPLE_SPLIT = "test" + FEWSHOT_SPLIT = "train" + SUBJECTS = ["full"] + LANGUAGE = Language.ENG + + def _get_raw_question(self, item: dict[str, Any]) -> str: + tests = "\n".join(item["test_list"]) + text = item["text"] if "text" in item else item["prompt"] + return f"You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n{tests}" # noqa + + def _get_choices(self, item: dict[str, Any]) -> list[str]: + return [item["code"]] + + def _get_correct_index(self, item: dict[str, Any]) -> int: + return 0 + + def _get_fewshot_target_text(self, item: dict[str, Any]) -> str: + return f"{BEGIN}\n{item['code']}\n{END}" + + def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]: + return self.rnd.sample(self.dataset[self.FEWSHOT_SPLIT], self.num_fewshot) + + +class MBPP_BPB(_MBPP_Base): + """BPB-only variant: scores bits-per-byte of the reference code solution.""" + + NAME = "MBPP_BPB" + TASK_STYLER = BPBStyle(question_prefix="", cue_text=BEGIN) + + +# fmt: off +# Fixed 3-shot fewshot examples matching codex_mbpp_gold_bpb_3shot. +# Source: MBPP "full" prompt split, task_ids 3, 9, 4 (in that order). +_CODEX_MBPP_FEWSHOTS: list[dict[str, Any]] = [ + { + "text": "Write a python function to identify non-prime numbers.", + "code": "import math\ndef is_not_prime(n):\n result = False\n for i in range(2,int(math.sqrt(n)) + 1):\n if n % i == 0:\n result = True\n return result",#noqa + }, + { + "text": "Write a python function to find the minimum number of rotations required to get the same string.", + "code": "def find_Rotations(str): \n tmp = str + str\n n = len(str) \n for i in range(1,n + 1): \n substring = tmp[i: i+n] \n if (str == substring): \n return i \n return n",#noqa + }, + { + "text": "Write a function to find the largest integers from a given list of numbers using heap queue algorithm.",#noqa + "code": "import heapq as hq\ndef heap_queue_largest(nums,n):\n largest_nums = hq.nlargest(n, nums)\n return largest_nums",#noqa + }, +] +# fmt: on + + +class _CodexMBPP_Base(BaseTask[str]): + """Shared base for the codex_mbpp_gold_bpb_3shot-compatible MBPP variants. + + Prompt format (per item):: + + Write a python function to {description}. + ```python + {code} + ``` + + The task description is used verbatim as the question; no test-assertions + are included in the prompt. BPB is scored over the full reference code + (including the closing ``` fence). Line endings are normalised to LF and + trailing whitespace is stripped from the code string. + """ + + DATASET_PATH = "google-research-datasets/mbpp" + SAMPLE_SPLIT = "test" + FEWSHOT_SPLIT = "test" + SUBJECTS = ["full"] + LANGUAGE = Language.ENG + + @staticmethod + def _normalize_code(code: str) -> str: + return code.replace("\r\n", "\n").replace("\r", "").rstrip() + + def _get_raw_question(self, item: dict[str, Any]) -> str: + return item["text"] + + def _get_choices(self, item: dict[str, Any]) -> list[str]: + return [self._normalize_code(item["code"])] + + def _get_correct_index(self, item: dict[str, Any]) -> int: + return 0 + + def _get_fewshot_target_text(self, item: dict[str, Any]) -> str: + code = self._normalize_code(item["code"]) + return f"```python\n{code}\n```" + + def _get_possible_completions(self, item: dict[str, Any]) -> list[str]: + # No leading space: the cue already ends with "\n" so the code starts + # directly on the next line (unlike the default ClozeStyle which + # prepends a space to each completion). + code = self._normalize_code(item["code"]) + return [f"{code}\n```"] + + def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict[str, Any]]: + return _CODEX_MBPP_FEWSHOTS[: self.num_fewshot] + + +class CodexMBPP_BPB(_CodexMBPP_Base): + """BPB-only MBPP variant that matches the codex_mbpp_gold_bpb_3shot reference. + + Prompt: ``"{description}\\n```python\\n"`` + Completion: ``"{code}\\n```"`` + """ + + NAME = "CodexMBPP_BPB" + TASK_STYLER = BPBStyle(question_prefix="", cue_text="```python\n", trailing_newline=True) diff --git a/src/eval_framework/tasks/benchmarks/mmlu.py b/src/eval_framework/tasks/benchmarks/mmlu.py index f1410dfc..c668f9e7 100644 --- a/src/eval_framework/tasks/benchmarks/mmlu.py +++ b/src/eval_framework/tasks/benchmarks/mmlu.py @@ -11,68 +11,80 @@ from eval_framework.metrics.loglikelihood.dcs import DistributionalCorrectnessScore from eval_framework.metrics.loglikelihood.ternary import TernaryScore from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample +from eval_framework.tasks.task_style import BPBStyle, ClozeStyle, MCStyle from eval_framework.tasks.utils import get_n_letters -MMLU_SUBJECTS = [ +MMLU_STEM = [ "abstract_algebra", - "anatomy", "astronomy", - "business_ethics", - "clinical_knowledge", "college_biology", "college_chemistry", "college_computer_science", "college_mathematics", - "college_medicine", "college_physics", "computer_security", "conceptual_physics", - "econometrics", "electrical_engineering", "elementary_mathematics", - "formal_logic", - "global_facts", "high_school_biology", "high_school_chemistry", "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", "high_school_mathematics", - "high_school_microeconomics", "high_school_physics", - "high_school_psychology", "high_school_statistics", + "machine_learning", +] + +MMLU_HUMANITIES = [ + "formal_logic", + "high_school_european_history", "high_school_us_history", "high_school_world_history", - "human_aging", - "human_sexuality", "international_law", "jurisprudence", "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", "moral_disputes", "moral_scenarios", - "nutrition", "philosophy", "prehistory", - "professional_accounting", "professional_law", - "professional_medicine", + "world_religions", +] + +MMLU_SOCIAL_SCIENCES = [ + "econometrics", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_microeconomics", + "high_school_psychology", + "human_sexuality", "professional_psychology", "public_relations", "security_studies", "sociology", "us_foreign_policy", +] + +MMLU_OTHER = [ + "anatomy", + "business_ethics", + "clinical_knowledge", + "college_medicine", + "global_facts", + "human_aging", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "nutrition", + "professional_accounting", + "professional_medicine", "virology", - "world_religions", ] +MMLU_SUBJECTS = MMLU_STEM + MMLU_HUMANITIES + MMLU_SOCIAL_SCIENCES + MMLU_OTHER + class MMLU(BaseTask[str]): """MMLU dataset: https://huggingface.co/datasets/cais/mmlu""" @@ -228,3 +240,70 @@ def _get_initial_prompt_text(self, item: dict[str, Any]) -> str: 'Summarize your reasoning concisely, then conclude with "Therefore, the answer is: X", where X is ' "one of A, B, C, or D." ) + + +class _MMLU_Base(BaseTask[str]): + """Shared base for TASK_STYLER-based MMLU variants (Cloze, MC, BPB). + + Subclasses set ``NAME`` and ``TASK_STYLER``; everything else is inherited. + """ + + DATASET_PATH = "cais/mmlu" + SAMPLE_SPLIT = "test" + FEWSHOT_SPLIT = "dev" + SUBJECTS = MMLU_SUBJECTS + PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"] + get_n_letters(4) + LANGUAGE = Language.ENG + + def _get_subject_name(self, item: dict[str, Any]) -> str: + return " ".join(item["subject"].split("_")) + + def _get_initial_prompt_text(self, item: dict[str, Any]) -> str: + return f"The following are multiple choice questions (with answers) about {self._get_subject_name(item)}:" + + def _get_raw_question(self, item: dict[str, Any]) -> str: + return item["question"].strip() + + def _get_choices(self, item: dict[str, Any]) -> list[str]: + return item["choices"] + + def _get_correct_index(self, item: dict[str, Any]) -> int: + return item["answer"] + + def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict[str, Any]]: + return list(self.dataset[self.FEWSHOT_SPLIT][: self.num_fewshot]) + + +class MMLUCloze(_MMLU_Base): + NAME = "MMLUCloze" + TASK_STYLER = ClozeStyle() + + +class MMLUMC(_MMLU_Base): + NAME = "MMLUMC" + TASK_STYLER = MCStyle(space_prefixed_labels=True) + + +class MMLUBPB(_MMLU_Base): + NAME = "MMLUBPB" + TASK_STYLER = BPBStyle() + + +class MMLUOtherBPB(MMLUBPB): + NAME = "MMLUOtherBPB" + SUBJECTS = MMLU_OTHER + + +class MMLUStemBPB(MMLUBPB): + NAME = "MMLUStemBPB" + SUBJECTS = MMLU_STEM + + +class MMLUHumanitiesBPB(MMLUBPB): + NAME = "MMLUHumanitiesBPB" + SUBJECTS = MMLU_HUMANITIES + + +class MMLUSocialSciencesBPB(MMLUBPB): + NAME = "MMLUSocialSciencesBPB" + SUBJECTS = MMLU_SOCIAL_SCIENCES From 01495c4e8860d1c65ee1ceeb2edb6b1feaa60c68 Mon Sep 17 00:00:00 2001 From: Prabhu Sivaprasad Date: Thu, 9 Apr 2026 11:07:41 +0000 Subject: [PATCH 09/13] storing --- .../tasks/benchmarks/humaneval.py | 57 +------------------ src/eval_framework/tasks/benchmarks/mbpp.py | 8 +-- src/eval_framework/tasks/benchmarks/mmlu.py | 8 +-- src/eval_framework/tasks/task_names.py | 16 ++++++ 4 files changed, 19 insertions(+), 70 deletions(-) diff --git a/src/eval_framework/tasks/benchmarks/humaneval.py b/src/eval_framework/tasks/benchmarks/humaneval.py index 983d795f..4f218c47 100644 --- a/src/eval_framework/tasks/benchmarks/humaneval.py +++ b/src/eval_framework/tasks/benchmarks/humaneval.py @@ -142,48 +142,6 @@ def _get_cue_text(self, item: dict[str, Any]) -> str: return self.CUE_PREFIX + item["prompt"].lstrip() -# fmt: off -# Fixed 3-shot fewshot examples for codex_humaneval_gold_bpb_3shot. -# Source: HumanEval test split, task_ids HumanEval/112, HumanEval/29, HumanEval/1 (in that order). -_CODEX_HUMANEVAL_FEWSHOTS: list[dict[str, Any]] = [ - { - "task_id": "HumanEval/112", - "entry_point": "reverse_delete", - # The HumanEval/112 prompt starts with "\n" in the dataset. In the - # reference (olmo_eval) this becomes the very first character of the - # pre-baked ctx string, and ConcatFormatter strips it when formatting a - # single-message context. We strip it here so that our multi-message - # context produces the same output. - "prompt": 'def reverse_delete(s,c):\n """Task\n We are given two strings s and c, you have to deleted all the characters in s that are equal to any character in c\n then check if the result string is palindrome.\n A string is called palindrome if it reads the same backward as forward.\n You should return a tuple containing the result string and True/False for the check.\n Example\n For s = "abcde", c = "ae", the result should be (\'bcd\',False)\n For s = "abcdef", c = "b" the result should be (\'acdef\',False)\n For s = "abcdedcba", c = "ab", the result should be (\'cdedc\',True)\n """\n',#noqa - "canonical_solution": " s = ''.join([char for char in s if char not in c])\n return (s,s[::-1] == s)\n", - }, - { - "task_id": "HumanEval/29", - "entry_point": "filter_by_prefix", - "prompt": "from typing import List\n\n\ndef filter_by_prefix(strings: List[str], prefix: str) -> List[str]:\n \"\"\" Filter an input list of strings only for ones that start with a given prefix.\n >>> filter_by_prefix([], 'a')\n []\n >>> filter_by_prefix(['abc', 'bcd', 'cde', 'array'], 'a')\n ['abc', 'array']\n \"\"\"\n",#noqa - "canonical_solution": " return [x for x in strings if x.startswith(prefix)]\n", - }, - { - "task_id": "HumanEval/1", - "entry_point": "separate_paren_groups", - "prompt": "from typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n separate those group into separate strings and return the list of those.\n Separate groups are balanced (each open brace is properly closed) and not nested within each other\n Ignore any spaces in the input string.\n >>> separate_paren_groups('( ) (( )) (( )( ))')\n ['()', '(())', '(()())']\n \"\"\"\n",#noqa - "canonical_solution": " result = []\n current_string = []\n current_depth = 0\n\n for c in paren_string:\n if c == '(':\n current_depth += 1\n current_string.append(c)\n elif c == ')':\n current_depth -= 1\n current_string.append(c)\n\n if current_depth == 0:\n result.append(''.join(current_string))\n current_string.clear()\n\n return result\n",#noqa - }, -] - -# Replacement fewshot used when the test item coincides with one of the standard -# three (to avoid self-referential fewshots, mirroring the olmo_eval behaviour). -_STRLEN_FEWSHOT: dict[str, Any] = { - "task_id": "HumanEval/23", - "entry_point": "strlen", - "prompt": '\n\ndef strlen(string: str) -> int:\n """ Return length of given string\n >>> strlen(\'\')\n 0\n >>> strlen(\'abc\')\n 3\n """\n',#noqa - "canonical_solution": " return len(string)\n", -} -# fmt: on - -_STANDARD_FEWSHOT_IDS: frozenset[str] = frozenset(d["task_id"] for d in _CODEX_HUMANEVAL_FEWSHOTS) - - class _CodexHumanEval_Base(BaseTask[str]): """Shared base for codex_humaneval_gold_bpb_3shot-compatible HumanEval variants. @@ -218,20 +176,6 @@ def _get_choices(self, item: dict[str, Any]) -> list[str]: def _get_correct_index(self, item: dict[str, Any]) -> int: return 0 - def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None: - # Skip BPBStyle's default " " prefix — the prompt's trailing "\n" is - # stripped by ConcatFormatter, so no extra space is needed. - return [item["canonical_solution"]] - - def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict[str, Any]]: - test_id = item.get("task_id", "") - if test_id in _STANDARD_FEWSHOT_IDS: - # Avoid self-referential fewshots: drop the test item's own example - # and substitute strlen (HumanEval/23), mirroring olmo_eval. - base = [d for d in _CODEX_HUMANEVAL_FEWSHOTS if d["task_id"] != test_id] - return (base + [_STRLEN_FEWSHOT])[: self.num_fewshot] - return _CODEX_HUMANEVAL_FEWSHOTS[: self.num_fewshot] - class CodexHumanEval_BPB(_CodexHumanEval_Base): """BPB-only HumanEval that matches codex_humaneval_gold_bpb_3shot. @@ -242,3 +186,4 @@ class CodexHumanEval_BPB(_CodexHumanEval_Base): NAME = "CodexHumanEval_BPB" TASK_STYLER = BPBStyle(question_prefix="", cue_text="", trailing_newline=False) + TASK_STYLER.get_possible_completions = lambda self, item: [item["canonical_solution"]] diff --git a/src/eval_framework/tasks/benchmarks/mbpp.py b/src/eval_framework/tasks/benchmarks/mbpp.py index 12a0c7fd..fa3f2884 100644 --- a/src/eval_framework/tasks/benchmarks/mbpp.py +++ b/src/eval_framework/tasks/benchmarks/mbpp.py @@ -409,13 +409,6 @@ def _get_fewshot_target_text(self, item: dict[str, Any]) -> str: code = self._normalize_code(item["code"]) return f"```python\n{code}\n```" - def _get_possible_completions(self, item: dict[str, Any]) -> list[str]: - # No leading space: the cue already ends with "\n" so the code starts - # directly on the next line (unlike the default ClozeStyle which - # prepends a space to each completion). - code = self._normalize_code(item["code"]) - return [f"{code}\n```"] - def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict[str, Any]]: return _CODEX_MBPP_FEWSHOTS[: self.num_fewshot] @@ -429,3 +422,4 @@ class CodexMBPP_BPB(_CodexMBPP_Base): NAME = "CodexMBPP_BPB" TASK_STYLER = BPBStyle(question_prefix="", cue_text="```python\n", trailing_newline=True) + TASK_STYLER.get_possible_completions = lambda self, item: [f"```python\n{self._normalize_code(item['code'])}\n```"] diff --git a/src/eval_framework/tasks/benchmarks/mmlu.py b/src/eval_framework/tasks/benchmarks/mmlu.py index c668f9e7..97b2b791 100644 --- a/src/eval_framework/tasks/benchmarks/mmlu.py +++ b/src/eval_framework/tasks/benchmarks/mmlu.py @@ -243,10 +243,7 @@ def _get_initial_prompt_text(self, item: dict[str, Any]) -> str: class _MMLU_Base(BaseTask[str]): - """Shared base for TASK_STYLER-based MMLU variants (Cloze, MC, BPB). - - Subclasses set ``NAME`` and ``TASK_STYLER``; everything else is inherited. - """ + """Shared base for TASK_STYLER-based MMLU variants (Cloze, MC, BPB).""" DATASET_PATH = "cais/mmlu" SAMPLE_SPLIT = "test" @@ -270,9 +267,6 @@ def _get_choices(self, item: dict[str, Any]) -> list[str]: def _get_correct_index(self, item: dict[str, Any]) -> int: return item["answer"] - def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict[str, Any]]: - return list(self.dataset[self.FEWSHOT_SPLIT][: self.num_fewshot]) - class MMLUCloze(_MMLU_Base): NAME = "MMLUCloze" diff --git a/src/eval_framework/tasks/task_names.py b/src/eval_framework/tasks/task_names.py index c5887191..5b38c753 100644 --- a/src/eval_framework/tasks/task_names.py +++ b/src/eval_framework/tasks/task_names.py @@ -21,6 +21,9 @@ def register_all_tasks() -> None: register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.AIME2025") register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.AIME2026") register_lazy_task("eval_framework.tasks.benchmarks.arc.ARC") + register_lazy_task("eval_framework.tasks.benchmarks.arc.ARCCloze") + register_lazy_task("eval_framework.tasks.benchmarks.arc.ARCMC") + register_lazy_task("eval_framework.tasks.benchmarks.arc.ARCBPB") register_lazy_task("eval_framework.tasks.benchmarks.arc.ARC_IDK") register_lazy_task("eval_framework.tasks.benchmarks.arc.ARC_OLMES") register_lazy_task("eval_framework.tasks.benchmarks.arc_de.ARC_DE") @@ -61,12 +64,16 @@ def register_all_tasks() -> None: register_lazy_task("eval_framework.tasks.benchmarks.hellaswag.HELLASWAG_OLMES") register_lazy_task("eval_framework.tasks.benchmarks.hellaswag.HELLASWAG_IDK") register_lazy_task("eval_framework.tasks.benchmarks.hellaswag_de.HELLASWAG_DE") + register_lazy_task("eval_framework.tasks.benchmarks.hellaswag.HELLASWAGCloze") + register_lazy_task("eval_framework.tasks.benchmarks.hellaswag.HELLASWAGMC") + register_lazy_task("eval_framework.tasks.benchmarks.hellaswag.HELLASWAGBPB") register_lazy_task("eval_framework.tasks.benchmarks.opengptx_eu20.HELLASWAG_EU20_DE") register_lazy_task("eval_framework.tasks.benchmarks.opengptx_eu20.HELLASWAG_EU20_FR") register_lazy_task("eval_framework.tasks.benchmarks.humaneval.HumanEval") register_lazy_task("eval_framework.tasks.benchmarks.humaneval.HumanEvalBPB") register_lazy_task("eval_framework.tasks.benchmarks.humaneval.HumanEval_OLMES") register_lazy_task("eval_framework.tasks.benchmarks.humaneval.HumanEvalInstruct") + register_lazy_task("eval_framework.tasks.benchmarks.humaneval.CodexHumanEval_BPB") register_lazy_task("eval_framework.tasks.benchmarks.ifeval.IFEval") register_lazy_task("eval_framework.tasks.benchmarks.ifeval.IFEvalDe") register_lazy_task("eval_framework.tasks.benchmarks.ifeval.IFEvalFiSv") @@ -88,6 +95,7 @@ def register_all_tasks() -> None: register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.MATHMinervaBPB") register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.MATHMinerva_OLMES") register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.MATH500Minerva") + register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.MATH500Minerva_BPB") register_lazy_task("eval_framework.tasks.benchmarks.multipl_e.MultiPLEHumanEvalCpp") register_lazy_task("eval_framework.tasks.benchmarks.multipl_e.MultiPLEHumanEvalJava") register_lazy_task("eval_framework.tasks.benchmarks.multipl_e.MultiPLEHumanEvalJs") @@ -106,10 +114,18 @@ def register_all_tasks() -> None: register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_PROMPT_WITHOUT_TESTS") register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_PROMPT_WITHOUT_TESTS_SANITIZED") register_lazy_task("eval_framework.tasks.benchmarks.mbpp.MBPP_OLMES") + register_lazy_task("eval_framework.tasks.benchmarks.mbpp.CodexMBPP_BPB") register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLU") register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLU_IDK") register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLU_OLMES") register_lazy_task("eval_framework.tasks.benchmarks.mmlu.FullTextMMLU") + register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLUCloze") + register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLUMC") + register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLUBPB") + register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLUOtherBPB") + register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLUStemBPB") + register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLUHumanitiesBPB") + register_lazy_task("eval_framework.tasks.benchmarks.mmlu.MMLUSocialSciencesBPB") register_lazy_task("eval_framework.tasks.benchmarks.opengptx_eu20.MMLU_EU20_DE") register_lazy_task("eval_framework.tasks.benchmarks.opengptx_eu20.MMLU_EU20_FR") register_lazy_task("eval_framework.tasks.benchmarks.mmlu_de.MMLU_DE") From e12a809b3eab456b5201effe31d7a29783a94296 Mon Sep 17 00:00:00 2001 From: Prabhu Sivaprasad Date: Thu, 9 Apr 2026 12:42:06 +0000 Subject: [PATCH 10/13] formatter changes to humaneval, mbpp --- src/eval_framework/tasks/benchmarks/humaneval.py | 3 +-- src/eval_framework/tasks/benchmarks/mbpp.py | 10 ++++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/eval_framework/tasks/benchmarks/humaneval.py b/src/eval_framework/tasks/benchmarks/humaneval.py index 4f218c47..0b469266 100644 --- a/src/eval_framework/tasks/benchmarks/humaneval.py +++ b/src/eval_framework/tasks/benchmarks/humaneval.py @@ -185,5 +185,4 @@ class CodexHumanEval_BPB(_CodexHumanEval_Base): """ NAME = "CodexHumanEval_BPB" - TASK_STYLER = BPBStyle(question_prefix="", cue_text="", trailing_newline=False) - TASK_STYLER.get_possible_completions = lambda self, item: [item["canonical_solution"]] + TASK_STYLER = BPBStyle(question_prefix="", cue_text="", trailing_newline=False, leading_space_continuations=False) diff --git a/src/eval_framework/tasks/benchmarks/mbpp.py b/src/eval_framework/tasks/benchmarks/mbpp.py index fa3f2884..0e480f08 100644 --- a/src/eval_framework/tasks/benchmarks/mbpp.py +++ b/src/eval_framework/tasks/benchmarks/mbpp.py @@ -276,8 +276,9 @@ class MBPP_OLMES(MBPP): FEWSHOT_SPLIT = "test" def __init__(self, num_fewshot: int = 3) -> None: - super().__init__(num_fewshot) - assert num_fewshot == 3, "MBPP_OLMES requires exactly 3 fewshot examples" + if num_fewshot != 3: + logger.warning(f"MBPP_OLMES supports only 3-shot, got {num_fewshot}") + super().__init__(num_fewshot=3) self.stop_sequences = ["```", '\n"""', "\nassert", "\n#"] def _get_instruction_text(self, item: dict[str, Any]) -> str: @@ -421,5 +422,6 @@ class CodexMBPP_BPB(_CodexMBPP_Base): """ NAME = "CodexMBPP_BPB" - TASK_STYLER = BPBStyle(question_prefix="", cue_text="```python\n", trailing_newline=True) - TASK_STYLER.get_possible_completions = lambda self, item: [f"```python\n{self._normalize_code(item['code'])}\n```"] + TASK_STYLER = BPBStyle( + question_prefix="", cue_text="```python\n", trailing_newline=True, leading_space_continuations=False + ) From 5bed0635ce6b6a83e0af4f7d9769c3329a6f96e6 Mon Sep 17 00:00:00 2001 From: Prabhu Sivaprasad Date: Thu, 9 Apr 2026 12:51:50 +0000 Subject: [PATCH 11/13] mbpp test fix --- tests/tests_eval_framework/tasks/test_mbpp_olmes.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/tests_eval_framework/tasks/test_mbpp_olmes.py b/tests/tests_eval_framework/tasks/test_mbpp_olmes.py index bb31a3eb..7e82eaa7 100644 --- a/tests/tests_eval_framework/tasks/test_mbpp_olmes.py +++ b/tests/tests_eval_framework/tasks/test_mbpp_olmes.py @@ -12,10 +12,6 @@ def task(self) -> MBPP_OLMES: with DatasetPatcher(MBPP_OLMES, num_fewshot=3, num_samples=10) as patched_task: return patched_task - def test_num_fewshot_must_be_3(self) -> None: - with pytest.raises(AssertionError, match="MBPP_OLMES requires exactly 3 fewshot examples"): - MBPP_OLMES(num_fewshot=1) - def test_stop_sequences(self) -> None: task = MBPP_OLMES(num_fewshot=3) assert task.stop_sequences == ["```", '\n"""', "\nassert", "\n#"] From 6b811bb69d960ae14ed8ef804165888d93b32861 Mon Sep 17 00:00:00 2001 From: Prabhu Sivaprasad Date: Thu, 9 Apr 2026 13:01:25 +0000 Subject: [PATCH 12/13] cleanups --- docs/tasks/FullTextMMLU.md | 2 +- docs/tasks/GlobalMMLU.md | 2 +- docs/tasks/GlobalMMLU_German.md | 2 +- docs/tasks/MMLU.md | 2 +- docs/tasks/MMLUBPB.md | 2 +- docs/tasks/MMLUCloze.md | 2 +- docs/tasks/MMLUMC.md | 2 +- docs/tasks/MMLU_COT.md | 2 +- docs/tasks/MMLU_EU20_DE.md | 2 +- docs/tasks/MMLU_EU20_FR.md | 2 +- docs/tasks/MMLU_IDK.md | 2 +- docs/tasks/MMLU_OLMES.md | 2 +- docs/tasks/MMMLU.md | 2 +- docs/tasks/MMMLU_GERMAN_COT.md | 4 +- docs/tasks/MMMLU_German.md | 2 +- src/eval_framework/tasks/benchmarks/arc.py | 239 ------------------ .../tasks/benchmarks/hellaswag.py | 21 -- .../tasks/benchmarks/humaneval.py | 18 +- src/eval_framework/tasks/benchmarks/mbpp.py | 23 -- src/eval_framework/tasks/benchmarks/mmlu.py | 2 +- 20 files changed, 18 insertions(+), 317 deletions(-) diff --git a/docs/tasks/FullTextMMLU.md b/docs/tasks/FullTextMMLU.md index 26761311..6edbb074 100644 --- a/docs/tasks/FullTextMMLU.md +++ b/docs/tasks/FullTextMMLU.md @@ -7,7 +7,7 @@ SAMPLE_SPLIT = test FEWSHOT_SPLIT = dev RESPONSE_TYPE = LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood] -SUBJECTS = ['abstract_algebra', 'astronomy', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_physics', 'computer_security', 'conceptual_physics', 'electrical_engineering', 'elementary_mathematics', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_mathematics', 'high_school_physics', 'high_school_statistics', 'machine_learning', 'formal_logic', 'high_school_european_history', 'high_school_us_history', 'high_school_world_history', 'international_law', 'jurisprudence', 'logical_fallacies', 'moral_disputes', 'moral_scenarios', 'philosophy', 'prehistory', 'professional_law', 'world_religions', 'econometrics', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_microeconomics', 'high_school_psychology', 'human_sexuality', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'anatomy', 'business_ethics', 'clinical_knowledge', 'college_medicine', 'global_facts', 'human_aging', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting', 'professional_medicine', 'virology'] +SUBJECTS = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions'] LANGUAGE = ```` diff --git a/docs/tasks/GlobalMMLU.md b/docs/tasks/GlobalMMLU.md index 49de70b8..60f8489e 100644 --- a/docs/tasks/GlobalMMLU.md +++ b/docs/tasks/GlobalMMLU.md @@ -7,7 +7,7 @@ SAMPLE_SPLIT = test FEWSHOT_SPLIT = dev RESPONSE_TYPE = LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood] -SUBJECTS = [('fr', 'abstract_algebra'), ('fr', 'astronomy'), ('fr', 'college_biology'), ('fr', 'college_chemistry'), ('fr', 'college_computer_science'), ('fr', 'college_mathematics'), ('fr', 'college_physics'), ('fr', 'computer_security'), ('fr', 'conceptual_physics'), ('fr', 'electrical_engineering'), ('fr', 'elementary_mathematics'), ('fr', 'high_school_biology'), ('fr', 'high_school_chemistry'), ('fr', 'high_school_computer_science'), ('fr', 'high_school_mathematics'), ('fr', 'high_school_physics'), ('fr', 'high_school_statistics'), ('fr', 'machine_learning'), ('fr', 'formal_logic'), ('fr', 'high_school_european_history'), ('fr', 'high_school_us_history'), ('fr', 'high_school_world_history'), ('fr', 'international_law'), ('fr', 'jurisprudence'), ('fr', 'logical_fallacies'), ('fr', 'moral_disputes'), ('fr', 'moral_scenarios'), ('fr', 'philosophy'), ('fr', 'prehistory'), ('fr', 'professional_law'), ('fr', 'world_religions'), ('fr', 'econometrics'), ('fr', 'high_school_geography'), ('fr', 'high_school_government_and_politics'), ('fr', 'high_school_macroeconomics'), ('fr', 'high_school_microeconomics'), ('fr', 'high_school_psychology'), ('fr', 'human_sexuality'), ('fr', 'professional_psychology'), ('fr', 'public_relations'), ('fr', 'security_studies'), ('fr', 'sociology'), ('fr', 'us_foreign_policy'), ('fr', 'anatomy'), ('fr', 'business_ethics'), ('fr', 'clinical_knowledge'), ('fr', 'college_medicine'), ('fr', 'global_facts'), ('fr', 'human_aging'), ('fr', 'management'), ('fr', 'marketing'), ('fr', 'medical_genetics'), ('fr', 'miscellaneous'), ('fr', 'nutrition'), ('fr', 'professional_accounting'), ('fr', 'professional_medicine'), ('fr', 'virology'), ('de', 'abstract_algebra'), ('de', 'astronomy'), ('de', 'college_biology'), ('de', 'college_chemistry'), ('de', 'college_computer_science'), ('de', 'college_mathematics'), ('de', 'college_physics'), ('de', 'computer_security'), ('de', 'conceptual_physics'), ('de', 'electrical_engineering'), ('de', 'elementary_mathematics'), ('de', 'high_school_biology'), ('de', 'high_school_chemistry'), ('de', 'high_school_computer_science'), ('de', 'high_school_mathematics'), ('de', 'high_school_physics'), ('de', 'high_school_statistics'), ('de', 'machine_learning'), ('de', 'formal_logic'), ('de', 'high_school_european_history'), ('de', 'high_school_us_history'), ('de', 'high_school_world_history'), ('de', 'international_law'), ('de', 'jurisprudence'), ('de', 'logical_fallacies'), ('de', 'moral_disputes'), ('de', 'moral_scenarios'), ('de', 'philosophy'), ('de', 'prehistory'), ('de', 'professional_law'), ('de', 'world_religions'), ('de', 'econometrics'), ('de', 'high_school_geography'), ('de', 'high_school_government_and_politics'), ('de', 'high_school_macroeconomics'), ('de', 'high_school_microeconomics'), ('de', 'high_school_psychology'), ('de', 'human_sexuality'), ('de', 'professional_psychology'), ('de', 'public_relations'), ('de', 'security_studies'), ('de', 'sociology'), ('de', 'us_foreign_policy'), ('de', 'anatomy'), ('de', 'business_ethics'), ('de', 'clinical_knowledge'), ('de', 'college_medicine'), ('de', 'global_facts'), ('de', 'human_aging'), ('de', 'management'), ('de', 'marketing'), ('de', 'medical_genetics'), ('de', 'miscellaneous'), ('de', 'nutrition'), ('de', 'professional_accounting'), ('de', 'professional_medicine'), ('de', 'virology'), ('es', 'abstract_algebra'), ('es', 'astronomy'), ('es', 'college_biology'), ('es', 'college_chemistry'), ('es', 'college_computer_science'), ('es', 'college_mathematics'), ('es', 'college_physics'), ('es', 'computer_security'), ('es', 'conceptual_physics'), ('es', 'electrical_engineering'), ('es', 'elementary_mathematics'), ('es', 'high_school_biology'), ('es', 'high_school_chemistry'), ('es', 'high_school_computer_science'), ('es', 'high_school_mathematics'), ('es', 'high_school_physics'), ('es', 'high_school_statistics'), ('es', 'machine_learning'), ('es', 'formal_logic'), ('es', 'high_school_european_history'), ('es', 'high_school_us_history'), ('es', 'high_school_world_history'), ('es', 'international_law'), ('es', 'jurisprudence'), ('es', 'logical_fallacies'), ('es', 'moral_disputes'), ('es', 'moral_scenarios'), ('es', 'philosophy'), ('es', 'prehistory'), ('es', 'professional_law'), ('es', 'world_religions'), ('es', 'econometrics'), ('es', 'high_school_geography'), ('es', 'high_school_government_and_politics'), ('es', 'high_school_macroeconomics'), ('es', 'high_school_microeconomics'), ('es', 'high_school_psychology'), ('es', 'human_sexuality'), ('es', 'professional_psychology'), ('es', 'public_relations'), ('es', 'security_studies'), ('es', 'sociology'), ('es', 'us_foreign_policy'), ('es', 'anatomy'), ('es', 'business_ethics'), ('es', 'clinical_knowledge'), ('es', 'college_medicine'), ('es', 'global_facts'), ('es', 'human_aging'), ('es', 'management'), ('es', 'marketing'), ('es', 'medical_genetics'), ('es', 'miscellaneous'), ('es', 'nutrition'), ('es', 'professional_accounting'), ('es', 'professional_medicine'), ('es', 'virology'), ('it', 'abstract_algebra'), ('it', 'astronomy'), ('it', 'college_biology'), ('it', 'college_chemistry'), ('it', 'college_computer_science'), ('it', 'college_mathematics'), ('it', 'college_physics'), ('it', 'computer_security'), ('it', 'conceptual_physics'), ('it', 'electrical_engineering'), ('it', 'elementary_mathematics'), ('it', 'high_school_biology'), ('it', 'high_school_chemistry'), ('it', 'high_school_computer_science'), ('it', 'high_school_mathematics'), ('it', 'high_school_physics'), ('it', 'high_school_statistics'), ('it', 'machine_learning'), ('it', 'formal_logic'), ('it', 'high_school_european_history'), ('it', 'high_school_us_history'), ('it', 'high_school_world_history'), ('it', 'international_law'), ('it', 'jurisprudence'), ('it', 'logical_fallacies'), ('it', 'moral_disputes'), ('it', 'moral_scenarios'), ('it', 'philosophy'), ('it', 'prehistory'), ('it', 'professional_law'), ('it', 'world_religions'), ('it', 'econometrics'), ('it', 'high_school_geography'), ('it', 'high_school_government_and_politics'), ('it', 'high_school_macroeconomics'), ('it', 'high_school_microeconomics'), ('it', 'high_school_psychology'), ('it', 'human_sexuality'), ('it', 'professional_psychology'), ('it', 'public_relations'), ('it', 'security_studies'), ('it', 'sociology'), ('it', 'us_foreign_policy'), ('it', 'anatomy'), ('it', 'business_ethics'), ('it', 'clinical_knowledge'), ('it', 'college_medicine'), ('it', 'global_facts'), ('it', 'human_aging'), ('it', 'management'), ('it', 'marketing'), ('it', 'medical_genetics'), ('it', 'miscellaneous'), ('it', 'nutrition'), ('it', 'professional_accounting'), ('it', 'professional_medicine'), ('it', 'virology'), ('pt', 'abstract_algebra'), ('pt', 'astronomy'), ('pt', 'college_biology'), ('pt', 'college_chemistry'), ('pt', 'college_computer_science'), ('pt', 'college_mathematics'), ('pt', 'college_physics'), ('pt', 'computer_security'), ('pt', 'conceptual_physics'), ('pt', 'electrical_engineering'), ('pt', 'elementary_mathematics'), ('pt', 'high_school_biology'), ('pt', 'high_school_chemistry'), ('pt', 'high_school_computer_science'), ('pt', 'high_school_mathematics'), ('pt', 'high_school_physics'), ('pt', 'high_school_statistics'), ('pt', 'machine_learning'), ('pt', 'formal_logic'), ('pt', 'high_school_european_history'), ('pt', 'high_school_us_history'), ('pt', 'high_school_world_history'), ('pt', 'international_law'), ('pt', 'jurisprudence'), ('pt', 'logical_fallacies'), ('pt', 'moral_disputes'), ('pt', 'moral_scenarios'), ('pt', 'philosophy'), ('pt', 'prehistory'), ('pt', 'professional_law'), ('pt', 'world_religions'), ('pt', 'econometrics'), ('pt', 'high_school_geography'), ('pt', 'high_school_government_and_politics'), ('pt', 'high_school_macroeconomics'), ('pt', 'high_school_microeconomics'), ('pt', 'high_school_psychology'), ('pt', 'human_sexuality'), ('pt', 'professional_psychology'), ('pt', 'public_relations'), ('pt', 'security_studies'), ('pt', 'sociology'), ('pt', 'us_foreign_policy'), ('pt', 'anatomy'), ('pt', 'business_ethics'), ('pt', 'clinical_knowledge'), ('pt', 'college_medicine'), ('pt', 'global_facts'), ('pt', 'human_aging'), ('pt', 'management'), ('pt', 'marketing'), ('pt', 'medical_genetics'), ('pt', 'miscellaneous'), ('pt', 'nutrition'), ('pt', 'professional_accounting'), ('pt', 'professional_medicine'), ('pt', 'virology'), ('ar', 'abstract_algebra'), ('ar', 'astronomy'), ('ar', 'college_biology'), ('ar', 'college_chemistry'), ('ar', 'college_computer_science'), ('ar', 'college_mathematics'), ('ar', 'college_physics'), ('ar', 'computer_security'), ('ar', 'conceptual_physics'), ('ar', 'electrical_engineering'), ('ar', 'elementary_mathematics'), ('ar', 'high_school_biology'), ('ar', 'high_school_chemistry'), ('ar', 'high_school_computer_science'), ('ar', 'high_school_mathematics'), ('ar', 'high_school_physics'), ('ar', 'high_school_statistics'), ('ar', 'machine_learning'), ('ar', 'formal_logic'), ('ar', 'high_school_european_history'), ('ar', 'high_school_us_history'), ('ar', 'high_school_world_history'), ('ar', 'international_law'), ('ar', 'jurisprudence'), ('ar', 'logical_fallacies'), ('ar', 'moral_disputes'), ('ar', 'moral_scenarios'), ('ar', 'philosophy'), ('ar', 'prehistory'), ('ar', 'professional_law'), ('ar', 'world_religions'), ('ar', 'econometrics'), ('ar', 'high_school_geography'), ('ar', 'high_school_government_and_politics'), ('ar', 'high_school_macroeconomics'), ('ar', 'high_school_microeconomics'), ('ar', 'high_school_psychology'), ('ar', 'human_sexuality'), ('ar', 'professional_psychology'), ('ar', 'public_relations'), ('ar', 'security_studies'), ('ar', 'sociology'), ('ar', 'us_foreign_policy'), ('ar', 'anatomy'), ('ar', 'business_ethics'), ('ar', 'clinical_knowledge'), ('ar', 'college_medicine'), ('ar', 'global_facts'), ('ar', 'human_aging'), ('ar', 'management'), ('ar', 'marketing'), ('ar', 'medical_genetics'), ('ar', 'miscellaneous'), ('ar', 'nutrition'), ('ar', 'professional_accounting'), ('ar', 'professional_medicine'), ('ar', 'virology')] +SUBJECTS = [('fr', 'abstract_algebra'), ('fr', 'anatomy'), ('fr', 'astronomy'), ('fr', 'business_ethics'), ('fr', 'clinical_knowledge'), ('fr', 'college_biology'), ('fr', 'college_chemistry'), ('fr', 'college_computer_science'), ('fr', 'college_mathematics'), ('fr', 'college_medicine'), ('fr', 'college_physics'), ('fr', 'computer_security'), ('fr', 'conceptual_physics'), ('fr', 'econometrics'), ('fr', 'electrical_engineering'), ('fr', 'elementary_mathematics'), ('fr', 'formal_logic'), ('fr', 'global_facts'), ('fr', 'high_school_biology'), ('fr', 'high_school_chemistry'), ('fr', 'high_school_computer_science'), ('fr', 'high_school_european_history'), ('fr', 'high_school_geography'), ('fr', 'high_school_government_and_politics'), ('fr', 'high_school_macroeconomics'), ('fr', 'high_school_mathematics'), ('fr', 'high_school_microeconomics'), ('fr', 'high_school_physics'), ('fr', 'high_school_psychology'), ('fr', 'high_school_statistics'), ('fr', 'high_school_us_history'), ('fr', 'high_school_world_history'), ('fr', 'human_aging'), ('fr', 'human_sexuality'), ('fr', 'international_law'), ('fr', 'jurisprudence'), ('fr', 'logical_fallacies'), ('fr', 'machine_learning'), ('fr', 'management'), ('fr', 'marketing'), ('fr', 'medical_genetics'), ('fr', 'miscellaneous'), ('fr', 'moral_disputes'), ('fr', 'moral_scenarios'), ('fr', 'nutrition'), ('fr', 'philosophy'), ('fr', 'prehistory'), ('fr', 'professional_accounting'), ('fr', 'professional_law'), ('fr', 'professional_medicine'), ('fr', 'professional_psychology'), ('fr', 'public_relations'), ('fr', 'security_studies'), ('fr', 'sociology'), ('fr', 'us_foreign_policy'), ('fr', 'virology'), ('fr', 'world_religions'), ('de', 'abstract_algebra'), ('de', 'anatomy'), ('de', 'astronomy'), ('de', 'business_ethics'), ('de', 'clinical_knowledge'), ('de', 'college_biology'), ('de', 'college_chemistry'), ('de', 'college_computer_science'), ('de', 'college_mathematics'), ('de', 'college_medicine'), ('de', 'college_physics'), ('de', 'computer_security'), ('de', 'conceptual_physics'), ('de', 'econometrics'), ('de', 'electrical_engineering'), ('de', 'elementary_mathematics'), ('de', 'formal_logic'), ('de', 'global_facts'), ('de', 'high_school_biology'), ('de', 'high_school_chemistry'), ('de', 'high_school_computer_science'), ('de', 'high_school_european_history'), ('de', 'high_school_geography'), ('de', 'high_school_government_and_politics'), ('de', 'high_school_macroeconomics'), ('de', 'high_school_mathematics'), ('de', 'high_school_microeconomics'), ('de', 'high_school_physics'), ('de', 'high_school_psychology'), ('de', 'high_school_statistics'), ('de', 'high_school_us_history'), ('de', 'high_school_world_history'), ('de', 'human_aging'), ('de', 'human_sexuality'), ('de', 'international_law'), ('de', 'jurisprudence'), ('de', 'logical_fallacies'), ('de', 'machine_learning'), ('de', 'management'), ('de', 'marketing'), ('de', 'medical_genetics'), ('de', 'miscellaneous'), ('de', 'moral_disputes'), ('de', 'moral_scenarios'), ('de', 'nutrition'), ('de', 'philosophy'), ('de', 'prehistory'), ('de', 'professional_accounting'), ('de', 'professional_law'), ('de', 'professional_medicine'), ('de', 'professional_psychology'), ('de', 'public_relations'), ('de', 'security_studies'), ('de', 'sociology'), ('de', 'us_foreign_policy'), ('de', 'virology'), ('de', 'world_religions'), ('es', 'abstract_algebra'), ('es', 'anatomy'), ('es', 'astronomy'), ('es', 'business_ethics'), ('es', 'clinical_knowledge'), ('es', 'college_biology'), ('es', 'college_chemistry'), ('es', 'college_computer_science'), ('es', 'college_mathematics'), ('es', 'college_medicine'), ('es', 'college_physics'), ('es', 'computer_security'), ('es', 'conceptual_physics'), ('es', 'econometrics'), ('es', 'electrical_engineering'), ('es', 'elementary_mathematics'), ('es', 'formal_logic'), ('es', 'global_facts'), ('es', 'high_school_biology'), ('es', 'high_school_chemistry'), ('es', 'high_school_computer_science'), ('es', 'high_school_european_history'), ('es', 'high_school_geography'), ('es', 'high_school_government_and_politics'), ('es', 'high_school_macroeconomics'), ('es', 'high_school_mathematics'), ('es', 'high_school_microeconomics'), ('es', 'high_school_physics'), ('es', 'high_school_psychology'), ('es', 'high_school_statistics'), ('es', 'high_school_us_history'), ('es', 'high_school_world_history'), ('es', 'human_aging'), ('es', 'human_sexuality'), ('es', 'international_law'), ('es', 'jurisprudence'), ('es', 'logical_fallacies'), ('es', 'machine_learning'), ('es', 'management'), ('es', 'marketing'), ('es', 'medical_genetics'), ('es', 'miscellaneous'), ('es', 'moral_disputes'), ('es', 'moral_scenarios'), ('es', 'nutrition'), ('es', 'philosophy'), ('es', 'prehistory'), ('es', 'professional_accounting'), ('es', 'professional_law'), ('es', 'professional_medicine'), ('es', 'professional_psychology'), ('es', 'public_relations'), ('es', 'security_studies'), ('es', 'sociology'), ('es', 'us_foreign_policy'), ('es', 'virology'), ('es', 'world_religions'), ('it', 'abstract_algebra'), ('it', 'anatomy'), ('it', 'astronomy'), ('it', 'business_ethics'), ('it', 'clinical_knowledge'), ('it', 'college_biology'), ('it', 'college_chemistry'), ('it', 'college_computer_science'), ('it', 'college_mathematics'), ('it', 'college_medicine'), ('it', 'college_physics'), ('it', 'computer_security'), ('it', 'conceptual_physics'), ('it', 'econometrics'), ('it', 'electrical_engineering'), ('it', 'elementary_mathematics'), ('it', 'formal_logic'), ('it', 'global_facts'), ('it', 'high_school_biology'), ('it', 'high_school_chemistry'), ('it', 'high_school_computer_science'), ('it', 'high_school_european_history'), ('it', 'high_school_geography'), ('it', 'high_school_government_and_politics'), ('it', 'high_school_macroeconomics'), ('it', 'high_school_mathematics'), ('it', 'high_school_microeconomics'), ('it', 'high_school_physics'), ('it', 'high_school_psychology'), ('it', 'high_school_statistics'), ('it', 'high_school_us_history'), ('it', 'high_school_world_history'), ('it', 'human_aging'), ('it', 'human_sexuality'), ('it', 'international_law'), ('it', 'jurisprudence'), ('it', 'logical_fallacies'), ('it', 'machine_learning'), ('it', 'management'), ('it', 'marketing'), ('it', 'medical_genetics'), ('it', 'miscellaneous'), ('it', 'moral_disputes'), ('it', 'moral_scenarios'), ('it', 'nutrition'), ('it', 'philosophy'), ('it', 'prehistory'), ('it', 'professional_accounting'), ('it', 'professional_law'), ('it', 'professional_medicine'), ('it', 'professional_psychology'), ('it', 'public_relations'), ('it', 'security_studies'), ('it', 'sociology'), ('it', 'us_foreign_policy'), ('it', 'virology'), ('it', 'world_religions'), ('pt', 'abstract_algebra'), ('pt', 'anatomy'), ('pt', 'astronomy'), ('pt', 'business_ethics'), ('pt', 'clinical_knowledge'), ('pt', 'college_biology'), ('pt', 'college_chemistry'), ('pt', 'college_computer_science'), ('pt', 'college_mathematics'), ('pt', 'college_medicine'), ('pt', 'college_physics'), ('pt', 'computer_security'), ('pt', 'conceptual_physics'), ('pt', 'econometrics'), ('pt', 'electrical_engineering'), ('pt', 'elementary_mathematics'), ('pt', 'formal_logic'), ('pt', 'global_facts'), ('pt', 'high_school_biology'), ('pt', 'high_school_chemistry'), ('pt', 'high_school_computer_science'), ('pt', 'high_school_european_history'), ('pt', 'high_school_geography'), ('pt', 'high_school_government_and_politics'), ('pt', 'high_school_macroeconomics'), ('pt', 'high_school_mathematics'), ('pt', 'high_school_microeconomics'), ('pt', 'high_school_physics'), ('pt', 'high_school_psychology'), ('pt', 'high_school_statistics'), ('pt', 'high_school_us_history'), ('pt', 'high_school_world_history'), ('pt', 'human_aging'), ('pt', 'human_sexuality'), ('pt', 'international_law'), ('pt', 'jurisprudence'), ('pt', 'logical_fallacies'), ('pt', 'machine_learning'), ('pt', 'management'), ('pt', 'marketing'), ('pt', 'medical_genetics'), ('pt', 'miscellaneous'), ('pt', 'moral_disputes'), ('pt', 'moral_scenarios'), ('pt', 'nutrition'), ('pt', 'philosophy'), ('pt', 'prehistory'), ('pt', 'professional_accounting'), ('pt', 'professional_law'), ('pt', 'professional_medicine'), ('pt', 'professional_psychology'), ('pt', 'public_relations'), ('pt', 'security_studies'), ('pt', 'sociology'), ('pt', 'us_foreign_policy'), ('pt', 'virology'), ('pt', 'world_religions'), ('ar', 'abstract_algebra'), ('ar', 'anatomy'), ('ar', 'astronomy'), ('ar', 'business_ethics'), ('ar', 'clinical_knowledge'), ('ar', 'college_biology'), ('ar', 'college_chemistry'), ('ar', 'college_computer_science'), ('ar', 'college_mathematics'), ('ar', 'college_medicine'), ('ar', 'college_physics'), ('ar', 'computer_security'), ('ar', 'conceptual_physics'), ('ar', 'econometrics'), ('ar', 'electrical_engineering'), ('ar', 'elementary_mathematics'), ('ar', 'formal_logic'), ('ar', 'global_facts'), ('ar', 'high_school_biology'), ('ar', 'high_school_chemistry'), ('ar', 'high_school_computer_science'), ('ar', 'high_school_european_history'), ('ar', 'high_school_geography'), ('ar', 'high_school_government_and_politics'), ('ar', 'high_school_macroeconomics'), ('ar', 'high_school_mathematics'), ('ar', 'high_school_microeconomics'), ('ar', 'high_school_physics'), ('ar', 'high_school_psychology'), ('ar', 'high_school_statistics'), ('ar', 'high_school_us_history'), ('ar', 'high_school_world_history'), ('ar', 'human_aging'), ('ar', 'human_sexuality'), ('ar', 'international_law'), ('ar', 'jurisprudence'), ('ar', 'logical_fallacies'), ('ar', 'machine_learning'), ('ar', 'management'), ('ar', 'marketing'), ('ar', 'medical_genetics'), ('ar', 'miscellaneous'), ('ar', 'moral_disputes'), ('ar', 'moral_scenarios'), ('ar', 'nutrition'), ('ar', 'philosophy'), ('ar', 'prehistory'), ('ar', 'professional_accounting'), ('ar', 'professional_law'), ('ar', 'professional_medicine'), ('ar', 'professional_psychology'), ('ar', 'public_relations'), ('ar', 'security_studies'), ('ar', 'sociology'), ('ar', 'us_foreign_policy'), ('ar', 'virology'), ('ar', 'world_religions')] LANGUAGE = {"('fr', 'abstract_algebra')": , "('fr', 'anatomy')": , "('fr', 'astronomy')": , "('fr', 'business_ethics')": , "('fr', 'clinical_knowledge')": , "('fr', 'college_biology')": , "('fr', 'college_chemistry')": , "('fr', 'college_computer_science')": , "('fr', 'college_mathematics')": , "('fr', 'college_medicine')": , "('fr', 'college_physics')": , "('fr', 'computer_security')": , "('fr', 'conceptual_physics')": , "('fr', 'econometrics')": , "('fr', 'electrical_engineering')": , "('fr', 'elementary_mathematics')": , "('fr', 'formal_logic')": , "('fr', 'global_facts')": , "('fr', 'high_school_biology')": , "('fr', 'high_school_chemistry')": , "('fr', 'high_school_computer_science')": , "('fr', 'high_school_european_history')": , "('fr', 'high_school_geography')": , "('fr', 'high_school_government_and_politics')": , "('fr', 'high_school_macroeconomics')": , "('fr', 'high_school_mathematics')": , "('fr', 'high_school_microeconomics')": , "('fr', 'high_school_physics')": , "('fr', 'high_school_psychology')": , "('fr', 'high_school_statistics')": , "('fr', 'high_school_us_history')": , "('fr', 'high_school_world_history')": , "('fr', 'human_aging')": , "('fr', 'human_sexuality')": , "('fr', 'international_law')": , "('fr', 'jurisprudence')": , "('fr', 'logical_fallacies')": , "('fr', 'machine_learning')": , "('fr', 'management')": , "('fr', 'marketing')": , "('fr', 'medical_genetics')": , "('fr', 'miscellaneous')": , "('fr', 'moral_disputes')": , "('fr', 'moral_scenarios')": , "('fr', 'nutrition')": , "('fr', 'philosophy')": , "('fr', 'prehistory')": , "('fr', 'professional_accounting')": , "('fr', 'professional_law')": , "('fr', 'professional_medicine')": , "('fr', 'professional_psychology')": , "('fr', 'public_relations')": , "('fr', 'security_studies')": , "('fr', 'sociology')": , "('fr', 'us_foreign_policy')": , "('fr', 'virology')": , "('fr', 'world_religions')": , "('de', 'abstract_algebra')": , "('de', 'anatomy')": , "('de', 'astronomy')": , "('de', 'business_ethics')": , "('de', 'clinical_knowledge')": , "('de', 'college_biology')": , "('de', 'college_chemistry')": , "('de', 'college_computer_science')": , "('de', 'college_mathematics')": , "('de', 'college_medicine')": , "('de', 'college_physics')": , "('de', 'computer_security')": , "('de', 'conceptual_physics')": , "('de', 'econometrics')": , "('de', 'electrical_engineering')": , "('de', 'elementary_mathematics')": , "('de', 'formal_logic')": , "('de', 'global_facts')": , "('de', 'high_school_biology')": , "('de', 'high_school_chemistry')": , "('de', 'high_school_computer_science')": , "('de', 'high_school_european_history')": , "('de', 'high_school_geography')": , "('de', 'high_school_government_and_politics')": , "('de', 'high_school_macroeconomics')": , "('de', 'high_school_mathematics')": , "('de', 'high_school_microeconomics')": , "('de', 'high_school_physics')": , "('de', 'high_school_psychology')": , "('de', 'high_school_statistics')": , "('de', 'high_school_us_history')": , "('de', 'high_school_world_history')": , "('de', 'human_aging')": , "('de', 'human_sexuality')": , "('de', 'international_law')": , "('de', 'jurisprudence')": , "('de', 'logical_fallacies')": , "('de', 'machine_learning')": , "('de', 'management')": , "('de', 'marketing')": , "('de', 'medical_genetics')": , "('de', 'miscellaneous')": , "('de', 'moral_disputes')": , "('de', 'moral_scenarios')": , "('de', 'nutrition')": , "('de', 'philosophy')": , "('de', 'prehistory')": , "('de', 'professional_accounting')": , "('de', 'professional_law')": , "('de', 'professional_medicine')": , "('de', 'professional_psychology')": , "('de', 'public_relations')": , "('de', 'security_studies')": , "('de', 'sociology')": , "('de', 'us_foreign_policy')": , "('de', 'virology')": , "('de', 'world_religions')": , "('es', 'abstract_algebra')": , "('es', 'anatomy')": , "('es', 'astronomy')": , "('es', 'business_ethics')": , "('es', 'clinical_knowledge')": , "('es', 'college_biology')": , "('es', 'college_chemistry')": , "('es', 'college_computer_science')": , "('es', 'college_mathematics')": , "('es', 'college_medicine')": , "('es', 'college_physics')": , "('es', 'computer_security')": , "('es', 'conceptual_physics')": , "('es', 'econometrics')": , "('es', 'electrical_engineering')": , "('es', 'elementary_mathematics')": , "('es', 'formal_logic')": , "('es', 'global_facts')": , "('es', 'high_school_biology')": , "('es', 'high_school_chemistry')": , "('es', 'high_school_computer_science')": , "('es', 'high_school_european_history')": , "('es', 'high_school_geography')": , "('es', 'high_school_government_and_politics')": , "('es', 'high_school_macroeconomics')": , "('es', 'high_school_mathematics')": , "('es', 'high_school_microeconomics')": , "('es', 'high_school_physics')": , "('es', 'high_school_psychology')": , "('es', 'high_school_statistics')": , "('es', 'high_school_us_history')": , "('es', 'high_school_world_history')": , "('es', 'human_aging')": , "('es', 'human_sexuality')": , "('es', 'international_law')": , "('es', 'jurisprudence')": , "('es', 'logical_fallacies')": , "('es', 'machine_learning')": , "('es', 'management')": , "('es', 'marketing')": , "('es', 'medical_genetics')": , "('es', 'miscellaneous')": , "('es', 'moral_disputes')": , "('es', 'moral_scenarios')": , "('es', 'nutrition')": , "('es', 'philosophy')": , "('es', 'prehistory')": , "('es', 'professional_accounting')": , "('es', 'professional_law')": , "('es', 'professional_medicine')": , "('es', 'professional_psychology')": , "('es', 'public_relations')": , "('es', 'security_studies')": , "('es', 'sociology')": , "('es', 'us_foreign_policy')": , "('es', 'virology')": , "('es', 'world_religions')": , "('it', 'abstract_algebra')": , "('it', 'anatomy')": , "('it', 'astronomy')": , "('it', 'business_ethics')": , "('it', 'clinical_knowledge')": , "('it', 'college_biology')": , "('it', 'college_chemistry')": , "('it', 'college_computer_science')": , "('it', 'college_mathematics')": , "('it', 'college_medicine')": , "('it', 'college_physics')": , "('it', 'computer_security')": , "('it', 'conceptual_physics')": , "('it', 'econometrics')": , "('it', 'electrical_engineering')": , "('it', 'elementary_mathematics')": , "('it', 'formal_logic')": , "('it', 'global_facts')": , "('it', 'high_school_biology')": , "('it', 'high_school_chemistry')": , "('it', 'high_school_computer_science')": , "('it', 'high_school_european_history')": , "('it', 'high_school_geography')": , "('it', 'high_school_government_and_politics')": , "('it', 'high_school_macroeconomics')": , "('it', 'high_school_mathematics')": , "('it', 'high_school_microeconomics')": , "('it', 'high_school_physics')": , "('it', 'high_school_psychology')": , "('it', 'high_school_statistics')": , "('it', 'high_school_us_history')": , "('it', 'high_school_world_history')": , "('it', 'human_aging')": , "('it', 'human_sexuality')": , "('it', 'international_law')": , "('it', 'jurisprudence')": , "('it', 'logical_fallacies')": , "('it', 'machine_learning')": , "('it', 'management')": , "('it', 'marketing')": , "('it', 'medical_genetics')": , "('it', 'miscellaneous')": , "('it', 'moral_disputes')": , "('it', 'moral_scenarios')": , "('it', 'nutrition')": , "('it', 'philosophy')": , "('it', 'prehistory')": , "('it', 'professional_accounting')": , "('it', 'professional_law')": , "('it', 'professional_medicine')": , "('it', 'professional_psychology')": , "('it', 'public_relations')": , "('it', 'security_studies')": , "('it', 'sociology')": , "('it', 'us_foreign_policy')": , "('it', 'virology')": , "('it', 'world_religions')": , "('pt', 'abstract_algebra')": , "('pt', 'anatomy')": , "('pt', 'astronomy')": , "('pt', 'business_ethics')": , "('pt', 'clinical_knowledge')": , "('pt', 'college_biology')": , "('pt', 'college_chemistry')": , "('pt', 'college_computer_science')": , "('pt', 'college_mathematics')": , "('pt', 'college_medicine')": , "('pt', 'college_physics')": , "('pt', 'computer_security')": , "('pt', 'conceptual_physics')": , "('pt', 'econometrics')": , "('pt', 'electrical_engineering')": , "('pt', 'elementary_mathematics')": , "('pt', 'formal_logic')": , "('pt', 'global_facts')": , "('pt', 'high_school_biology')": , "('pt', 'high_school_chemistry')": , "('pt', 'high_school_computer_science')": , "('pt', 'high_school_european_history')": , "('pt', 'high_school_geography')": , "('pt', 'high_school_government_and_politics')": , "('pt', 'high_school_macroeconomics')": , "('pt', 'high_school_mathematics')": , "('pt', 'high_school_microeconomics')": , "('pt', 'high_school_physics')": , "('pt', 'high_school_psychology')": , "('pt', 'high_school_statistics')": , "('pt', 'high_school_us_history')": , "('pt', 'high_school_world_history')": , "('pt', 'human_aging')": , "('pt', 'human_sexuality')": , "('pt', 'international_law')": , "('pt', 'jurisprudence')": , "('pt', 'logical_fallacies')": , "('pt', 'machine_learning')": , "('pt', 'management')": , "('pt', 'marketing')": , "('pt', 'medical_genetics')": , "('pt', 'miscellaneous')": , "('pt', 'moral_disputes')": , "('pt', 'moral_scenarios')": , "('pt', 'nutrition')": , "('pt', 'philosophy')": , "('pt', 'prehistory')": , "('pt', 'professional_accounting')": , "('pt', 'professional_law')": , "('pt', 'professional_medicine')": , "('pt', 'professional_psychology')": , "('pt', 'public_relations')": , "('pt', 'security_studies')": , "('pt', 'sociology')": , "('pt', 'us_foreign_policy')": , "('pt', 'virology')": , "('pt', 'world_religions')": , "('ar', 'abstract_algebra')": , "('ar', 'anatomy')": , "('ar', 'astronomy')": , "('ar', 'business_ethics')": , "('ar', 'clinical_knowledge')": , "('ar', 'college_biology')": , "('ar', 'college_chemistry')": , "('ar', 'college_computer_science')": , "('ar', 'college_mathematics')": , "('ar', 'college_medicine')": , "('ar', 'college_physics')": , "('ar', 'computer_security')": , "('ar', 'conceptual_physics')": , "('ar', 'econometrics')": , "('ar', 'electrical_engineering')": , "('ar', 'elementary_mathematics')": , "('ar', 'formal_logic')": , "('ar', 'global_facts')": , "('ar', 'high_school_biology')": , "('ar', 'high_school_chemistry')": , "('ar', 'high_school_computer_science')": , "('ar', 'high_school_european_history')": , "('ar', 'high_school_geography')": , "('ar', 'high_school_government_and_politics')": , "('ar', 'high_school_macroeconomics')": , "('ar', 'high_school_mathematics')": , "('ar', 'high_school_microeconomics')": , "('ar', 'high_school_physics')": , "('ar', 'high_school_psychology')": , "('ar', 'high_school_statistics')": , "('ar', 'high_school_us_history')": , "('ar', 'high_school_world_history')": , "('ar', 'human_aging')": , "('ar', 'human_sexuality')": , "('ar', 'international_law')": , "('ar', 'jurisprudence')": , "('ar', 'logical_fallacies')": , "('ar', 'machine_learning')": , "('ar', 'management')": , "('ar', 'marketing')": , "('ar', 'medical_genetics')": , "('ar', 'miscellaneous')": , "('ar', 'moral_disputes')": , "('ar', 'moral_scenarios')": , "('ar', 'nutrition')": , "('ar', 'philosophy')": , "('ar', 'prehistory')": , "('ar', 'professional_accounting')": , "('ar', 'professional_law')": , "('ar', 'professional_medicine')": , "('ar', 'professional_psychology')": , "('ar', 'public_relations')": , "('ar', 'security_studies')": , "('ar', 'sociology')": , "('ar', 'us_foreign_policy')": , "('ar', 'virology')": , "('ar', 'world_religions')": } ```` diff --git a/docs/tasks/GlobalMMLU_German.md b/docs/tasks/GlobalMMLU_German.md index e315504e..3c7d5807 100644 --- a/docs/tasks/GlobalMMLU_German.md +++ b/docs/tasks/GlobalMMLU_German.md @@ -7,7 +7,7 @@ SAMPLE_SPLIT = test FEWSHOT_SPLIT = dev RESPONSE_TYPE = LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood] -SUBJECTS = [('de', 'abstract_algebra'), ('de', 'astronomy'), ('de', 'college_biology'), ('de', 'college_chemistry'), ('de', 'college_computer_science'), ('de', 'college_mathematics'), ('de', 'college_physics'), ('de', 'computer_security'), ('de', 'conceptual_physics'), ('de', 'electrical_engineering'), ('de', 'elementary_mathematics'), ('de', 'high_school_biology'), ('de', 'high_school_chemistry'), ('de', 'high_school_computer_science'), ('de', 'high_school_mathematics'), ('de', 'high_school_physics'), ('de', 'high_school_statistics'), ('de', 'machine_learning'), ('de', 'formal_logic'), ('de', 'high_school_european_history'), ('de', 'high_school_us_history'), ('de', 'high_school_world_history'), ('de', 'international_law'), ('de', 'jurisprudence'), ('de', 'logical_fallacies'), ('de', 'moral_disputes'), ('de', 'moral_scenarios'), ('de', 'philosophy'), ('de', 'prehistory'), ('de', 'professional_law'), ('de', 'world_religions'), ('de', 'econometrics'), ('de', 'high_school_geography'), ('de', 'high_school_government_and_politics'), ('de', 'high_school_macroeconomics'), ('de', 'high_school_microeconomics'), ('de', 'high_school_psychology'), ('de', 'human_sexuality'), ('de', 'professional_psychology'), ('de', 'public_relations'), ('de', 'security_studies'), ('de', 'sociology'), ('de', 'us_foreign_policy'), ('de', 'anatomy'), ('de', 'business_ethics'), ('de', 'clinical_knowledge'), ('de', 'college_medicine'), ('de', 'global_facts'), ('de', 'human_aging'), ('de', 'management'), ('de', 'marketing'), ('de', 'medical_genetics'), ('de', 'miscellaneous'), ('de', 'nutrition'), ('de', 'professional_accounting'), ('de', 'professional_medicine'), ('de', 'virology')] +SUBJECTS = [('de', 'abstract_algebra'), ('de', 'anatomy'), ('de', 'astronomy'), ('de', 'business_ethics'), ('de', 'clinical_knowledge'), ('de', 'college_biology'), ('de', 'college_chemistry'), ('de', 'college_computer_science'), ('de', 'college_mathematics'), ('de', 'college_medicine'), ('de', 'college_physics'), ('de', 'computer_security'), ('de', 'conceptual_physics'), ('de', 'econometrics'), ('de', 'electrical_engineering'), ('de', 'elementary_mathematics'), ('de', 'formal_logic'), ('de', 'global_facts'), ('de', 'high_school_biology'), ('de', 'high_school_chemistry'), ('de', 'high_school_computer_science'), ('de', 'high_school_european_history'), ('de', 'high_school_geography'), ('de', 'high_school_government_and_politics'), ('de', 'high_school_macroeconomics'), ('de', 'high_school_mathematics'), ('de', 'high_school_microeconomics'), ('de', 'high_school_physics'), ('de', 'high_school_psychology'), ('de', 'high_school_statistics'), ('de', 'high_school_us_history'), ('de', 'high_school_world_history'), ('de', 'human_aging'), ('de', 'human_sexuality'), ('de', 'international_law'), ('de', 'jurisprudence'), ('de', 'logical_fallacies'), ('de', 'machine_learning'), ('de', 'management'), ('de', 'marketing'), ('de', 'medical_genetics'), ('de', 'miscellaneous'), ('de', 'moral_disputes'), ('de', 'moral_scenarios'), ('de', 'nutrition'), ('de', 'philosophy'), ('de', 'prehistory'), ('de', 'professional_accounting'), ('de', 'professional_law'), ('de', 'professional_medicine'), ('de', 'professional_psychology'), ('de', 'public_relations'), ('de', 'security_studies'), ('de', 'sociology'), ('de', 'us_foreign_policy'), ('de', 'virology'), ('de', 'world_religions')] LANGUAGE = ```` diff --git a/docs/tasks/MMLU.md b/docs/tasks/MMLU.md index 30cbe7f4..ce1f6a11 100644 --- a/docs/tasks/MMLU.md +++ b/docs/tasks/MMLU.md @@ -7,7 +7,7 @@ SAMPLE_SPLIT = test FEWSHOT_SPLIT = dev RESPONSE_TYPE = LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood] -SUBJECTS = ['abstract_algebra', 'astronomy', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_physics', 'computer_security', 'conceptual_physics', 'electrical_engineering', 'elementary_mathematics', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_mathematics', 'high_school_physics', 'high_school_statistics', 'machine_learning', 'formal_logic', 'high_school_european_history', 'high_school_us_history', 'high_school_world_history', 'international_law', 'jurisprudence', 'logical_fallacies', 'moral_disputes', 'moral_scenarios', 'philosophy', 'prehistory', 'professional_law', 'world_religions', 'econometrics', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_microeconomics', 'high_school_psychology', 'human_sexuality', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'anatomy', 'business_ethics', 'clinical_knowledge', 'college_medicine', 'global_facts', 'human_aging', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting', 'professional_medicine', 'virology'] +SUBJECTS = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions'] LANGUAGE = ```` diff --git a/docs/tasks/MMLUBPB.md b/docs/tasks/MMLUBPB.md index d9d798f0..ed99b33f 100644 --- a/docs/tasks/MMLUBPB.md +++ b/docs/tasks/MMLUBPB.md @@ -7,7 +7,7 @@ SAMPLE_SPLIT = test FEWSHOT_SPLIT = dev RESPONSE_TYPE = LOGLIKELIHOODS METRICS = [BitsPerByteLoglikelihood] -SUBJECTS = ['abstract_algebra', 'astronomy', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_physics', 'computer_security', 'conceptual_physics', 'electrical_engineering', 'elementary_mathematics', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_mathematics', 'high_school_physics', 'high_school_statistics', 'machine_learning', 'formal_logic', 'high_school_european_history', 'high_school_us_history', 'high_school_world_history', 'international_law', 'jurisprudence', 'logical_fallacies', 'moral_disputes', 'moral_scenarios', 'philosophy', 'prehistory', 'professional_law', 'world_religions', 'econometrics', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_microeconomics', 'high_school_psychology', 'human_sexuality', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'anatomy', 'business_ethics', 'clinical_knowledge', 'college_medicine', 'global_facts', 'human_aging', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting', 'professional_medicine', 'virology'] +SUBJECTS = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions'] LANGUAGE = ```` diff --git a/docs/tasks/MMLUCloze.md b/docs/tasks/MMLUCloze.md index 0b74f721..fd7944bc 100644 --- a/docs/tasks/MMLUCloze.md +++ b/docs/tasks/MMLUCloze.md @@ -7,7 +7,7 @@ SAMPLE_SPLIT = test FEWSHOT_SPLIT = dev RESPONSE_TYPE = LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood] -SUBJECTS = ['abstract_algebra', 'astronomy', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_physics', 'computer_security', 'conceptual_physics', 'electrical_engineering', 'elementary_mathematics', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_mathematics', 'high_school_physics', 'high_school_statistics', 'machine_learning', 'formal_logic', 'high_school_european_history', 'high_school_us_history', 'high_school_world_history', 'international_law', 'jurisprudence', 'logical_fallacies', 'moral_disputes', 'moral_scenarios', 'philosophy', 'prehistory', 'professional_law', 'world_religions', 'econometrics', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_microeconomics', 'high_school_psychology', 'human_sexuality', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'anatomy', 'business_ethics', 'clinical_knowledge', 'college_medicine', 'global_facts', 'human_aging', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting', 'professional_medicine', 'virology'] +SUBJECTS = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions'] LANGUAGE = ```` diff --git a/docs/tasks/MMLUMC.md b/docs/tasks/MMLUMC.md index 0791f54b..b007edd4 100644 --- a/docs/tasks/MMLUMC.md +++ b/docs/tasks/MMLUMC.md @@ -7,7 +7,7 @@ SAMPLE_SPLIT = test FEWSHOT_SPLIT = dev RESPONSE_TYPE = LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood] -SUBJECTS = ['abstract_algebra', 'astronomy', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_physics', 'computer_security', 'conceptual_physics', 'electrical_engineering', 'elementary_mathematics', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_mathematics', 'high_school_physics', 'high_school_statistics', 'machine_learning', 'formal_logic', 'high_school_european_history', 'high_school_us_history', 'high_school_world_history', 'international_law', 'jurisprudence', 'logical_fallacies', 'moral_disputes', 'moral_scenarios', 'philosophy', 'prehistory', 'professional_law', 'world_religions', 'econometrics', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_microeconomics', 'high_school_psychology', 'human_sexuality', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'anatomy', 'business_ethics', 'clinical_knowledge', 'college_medicine', 'global_facts', 'human_aging', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting', 'professional_medicine', 'virology'] +SUBJECTS = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions'] LANGUAGE = ```` diff --git a/docs/tasks/MMLU_COT.md b/docs/tasks/MMLU_COT.md index e7af5cae..52eab322 100644 --- a/docs/tasks/MMLU_COT.md +++ b/docs/tasks/MMLU_COT.md @@ -7,7 +7,7 @@ SAMPLE_SPLIT = test FEWSHOT_SPLIT = dev RESPONSE_TYPE = COMPLETION METRICS = [AccuracyCompletion] -SUBJECTS = ['abstract_algebra', 'astronomy', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_physics', 'computer_security', 'conceptual_physics', 'electrical_engineering', 'elementary_mathematics', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_mathematics', 'high_school_physics', 'high_school_statistics', 'machine_learning', 'formal_logic', 'high_school_european_history', 'high_school_us_history', 'high_school_world_history', 'international_law', 'jurisprudence', 'logical_fallacies', 'moral_disputes', 'moral_scenarios', 'philosophy', 'prehistory', 'professional_law', 'world_religions', 'econometrics', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_microeconomics', 'high_school_psychology', 'human_sexuality', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'anatomy', 'business_ethics', 'clinical_knowledge', 'college_medicine', 'global_facts', 'human_aging', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting', 'professional_medicine', 'virology'] +SUBJECTS = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions'] LANGUAGE = ```` diff --git a/docs/tasks/MMLU_EU20_DE.md b/docs/tasks/MMLU_EU20_DE.md index 6195cdd7..27029293 100644 --- a/docs/tasks/MMLU_EU20_DE.md +++ b/docs/tasks/MMLU_EU20_DE.md @@ -7,7 +7,7 @@ SAMPLE_SPLIT = test FEWSHOT_SPLIT = dev RESPONSE_TYPE = LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood] -SUBJECTS = ['abstract_algebra_DE', 'astronomy_DE', 'college_biology_DE', 'college_chemistry_DE', 'college_computer_science_DE', 'college_mathematics_DE', 'college_physics_DE', 'computer_security_DE', 'conceptual_physics_DE', 'electrical_engineering_DE', 'elementary_mathematics_DE', 'high_school_biology_DE', 'high_school_chemistry_DE', 'high_school_computer_science_DE', 'high_school_mathematics_DE', 'high_school_physics_DE', 'high_school_statistics_DE', 'machine_learning_DE', 'formal_logic_DE', 'high_school_european_history_DE', 'high_school_us_history_DE', 'high_school_world_history_DE', 'international_law_DE', 'jurisprudence_DE', 'logical_fallacies_DE', 'moral_disputes_DE', 'moral_scenarios_DE', 'philosophy_DE', 'prehistory_DE', 'professional_law_DE', 'world_religions_DE', 'econometrics_DE', 'high_school_geography_DE', 'high_school_government_and_politics_DE', 'high_school_macroeconomics_DE', 'high_school_microeconomics_DE', 'high_school_psychology_DE', 'human_sexuality_DE', 'professional_psychology_DE', 'public_relations_DE', 'security_studies_DE', 'sociology_DE', 'us_foreign_policy_DE', 'anatomy_DE', 'business_ethics_DE', 'clinical_knowledge_DE', 'college_medicine_DE', 'global_facts_DE', 'human_aging_DE', 'management_DE', 'marketing_DE', 'medical_genetics_DE', 'miscellaneous_DE', 'nutrition_DE', 'professional_accounting_DE', 'professional_medicine_DE', 'virology_DE'] +SUBJECTS = ['abstract_algebra_DE', 'anatomy_DE', 'astronomy_DE', 'business_ethics_DE', 'clinical_knowledge_DE', 'college_biology_DE', 'college_chemistry_DE', 'college_computer_science_DE', 'college_mathematics_DE', 'college_medicine_DE', 'college_physics_DE', 'computer_security_DE', 'conceptual_physics_DE', 'econometrics_DE', 'electrical_engineering_DE', 'elementary_mathematics_DE', 'formal_logic_DE', 'global_facts_DE', 'high_school_biology_DE', 'high_school_chemistry_DE', 'high_school_computer_science_DE', 'high_school_european_history_DE', 'high_school_geography_DE', 'high_school_government_and_politics_DE', 'high_school_macroeconomics_DE', 'high_school_mathematics_DE', 'high_school_microeconomics_DE', 'high_school_physics_DE', 'high_school_psychology_DE', 'high_school_statistics_DE', 'high_school_us_history_DE', 'high_school_world_history_DE', 'human_aging_DE', 'human_sexuality_DE', 'international_law_DE', 'jurisprudence_DE', 'logical_fallacies_DE', 'machine_learning_DE', 'management_DE', 'marketing_DE', 'medical_genetics_DE', 'miscellaneous_DE', 'moral_disputes_DE', 'moral_scenarios_DE', 'nutrition_DE', 'philosophy_DE', 'prehistory_DE', 'professional_accounting_DE', 'professional_law_DE', 'professional_medicine_DE', 'professional_psychology_DE', 'public_relations_DE', 'security_studies_DE', 'sociology_DE', 'us_foreign_policy_DE', 'virology_DE', 'world_religions_DE'] LANGUAGE = ```` diff --git a/docs/tasks/MMLU_EU20_FR.md b/docs/tasks/MMLU_EU20_FR.md index 3eb430c2..27effe3a 100644 --- a/docs/tasks/MMLU_EU20_FR.md +++ b/docs/tasks/MMLU_EU20_FR.md @@ -7,7 +7,7 @@ SAMPLE_SPLIT = test FEWSHOT_SPLIT = dev RESPONSE_TYPE = LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood] -SUBJECTS = ['abstract_algebra_FR', 'astronomy_FR', 'college_biology_FR', 'college_chemistry_FR', 'college_computer_science_FR', 'college_mathematics_FR', 'college_physics_FR', 'computer_security_FR', 'conceptual_physics_FR', 'electrical_engineering_FR', 'elementary_mathematics_FR', 'high_school_biology_FR', 'high_school_chemistry_FR', 'high_school_computer_science_FR', 'high_school_mathematics_FR', 'high_school_physics_FR', 'high_school_statistics_FR', 'machine_learning_FR', 'formal_logic_FR', 'high_school_european_history_FR', 'high_school_us_history_FR', 'high_school_world_history_FR', 'international_law_FR', 'jurisprudence_FR', 'logical_fallacies_FR', 'moral_disputes_FR', 'moral_scenarios_FR', 'philosophy_FR', 'prehistory_FR', 'professional_law_FR', 'world_religions_FR', 'econometrics_FR', 'high_school_geography_FR', 'high_school_government_and_politics_FR', 'high_school_macroeconomics_FR', 'high_school_microeconomics_FR', 'high_school_psychology_FR', 'human_sexuality_FR', 'professional_psychology_FR', 'public_relations_FR', 'security_studies_FR', 'sociology_FR', 'us_foreign_policy_FR', 'anatomy_FR', 'business_ethics_FR', 'clinical_knowledge_FR', 'college_medicine_FR', 'global_facts_FR', 'human_aging_FR', 'management_FR', 'marketing_FR', 'medical_genetics_FR', 'miscellaneous_FR', 'nutrition_FR', 'professional_accounting_FR', 'professional_medicine_FR', 'virology_FR'] +SUBJECTS = ['abstract_algebra_FR', 'anatomy_FR', 'astronomy_FR', 'business_ethics_FR', 'clinical_knowledge_FR', 'college_biology_FR', 'college_chemistry_FR', 'college_computer_science_FR', 'college_mathematics_FR', 'college_medicine_FR', 'college_physics_FR', 'computer_security_FR', 'conceptual_physics_FR', 'econometrics_FR', 'electrical_engineering_FR', 'elementary_mathematics_FR', 'formal_logic_FR', 'global_facts_FR', 'high_school_biology_FR', 'high_school_chemistry_FR', 'high_school_computer_science_FR', 'high_school_european_history_FR', 'high_school_geography_FR', 'high_school_government_and_politics_FR', 'high_school_macroeconomics_FR', 'high_school_mathematics_FR', 'high_school_microeconomics_FR', 'high_school_physics_FR', 'high_school_psychology_FR', 'high_school_statistics_FR', 'high_school_us_history_FR', 'high_school_world_history_FR', 'human_aging_FR', 'human_sexuality_FR', 'international_law_FR', 'jurisprudence_FR', 'logical_fallacies_FR', 'machine_learning_FR', 'management_FR', 'marketing_FR', 'medical_genetics_FR', 'miscellaneous_FR', 'moral_disputes_FR', 'moral_scenarios_FR', 'nutrition_FR', 'philosophy_FR', 'prehistory_FR', 'professional_accounting_FR', 'professional_law_FR', 'professional_medicine_FR', 'professional_psychology_FR', 'public_relations_FR', 'security_studies_FR', 'sociology_FR', 'us_foreign_policy_FR', 'virology_FR', 'world_religions_FR'] LANGUAGE = ```` diff --git a/docs/tasks/MMLU_IDK.md b/docs/tasks/MMLU_IDK.md index 46c97c48..f854b7a4 100644 --- a/docs/tasks/MMLU_IDK.md +++ b/docs/tasks/MMLU_IDK.md @@ -7,7 +7,7 @@ SAMPLE_SPLIT = test FEWSHOT_SPLIT = dev RESPONSE_TYPE = LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, ConfidenceWeightedAccuracy, DistributionalCorrectnessScore, TernaryScore] -SUBJECTS = ['abstract_algebra', 'astronomy', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_physics', 'computer_security', 'conceptual_physics', 'electrical_engineering', 'elementary_mathematics', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_mathematics', 'high_school_physics', 'high_school_statistics', 'machine_learning', 'formal_logic', 'high_school_european_history', 'high_school_us_history', 'high_school_world_history', 'international_law', 'jurisprudence', 'logical_fallacies', 'moral_disputes', 'moral_scenarios', 'philosophy', 'prehistory', 'professional_law', 'world_religions', 'econometrics', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_microeconomics', 'high_school_psychology', 'human_sexuality', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'anatomy', 'business_ethics', 'clinical_knowledge', 'college_medicine', 'global_facts', 'human_aging', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting', 'professional_medicine', 'virology'] +SUBJECTS = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions'] LANGUAGE = ```` diff --git a/docs/tasks/MMLU_OLMES.md b/docs/tasks/MMLU_OLMES.md index 47e7d2e6..4d98b7db 100644 --- a/docs/tasks/MMLU_OLMES.md +++ b/docs/tasks/MMLU_OLMES.md @@ -7,7 +7,7 @@ SAMPLE_SPLIT = test FEWSHOT_SPLIT = dev RESPONSE_TYPE = LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood] -SUBJECTS = ['abstract_algebra', 'astronomy', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_physics', 'computer_security', 'conceptual_physics', 'electrical_engineering', 'elementary_mathematics', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_mathematics', 'high_school_physics', 'high_school_statistics', 'machine_learning', 'formal_logic', 'high_school_european_history', 'high_school_us_history', 'high_school_world_history', 'international_law', 'jurisprudence', 'logical_fallacies', 'moral_disputes', 'moral_scenarios', 'philosophy', 'prehistory', 'professional_law', 'world_religions', 'econometrics', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_microeconomics', 'high_school_psychology', 'human_sexuality', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'anatomy', 'business_ethics', 'clinical_knowledge', 'college_medicine', 'global_facts', 'human_aging', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting', 'professional_medicine', 'virology'] +SUBJECTS = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions'] LANGUAGE = ```` diff --git a/docs/tasks/MMMLU.md b/docs/tasks/MMMLU.md index 6cb8f52d..2684cf5a 100644 --- a/docs/tasks/MMMLU.md +++ b/docs/tasks/MMMLU.md @@ -7,7 +7,7 @@ SAMPLE_SPLIT = test FEWSHOT_SPLIT = test RESPONSE_TYPE = LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood] -SUBJECTS = [('FR_FR', 'abstract_algebra'), ('FR_FR', 'astronomy'), ('FR_FR', 'college_biology'), ('FR_FR', 'college_chemistry'), ('FR_FR', 'college_computer_science'), ('FR_FR', 'college_mathematics'), ('FR_FR', 'college_physics'), ('FR_FR', 'computer_security'), ('FR_FR', 'conceptual_physics'), ('FR_FR', 'electrical_engineering'), ('FR_FR', 'elementary_mathematics'), ('FR_FR', 'high_school_biology'), ('FR_FR', 'high_school_chemistry'), ('FR_FR', 'high_school_computer_science'), ('FR_FR', 'high_school_mathematics'), ('FR_FR', 'high_school_physics'), ('FR_FR', 'high_school_statistics'), ('FR_FR', 'machine_learning'), ('FR_FR', 'formal_logic'), ('FR_FR', 'high_school_european_history'), ('FR_FR', 'high_school_us_history'), ('FR_FR', 'high_school_world_history'), ('FR_FR', 'international_law'), ('FR_FR', 'jurisprudence'), ('FR_FR', 'logical_fallacies'), ('FR_FR', 'moral_disputes'), ('FR_FR', 'moral_scenarios'), ('FR_FR', 'philosophy'), ('FR_FR', 'prehistory'), ('FR_FR', 'professional_law'), ('FR_FR', 'world_religions'), ('FR_FR', 'econometrics'), ('FR_FR', 'high_school_geography'), ('FR_FR', 'high_school_government_and_politics'), ('FR_FR', 'high_school_macroeconomics'), ('FR_FR', 'high_school_microeconomics'), ('FR_FR', 'high_school_psychology'), ('FR_FR', 'human_sexuality'), ('FR_FR', 'professional_psychology'), ('FR_FR', 'public_relations'), ('FR_FR', 'security_studies'), ('FR_FR', 'sociology'), ('FR_FR', 'us_foreign_policy'), ('FR_FR', 'anatomy'), ('FR_FR', 'business_ethics'), ('FR_FR', 'clinical_knowledge'), ('FR_FR', 'college_medicine'), ('FR_FR', 'global_facts'), ('FR_FR', 'human_aging'), ('FR_FR', 'management'), ('FR_FR', 'marketing'), ('FR_FR', 'medical_genetics'), ('FR_FR', 'miscellaneous'), ('FR_FR', 'nutrition'), ('FR_FR', 'professional_accounting'), ('FR_FR', 'professional_medicine'), ('FR_FR', 'virology'), ('DE_DE', 'abstract_algebra'), ('DE_DE', 'astronomy'), ('DE_DE', 'college_biology'), ('DE_DE', 'college_chemistry'), ('DE_DE', 'college_computer_science'), ('DE_DE', 'college_mathematics'), ('DE_DE', 'college_physics'), ('DE_DE', 'computer_security'), ('DE_DE', 'conceptual_physics'), ('DE_DE', 'electrical_engineering'), ('DE_DE', 'elementary_mathematics'), ('DE_DE', 'high_school_biology'), ('DE_DE', 'high_school_chemistry'), ('DE_DE', 'high_school_computer_science'), ('DE_DE', 'high_school_mathematics'), ('DE_DE', 'high_school_physics'), ('DE_DE', 'high_school_statistics'), ('DE_DE', 'machine_learning'), ('DE_DE', 'formal_logic'), ('DE_DE', 'high_school_european_history'), ('DE_DE', 'high_school_us_history'), ('DE_DE', 'high_school_world_history'), ('DE_DE', 'international_law'), ('DE_DE', 'jurisprudence'), ('DE_DE', 'logical_fallacies'), ('DE_DE', 'moral_disputes'), ('DE_DE', 'moral_scenarios'), ('DE_DE', 'philosophy'), ('DE_DE', 'prehistory'), ('DE_DE', 'professional_law'), ('DE_DE', 'world_religions'), ('DE_DE', 'econometrics'), ('DE_DE', 'high_school_geography'), ('DE_DE', 'high_school_government_and_politics'), ('DE_DE', 'high_school_macroeconomics'), ('DE_DE', 'high_school_microeconomics'), ('DE_DE', 'high_school_psychology'), ('DE_DE', 'human_sexuality'), ('DE_DE', 'professional_psychology'), ('DE_DE', 'public_relations'), ('DE_DE', 'security_studies'), ('DE_DE', 'sociology'), ('DE_DE', 'us_foreign_policy'), ('DE_DE', 'anatomy'), ('DE_DE', 'business_ethics'), ('DE_DE', 'clinical_knowledge'), ('DE_DE', 'college_medicine'), ('DE_DE', 'global_facts'), ('DE_DE', 'human_aging'), ('DE_DE', 'management'), ('DE_DE', 'marketing'), ('DE_DE', 'medical_genetics'), ('DE_DE', 'miscellaneous'), ('DE_DE', 'nutrition'), ('DE_DE', 'professional_accounting'), ('DE_DE', 'professional_medicine'), ('DE_DE', 'virology'), ('ES_LA', 'abstract_algebra'), ('ES_LA', 'astronomy'), ('ES_LA', 'college_biology'), ('ES_LA', 'college_chemistry'), ('ES_LA', 'college_computer_science'), ('ES_LA', 'college_mathematics'), ('ES_LA', 'college_physics'), ('ES_LA', 'computer_security'), ('ES_LA', 'conceptual_physics'), ('ES_LA', 'electrical_engineering'), ('ES_LA', 'elementary_mathematics'), ('ES_LA', 'high_school_biology'), ('ES_LA', 'high_school_chemistry'), ('ES_LA', 'high_school_computer_science'), ('ES_LA', 'high_school_mathematics'), ('ES_LA', 'high_school_physics'), ('ES_LA', 'high_school_statistics'), ('ES_LA', 'machine_learning'), ('ES_LA', 'formal_logic'), ('ES_LA', 'high_school_european_history'), ('ES_LA', 'high_school_us_history'), ('ES_LA', 'high_school_world_history'), ('ES_LA', 'international_law'), ('ES_LA', 'jurisprudence'), ('ES_LA', 'logical_fallacies'), ('ES_LA', 'moral_disputes'), ('ES_LA', 'moral_scenarios'), ('ES_LA', 'philosophy'), ('ES_LA', 'prehistory'), ('ES_LA', 'professional_law'), ('ES_LA', 'world_religions'), ('ES_LA', 'econometrics'), ('ES_LA', 'high_school_geography'), ('ES_LA', 'high_school_government_and_politics'), ('ES_LA', 'high_school_macroeconomics'), ('ES_LA', 'high_school_microeconomics'), ('ES_LA', 'high_school_psychology'), ('ES_LA', 'human_sexuality'), ('ES_LA', 'professional_psychology'), ('ES_LA', 'public_relations'), ('ES_LA', 'security_studies'), ('ES_LA', 'sociology'), ('ES_LA', 'us_foreign_policy'), ('ES_LA', 'anatomy'), ('ES_LA', 'business_ethics'), ('ES_LA', 'clinical_knowledge'), ('ES_LA', 'college_medicine'), ('ES_LA', 'global_facts'), ('ES_LA', 'human_aging'), ('ES_LA', 'management'), ('ES_LA', 'marketing'), ('ES_LA', 'medical_genetics'), ('ES_LA', 'miscellaneous'), ('ES_LA', 'nutrition'), ('ES_LA', 'professional_accounting'), ('ES_LA', 'professional_medicine'), ('ES_LA', 'virology'), ('IT_IT', 'abstract_algebra'), ('IT_IT', 'astronomy'), ('IT_IT', 'college_biology'), ('IT_IT', 'college_chemistry'), ('IT_IT', 'college_computer_science'), ('IT_IT', 'college_mathematics'), ('IT_IT', 'college_physics'), ('IT_IT', 'computer_security'), ('IT_IT', 'conceptual_physics'), ('IT_IT', 'electrical_engineering'), ('IT_IT', 'elementary_mathematics'), ('IT_IT', 'high_school_biology'), ('IT_IT', 'high_school_chemistry'), ('IT_IT', 'high_school_computer_science'), ('IT_IT', 'high_school_mathematics'), ('IT_IT', 'high_school_physics'), ('IT_IT', 'high_school_statistics'), ('IT_IT', 'machine_learning'), ('IT_IT', 'formal_logic'), ('IT_IT', 'high_school_european_history'), ('IT_IT', 'high_school_us_history'), ('IT_IT', 'high_school_world_history'), ('IT_IT', 'international_law'), ('IT_IT', 'jurisprudence'), ('IT_IT', 'logical_fallacies'), ('IT_IT', 'moral_disputes'), ('IT_IT', 'moral_scenarios'), ('IT_IT', 'philosophy'), ('IT_IT', 'prehistory'), ('IT_IT', 'professional_law'), ('IT_IT', 'world_religions'), ('IT_IT', 'econometrics'), ('IT_IT', 'high_school_geography'), ('IT_IT', 'high_school_government_and_politics'), ('IT_IT', 'high_school_macroeconomics'), ('IT_IT', 'high_school_microeconomics'), ('IT_IT', 'high_school_psychology'), ('IT_IT', 'human_sexuality'), ('IT_IT', 'professional_psychology'), ('IT_IT', 'public_relations'), ('IT_IT', 'security_studies'), ('IT_IT', 'sociology'), ('IT_IT', 'us_foreign_policy'), ('IT_IT', 'anatomy'), ('IT_IT', 'business_ethics'), ('IT_IT', 'clinical_knowledge'), ('IT_IT', 'college_medicine'), ('IT_IT', 'global_facts'), ('IT_IT', 'human_aging'), ('IT_IT', 'management'), ('IT_IT', 'marketing'), ('IT_IT', 'medical_genetics'), ('IT_IT', 'miscellaneous'), ('IT_IT', 'nutrition'), ('IT_IT', 'professional_accounting'), ('IT_IT', 'professional_medicine'), ('IT_IT', 'virology'), ('PT_BR', 'abstract_algebra'), ('PT_BR', 'astronomy'), ('PT_BR', 'college_biology'), ('PT_BR', 'college_chemistry'), ('PT_BR', 'college_computer_science'), ('PT_BR', 'college_mathematics'), ('PT_BR', 'college_physics'), ('PT_BR', 'computer_security'), ('PT_BR', 'conceptual_physics'), ('PT_BR', 'electrical_engineering'), ('PT_BR', 'elementary_mathematics'), ('PT_BR', 'high_school_biology'), ('PT_BR', 'high_school_chemistry'), ('PT_BR', 'high_school_computer_science'), ('PT_BR', 'high_school_mathematics'), ('PT_BR', 'high_school_physics'), ('PT_BR', 'high_school_statistics'), ('PT_BR', 'machine_learning'), ('PT_BR', 'formal_logic'), ('PT_BR', 'high_school_european_history'), ('PT_BR', 'high_school_us_history'), ('PT_BR', 'high_school_world_history'), ('PT_BR', 'international_law'), ('PT_BR', 'jurisprudence'), ('PT_BR', 'logical_fallacies'), ('PT_BR', 'moral_disputes'), ('PT_BR', 'moral_scenarios'), ('PT_BR', 'philosophy'), ('PT_BR', 'prehistory'), ('PT_BR', 'professional_law'), ('PT_BR', 'world_religions'), ('PT_BR', 'econometrics'), ('PT_BR', 'high_school_geography'), ('PT_BR', 'high_school_government_and_politics'), ('PT_BR', 'high_school_macroeconomics'), ('PT_BR', 'high_school_microeconomics'), ('PT_BR', 'high_school_psychology'), ('PT_BR', 'human_sexuality'), ('PT_BR', 'professional_psychology'), ('PT_BR', 'public_relations'), ('PT_BR', 'security_studies'), ('PT_BR', 'sociology'), ('PT_BR', 'us_foreign_policy'), ('PT_BR', 'anatomy'), ('PT_BR', 'business_ethics'), ('PT_BR', 'clinical_knowledge'), ('PT_BR', 'college_medicine'), ('PT_BR', 'global_facts'), ('PT_BR', 'human_aging'), ('PT_BR', 'management'), ('PT_BR', 'marketing'), ('PT_BR', 'medical_genetics'), ('PT_BR', 'miscellaneous'), ('PT_BR', 'nutrition'), ('PT_BR', 'professional_accounting'), ('PT_BR', 'professional_medicine'), ('PT_BR', 'virology'), ('AR_XY', 'abstract_algebra'), ('AR_XY', 'astronomy'), ('AR_XY', 'college_biology'), ('AR_XY', 'college_chemistry'), ('AR_XY', 'college_computer_science'), ('AR_XY', 'college_mathematics'), ('AR_XY', 'college_physics'), ('AR_XY', 'computer_security'), ('AR_XY', 'conceptual_physics'), ('AR_XY', 'electrical_engineering'), ('AR_XY', 'elementary_mathematics'), ('AR_XY', 'high_school_biology'), ('AR_XY', 'high_school_chemistry'), ('AR_XY', 'high_school_computer_science'), ('AR_XY', 'high_school_mathematics'), ('AR_XY', 'high_school_physics'), ('AR_XY', 'high_school_statistics'), ('AR_XY', 'machine_learning'), ('AR_XY', 'formal_logic'), ('AR_XY', 'high_school_european_history'), ('AR_XY', 'high_school_us_history'), ('AR_XY', 'high_school_world_history'), ('AR_XY', 'international_law'), ('AR_XY', 'jurisprudence'), ('AR_XY', 'logical_fallacies'), ('AR_XY', 'moral_disputes'), ('AR_XY', 'moral_scenarios'), ('AR_XY', 'philosophy'), ('AR_XY', 'prehistory'), ('AR_XY', 'professional_law'), ('AR_XY', 'world_religions'), ('AR_XY', 'econometrics'), ('AR_XY', 'high_school_geography'), ('AR_XY', 'high_school_government_and_politics'), ('AR_XY', 'high_school_macroeconomics'), ('AR_XY', 'high_school_microeconomics'), ('AR_XY', 'high_school_psychology'), ('AR_XY', 'human_sexuality'), ('AR_XY', 'professional_psychology'), ('AR_XY', 'public_relations'), ('AR_XY', 'security_studies'), ('AR_XY', 'sociology'), ('AR_XY', 'us_foreign_policy'), ('AR_XY', 'anatomy'), ('AR_XY', 'business_ethics'), ('AR_XY', 'clinical_knowledge'), ('AR_XY', 'college_medicine'), ('AR_XY', 'global_facts'), ('AR_XY', 'human_aging'), ('AR_XY', 'management'), ('AR_XY', 'marketing'), ('AR_XY', 'medical_genetics'), ('AR_XY', 'miscellaneous'), ('AR_XY', 'nutrition'), ('AR_XY', 'professional_accounting'), ('AR_XY', 'professional_medicine'), ('AR_XY', 'virology')] +SUBJECTS = [('FR_FR', 'abstract_algebra'), ('FR_FR', 'anatomy'), ('FR_FR', 'astronomy'), ('FR_FR', 'business_ethics'), ('FR_FR', 'clinical_knowledge'), ('FR_FR', 'college_biology'), ('FR_FR', 'college_chemistry'), ('FR_FR', 'college_computer_science'), ('FR_FR', 'college_mathematics'), ('FR_FR', 'college_medicine'), ('FR_FR', 'college_physics'), ('FR_FR', 'computer_security'), ('FR_FR', 'conceptual_physics'), ('FR_FR', 'econometrics'), ('FR_FR', 'electrical_engineering'), ('FR_FR', 'elementary_mathematics'), ('FR_FR', 'formal_logic'), ('FR_FR', 'global_facts'), ('FR_FR', 'high_school_biology'), ('FR_FR', 'high_school_chemistry'), ('FR_FR', 'high_school_computer_science'), ('FR_FR', 'high_school_european_history'), ('FR_FR', 'high_school_geography'), ('FR_FR', 'high_school_government_and_politics'), ('FR_FR', 'high_school_macroeconomics'), ('FR_FR', 'high_school_mathematics'), ('FR_FR', 'high_school_microeconomics'), ('FR_FR', 'high_school_physics'), ('FR_FR', 'high_school_psychology'), ('FR_FR', 'high_school_statistics'), ('FR_FR', 'high_school_us_history'), ('FR_FR', 'high_school_world_history'), ('FR_FR', 'human_aging'), ('FR_FR', 'human_sexuality'), ('FR_FR', 'international_law'), ('FR_FR', 'jurisprudence'), ('FR_FR', 'logical_fallacies'), ('FR_FR', 'machine_learning'), ('FR_FR', 'management'), ('FR_FR', 'marketing'), ('FR_FR', 'medical_genetics'), ('FR_FR', 'miscellaneous'), ('FR_FR', 'moral_disputes'), ('FR_FR', 'moral_scenarios'), ('FR_FR', 'nutrition'), ('FR_FR', 'philosophy'), ('FR_FR', 'prehistory'), ('FR_FR', 'professional_accounting'), ('FR_FR', 'professional_law'), ('FR_FR', 'professional_medicine'), ('FR_FR', 'professional_psychology'), ('FR_FR', 'public_relations'), ('FR_FR', 'security_studies'), ('FR_FR', 'sociology'), ('FR_FR', 'us_foreign_policy'), ('FR_FR', 'virology'), ('FR_FR', 'world_religions'), ('DE_DE', 'abstract_algebra'), ('DE_DE', 'anatomy'), ('DE_DE', 'astronomy'), ('DE_DE', 'business_ethics'), ('DE_DE', 'clinical_knowledge'), ('DE_DE', 'college_biology'), ('DE_DE', 'college_chemistry'), ('DE_DE', 'college_computer_science'), ('DE_DE', 'college_mathematics'), ('DE_DE', 'college_medicine'), ('DE_DE', 'college_physics'), ('DE_DE', 'computer_security'), ('DE_DE', 'conceptual_physics'), ('DE_DE', 'econometrics'), ('DE_DE', 'electrical_engineering'), ('DE_DE', 'elementary_mathematics'), ('DE_DE', 'formal_logic'), ('DE_DE', 'global_facts'), ('DE_DE', 'high_school_biology'), ('DE_DE', 'high_school_chemistry'), ('DE_DE', 'high_school_computer_science'), ('DE_DE', 'high_school_european_history'), ('DE_DE', 'high_school_geography'), ('DE_DE', 'high_school_government_and_politics'), ('DE_DE', 'high_school_macroeconomics'), ('DE_DE', 'high_school_mathematics'), ('DE_DE', 'high_school_microeconomics'), ('DE_DE', 'high_school_physics'), ('DE_DE', 'high_school_psychology'), ('DE_DE', 'high_school_statistics'), ('DE_DE', 'high_school_us_history'), ('DE_DE', 'high_school_world_history'), ('DE_DE', 'human_aging'), ('DE_DE', 'human_sexuality'), ('DE_DE', 'international_law'), ('DE_DE', 'jurisprudence'), ('DE_DE', 'logical_fallacies'), ('DE_DE', 'machine_learning'), ('DE_DE', 'management'), ('DE_DE', 'marketing'), ('DE_DE', 'medical_genetics'), ('DE_DE', 'miscellaneous'), ('DE_DE', 'moral_disputes'), ('DE_DE', 'moral_scenarios'), ('DE_DE', 'nutrition'), ('DE_DE', 'philosophy'), ('DE_DE', 'prehistory'), ('DE_DE', 'professional_accounting'), ('DE_DE', 'professional_law'), ('DE_DE', 'professional_medicine'), ('DE_DE', 'professional_psychology'), ('DE_DE', 'public_relations'), ('DE_DE', 'security_studies'), ('DE_DE', 'sociology'), ('DE_DE', 'us_foreign_policy'), ('DE_DE', 'virology'), ('DE_DE', 'world_religions'), ('ES_LA', 'abstract_algebra'), ('ES_LA', 'anatomy'), ('ES_LA', 'astronomy'), ('ES_LA', 'business_ethics'), ('ES_LA', 'clinical_knowledge'), ('ES_LA', 'college_biology'), ('ES_LA', 'college_chemistry'), ('ES_LA', 'college_computer_science'), ('ES_LA', 'college_mathematics'), ('ES_LA', 'college_medicine'), ('ES_LA', 'college_physics'), ('ES_LA', 'computer_security'), ('ES_LA', 'conceptual_physics'), ('ES_LA', 'econometrics'), ('ES_LA', 'electrical_engineering'), ('ES_LA', 'elementary_mathematics'), ('ES_LA', 'formal_logic'), ('ES_LA', 'global_facts'), ('ES_LA', 'high_school_biology'), ('ES_LA', 'high_school_chemistry'), ('ES_LA', 'high_school_computer_science'), ('ES_LA', 'high_school_european_history'), ('ES_LA', 'high_school_geography'), ('ES_LA', 'high_school_government_and_politics'), ('ES_LA', 'high_school_macroeconomics'), ('ES_LA', 'high_school_mathematics'), ('ES_LA', 'high_school_microeconomics'), ('ES_LA', 'high_school_physics'), ('ES_LA', 'high_school_psychology'), ('ES_LA', 'high_school_statistics'), ('ES_LA', 'high_school_us_history'), ('ES_LA', 'high_school_world_history'), ('ES_LA', 'human_aging'), ('ES_LA', 'human_sexuality'), ('ES_LA', 'international_law'), ('ES_LA', 'jurisprudence'), ('ES_LA', 'logical_fallacies'), ('ES_LA', 'machine_learning'), ('ES_LA', 'management'), ('ES_LA', 'marketing'), ('ES_LA', 'medical_genetics'), ('ES_LA', 'miscellaneous'), ('ES_LA', 'moral_disputes'), ('ES_LA', 'moral_scenarios'), ('ES_LA', 'nutrition'), ('ES_LA', 'philosophy'), ('ES_LA', 'prehistory'), ('ES_LA', 'professional_accounting'), ('ES_LA', 'professional_law'), ('ES_LA', 'professional_medicine'), ('ES_LA', 'professional_psychology'), ('ES_LA', 'public_relations'), ('ES_LA', 'security_studies'), ('ES_LA', 'sociology'), ('ES_LA', 'us_foreign_policy'), ('ES_LA', 'virology'), ('ES_LA', 'world_religions'), ('IT_IT', 'abstract_algebra'), ('IT_IT', 'anatomy'), ('IT_IT', 'astronomy'), ('IT_IT', 'business_ethics'), ('IT_IT', 'clinical_knowledge'), ('IT_IT', 'college_biology'), ('IT_IT', 'college_chemistry'), ('IT_IT', 'college_computer_science'), ('IT_IT', 'college_mathematics'), ('IT_IT', 'college_medicine'), ('IT_IT', 'college_physics'), ('IT_IT', 'computer_security'), ('IT_IT', 'conceptual_physics'), ('IT_IT', 'econometrics'), ('IT_IT', 'electrical_engineering'), ('IT_IT', 'elementary_mathematics'), ('IT_IT', 'formal_logic'), ('IT_IT', 'global_facts'), ('IT_IT', 'high_school_biology'), ('IT_IT', 'high_school_chemistry'), ('IT_IT', 'high_school_computer_science'), ('IT_IT', 'high_school_european_history'), ('IT_IT', 'high_school_geography'), ('IT_IT', 'high_school_government_and_politics'), ('IT_IT', 'high_school_macroeconomics'), ('IT_IT', 'high_school_mathematics'), ('IT_IT', 'high_school_microeconomics'), ('IT_IT', 'high_school_physics'), ('IT_IT', 'high_school_psychology'), ('IT_IT', 'high_school_statistics'), ('IT_IT', 'high_school_us_history'), ('IT_IT', 'high_school_world_history'), ('IT_IT', 'human_aging'), ('IT_IT', 'human_sexuality'), ('IT_IT', 'international_law'), ('IT_IT', 'jurisprudence'), ('IT_IT', 'logical_fallacies'), ('IT_IT', 'machine_learning'), ('IT_IT', 'management'), ('IT_IT', 'marketing'), ('IT_IT', 'medical_genetics'), ('IT_IT', 'miscellaneous'), ('IT_IT', 'moral_disputes'), ('IT_IT', 'moral_scenarios'), ('IT_IT', 'nutrition'), ('IT_IT', 'philosophy'), ('IT_IT', 'prehistory'), ('IT_IT', 'professional_accounting'), ('IT_IT', 'professional_law'), ('IT_IT', 'professional_medicine'), ('IT_IT', 'professional_psychology'), ('IT_IT', 'public_relations'), ('IT_IT', 'security_studies'), ('IT_IT', 'sociology'), ('IT_IT', 'us_foreign_policy'), ('IT_IT', 'virology'), ('IT_IT', 'world_religions'), ('PT_BR', 'abstract_algebra'), ('PT_BR', 'anatomy'), ('PT_BR', 'astronomy'), ('PT_BR', 'business_ethics'), ('PT_BR', 'clinical_knowledge'), ('PT_BR', 'college_biology'), ('PT_BR', 'college_chemistry'), ('PT_BR', 'college_computer_science'), ('PT_BR', 'college_mathematics'), ('PT_BR', 'college_medicine'), ('PT_BR', 'college_physics'), ('PT_BR', 'computer_security'), ('PT_BR', 'conceptual_physics'), ('PT_BR', 'econometrics'), ('PT_BR', 'electrical_engineering'), ('PT_BR', 'elementary_mathematics'), ('PT_BR', 'formal_logic'), ('PT_BR', 'global_facts'), ('PT_BR', 'high_school_biology'), ('PT_BR', 'high_school_chemistry'), ('PT_BR', 'high_school_computer_science'), ('PT_BR', 'high_school_european_history'), ('PT_BR', 'high_school_geography'), ('PT_BR', 'high_school_government_and_politics'), ('PT_BR', 'high_school_macroeconomics'), ('PT_BR', 'high_school_mathematics'), ('PT_BR', 'high_school_microeconomics'), ('PT_BR', 'high_school_physics'), ('PT_BR', 'high_school_psychology'), ('PT_BR', 'high_school_statistics'), ('PT_BR', 'high_school_us_history'), ('PT_BR', 'high_school_world_history'), ('PT_BR', 'human_aging'), ('PT_BR', 'human_sexuality'), ('PT_BR', 'international_law'), ('PT_BR', 'jurisprudence'), ('PT_BR', 'logical_fallacies'), ('PT_BR', 'machine_learning'), ('PT_BR', 'management'), ('PT_BR', 'marketing'), ('PT_BR', 'medical_genetics'), ('PT_BR', 'miscellaneous'), ('PT_BR', 'moral_disputes'), ('PT_BR', 'moral_scenarios'), ('PT_BR', 'nutrition'), ('PT_BR', 'philosophy'), ('PT_BR', 'prehistory'), ('PT_BR', 'professional_accounting'), ('PT_BR', 'professional_law'), ('PT_BR', 'professional_medicine'), ('PT_BR', 'professional_psychology'), ('PT_BR', 'public_relations'), ('PT_BR', 'security_studies'), ('PT_BR', 'sociology'), ('PT_BR', 'us_foreign_policy'), ('PT_BR', 'virology'), ('PT_BR', 'world_religions'), ('AR_XY', 'abstract_algebra'), ('AR_XY', 'anatomy'), ('AR_XY', 'astronomy'), ('AR_XY', 'business_ethics'), ('AR_XY', 'clinical_knowledge'), ('AR_XY', 'college_biology'), ('AR_XY', 'college_chemistry'), ('AR_XY', 'college_computer_science'), ('AR_XY', 'college_mathematics'), ('AR_XY', 'college_medicine'), ('AR_XY', 'college_physics'), ('AR_XY', 'computer_security'), ('AR_XY', 'conceptual_physics'), ('AR_XY', 'econometrics'), ('AR_XY', 'electrical_engineering'), ('AR_XY', 'elementary_mathematics'), ('AR_XY', 'formal_logic'), ('AR_XY', 'global_facts'), ('AR_XY', 'high_school_biology'), ('AR_XY', 'high_school_chemistry'), ('AR_XY', 'high_school_computer_science'), ('AR_XY', 'high_school_european_history'), ('AR_XY', 'high_school_geography'), ('AR_XY', 'high_school_government_and_politics'), ('AR_XY', 'high_school_macroeconomics'), ('AR_XY', 'high_school_mathematics'), ('AR_XY', 'high_school_microeconomics'), ('AR_XY', 'high_school_physics'), ('AR_XY', 'high_school_psychology'), ('AR_XY', 'high_school_statistics'), ('AR_XY', 'high_school_us_history'), ('AR_XY', 'high_school_world_history'), ('AR_XY', 'human_aging'), ('AR_XY', 'human_sexuality'), ('AR_XY', 'international_law'), ('AR_XY', 'jurisprudence'), ('AR_XY', 'logical_fallacies'), ('AR_XY', 'machine_learning'), ('AR_XY', 'management'), ('AR_XY', 'marketing'), ('AR_XY', 'medical_genetics'), ('AR_XY', 'miscellaneous'), ('AR_XY', 'moral_disputes'), ('AR_XY', 'moral_scenarios'), ('AR_XY', 'nutrition'), ('AR_XY', 'philosophy'), ('AR_XY', 'prehistory'), ('AR_XY', 'professional_accounting'), ('AR_XY', 'professional_law'), ('AR_XY', 'professional_medicine'), ('AR_XY', 'professional_psychology'), ('AR_XY', 'public_relations'), ('AR_XY', 'security_studies'), ('AR_XY', 'sociology'), ('AR_XY', 'us_foreign_policy'), ('AR_XY', 'virology'), ('AR_XY', 'world_religions')] LANGUAGE = {"('FR', 'abstract_algebra')": , "('FR', 'anatomy')": , "('FR', 'astronomy')": , "('FR', 'business_ethics')": , "('FR', 'clinical_knowledge')": , "('FR', 'college_biology')": , "('FR', 'college_chemistry')": , "('FR', 'college_computer_science')": , "('FR', 'college_mathematics')": , "('FR', 'college_medicine')": , "('FR', 'college_physics')": , "('FR', 'computer_security')": , "('FR', 'conceptual_physics')": , "('FR', 'econometrics')": , "('FR', 'electrical_engineering')": , "('FR', 'elementary_mathematics')": , "('FR', 'formal_logic')": , "('FR', 'global_facts')": , "('FR', 'high_school_biology')": , "('FR', 'high_school_chemistry')": , "('FR', 'high_school_computer_science')": , "('FR', 'high_school_european_history')": , "('FR', 'high_school_geography')": , "('FR', 'high_school_government_and_politics')": , "('FR', 'high_school_macroeconomics')": , "('FR', 'high_school_mathematics')": , "('FR', 'high_school_microeconomics')": , "('FR', 'high_school_physics')": , "('FR', 'high_school_psychology')": , "('FR', 'high_school_statistics')": , "('FR', 'high_school_us_history')": , "('FR', 'high_school_world_history')": , "('FR', 'human_aging')": , "('FR', 'human_sexuality')": , "('FR', 'international_law')": , "('FR', 'jurisprudence')": , "('FR', 'logical_fallacies')": , "('FR', 'machine_learning')": , "('FR', 'management')": , "('FR', 'marketing')": , "('FR', 'medical_genetics')": , "('FR', 'miscellaneous')": , "('FR', 'moral_disputes')": , "('FR', 'moral_scenarios')": , "('FR', 'nutrition')": , "('FR', 'philosophy')": , "('FR', 'prehistory')": , "('FR', 'professional_accounting')": , "('FR', 'professional_law')": , "('FR', 'professional_medicine')": , "('FR', 'professional_psychology')": , "('FR', 'public_relations')": , "('FR', 'security_studies')": , "('FR', 'sociology')": , "('FR', 'us_foreign_policy')": , "('FR', 'virology')": , "('FR', 'world_religions')": , "('DE', 'abstract_algebra')": , "('DE', 'anatomy')": , "('DE', 'astronomy')": , "('DE', 'business_ethics')": , "('DE', 'clinical_knowledge')": , "('DE', 'college_biology')": , "('DE', 'college_chemistry')": , "('DE', 'college_computer_science')": , "('DE', 'college_mathematics')": , "('DE', 'college_medicine')": , "('DE', 'college_physics')": , "('DE', 'computer_security')": , "('DE', 'conceptual_physics')": , "('DE', 'econometrics')": , "('DE', 'electrical_engineering')": , "('DE', 'elementary_mathematics')": , "('DE', 'formal_logic')": , "('DE', 'global_facts')": , "('DE', 'high_school_biology')": , "('DE', 'high_school_chemistry')": , "('DE', 'high_school_computer_science')": , "('DE', 'high_school_european_history')": , "('DE', 'high_school_geography')": , "('DE', 'high_school_government_and_politics')": , "('DE', 'high_school_macroeconomics')": , "('DE', 'high_school_mathematics')": , "('DE', 'high_school_microeconomics')": , "('DE', 'high_school_physics')": , "('DE', 'high_school_psychology')": , "('DE', 'high_school_statistics')": , "('DE', 'high_school_us_history')": , "('DE', 'high_school_world_history')": , "('DE', 'human_aging')": , "('DE', 'human_sexuality')": , "('DE', 'international_law')": , "('DE', 'jurisprudence')": , "('DE', 'logical_fallacies')": , "('DE', 'machine_learning')": , "('DE', 'management')": , "('DE', 'marketing')": , "('DE', 'medical_genetics')": , "('DE', 'miscellaneous')": , "('DE', 'moral_disputes')": , "('DE', 'moral_scenarios')": , "('DE', 'nutrition')": , "('DE', 'philosophy')": , "('DE', 'prehistory')": , "('DE', 'professional_accounting')": , "('DE', 'professional_law')": , "('DE', 'professional_medicine')": , "('DE', 'professional_psychology')": , "('DE', 'public_relations')": , "('DE', 'security_studies')": , "('DE', 'sociology')": , "('DE', 'us_foreign_policy')": , "('DE', 'virology')": , "('DE', 'world_religions')": , "('ES', 'abstract_algebra')": , "('ES', 'anatomy')": , "('ES', 'astronomy')": , "('ES', 'business_ethics')": , "('ES', 'clinical_knowledge')": , "('ES', 'college_biology')": , "('ES', 'college_chemistry')": , "('ES', 'college_computer_science')": , "('ES', 'college_mathematics')": , "('ES', 'college_medicine')": , "('ES', 'college_physics')": , "('ES', 'computer_security')": , "('ES', 'conceptual_physics')": , "('ES', 'econometrics')": , "('ES', 'electrical_engineering')": , "('ES', 'elementary_mathematics')": , "('ES', 'formal_logic')": , "('ES', 'global_facts')": , "('ES', 'high_school_biology')": , "('ES', 'high_school_chemistry')": , "('ES', 'high_school_computer_science')": , "('ES', 'high_school_european_history')": , "('ES', 'high_school_geography')": , "('ES', 'high_school_government_and_politics')": , "('ES', 'high_school_macroeconomics')": , "('ES', 'high_school_mathematics')": , "('ES', 'high_school_microeconomics')": , "('ES', 'high_school_physics')": , "('ES', 'high_school_psychology')": , "('ES', 'high_school_statistics')": , "('ES', 'high_school_us_history')": , "('ES', 'high_school_world_history')": , "('ES', 'human_aging')": , "('ES', 'human_sexuality')": , "('ES', 'international_law')": , "('ES', 'jurisprudence')": , "('ES', 'logical_fallacies')": , "('ES', 'machine_learning')": , "('ES', 'management')": , "('ES', 'marketing')": , "('ES', 'medical_genetics')": , "('ES', 'miscellaneous')": , "('ES', 'moral_disputes')": , "('ES', 'moral_scenarios')": , "('ES', 'nutrition')": , "('ES', 'philosophy')": , "('ES', 'prehistory')": , "('ES', 'professional_accounting')": , "('ES', 'professional_law')": , "('ES', 'professional_medicine')": , "('ES', 'professional_psychology')": , "('ES', 'public_relations')": , "('ES', 'security_studies')": , "('ES', 'sociology')": , "('ES', 'us_foreign_policy')": , "('ES', 'virology')": , "('ES', 'world_religions')": , "('IT', 'abstract_algebra')": , "('IT', 'anatomy')": , "('IT', 'astronomy')": , "('IT', 'business_ethics')": , "('IT', 'clinical_knowledge')": , "('IT', 'college_biology')": , "('IT', 'college_chemistry')": , "('IT', 'college_computer_science')": , "('IT', 'college_mathematics')": , "('IT', 'college_medicine')": , "('IT', 'college_physics')": , "('IT', 'computer_security')": , "('IT', 'conceptual_physics')": , "('IT', 'econometrics')": , "('IT', 'electrical_engineering')": , "('IT', 'elementary_mathematics')": , "('IT', 'formal_logic')": , "('IT', 'global_facts')": , "('IT', 'high_school_biology')": , "('IT', 'high_school_chemistry')": , "('IT', 'high_school_computer_science')": , "('IT', 'high_school_european_history')": , "('IT', 'high_school_geography')": , "('IT', 'high_school_government_and_politics')": , "('IT', 'high_school_macroeconomics')": , "('IT', 'high_school_mathematics')": , "('IT', 'high_school_microeconomics')": , "('IT', 'high_school_physics')": , "('IT', 'high_school_psychology')": , "('IT', 'high_school_statistics')": , "('IT', 'high_school_us_history')": , "('IT', 'high_school_world_history')": , "('IT', 'human_aging')": , "('IT', 'human_sexuality')": , "('IT', 'international_law')": , "('IT', 'jurisprudence')": , "('IT', 'logical_fallacies')": , "('IT', 'machine_learning')": , "('IT', 'management')": , "('IT', 'marketing')": , "('IT', 'medical_genetics')": , "('IT', 'miscellaneous')": , "('IT', 'moral_disputes')": , "('IT', 'moral_scenarios')": , "('IT', 'nutrition')": , "('IT', 'philosophy')": , "('IT', 'prehistory')": , "('IT', 'professional_accounting')": , "('IT', 'professional_law')": , "('IT', 'professional_medicine')": , "('IT', 'professional_psychology')": , "('IT', 'public_relations')": , "('IT', 'security_studies')": , "('IT', 'sociology')": , "('IT', 'us_foreign_policy')": , "('IT', 'virology')": , "('IT', 'world_religions')": , "('PT', 'abstract_algebra')": , "('PT', 'anatomy')": , "('PT', 'astronomy')": , "('PT', 'business_ethics')": , "('PT', 'clinical_knowledge')": , "('PT', 'college_biology')": , "('PT', 'college_chemistry')": , "('PT', 'college_computer_science')": , "('PT', 'college_mathematics')": , "('PT', 'college_medicine')": , "('PT', 'college_physics')": , "('PT', 'computer_security')": , "('PT', 'conceptual_physics')": , "('PT', 'econometrics')": , "('PT', 'electrical_engineering')": , "('PT', 'elementary_mathematics')": , "('PT', 'formal_logic')": , "('PT', 'global_facts')": , "('PT', 'high_school_biology')": , "('PT', 'high_school_chemistry')": , "('PT', 'high_school_computer_science')": , "('PT', 'high_school_european_history')": , "('PT', 'high_school_geography')": , "('PT', 'high_school_government_and_politics')": , "('PT', 'high_school_macroeconomics')": , "('PT', 'high_school_mathematics')": , "('PT', 'high_school_microeconomics')": , "('PT', 'high_school_physics')": , "('PT', 'high_school_psychology')": , "('PT', 'high_school_statistics')": , "('PT', 'high_school_us_history')": , "('PT', 'high_school_world_history')": , "('PT', 'human_aging')": , "('PT', 'human_sexuality')": , "('PT', 'international_law')": , "('PT', 'jurisprudence')": , "('PT', 'logical_fallacies')": , "('PT', 'machine_learning')": , "('PT', 'management')": , "('PT', 'marketing')": , "('PT', 'medical_genetics')": , "('PT', 'miscellaneous')": , "('PT', 'moral_disputes')": , "('PT', 'moral_scenarios')": , "('PT', 'nutrition')": , "('PT', 'philosophy')": , "('PT', 'prehistory')": , "('PT', 'professional_accounting')": , "('PT', 'professional_law')": , "('PT', 'professional_medicine')": , "('PT', 'professional_psychology')": , "('PT', 'public_relations')": , "('PT', 'security_studies')": , "('PT', 'sociology')": , "('PT', 'us_foreign_policy')": , "('PT', 'virology')": , "('PT', 'world_religions')": , "('AR', 'abstract_algebra')": , "('AR', 'anatomy')": , "('AR', 'astronomy')": , "('AR', 'business_ethics')": , "('AR', 'clinical_knowledge')": , "('AR', 'college_biology')": , "('AR', 'college_chemistry')": , "('AR', 'college_computer_science')": , "('AR', 'college_mathematics')": , "('AR', 'college_medicine')": , "('AR', 'college_physics')": , "('AR', 'computer_security')": , "('AR', 'conceptual_physics')": , "('AR', 'econometrics')": , "('AR', 'electrical_engineering')": , "('AR', 'elementary_mathematics')": , "('AR', 'formal_logic')": , "('AR', 'global_facts')": , "('AR', 'high_school_biology')": , "('AR', 'high_school_chemistry')": , "('AR', 'high_school_computer_science')": , "('AR', 'high_school_european_history')": , "('AR', 'high_school_geography')": , "('AR', 'high_school_government_and_politics')": , "('AR', 'high_school_macroeconomics')": , "('AR', 'high_school_mathematics')": , "('AR', 'high_school_microeconomics')": , "('AR', 'high_school_physics')": , "('AR', 'high_school_psychology')": , "('AR', 'high_school_statistics')": , "('AR', 'high_school_us_history')": , "('AR', 'high_school_world_history')": , "('AR', 'human_aging')": , "('AR', 'human_sexuality')": , "('AR', 'international_law')": , "('AR', 'jurisprudence')": , "('AR', 'logical_fallacies')": , "('AR', 'machine_learning')": , "('AR', 'management')": , "('AR', 'marketing')": , "('AR', 'medical_genetics')": , "('AR', 'miscellaneous')": , "('AR', 'moral_disputes')": , "('AR', 'moral_scenarios')": , "('AR', 'nutrition')": , "('AR', 'philosophy')": , "('AR', 'prehistory')": , "('AR', 'professional_accounting')": , "('AR', 'professional_law')": , "('AR', 'professional_medicine')": , "('AR', 'professional_psychology')": , "('AR', 'public_relations')": , "('AR', 'security_studies')": , "('AR', 'sociology')": , "('AR', 'us_foreign_policy')": , "('AR', 'virology')": , "('AR', 'world_religions')": } ```` diff --git a/docs/tasks/MMMLU_GERMAN_COT.md b/docs/tasks/MMMLU_GERMAN_COT.md index fb1fef17..75297bcc 100644 --- a/docs/tasks/MMMLU_GERMAN_COT.md +++ b/docs/tasks/MMMLU_GERMAN_COT.md @@ -7,8 +7,8 @@ SAMPLE_SPLIT = test FEWSHOT_SPLIT = test RESPONSE_TYPE = COMPLETION METRICS = [AccuracyCompletion, GermanCompletionChecker] -SUBJECTS = [('DE_DE', 'abstract_algebra'), ('DE_DE', 'astronomy'), ('DE_DE', 'college_biology'), ('DE_DE', 'college_chemistry'), ('DE_DE', 'college_computer_science'), ('DE_DE', 'college_mathematics'), ('DE_DE', 'college_physics'), ('DE_DE', 'computer_security'), ('DE_DE', 'conceptual_physics'), ('DE_DE', 'electrical_engineering'), ('DE_DE', 'elementary_mathematics'), ('DE_DE', 'high_school_biology'), ('DE_DE', 'high_school_chemistry'), ('DE_DE', 'high_school_computer_science'), ('DE_DE', 'high_school_mathematics'), ('DE_DE', 'high_school_physics'), ('DE_DE', 'high_school_statistics'), ('DE_DE', 'machine_learning'), ('DE_DE', 'formal_logic'), ('DE_DE', 'high_school_european_history'), ('DE_DE', 'high_school_us_history'), ('DE_DE', 'high_school_world_history'), ('DE_DE', 'international_law'), ('DE_DE', 'jurisprudence'), ('DE_DE', 'logical_fallacies'), ('DE_DE', 'moral_disputes'), ('DE_DE', 'moral_scenarios'), ('DE_DE', 'philosophy'), ('DE_DE', 'prehistory'), ('DE_DE', 'professional_law'), ('DE_DE', 'world_religions'), ('DE_DE', 'econometrics'), ('DE_DE', 'high_school_geography'), ('DE_DE', 'high_school_government_and_politics'), ('DE_DE', 'high_school_macroeconomics'), ('DE_DE', 'high_school_microeconomics'), ('DE_DE', 'high_school_psychology'), ('DE_DE', 'human_sexuality'), ('DE_DE', 'professional_psychology'), ('DE_DE', 'public_relations'), ('DE_DE', 'security_studies'), ('DE_DE', 'sociology'), ('DE_DE', 'us_foreign_policy'), ('DE_DE', 'anatomy'), ('DE_DE', 'business_ethics'), ('DE_DE', 'clinical_knowledge'), ('DE_DE', 'college_medicine'), ('DE_DE', 'global_facts'), ('DE_DE', 'human_aging'), ('DE_DE', 'management'), ('DE_DE', 'marketing'), ('DE_DE', 'medical_genetics'), ('DE_DE', 'miscellaneous'), ('DE_DE', 'nutrition'), ('DE_DE', 'professional_accounting'), ('DE_DE', 'professional_medicine'), ('DE_DE', 'virology')] -LANGUAGE = {"('de', 'abstract_algebra')": , "('de', 'astronomy')": , "('de', 'college_biology')": , "('de', 'college_chemistry')": , "('de', 'college_computer_science')": , "('de', 'college_mathematics')": , "('de', 'college_physics')": , "('de', 'computer_security')": , "('de', 'conceptual_physics')": , "('de', 'electrical_engineering')": , "('de', 'elementary_mathematics')": , "('de', 'high_school_biology')": , "('de', 'high_school_chemistry')": , "('de', 'high_school_computer_science')": , "('de', 'high_school_mathematics')": , "('de', 'high_school_physics')": , "('de', 'high_school_statistics')": , "('de', 'machine_learning')": , "('de', 'formal_logic')": , "('de', 'high_school_european_history')": , "('de', 'high_school_us_history')": , "('de', 'high_school_world_history')": , "('de', 'international_law')": , "('de', 'jurisprudence')": , "('de', 'logical_fallacies')": , "('de', 'moral_disputes')": , "('de', 'moral_scenarios')": , "('de', 'philosophy')": , "('de', 'prehistory')": , "('de', 'professional_law')": , "('de', 'world_religions')": , "('de', 'econometrics')": , "('de', 'high_school_geography')": , "('de', 'high_school_government_and_politics')": , "('de', 'high_school_macroeconomics')": , "('de', 'high_school_microeconomics')": , "('de', 'high_school_psychology')": , "('de', 'human_sexuality')": , "('de', 'professional_psychology')": , "('de', 'public_relations')": , "('de', 'security_studies')": , "('de', 'sociology')": , "('de', 'us_foreign_policy')": , "('de', 'anatomy')": , "('de', 'business_ethics')": , "('de', 'clinical_knowledge')": , "('de', 'college_medicine')": , "('de', 'global_facts')": , "('de', 'human_aging')": , "('de', 'management')": , "('de', 'marketing')": , "('de', 'medical_genetics')": , "('de', 'miscellaneous')": , "('de', 'nutrition')": , "('de', 'professional_accounting')": , "('de', 'professional_medicine')": , "('de', 'virology')": } +SUBJECTS = [('DE_DE', 'abstract_algebra'), ('DE_DE', 'anatomy'), ('DE_DE', 'astronomy'), ('DE_DE', 'business_ethics'), ('DE_DE', 'clinical_knowledge'), ('DE_DE', 'college_biology'), ('DE_DE', 'college_chemistry'), ('DE_DE', 'college_computer_science'), ('DE_DE', 'college_mathematics'), ('DE_DE', 'college_medicine'), ('DE_DE', 'college_physics'), ('DE_DE', 'computer_security'), ('DE_DE', 'conceptual_physics'), ('DE_DE', 'econometrics'), ('DE_DE', 'electrical_engineering'), ('DE_DE', 'elementary_mathematics'), ('DE_DE', 'formal_logic'), ('DE_DE', 'global_facts'), ('DE_DE', 'high_school_biology'), ('DE_DE', 'high_school_chemistry'), ('DE_DE', 'high_school_computer_science'), ('DE_DE', 'high_school_european_history'), ('DE_DE', 'high_school_geography'), ('DE_DE', 'high_school_government_and_politics'), ('DE_DE', 'high_school_macroeconomics'), ('DE_DE', 'high_school_mathematics'), ('DE_DE', 'high_school_microeconomics'), ('DE_DE', 'high_school_physics'), ('DE_DE', 'high_school_psychology'), ('DE_DE', 'high_school_statistics'), ('DE_DE', 'high_school_us_history'), ('DE_DE', 'high_school_world_history'), ('DE_DE', 'human_aging'), ('DE_DE', 'human_sexuality'), ('DE_DE', 'international_law'), ('DE_DE', 'jurisprudence'), ('DE_DE', 'logical_fallacies'), ('DE_DE', 'machine_learning'), ('DE_DE', 'management'), ('DE_DE', 'marketing'), ('DE_DE', 'medical_genetics'), ('DE_DE', 'miscellaneous'), ('DE_DE', 'moral_disputes'), ('DE_DE', 'moral_scenarios'), ('DE_DE', 'nutrition'), ('DE_DE', 'philosophy'), ('DE_DE', 'prehistory'), ('DE_DE', 'professional_accounting'), ('DE_DE', 'professional_law'), ('DE_DE', 'professional_medicine'), ('DE_DE', 'professional_psychology'), ('DE_DE', 'public_relations'), ('DE_DE', 'security_studies'), ('DE_DE', 'sociology'), ('DE_DE', 'us_foreign_policy'), ('DE_DE', 'virology'), ('DE_DE', 'world_religions')] +LANGUAGE = {"('de', 'abstract_algebra')": , "('de', 'anatomy')": , "('de', 'astronomy')": , "('de', 'business_ethics')": , "('de', 'clinical_knowledge')": , "('de', 'college_biology')": , "('de', 'college_chemistry')": , "('de', 'college_computer_science')": , "('de', 'college_mathematics')": , "('de', 'college_medicine')": , "('de', 'college_physics')": , "('de', 'computer_security')": , "('de', 'conceptual_physics')": , "('de', 'econometrics')": , "('de', 'electrical_engineering')": , "('de', 'elementary_mathematics')": , "('de', 'formal_logic')": , "('de', 'global_facts')": , "('de', 'high_school_biology')": , "('de', 'high_school_chemistry')": , "('de', 'high_school_computer_science')": , "('de', 'high_school_european_history')": , "('de', 'high_school_geography')": , "('de', 'high_school_government_and_politics')": , "('de', 'high_school_macroeconomics')": , "('de', 'high_school_mathematics')": , "('de', 'high_school_microeconomics')": , "('de', 'high_school_physics')": , "('de', 'high_school_psychology')": , "('de', 'high_school_statistics')": , "('de', 'high_school_us_history')": , "('de', 'high_school_world_history')": , "('de', 'human_aging')": , "('de', 'human_sexuality')": , "('de', 'international_law')": , "('de', 'jurisprudence')": , "('de', 'logical_fallacies')": , "('de', 'machine_learning')": , "('de', 'management')": , "('de', 'marketing')": , "('de', 'medical_genetics')": , "('de', 'miscellaneous')": , "('de', 'moral_disputes')": , "('de', 'moral_scenarios')": , "('de', 'nutrition')": , "('de', 'philosophy')": , "('de', 'prehistory')": , "('de', 'professional_accounting')": , "('de', 'professional_law')": , "('de', 'professional_medicine')": , "('de', 'professional_psychology')": , "('de', 'public_relations')": , "('de', 'security_studies')": , "('de', 'sociology')": , "('de', 'us_foreign_policy')": , "('de', 'virology')": , "('de', 'world_religions')": } ```` - Module: `eval_framework.tasks.benchmarks.mmmlu` diff --git a/docs/tasks/MMMLU_German.md b/docs/tasks/MMMLU_German.md index 139bcb51..db02bf99 100644 --- a/docs/tasks/MMMLU_German.md +++ b/docs/tasks/MMMLU_German.md @@ -7,7 +7,7 @@ SAMPLE_SPLIT = test FEWSHOT_SPLIT = test RESPONSE_TYPE = LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood] -SUBJECTS = [('DE_DE', 'abstract_algebra'), ('DE_DE', 'astronomy'), ('DE_DE', 'college_biology'), ('DE_DE', 'college_chemistry'), ('DE_DE', 'college_computer_science'), ('DE_DE', 'college_mathematics'), ('DE_DE', 'college_physics'), ('DE_DE', 'computer_security'), ('DE_DE', 'conceptual_physics'), ('DE_DE', 'electrical_engineering'), ('DE_DE', 'elementary_mathematics'), ('DE_DE', 'high_school_biology'), ('DE_DE', 'high_school_chemistry'), ('DE_DE', 'high_school_computer_science'), ('DE_DE', 'high_school_mathematics'), ('DE_DE', 'high_school_physics'), ('DE_DE', 'high_school_statistics'), ('DE_DE', 'machine_learning'), ('DE_DE', 'formal_logic'), ('DE_DE', 'high_school_european_history'), ('DE_DE', 'high_school_us_history'), ('DE_DE', 'high_school_world_history'), ('DE_DE', 'international_law'), ('DE_DE', 'jurisprudence'), ('DE_DE', 'logical_fallacies'), ('DE_DE', 'moral_disputes'), ('DE_DE', 'moral_scenarios'), ('DE_DE', 'philosophy'), ('DE_DE', 'prehistory'), ('DE_DE', 'professional_law'), ('DE_DE', 'world_religions'), ('DE_DE', 'econometrics'), ('DE_DE', 'high_school_geography'), ('DE_DE', 'high_school_government_and_politics'), ('DE_DE', 'high_school_macroeconomics'), ('DE_DE', 'high_school_microeconomics'), ('DE_DE', 'high_school_psychology'), ('DE_DE', 'human_sexuality'), ('DE_DE', 'professional_psychology'), ('DE_DE', 'public_relations'), ('DE_DE', 'security_studies'), ('DE_DE', 'sociology'), ('DE_DE', 'us_foreign_policy'), ('DE_DE', 'anatomy'), ('DE_DE', 'business_ethics'), ('DE_DE', 'clinical_knowledge'), ('DE_DE', 'college_medicine'), ('DE_DE', 'global_facts'), ('DE_DE', 'human_aging'), ('DE_DE', 'management'), ('DE_DE', 'marketing'), ('DE_DE', 'medical_genetics'), ('DE_DE', 'miscellaneous'), ('DE_DE', 'nutrition'), ('DE_DE', 'professional_accounting'), ('DE_DE', 'professional_medicine'), ('DE_DE', 'virology')] +SUBJECTS = [('DE_DE', 'abstract_algebra'), ('DE_DE', 'anatomy'), ('DE_DE', 'astronomy'), ('DE_DE', 'business_ethics'), ('DE_DE', 'clinical_knowledge'), ('DE_DE', 'college_biology'), ('DE_DE', 'college_chemistry'), ('DE_DE', 'college_computer_science'), ('DE_DE', 'college_mathematics'), ('DE_DE', 'college_medicine'), ('DE_DE', 'college_physics'), ('DE_DE', 'computer_security'), ('DE_DE', 'conceptual_physics'), ('DE_DE', 'econometrics'), ('DE_DE', 'electrical_engineering'), ('DE_DE', 'elementary_mathematics'), ('DE_DE', 'formal_logic'), ('DE_DE', 'global_facts'), ('DE_DE', 'high_school_biology'), ('DE_DE', 'high_school_chemistry'), ('DE_DE', 'high_school_computer_science'), ('DE_DE', 'high_school_european_history'), ('DE_DE', 'high_school_geography'), ('DE_DE', 'high_school_government_and_politics'), ('DE_DE', 'high_school_macroeconomics'), ('DE_DE', 'high_school_mathematics'), ('DE_DE', 'high_school_microeconomics'), ('DE_DE', 'high_school_physics'), ('DE_DE', 'high_school_psychology'), ('DE_DE', 'high_school_statistics'), ('DE_DE', 'high_school_us_history'), ('DE_DE', 'high_school_world_history'), ('DE_DE', 'human_aging'), ('DE_DE', 'human_sexuality'), ('DE_DE', 'international_law'), ('DE_DE', 'jurisprudence'), ('DE_DE', 'logical_fallacies'), ('DE_DE', 'machine_learning'), ('DE_DE', 'management'), ('DE_DE', 'marketing'), ('DE_DE', 'medical_genetics'), ('DE_DE', 'miscellaneous'), ('DE_DE', 'moral_disputes'), ('DE_DE', 'moral_scenarios'), ('DE_DE', 'nutrition'), ('DE_DE', 'philosophy'), ('DE_DE', 'prehistory'), ('DE_DE', 'professional_accounting'), ('DE_DE', 'professional_law'), ('DE_DE', 'professional_medicine'), ('DE_DE', 'professional_psychology'), ('DE_DE', 'public_relations'), ('DE_DE', 'security_studies'), ('DE_DE', 'sociology'), ('DE_DE', 'us_foreign_policy'), ('DE_DE', 'virology'), ('DE_DE', 'world_religions')] LANGUAGE = ```` diff --git a/src/eval_framework/tasks/benchmarks/arc.py b/src/eval_framework/tasks/benchmarks/arc.py index c8b52a1e..b047c7c7 100644 --- a/src/eval_framework/tasks/benchmarks/arc.py +++ b/src/eval_framework/tasks/benchmarks/arc.py @@ -12,241 +12,6 @@ from eval_framework.tasks.task_style import BPBStyle, ClozeStyle, MCStyle, answer_key_to_index from eval_framework.tasks.utils import get_n_letters -# OLMES fixed fewshot sources, keyed by HF subject name. -# Source: https://github.com/allenai/olmes (FEWSHOT_SOURCES["OLMES:ARC-*"]) -_ARC_FEWSHOT_SOURCES: dict[str, list[dict[str, Any]]] = { - "ARC-Easy": [ - { - "id": "MCAS_2007_8_5189", - "question": "Lichens are symbiotic organisms made of green algae and fungi. What do the green algae supply " - "to the fungi in this symbiotic relationship?", - "choices": {"text": ["carbon dioxide", "food", "protection", "water"], "label": ["A", "B", "C", "D"]}, - "answerKey": "B", - }, - { - "id": "Mercury_SC_401169", - "question": "When a switch is used in an electrical circuit, the switch can", - "choices": { - "text": [ - "cause the charge to build.", - "increase and decrease the voltage.", - "cause the current to change direction.", - "stop and start the flow of current.", - ], - "label": ["A", "B", "C", "D"], - }, - "answerKey": "D", - }, - { - "id": "MCAS_2004_8_27", - "question": "Which of the following is an example of an assistive device?", - "choices": { - "text": ["contact lens", "motorcycle", "raincoat", "coffee pot"], - "label": ["A", "B", "C", "D"], - }, - "answerKey": "A", - }, - { - "id": "NYSEDREGENTS_2006_8_10", - "question": "Rocks are classified as igneous, metamorphic, or sedimentary according to", - "choices": { - "text": ["their color", "their shape", "how they formed", "the minerals they contain"], - "label": ["1", "2", "3", "4"], - }, - "answerKey": "3", - }, - { - "id": "Mercury_7013388", - "question": "A chewable calcium carbonate tablet is a common treatment for stomach discomfort. Calcium " - "carbonate is most likely used as this type of medicine because calcium carbonate", - "choices": { - "text": [ - "has a pleasant flavor.", - "is inexpensive to produce.", - "neutralizes digestive acid.", - "occurs naturally in the body.", - ], - "label": ["A", "B", "C", "D"], - }, - "answerKey": "C", - }, - { - "id": "Mercury_7179953", - "question": "Which two body systems are directly involved in movement?", - "choices": { - "text": [ - "muscular and skeletal", - "digestive and muscular", - "skeletal and respiratory", - "respiratory and digestive", - ], - "label": ["A", "B", "C", "D"], - }, - "answerKey": "A", - }, - { - "id": "Mercury_7205118", - "question": "Which change in the state of water particles causes the particles to become arranged in a" - " fixed position?", - "choices": {"text": ["boiling", "melting", "freezing", "evaporating"], "label": ["A", "B", "C", "D"]}, - "answerKey": "C", - }, - { - "id": "MCAS_2016_8_13", - "question": "Earth's core is primarily composed of which of the following materials?", - "choices": {"text": ["basalt", "iron", "magma", "quartz"], "label": ["A", "B", "C", "D"]}, - "answerKey": "B", - }, - ], - "ARC-Challenge": [ - { - "id": "Mercury_SC_415702", - "question": "George wants to warm his hands quickly by rubbing them. Which skin surface will produce the " - "most heat?", - "choices": { - "text": ["dry palms", "wet palms", "palms covered with oil", "palms covered with lotion"], - "label": ["A", "B", "C", "D"], - }, - "answerKey": "A", - }, - { - "id": "MCAS_2009_5_6516", - "question": "Which of the following statements best explains why magnets usually stick to a refrigerator " - "door?", - "choices": { - "text": [ - "The refrigerator door is smooth.", - "The refrigerator door contains iron.", - "The refrigerator door is a good conductor.", - "The refrigerator door has electric wires in it.", - ], - "label": ["A", "B", "C", "D"], - }, - "answerKey": "B", - }, - { - "id": "Mercury_7233695", - "question": "A fold observed in layers of sedimentary rock most likely resulted from the", - "choices": { - "text": [ - "cooling of flowing magma.", - "converging of crustal plates.", - "deposition of river sediments.", - "solution of carbonate minerals.", - ], - "label": ["A", "B", "C", "D"], - }, - "answerKey": "B", - }, - { - "id": "Mercury_7041615", - "question": "Which of these do scientists offer as the most recent explanation as to why many plants and " - "animals died out at the end of the Mesozoic era?", - "choices": { - "text": [ - "worldwide disease", - "global mountain building", - "rise of mammals that preyed upon plants and animals", - "impact of an asteroid created dust that blocked the sunlight", - ], - "label": ["A", "B", "C", "D"], - }, - "answerKey": "D", - }, - { - "id": "MCAS_1998_4_3", - "question": "Which of the following is a trait that a dog does NOT inherit from its parents?", - "choices": { - "text": [ - "the length of its fur", - "the shape of its nose", - "the size of its appetite", - "the color of its fur", - ], - "label": ["A", "B", "C", "D"], - }, - "answerKey": "C", - }, - { - "id": "Mercury_7041860", - "question": "A boat is acted on by a river current flowing north and by wind blowing on its sails. The boat" - " travels northeast. In which direction is the wind most likely applying force to the sails of the boat?", - "choices": {"text": ["west", "east", "north", "south"], "label": ["A", "B", "C", "D"]}, - "answerKey": "B", - }, - { - "id": "ACTAAP_2013_5_11", - "question": "As part of an experiment, an astronaut takes a scale to the Moon and weighs himself. The scale" - " reads 31 pounds. If the astronaut has a mass of about 84 kilograms, which are the approximate weight " - "and mass of the astronaut when standing on the Earth?", - "choices": { - "text": [ - "31 pounds and 14 kilograms", - "31 pounds and 84 kilograms", - "186 pounds and 14 kilograms", - "186 pounds and 84 kilograms", - ], - "label": ["A", "B", "C", "D"], - }, - "answerKey": "D", - }, - { - "id": "MDSA_2008_5_30", - "question": "On Earth, water can be a solid, a liquid, or a gas. Which energy source has the greatest " - "influence on the state of matter of water?", - "choices": { - "text": ["the sun", "the wind", "ocean currents", "the metal core"], - "label": ["A", "B", "C", "D"], - }, - "answerKey": "A", - }, - { - "id": "MEA_2016_8_14", - "question": "Which statement best compares single-celled and multi-celled organisms?", - "choices": { - "text": [ - "Tissues in a single-celled organism are like the cells in a multi-celled organism.", - "The nucleus in a single-celled organism is like the skin of a multi-celled organism.", - "Organelles in a single-celled organism are like the organs in a multi-celled organism.", - "The cytoplasm in a single-celled organism is like the nervous system in a multi-celled organism.", - ], - "label": ["A", "B", "C", "D"], - }, - "answerKey": "C", - }, - { - "id": "Mercury_SC_401653", - "question": "Which land form is the result of the constructive force of a glacier?", - "choices": { - "text": [ - "valleys carved by a moving glacier", - "piles of rocks deposited by a melting glacier", - "grooves created in a granite surface by a glacier", - "bedrock hills roughened by the passing of a glacier", - ], - "label": ["A", "B", "C", "D"], - }, - "answerKey": "B", - }, - { - "id": "Mercury_7106908", - "question": "Hatchling sea turtles are typically dark in color. Occasionally, a sea turtle hatches that " - "is almost white in color. When crawling from the nest on the beach to the ocean, this light-colored sea " - "turtle could be at risk for sunburn. The light color of the turtles would most likely", - "choices": { - "text": [ - "help the turtles have better chances at reproducing.", - "cause the shell of the sea turtles to become stronger.", - "reduce the chances of turtles surviving to reproduce.", - "help in the development of a new species of sea turtles.", - ], - "label": ["A", "B", "C", "D"], - }, - "answerKey": "C", - }, - ], -} # noqa: E501 - class ARC(BaseTask[str]): """ARC dataset: https://huggingface.co/datasets/allenai/ai2_arc""" @@ -354,10 +119,6 @@ def _get_choices(self, item: dict[str, Any]) -> list[str]: def _get_correct_index(self, item: dict[str, Any]) -> int: return answer_key_to_index(item["answerKey"]) - def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict[str, Any]]: - subject = item.get("subject", "") - return _ARC_FEWSHOT_SOURCES.get(subject, [])[: self.num_fewshot] - class ARCCloze(_ARCChoice_Base): NAME = "ARCCloze" diff --git a/src/eval_framework/tasks/benchmarks/hellaswag.py b/src/eval_framework/tasks/benchmarks/hellaswag.py index 4b54955b..3d69945c 100644 --- a/src/eval_framework/tasks/benchmarks/hellaswag.py +++ b/src/eval_framework/tasks/benchmarks/hellaswag.py @@ -12,24 +12,6 @@ from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType from eval_framework.tasks.task_style import BPBStyle, ClozeStyle, MCStyle -# fmt: off -# OLMES fixed fewshot sources for HellaSwag. -# Source: https://github.com/allenai/olmes (FEWSHOT_SOURCES["OLMES:HellaSwag"]) -_HELLASWAG_FEWSHOTS: list[dict[str, Any]] = [ - {"ind": 12, "activity_label": "Health", "ctx_a": "[header] How to cope with suicidal thoughts [title] Put off any plans. [step] Promise yourself that you'll wait 48 hours before doing anything. Remember, thoughts don't have the power to force you to act.", "ctx_b": "", "endings": ["Even when you do, there may be a small image of the future still lurking around your brain. [substeps] For instance, don't tell yourself that you can't make it.", "You're doing something, and no one can force you to act. It's completely natural to feel negative thoughts before you act.", "Do not panic if people talk to you (even if it's about quitting smoking). Have a plan for how you're going to react to a group of people who bring on suicidal thoughts.", "Sometimes extreme pain can distort our perception. Waiting before taking action will give your mind time to clear."], "label": "3"},#noqa - {"ind": 39, "activity_label": "Education and Communications", "ctx_a": "[header] How to make a liquid into a solid [title] Place a small open container of water in the freezer compartment of a class or home refrigerator. [title] Leave the water there for several hours or overnight. [title] Remove from the freezer and note what has occurred.", "ctx_b": "", "endings": ["[step] Water changes state from liquid to solid when it reaches a temperature of 0 degrees celsius, or 32 degrees fahrenheit. This is a simple example of changing from liquid to solid, or freezing.", "[substeps] Check that the container is completely dry, but no ice has formed. You should get a sample before disposing of it.", "[step] Don't drink and continue making liquid. [title] Separate the ice water if you're not used to using water.", "[title] Set a timer to check on the reaction. [step] The liquid should be safe to use again once the water has frozen completely and the food appears firm."], "label": "0"}, #noqa - {"ind": 9, "activity_label": "Baking cookies", "ctx_a": "A female chef in white uniform shows a stack of baking pans in a large kitchen presenting them. The pans are filled with pastries and loaded into the oven.", "ctx_b": "a knife", "endings": ["is seen moving on a board and cutting out its contents.", "hits the peeled cheesecake, followed by sliced custard and still cooked ice cream.", "etches a shape into the inside of the baked pans.", "is used to cut cylinder shaped dough into rounds."], "label": "3"},#noqa - {"ind": 47, "activity_label": "Starting a campfire", "ctx_a": "He takes his lighter and lights the newspaper in several places to start the fire. The bonfire starts burning and continues to burn.", "ctx_b": "he", "endings": ["plays with the dog and makes two cookies.", "adds a few more twigs to keep the flames burning.", "gets up and attempts to put a flag on it, fails and makes a complete ass out of himself.", "puts on equipment and stools."], "label": "1"},#noqa - {"ind": 38, "activity_label": "Finance and Business", "ctx_a": "[header] How to write a method statement [title] Prepare to write by conducting a risk assessment--an in-depth examination of the task or process. [substeps] Identify the work hazards (those that could potentially cause poor health or personal harm) that are inherent in the task. Analyze what has been done about these hazards and if these measures are enough to reduce the harm potential to an acceptable level.", "ctx_b": "", "endings": ["Determine if there are further steps you would like to take. For example, if you want to write about looking as though you've truly experienced the problem in practice, doing a risk assessment may help you so further in mental illness.", "Review the information presented to the project and get an understanding of the hazards. [title] Organize and plan a rest period that will help the sanitation industry and forest service team manage the task more effectively.", "Decide what additional measures need to be taken to reduce harm if an acceptable level has not been met. [title] Begin to write your method statement, starting at the header.", "[title] Write the search code (cnet) heading. [step] To write an article or report, simply write the following code (cnet: alternative sources and outcomes."], "label": "2"},#noqa - {"ind": 38, "activity_label": "Arm wrestling", "ctx_a": "Two bodybuilder women are seated at a table. They are arm wrestling, vieing to win.", "ctx_b": "when there", "endings": ["'s another wrestler, they finish wrestling him.", "is a winner they go cheer each other on.", "is a victor, the two women shake hands.", "is not a winner, they get a huge kick in the face and continue wrestling as the crowd cheers on."], "label": "2"},#noqa - {"ind": 51, "activity_label": "Painting", "ctx_a": "A lady named linda, creator of paint along is demonstrating how to do an acrylic painting.", "ctx_b": "she", "endings": ["extensively paints from fabric and paint horse tails on a painting screen.", "starts with a one inch flat brush and yellow and white acrylic paint.", "shows off her paint thinner and begins to tell her story about the underground bottle of magenta paints.", "demonstrates how to bring a window down from the wall."], "label": "1"},#noqa - {"ind": 63, "activity_label": "Fixing the roof", "ctx_a": "A woman with long, black, curly hair is wearing casual wear, talking, and squatting on a roof.", "ctx_b": "the woman", "endings": ["then stands up and walks to a part of the roof where she lifts up a black shingle on the roof.", "turns on a machine attached to a hand cart with multiple metal rails and drives it underneath a large roof.", "raise her left leg to the graffiti, move it partially along, and just gets herself started climbing the tiles.", "holds her back while she works on the roof, she holds her legs behind her legs."], "label": "0"},#noqa - {"ind": 4, "activity_label": "Removing ice from car", "ctx_a": "Then, the man writes over the snow covering the window of a car, and a woman wearing winter clothes smiles.", "ctx_b": "then", "endings": [", the man adds wax to the windshield and cuts it.", ", a person board a ski lift, while two men supporting the head of the person wearing winter clothes snow as the we girls sled.", ", the man puts on a christmas coat, knitted with netting.", ", the man continues removing the snow on his car."], "label": "3"},#noqa - {"ind": 30, "activity_label": "Getting a haircut", "ctx_a": "The man in the blue shirt sits on the chair next to the sink. The other man begins washing his hair. He scrubs in the shampoo and then washes it off.", "ctx_b": "he", "endings": ["then combs it and blow dries his hair after styling it with gel.", "shows the razor that he has for shaving his hair.", "hair is now dry, he is on his way to the barber.", "moves the bucket to the other side of the sink and continues washing his hair."], "label": "0"},#noqa - {"ind": 61, "activity_label": "Brushing teeth", "ctx_a": "A little boy walk toward the sink.", "ctx_b": "the boy", "endings": ["falling shits his pants from the bottom out.", "stands water to rinse his mouth.", "stands on front the sink and puts toothpaste on the brush, and then brush the teeth.", "rinses his cup in the pot, then put glasses on it."], "label": "2"},#noqa -] -# fmt: on - class HELLASWAG(BaseTask[str]): """Hellaswag dataset: https://huggingface.co/datasets/Rowan/hellaswag @@ -129,9 +111,6 @@ def _get_raw_question(self, item: dict[str, Any]) -> str: def _get_correct_index(self, item: dict[str, Any]) -> int: return int(item["label"] if item["label"] != "" else 0) - def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict[str, Any]]: - return _HELLASWAG_FEWSHOTS[: self.num_fewshot] - class HELLASWAGCloze(_HELLASWAG_Base): NAME = "HELLASWAGCloze" diff --git a/src/eval_framework/tasks/benchmarks/humaneval.py b/src/eval_framework/tasks/benchmarks/humaneval.py index 0b469266..ba5ccc0c 100644 --- a/src/eval_framework/tasks/benchmarks/humaneval.py +++ b/src/eval_framework/tasks/benchmarks/humaneval.py @@ -143,23 +143,7 @@ def _get_cue_text(self, item: dict[str, Any]) -> str: class _CodexHumanEval_Base(BaseTask[str]): - """Shared base for codex_humaneval_gold_bpb_3shot-compatible HumanEval variants. - - Follows the TASK_STYLER pattern (like ARC): - - ``_get_raw_question`` → ``item["prompt"]`` (function signature + docstring) - - ``_get_choices`` → ``[item["canonical_solution"]]`` - - ``_get_correct_index`` → ``0`` - - ``RESPONSE_TYPE`` and ``METRICS`` are provided by the ``TASK_STYLER``. - - BPBStyle normally prepends ``" "`` to the scored completion, but HumanEval - prompts already end with ``"\\n"`` which ConcatFormatter strips from the last - USER message. ``_get_possible_completions`` is therefore overridden to omit - that space so the completion starts directly with the four-space indent of - the function body, matching the olmo_eval reference. The fewshot *target* - retains the leading space via ``BPBStyle.get_fewshot_target_text`` because - those messages are not the final USER turn (no stripping). - """ + """Shared base for codex_humaneval_gold_bpb_3shot-compatible HumanEval variants.""" DATASET_PATH = "openai/openai_humaneval" SAMPLE_SPLIT = "test" diff --git a/src/eval_framework/tasks/benchmarks/mbpp.py b/src/eval_framework/tasks/benchmarks/mbpp.py index 0e480f08..3e3666e4 100644 --- a/src/eval_framework/tasks/benchmarks/mbpp.py +++ b/src/eval_framework/tasks/benchmarks/mbpp.py @@ -351,26 +351,6 @@ class MBPP_BPB(_MBPP_Base): TASK_STYLER = BPBStyle(question_prefix="", cue_text=BEGIN) -# fmt: off -# Fixed 3-shot fewshot examples matching codex_mbpp_gold_bpb_3shot. -# Source: MBPP "full" prompt split, task_ids 3, 9, 4 (in that order). -_CODEX_MBPP_FEWSHOTS: list[dict[str, Any]] = [ - { - "text": "Write a python function to identify non-prime numbers.", - "code": "import math\ndef is_not_prime(n):\n result = False\n for i in range(2,int(math.sqrt(n)) + 1):\n if n % i == 0:\n result = True\n return result",#noqa - }, - { - "text": "Write a python function to find the minimum number of rotations required to get the same string.", - "code": "def find_Rotations(str): \n tmp = str + str\n n = len(str) \n for i in range(1,n + 1): \n substring = tmp[i: i+n] \n if (str == substring): \n return i \n return n",#noqa - }, - { - "text": "Write a function to find the largest integers from a given list of numbers using heap queue algorithm.",#noqa - "code": "import heapq as hq\ndef heap_queue_largest(nums,n):\n largest_nums = hq.nlargest(n, nums)\n return largest_nums",#noqa - }, -] -# fmt: on - - class _CodexMBPP_Base(BaseTask[str]): """Shared base for the codex_mbpp_gold_bpb_3shot-compatible MBPP variants. @@ -410,9 +390,6 @@ def _get_fewshot_target_text(self, item: dict[str, Any]) -> str: code = self._normalize_code(item["code"]) return f"```python\n{code}\n```" - def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict[str, Any]]: - return _CODEX_MBPP_FEWSHOTS[: self.num_fewshot] - class CodexMBPP_BPB(_CodexMBPP_Base): """BPB-only MBPP variant that matches the codex_mbpp_gold_bpb_3shot reference. diff --git a/src/eval_framework/tasks/benchmarks/mmlu.py b/src/eval_framework/tasks/benchmarks/mmlu.py index 97b2b791..8bca317b 100644 --- a/src/eval_framework/tasks/benchmarks/mmlu.py +++ b/src/eval_framework/tasks/benchmarks/mmlu.py @@ -83,7 +83,7 @@ "virology", ] -MMLU_SUBJECTS = MMLU_STEM + MMLU_HUMANITIES + MMLU_SOCIAL_SCIENCES + MMLU_OTHER +MMLU_SUBJECTS = sorted(MMLU_STEM + MMLU_HUMANITIES + MMLU_SOCIAL_SCIENCES + MMLU_OTHER) class MMLU(BaseTask[str]): From 95f36c633ac2b19880f306b5fcbdb765602328b3 Mon Sep 17 00:00:00 2001 From: Prabhu Sivaprasad Date: Thu, 9 Apr 2026 13:39:08 +0000 Subject: [PATCH 13/13] hashes --- .../tasks/task-prompts-hashes.json | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tests/tests_eval_framework/tasks/task-prompts-hashes.json b/tests/tests_eval_framework/tasks/task-prompts-hashes.json index f823e21e..a56525ae 100644 --- a/tests/tests_eval_framework/tasks/task-prompts-hashes.json +++ b/tests/tests_eval_framework/tasks/task-prompts-hashes.json @@ -7,6 +7,12 @@ "AIME2026.Llama3Formatter": "3ff7afee4d41646990b5cc24272db494", "ARC.ConcatFormatter": "bd30651bf7141f65afcfa89cb449fe80", "ARC.Llama3Formatter": "0027b1a525b202c4bd3809d6be54f8fa", + "ARCBPB.ConcatFormatter": "1f52205996ca708a0a0a54309e2ccf44", + "ARCBPB.Llama3Formatter": "b22269e9e72763d8d5fa248577de4cab", + "ARCCloze.ConcatFormatter": "bd30651bf7141f65afcfa89cb449fe80", + "ARCCloze.Llama3Formatter": "0027b1a525b202c4bd3809d6be54f8fa", + "ARCMC.ConcatFormatter": "1ed0a2e8375df1d6856932346e8e40b2", + "ARCMC.Llama3Formatter": "0777ec98b33498d916ed22fbfe68ee94", "ARC_DE.ConcatFormatter": "bb58420112447c9e7f6b3b92cd3adc9f", "ARC_DE.Llama3Formatter": "71ed6989a95477588fbd80b6ae3bff68", "ARC_EU20_DE.ConcatFormatter": "8b931a0504d5e41c0ebdf2c799268e58", @@ -45,6 +51,10 @@ "COPA_OLMES.Llama3Formatter": "65456e820526b80949bec9dc00a8e33f", "ChemBench.ConcatFormatter": "4a5c849a20cba792c46ac4af8ed88e8d", "ChemBench.Llama3Formatter": "f36935a9d4c8900a5a74731b46e5b1b3", + "CodexHumanEval_BPB.ConcatFormatter": "6f3202cc06e81ed52eaf95c012642f3e", + "CodexHumanEval_BPB.Llama3Formatter": "134128d0706d25c4ecbc7e624754c843", + "CodexMBPP_BPB.ConcatFormatter": "ac5d042d10fc5486ceaae1e145bf20f0", + "CodexMBPP_BPB.Llama3Formatter": "bd2fc979902ee9c80d5d4cedac9aa5ce", "CommonsenseQACloze.ConcatFormatter": "c644c3c2a3395d83fca2edc3fc31844b", "CommonsenseQACloze.Llama3Formatter": "393a85d1b86304b22b57038f430302b0", "CommonsenseQAFullTextCloze.ConcatFormatter": "9f01052649ab337b9b7181c9302a4fc3", @@ -102,6 +112,12 @@ "GridDifference.EXTRACTED_GRID": "c3463fc837dbf31c6d20e4c6a135f14f", "HELLASWAG.ConcatFormatter": "c8f069fff818335c99bb92288a237d92", "HELLASWAG.Llama3Formatter": "75fc2b5e4e6161a1bb8b8050cbb716d2", + "HELLASWAGBPB.ConcatFormatter": "a6335b585e97fc2c60e35c9c01d09443", + "HELLASWAGBPB.Llama3Formatter": "c336c651ed15a445a910e1a5c3f1ca66", + "HELLASWAGCloze.ConcatFormatter": "c2cb6e4097b2a03a5c9397dbd3b82b0e", + "HELLASWAGCloze.Llama3Formatter": "b6b1b011514e12f4e52c43b18386ca1f", + "HELLASWAGMC.ConcatFormatter": "27935199d13dc7fa98ae3f5f79437685", + "HELLASWAGMC.Llama3Formatter": "24cb1d7d083f43df3098a7957ff353ec", "HELLASWAG_DE.ConcatFormatter": "9fb70ef960b7dc1401ef81b3991d6fea", "HELLASWAG_DE.Llama3Formatter": "69da70417ee40c99529b991eaa04a776", "HELLASWAG_EU20_DE.ConcatFormatter": "fcea590f6d03494e6da65bf3c274d0d0", @@ -158,6 +174,8 @@ "MATH500.Llama3Formatter": "c8624982f58f346c68622e2687a46965", "MATH500Minerva.ConcatFormatter": "4822e1d31c2a8b3b129d08c5974f3fe9", "MATH500Minerva.Llama3Formatter": "794e344523118fe325de091e455bec00", + "MATH500Minerva_BPB.ConcatFormatter": "a9bc27d9cfa622bb2d8ed28ab69c2819", + "MATH500Minerva_BPB.Llama3Formatter": "822d483aad1d2fe85cfb82627c2d375a", "MATHLvl5.ConcatFormatter": "82feee2e24f2f96f668d22c0d4554c4a", "MATHLvl5.Llama3Formatter": "6fdf0835ce969239a843c088c9104fe4", "MATHMinerva.ConcatFormatter": "817591afbe9426c45cddf82be7e11e07", @@ -182,6 +200,20 @@ "MBPP_SANITIZED.Llama3Formatter": "c3fa6d5b9126c9e320b95a0c504c2ef1", "MMLU.ConcatFormatter": "d8b543f6e31659e1e0bf9f90f51a3ce7", "MMLU.Llama3Formatter": "61546963de15da149c4a7ec0e321bc48", + "MMLUBPB.ConcatFormatter": "27a2b31b1250ca88f6242b30a1a3d26c", + "MMLUBPB.Llama3Formatter": "fbc252c7f431bbea1a0a968c5c2573ae", + "MMLUCloze.ConcatFormatter": "3f759a116bd57c5032c34bf150d5b81c", + "MMLUCloze.Llama3Formatter": "e583009d7b038f3a1de05f06d0b6c3ee", + "MMLUHumanitiesBPB.ConcatFormatter": "0f6a5e9955f1b8978ae818287ee5ddea", + "MMLUHumanitiesBPB.Llama3Formatter": "2864c96d60f466eb36ae0c8738e0df04", + "MMLUMC.ConcatFormatter": "faf81b862db8eca4f6cc98076fc8ac80", + "MMLUMC.Llama3Formatter": "b6e534972d97620e5a5d4c94bf217fc9", + "MMLUOtherBPB.ConcatFormatter": "9735fe03eb7c412a05138fc8ad3f62bd", + "MMLUOtherBPB.Llama3Formatter": "7a29e34f4576c958fe164bfdc9d2c871", + "MMLUSocialSciencesBPB.ConcatFormatter": "685731111e72dea393f6255734791fdd", + "MMLUSocialSciencesBPB.Llama3Formatter": "c2f374a9190aa2428ce9be8aac70d402", + "MMLUStemBPB.ConcatFormatter": "27a2b31b1250ca88f6242b30a1a3d26c", + "MMLUStemBPB.Llama3Formatter": "fbc252c7f431bbea1a0a968c5c2573ae", "MMLU_COT.ConcatFormatter": "158044a1336658a19faf45d116ea66e6", "MMLU_COT.Llama3Formatter": "a21859e78e8eae37a66a388708fe3a18", "MMLU_DE.ConcatFormatter": "b448e01092dd94cb83f788590b28b08b",