From d1decf15aeba09990f28fc6d734d7a4a344ab6dc Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Fri, 10 Apr 2026 10:12:24 +0000 Subject: [PATCH 1/4] refactor: more robust metadata retrieval in BaseTask with TaskStyler Introduced new methods for response type and metrics retrieval, ensuring backwards compatibility with existing properties. --- src/eval_framework/response_generator.py | 2 +- src/eval_framework/tasks/base.py | 29 ++++++++++++++++++------ src/eval_framework/tasks/eval_config.py | 2 +- 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/src/eval_framework/response_generator.py b/src/eval_framework/response_generator.py index c8168eed..f8cc2171 100644 --- a/src/eval_framework/response_generator.py +++ b/src/eval_framework/response_generator.py @@ -78,7 +78,7 @@ def __init__(self, llm: BaseLLM, config: EvalConfig, result_processor: ResultsFi custom_hf_revision=self.config.hf_revision, ) - self.response_type, _ = self.task._get_type_and_metrics() + self.response_type = self.task.get_response_type() def _llm_task_param_precedence(self) -> tuple[list[str] | None, int | None]: """ diff --git a/src/eval_framework/tasks/base.py b/src/eval_framework/tasks/base.py index 16fb94b8..fab35028 100644 --- a/src/eval_framework/tasks/base.py +++ b/src/eval_framework/tasks/base.py @@ -332,14 +332,12 @@ def _get_context(self, item: dict[str, Any]) -> BaseMetricContext | list[BaseMet return None def get_metadata(self) -> dict[str, str | list[str]]: - response_type, metrics = self._get_type_and_metrics() - meta: dict[str, str | list[str]] = { "dataset_path": self.DATASET_PATH, "sample_split": self.SAMPLE_SPLIT, "fewshot_split": self.FEWSHOT_SPLIT, - "response_type": response_type.value, - "metrics": [m.NAME for m in metrics], + "response_type": self.get_response_type().value, + "metrics": [m.NAME for m in self.get_metrics()], "subjects": [str(s) for s in self.SUBJECTS], } if hasattr(self, "TASK_STYLER"): @@ -420,7 +418,24 @@ def generate_completions( ) return completion_list - def _get_type_and_metrics(self) -> tuple[ResponseType, list[type["BaseMetric"]]]: + def get_response_type(self) -> ResponseType: + """Return the response type of the task (or the styler if it exists).""" + if hasattr(self, "TASK_STYLER"): + return self.TASK_STYLER.response_type + return self.RESPONSE_TYPE + + def get_metrics(self) -> list[type["BaseMetric"]]: + """Return the metrics of the task (or the styler if it exists).""" if hasattr(self, "TASK_STYLER"): - return self.TASK_STYLER.response_type, self.TASK_STYLER.metrics - return self.RESPONSE_TYPE, self.METRICS + return self.TASK_STYLER.metrics + return self.METRICS + + @property + def RESPONSE_TYPE(self) -> ResponseType: + """For backwards compatibility.""" + return self.get_response_type() + + @property + def METRICS(self) -> list[type["BaseMetric"]]: + """For backwards compatibility.""" + return self.get_metrics() diff --git a/src/eval_framework/tasks/eval_config.py b/src/eval_framework/tasks/eval_config.py index 58bc2b4d..99f88a6b 100644 --- a/src/eval_framework/tasks/eval_config.py +++ b/src/eval_framework/tasks/eval_config.py @@ -112,7 +112,7 @@ def validate_judge_model_args(cls, value: dict[str, Any]) -> dict[str, Any]: @model_validator(mode="after") def validate_llm_judge_defined(self) -> "EvalConfig": task = get_task(self.task_name) - _, task_metrics = task(num_fewshot=0)._get_type_and_metrics() + task_metrics = task(num_fewshot=0).get_metrics() for metric_class in task_metrics: if issubclass(metric_class, BaseLLMJudgeMetric): assert self.llm_judge_class is not None, "The LLM Judge must be defined for this evaluation task." From 6350f8f473cf5631064e9fe1549acb85cb02a746 Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Fri, 10 Apr 2026 10:17:48 +0000 Subject: [PATCH 2/4] test: add unit test for properties with taskstyler --- tests/tests_eval_framework/tasks/test_task_style.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/tests_eval_framework/tasks/test_task_style.py b/tests/tests_eval_framework/tasks/test_task_style.py index 5951fbdb..c33c5070 100644 --- a/tests/tests_eval_framework/tasks/test_task_style.py +++ b/tests/tests_eval_framework/tasks/test_task_style.py @@ -506,3 +506,11 @@ def test_metadata_task_style(self) -> None: def test_metadata_metrics_bpb_only(self) -> None: meta = self.task.get_metadata() assert meta["metrics"] == ["BitsPerByte"] + + +def test_instance_properties_are_styler_backed() -> None: + task = _ConcreteMCTask() + + # Check compatibility access points for metadata. + assert task.RESPONSE_TYPE == ResponseType.LOGLIKELIHOODS + assert task.METRICS == task.TASK_STYLER.metrics From 46c541797c57162f457c7232e2845ed51ccb2b7d Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Fri, 10 Apr 2026 11:40:16 +0000 Subject: [PATCH 3/4] refactor: avoid mypy issue for redef --- src/eval_framework/tasks/base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/eval_framework/tasks/base.py b/src/eval_framework/tasks/base.py index fab35028..fc7f6a82 100644 --- a/src/eval_framework/tasks/base.py +++ b/src/eval_framework/tasks/base.py @@ -91,8 +91,6 @@ class BaseTask[SubjectType](ABC): DATASET_PATH: str SAMPLE_SPLIT: str FEWSHOT_SPLIT: str - RESPONSE_TYPE: ResponseType - METRICS: list[type["BaseMetric"]] SUBJECTS: list[SubjectType] HF_REVISION: str | None = None # tag name, or branch name, or commit hash to ensure reproducibility @@ -104,6 +102,9 @@ class BaseTask[SubjectType](ABC): # language by subtopic, or `None` (for tasks not specific to a single language). LANGUAGE: Language | dict[str, Language] | dict[str, tuple[Language, Language]] | None + # Note: We don't declare RESPONSE_TYPE/METRICS here, instead exposing them as properties. + # This allows for backwards compatibility (accessing via task.METRICS) both when using a styler and when not. + def __init__(self, num_fewshot: int = 0) -> None: self.num_fewshot = num_fewshot self.stop_sequences: list[str] | None = None From 4663adc0200df150ccc255ef50fe7c457fdcea2c Mon Sep 17 00:00:00 2001 From: Frank Schneider Date: Fri, 10 Apr 2026 12:21:07 +0000 Subject: [PATCH 4/4] refactor: use classproperty for BaseTask metrics and response type --- src/eval_framework/tasks/base.py | 37 +++++++++++++++++-------------- src/eval_framework/tasks/utils.py | 20 ++++++++++++++++- 2 files changed, 39 insertions(+), 18 deletions(-) diff --git a/src/eval_framework/tasks/base.py b/src/eval_framework/tasks/base.py index fc7f6a82..07c21909 100644 --- a/src/eval_framework/tasks/base.py +++ b/src/eval_framework/tasks/base.py @@ -15,7 +15,7 @@ from pydantic import BaseModel, ConfigDict from eval_framework.shared.types import BaseMetricContext, Completion, Error, RawCompletion -from eval_framework.tasks.utils import raise_errors +from eval_framework.tasks.utils import classproperty, raise_errors from template_formatting.formatter import Message, Role if TYPE_CHECKING: @@ -102,8 +102,9 @@ class BaseTask[SubjectType](ABC): # language by subtopic, or `None` (for tasks not specific to a single language). LANGUAGE: Language | dict[str, Language] | dict[str, tuple[Language, Language]] | None - # Note: We don't declare RESPONSE_TYPE/METRICS here, instead exposing them as properties. - # This allows for backwards compatibility (accessing via task.METRICS) both when using a styler and when not. + # RESPONSE_TYPE and METRICS use exposed as classproperties, so you can access them via either + # `TaskClass.*` or `task.*` (or `task.get_metrics()`). This avoids mypy conflicts from re-declaring class vars. + # By default, these values come from TASK_STYLER if set, otherwise from legacy class attributes. def __init__(self, num_fewshot: int = 0) -> None: self.num_fewshot = num_fewshot @@ -419,24 +420,26 @@ def generate_completions( ) return completion_list - def get_response_type(self) -> ResponseType: + @classmethod + def get_response_type(cls) -> ResponseType: """Return the response type of the task (or the styler if it exists).""" - if hasattr(self, "TASK_STYLER"): - return self.TASK_STYLER.response_type - return self.RESPONSE_TYPE + if hasattr(cls, "TASK_STYLER"): + return cls.TASK_STYLER.response_type + return cls.RESPONSE_TYPE - def get_metrics(self) -> list[type["BaseMetric"]]: + @classmethod + def get_metrics(cls) -> list[type["BaseMetric"]]: """Return the metrics of the task (or the styler if it exists).""" - if hasattr(self, "TASK_STYLER"): - return self.TASK_STYLER.metrics - return self.METRICS + if hasattr(cls, "TASK_STYLER"): + return cls.TASK_STYLER.metrics + return cls.METRICS - @property - def RESPONSE_TYPE(self) -> ResponseType: + @classproperty + def RESPONSE_TYPE(cls) -> ResponseType: """For backwards compatibility.""" - return self.get_response_type() + return cls.get_response_type() - @property - def METRICS(self) -> list[type["BaseMetric"]]: + @classproperty + def METRICS(cls) -> list[type["BaseMetric"]]: """For backwards compatibility.""" - return self.get_metrics() + return cls.get_metrics() diff --git a/src/eval_framework/tasks/utils.py b/src/eval_framework/tasks/utils.py index 5752f1e7..d414444d 100644 --- a/src/eval_framework/tasks/utils.py +++ b/src/eval_framework/tasks/utils.py @@ -8,7 +8,7 @@ import threading from collections.abc import Callable from pathlib import Path -from typing import Any, Literal, NamedTuple +from typing import Any, Literal, NamedTuple, overload import dill import numpy as np @@ -22,6 +22,24 @@ RANDOM_SEED = 42 # hacky way to get around circular import redis_warning_printed = False + +class classproperty[T]: + """Descriptor supporting property-like access on classes and instances.""" + + def __init__(self, fget: Callable[[Any], T]) -> None: + self.fget = fget + + @overload + def __get__(self, obj: None, owner: type[Any]) -> T: ... + + @overload + def __get__(self, obj: object, owner: type[Any] | None = None) -> T: ... + + def __get__(self, obj: object | None, owner: type[Any] | None = None) -> T: + cls = owner if owner is not None else type(obj) + return self.fget(cls) + + _pools: dict[tuple[str | None, tuple[str, ...] | None], ContainerPoolManager] = {} _pools_lock = threading.Lock()