diff --git a/README.md b/README.md index 262a2ca11..114f5312d 100644 --- a/README.md +++ b/README.md @@ -155,6 +155,18 @@ For agentic evaluations (e.g., SWE-Bench, GAIA), the aggregate schema captures c At the instance level, agentic evaluations use `interaction_type: "agentic"` with full tool call traces recorded in the `messages` array. See the [Inspect AI test fixture](tests/data/inspect/) for a GAIA example with docker sandbox and tool usage. +### Text-to-Image Evaluations + +The schema supports text-to-image (T2I) generation models (FLUX, SDXL, Imagen, …) alongside LLMs. Three small additions cover it; everything else (sampler args, image dimensions, sha256, rater pools, …) goes through the existing `additional_details` escape hatches. + +- **`modality`** — optional enum (`"text"` | `"text_to_image"`) on each `evaluation_results[]` entry and on each instance record. Absent means `"text"` (backwards compatibility). +- **`output.media: MediaRef[]`** — generated artifacts on the instance record. A `MediaRef` is just `{media_type, uri}` plus an `additional_details` bag (sha256, mime_type, width/height, seed, index, …). Required when `modality == "text_to_image"`. +- **`evaluation.is_correct`** is now `boolean | null` — set to `null` when the metric is continuous (FID, CLIPScore, ImageReward, etc.). + +T2I uses `interaction_type: "single_turn"`; `modality` is the orthogonal axis. Sampler args (`num_inference_steps`, `guidance_scale`, `width/height`, `scheduler`, `seed`, …) go in `generation_config.additional_details` as stringified key-value pairs. Human-rater pools (MTurk Likert critique, pairwise photorealism comparisons à la HEIM) go in `metric_config.additional_details` until a follow-up PR adds first-class structure for them. + +See [`tests/data/t2i/`](tests/data/t2i/) for a GenEval / SDXL worked example. + ## ✅ Data Validation Validation uses Pydantic models generated from the JSON schemas. This validates aggregate `.json` files against `EvaluationLog` and instance-level `_samples.jsonl` files line-by-line against `InstanceLevelEvaluationLog`. Requires [uv](https://docs.astral.sh/uv/). diff --git a/every_eval_ever/eval_types.py b/every_eval_ever/eval_types.py index 40035403b..00264357d 100644 --- a/every_eval_ever/eval_types.py +++ b/every_eval_ever/eval_types.py @@ -1,21 +1,13 @@ # generated by datamodel-codegen: # filename: eval.schema.json -# timestamp: 2026-03-19T20:30:15+00:00 +# timestamp: 2026-05-18T09:44:21+00:00 from __future__ import annotations from enum import Enum from typing import Annotated, Literal -from pydantic import ( - BaseModel, - ConfigDict, - Discriminator, - Field, - confloat, - conint, - model_validator, -) +from pydantic import BaseModel, ConfigDict, Field, confloat, conint, model_validator, Discriminator class SourceType(Enum): @@ -73,6 +65,11 @@ class EvalLibrary(BaseModel): ) +class Modality(Enum): + text = 'text' + text_to_image = 'text_to_image' + + class ScoreType(Enum): binary = 'binary' continuous = 'continuous' @@ -443,15 +440,13 @@ class MetricConfig(BaseModel): # --- validators (added by post_codegen.py) --- - @model_validator(mode='after') + @model_validator(mode="after") def validate_score_type_requirements(self): if self.score_type == ScoreType.levels: if self.level_names is None: raise ValueError("score_type 'levels' requires level_names") if self.has_unknown_level is None: - raise ValueError( - "score_type 'levels' requires has_unknown_level" - ) + raise ValueError("score_type 'levels' requires has_unknown_level") elif self.score_type == ScoreType.continuous: if self.min_score is None: raise ValueError("score_type 'continuous' requires min_score") @@ -459,17 +454,17 @@ def validate_score_type_requirements(self): raise ValueError("score_type 'continuous' requires max_score") return self - class EvaluationResult(BaseModel): evaluation_result_id: str | None = Field( None, description='Stable identifier for this metric result inside an evaluation run. Recommended deterministic join key for instance-level records.', ) evaluation_name: str = Field(..., description='Name of the evaluation') - source_data: Annotated[ - SourceDataUrl | SourceDataHf | SourceDataPrivate, - Discriminator('source_type'), - ] = Field( + modality: Modality | None = Field( + None, + description="Modality of the task being evaluated. Absent means 'text' for backwards compatibility. Use 'text_to_image' for prompt-to-image generation tasks. Future modalities will be added to this enum.", + ) + source_data: Annotated[SourceDataUrl | SourceDataHf | SourceDataPrivate, Discriminator("source_type")] = Field( ..., description='Source of dataset for this evaluation: URL, HuggingFace dataset, or private/custom dataset.', ) diff --git a/every_eval_ever/instance_level_types.py b/every_eval_ever/instance_level_types.py index ff78d3420..84913c273 100644 --- a/every_eval_ever/instance_level_types.py +++ b/every_eval_ever/instance_level_types.py @@ -1,20 +1,13 @@ # generated by datamodel-codegen: # filename: instance_level_eval.schema.json -# timestamp: 2026-03-19T20:30:15+00:00 +# timestamp: 2026-05-18T09:44:23+00:00 from __future__ import annotations from enum import Enum from typing import Any -from pydantic import ( - BaseModel, - ConfigDict, - Field, - confloat, - conint, - model_validator, -) +from pydantic import BaseModel, ConfigDict, Field, confloat, conint, model_validator class InteractionType(Enum): @@ -23,6 +16,11 @@ class InteractionType(Enum): agentic = 'agentic' +class Modality(Enum): + text = 'text' + text_to_image = 'text_to_image' + + class Input(BaseModel): raw: str = Field(..., description='The raw input as defined in the eval') formatted: str | None = Field( @@ -39,14 +37,6 @@ class Input(BaseModel): ) -class Output(BaseModel): - raw: list[str] = Field(..., description='Complete model responses') - reasoning_trace: list[str] | None = Field( - None, - description='Reasoning traces of the model if applicable (e.g. chain-of-thought tokens)', - ) - - class ToolCall(BaseModel): id: str = Field(..., description='Unique identifier for the tool call') name: str = Field(..., description='Name of tool/function') @@ -104,8 +94,9 @@ class AnswerAttributionItem(BaseModel): class Evaluation(BaseModel): score: float = Field(..., description='Instance-level score') - is_correct: bool = Field( - ..., description='Whether the final answer is correct' + is_correct: bool | None = Field( + ..., + description='Whether the final answer is correct. Required to be present, but may be null when correctness is not well-defined (e.g. continuous T2I metrics like FID or CLIPScore).', ) num_turns: conint(ge=1) | None = Field( None, description='Number of turns in the interaction' @@ -150,6 +141,42 @@ class Performance(BaseModel): ) +class MediaType(Enum): + image = 'image' + video = 'video' + audio = 'audio' + + +class MediaRef(BaseModel): + model_config = ConfigDict( + extra='forbid', + ) + media_type: MediaType + uri: str = Field( + ..., + description="Location of the artifact: 'file://...', 'https://...', 'hf://...', 's3://...', or 'data:...;base64,...' for inline.", + ) + additional_details: dict[str, str] | None = Field( + None, + description='Per-artifact extras (key-value pairs, all values must be strings). Use for sha256, mime_type, width/height, seed, index, etc.', + ) + + +class Output(BaseModel): + raw: list[str] = Field( + ..., + description="Complete model responses. For text_to_image modality, populate with one placeholder string per generated artifact (e.g. '') so indexes align with output.media[].", + ) + reasoning_trace: list[str] | None = Field( + None, + description='Reasoning traces of the model if applicable (e.g. chain-of-thought tokens)', + ) + media: list[MediaRef] | None = Field( + None, + description="Generated media artifacts (image, video, audio). Required when modality == 'text_to_image'.", + ) + + class InstanceLevelEvaluationLog(BaseModel): model_config = ConfigDict( extra='forbid', @@ -183,7 +210,11 @@ class InstanceLevelEvaluationLog(BaseModel): ) interaction_type: InteractionType = Field( ..., - description='Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents', + description="Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents. Text-to-image evaluations use 'single_turn'; modality is the orthogonal axis.", + ) + modality: Modality | None = Field( + None, + description="Modality of this sample. Should match the modality on the linked aggregate evaluation_result. Absent means 'text' for backwards compatibility.", ) input: Input = Field( ..., description='Input data for the evaluation sample' @@ -220,22 +251,31 @@ class InstanceLevelEvaluationLog(BaseModel): # --- validators (added by post_codegen.py) --- - @model_validator(mode='after') + @model_validator(mode="after") def validate_interaction_type_consistency(self): if self.interaction_type == InteractionType.single_turn: if self.output is None: - raise ValueError('single_turn interaction_type requires output') + raise ValueError("single_turn interaction_type requires output") if self.messages is not None: raise ValueError( - 'single_turn interaction_type must not have messages' + "single_turn interaction_type must not have messages" ) else: if self.messages is None: raise ValueError( - f'{self.interaction_type.value} interaction_type requires messages' + f"{self.interaction_type.value} interaction_type requires messages" ) if self.output is not None: raise ValueError( - f'{self.interaction_type.value} interaction_type must not have output' + f"{self.interaction_type.value} interaction_type must not have output" + ) + return self + + @model_validator(mode="after") + def validate_modality_consistency(self): + if self.modality == Modality.text_to_image: + if self.output is None or not self.output.media: + raise ValueError( + "modality 'text_to_image' requires output.media to be a non-empty list" ) return self diff --git a/every_eval_ever/schemas/eval.schema.json b/every_eval_ever/schemas/eval.schema.json index 1c7f17271..ff09c7c6f 100644 --- a/every_eval_ever/schemas/eval.schema.json +++ b/every_eval_ever/schemas/eval.schema.json @@ -130,6 +130,14 @@ "type": "string", "description": "Name of the evaluation" }, + "modality": { + "type": "string", + "enum": [ + "text", + "text_to_image" + ], + "description": "Modality of the task being evaluated. Absent means 'text' for backwards compatibility. Use 'text_to_image' for prompt-to-image generation tasks. Future modalities will be added to this enum." + }, "source_data": { "description": "Source of dataset for this evaluation: URL, HuggingFace dataset, or private/custom dataset.", "oneOf": [ diff --git a/every_eval_ever/schemas/instance_level_eval.schema.json b/every_eval_ever/schemas/instance_level_eval.schema.json index 8701eb8f0..a7e4bf4c2 100644 --- a/every_eval_ever/schemas/instance_level_eval.schema.json +++ b/every_eval_ever/schemas/instance_level_eval.schema.json @@ -46,7 +46,12 @@ "interaction_type": { "type": "string", "enum": ["single_turn", "multi_turn", "agentic"], - "description": "Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents" + "description": "Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents. Text-to-image evaluations use 'single_turn'; modality is the orthogonal axis." + }, + "modality": { + "type": "string", + "enum": ["text", "text_to_image"], + "description": "Modality of this sample. Should match the modality on the linked aggregate evaluation_result. Absent means 'text' for backwards compatibility." }, "input": { "type": "object", @@ -82,13 +87,18 @@ "properties": { "raw": { "type": "array", - "description": "Complete model responses", + "description": "Complete model responses. For text_to_image modality, populate with one placeholder string per generated artifact (e.g. '') so indexes align with output.media[].", "items": { "type": "string" } }, "reasoning_trace": { "type": ["array", "null"], "description": "Reasoning traces of the model if applicable (e.g. chain-of-thought tokens)", "items": { "type": "string" } + }, + "media": { + "type": ["array", "null"], + "description": "Generated media artifacts (image, video, audio). Required when modality == 'text_to_image'.", + "items": { "$ref": "#/$defs/media_ref" } } } }, @@ -192,8 +202,8 @@ "description": "Instance-level score" }, "is_correct": { - "type": "boolean", - "description": "Whether the final answer is correct" + "type": ["boolean", "null"], + "description": "Whether the final answer is correct. Required to be present, but may be null when correctness is not well-defined (e.g. continuous T2I metrics like FID or CLIPScore)." }, "num_turns": { "type": ["integer", "null"], @@ -334,5 +344,28 @@ } } } - ] + ], + "$defs": { + "media_ref": { + "type": "object", + "description": "Reference to a generated media artifact. Required fields are intentionally minimal; record extras (mime_type, sha256, width/height, seed, index, ...) in additional_details.", + "required": ["media_type", "uri"], + "additionalProperties": false, + "properties": { + "media_type": { + "type": "string", + "enum": ["image", "video", "audio"] + }, + "uri": { + "type": "string", + "description": "Location of the artifact: 'file://...', 'https://...', 'hf://...', 's3://...', or 'data:...;base64,...' for inline." + }, + "additional_details": { + "type": "object", + "description": "Per-artifact extras (key-value pairs, all values must be strings). Use for sha256, mime_type, width/height, seed, index, etc.", + "additionalProperties": {"type": "string"} + } + } + } + } } diff --git a/post_codegen.py b/post_codegen.py index c355536e6..c6e963138 100644 --- a/post_codegen.py +++ b/post_codegen.py @@ -66,6 +66,21 @@ def validate_score_type_requirements(self): if self.max_score is None: raise ValueError("score_type 'continuous' requires max_score") return self +""", + }, + { + 'file': 'every_eval_ever/instance_level_types.py', + 'import_add': 'model_validator', + 'class_name': 'InstanceLevelEvaluationLog', + 'validator': """ + @model_validator(mode="after") + def validate_modality_consistency(self): + if self.modality == Modality.text_to_image: + if self.output is None or not self.output.media: + raise ValueError( + "modality 'text_to_image' requires output.media to be a non-empty list" + ) + return self """, }, ] @@ -126,9 +141,9 @@ def patch_file(patch: dict) -> None: path = Path(__file__).parent / patch['file'] content = path.read_text() - # Check if already patched - if 'post_codegen.py' in content: - print(f' {patch["file"]}: already patched, skipping') + validator_def = re.search(r'def (\w+)\(self', patch['validator']) + if validator_def and f'def {validator_def.group(1)}(self' in content: + print(f' {patch["file"]}: {patch["class_name"]}.{validator_def.group(1)} already patched, skipping') return content = add_import(content, patch['import_add']) diff --git a/tests/data/t2i/geneval_sdxl_example.json b/tests/data/t2i/geneval_sdxl_example.json new file mode 100644 index 000000000..e21488f3f --- /dev/null +++ b/tests/data/t2i/geneval_sdxl_example.json @@ -0,0 +1,58 @@ +{ + "schema_version": "0.2.2", + "evaluation_id": "geneval/stability-ai/stable-diffusion-xl-base-1.0/1747312345", + "retrieved_timestamp": "1747312345.0", + "source_metadata": { + "source_type": "evaluation_run", + "source_organization_name": "EvalEval Coalition", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "geneval", + "version": "unknown" + }, + "model_info": { + "name": "Stable Diffusion XL Base 1.0", + "id": "stabilityai/stable-diffusion-xl-base-1.0", + "developer": "Stability AI", + "inference_engine": { + "name": "diffusers", + "version": "0.30.0" + } + }, + "evaluation_results": [ + { + "evaluation_result_id": "geneval_overall_sdxl", + "evaluation_name": "geneval_overall", + "modality": "text_to_image", + "source_data": { + "dataset_name": "GenEval", + "source_type": "hf_dataset", + "hf_repo": "djghosh/wds_geneval" + }, + "metric_config": { + "metric_id": "geneval.overall", + "metric_name": "GenEval Overall", + "metric_kind": "vqa_score", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0, + "max_score": 1 + }, + "score_details": { + "score": 0.55 + }, + "generation_config": { + "additional_details": { + "num_inference_steps": "50", + "guidance_scale": "7.5", + "width": "1024", + "height": "1024", + "num_images_per_prompt": "4", + "scheduler": "EulerDiscreteScheduler", + "seed": "42" + } + } + } + ] +} diff --git a/tests/data/t2i/geneval_sdxl_example_samples.jsonl b/tests/data/t2i/geneval_sdxl_example_samples.jsonl new file mode 100644 index 000000000..7c2098c61 --- /dev/null +++ b/tests/data/t2i/geneval_sdxl_example_samples.jsonl @@ -0,0 +1 @@ +{"schema_version": "instance_level_eval_0.2.2", "evaluation_id": "geneval/stability-ai/stable-diffusion-xl-base-1.0/1747312345", "evaluation_result_id": "geneval_overall_sdxl", "model_id": "stabilityai/stable-diffusion-xl-base-1.0", "evaluation_name": "geneval_overall", "sample_id": "geneval_0001", "interaction_type": "single_turn", "modality": "text_to_image", "input": {"raw": "a photo of a red car and a blue motorcycle", "reference": ["red car", "blue motorcycle"]}, "output": {"raw": ["", ""], "media": [{"media_type": "image", "uri": "file://./images/geneval_0001_0.png", "additional_details": {"sha256": "1111111111111111111111111111111111111111111111111111111111111111", "width": "1024", "height": "1024", "seed": "42", "index": "0"}}, {"media_type": "image", "uri": "file://./images/geneval_0001_1.png", "additional_details": {"sha256": "2222222222222222222222222222222222222222222222222222222222222222", "width": "1024", "height": "1024", "seed": "43", "index": "1"}}]}, "answer_attribution": [], "evaluation": {"score": 0.75, "is_correct": null}} diff --git a/tests/test_validate.py b/tests/test_validate.py index edb2d5b6a..49cdaa838 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -376,3 +376,68 @@ def test_exit_code_1_on_failure(self, tmp_path: Path): fp = _write_json(tmp_path, 'fail.json', data) report = validate_file(fp) assert report.valid is False + + +# =================================================================== +# Text-to-image (modality: text_to_image) tests +# =================================================================== + + +T2I_FIXTURE_DIR = Path(__file__).parent / 'data' / 't2i' + + +class TestT2I: + def test_geneval_fixture_passes(self): + agg = validate_file(T2I_FIXTURE_DIR / 'geneval_sdxl_example.json') + samples = validate_file( + T2I_FIXTURE_DIR / 'geneval_sdxl_example_samples.jsonl' + ) + assert agg.valid is True, agg.errors + assert samples.valid is True, samples.errors + + def test_t2i_record_requires_media(self, tmp_path: Path): + rec = { + 'schema_version': 'instance_level_eval_0.2.2', + 'evaluation_id': 'x', 'model_id': 'x/y', 'evaluation_name': 'x', + 'sample_id': '1', 'interaction_type': 'single_turn', + 'modality': 'text_to_image', + 'input': {'raw': 'a cat', 'reference': []}, + 'output': {'raw': ['']}, + 'answer_attribution': [], + 'evaluation': {'score': 0.5, 'is_correct': None}, + } + fp = _write_jsonl(tmp_path, 't.jsonl', [rec]) + report = validate_file(fp) + assert report.valid is False + assert any('media' in e['msg'] for e in report.errors) + + def test_t2i_null_is_correct_passes(self, tmp_path: Path): + rec = { + 'schema_version': 'instance_level_eval_0.2.2', + 'evaluation_id': 'x', 'model_id': 'x/y', 'evaluation_name': 'x', + 'sample_id': '1', 'interaction_type': 'single_turn', + 'modality': 'text_to_image', + 'input': {'raw': 'a cat', 'reference': []}, + 'output': { + 'raw': [''], + 'media': [{'media_type': 'image', 'uri': 'file://./a.png'}], + }, + 'answer_attribution': [], + 'evaluation': {'score': 0.87, 'is_correct': None}, + } + fp = _write_jsonl(tmp_path, 't.jsonl', [rec]) + report = validate_file(fp) + assert report.valid is True, report.errors + + def test_modality_unknown_value_fails(self, tmp_path: Path): + data = json.loads(json.dumps(VALID_AGGREGATE)) + data['evaluation_results'][0]['modality'] = 'image_edit' + fp = _write_json(tmp_path, 'agg.json', data) + report = validate_file(fp) + assert report.valid is False + + def test_existing_records_without_modality_still_pass(self, tmp_path: Path): + fp_agg = _write_json(tmp_path, 'agg.json', VALID_AGGREGATE) + fp_inst = _write_jsonl(tmp_path, 'inst.jsonl', [VALID_SINGLE_TURN]) + assert validate_file(fp_agg).valid is True + assert validate_file(fp_inst).valid is True