diff --git a/README.md b/README.md
index 262a2ca11..114f5312d 100644
--- a/README.md
+++ b/README.md
@@ -155,6 +155,18 @@ For agentic evaluations (e.g., SWE-Bench, GAIA), the aggregate schema captures c
 
 At the instance level, agentic evaluations use `interaction_type: "agentic"` with full tool call traces recorded in the `messages` array. See the [Inspect AI test fixture](tests/data/inspect/) for a GAIA example with docker sandbox and tool usage.
 
+### Text-to-Image Evaluations
+
+The schema supports text-to-image (T2I) generation models (FLUX, SDXL, Imagen, …) alongside LLMs. Three small additions cover it; everything else (sampler args, image dimensions, sha256, rater pools, …) goes through the existing `additional_details` escape hatches.
+
+- **`modality`** — optional enum (`"text"` | `"text_to_image"`) on each `evaluation_results[]` entry and on each instance record. Absent means `"text"` (backwards compatibility).
+- **`output.media: MediaRef[]`** — generated artifacts on the instance record. A `MediaRef` is just `{media_type, uri}` plus an `additional_details` bag (sha256, mime_type, width/height, seed, index, …). Required when `modality == "text_to_image"`.
+- **`evaluation.is_correct`** is now `boolean | null` — set to `null` when the metric is continuous (FID, CLIPScore, ImageReward, etc.).
+
+T2I uses `interaction_type: "single_turn"`; `modality` is the orthogonal axis. Sampler args (`num_inference_steps`, `guidance_scale`, `width/height`, `scheduler`, `seed`, …) go in `generation_config.additional_details` as stringified key-value pairs. Human-rater pools (MTurk Likert critique, pairwise photorealism comparisons à la HEIM) go in `metric_config.additional_details` until a follow-up PR adds first-class structure for them.
+
+See [`tests/data/t2i/`](tests/data/t2i/) for a GenEval / SDXL worked example.
+
 ## ✅ Data Validation
 
 Validation uses Pydantic models generated from the JSON schemas. This validates aggregate `.json` files against `EvaluationLog` and instance-level `_samples.jsonl` files line-by-line against `InstanceLevelEvaluationLog`. Requires [uv](https://docs.astral.sh/uv/).
diff --git a/every_eval_ever/eval_types.py b/every_eval_ever/eval_types.py
index 40035403b..00264357d 100644
--- a/every_eval_ever/eval_types.py
+++ b/every_eval_ever/eval_types.py
@@ -1,21 +1,13 @@
 # generated by datamodel-codegen:
 #   filename:  eval.schema.json
-#   timestamp: 2026-03-19T20:30:15+00:00
+#   timestamp: 2026-05-18T09:44:21+00:00
 
 from __future__ import annotations
 
 from enum import Enum
 from typing import Annotated, Literal
 
-from pydantic import (
-    BaseModel,
-    ConfigDict,
-    Discriminator,
-    Field,
-    confloat,
-    conint,
-    model_validator,
-)
+from pydantic import BaseModel, ConfigDict, Field, confloat, conint, model_validator, Discriminator
 
 
 class SourceType(Enum):
@@ -73,6 +65,11 @@ class EvalLibrary(BaseModel):
     )
 
 
+class Modality(Enum):
+    text = 'text'
+    text_to_image = 'text_to_image'
+
+
 class ScoreType(Enum):
     binary = 'binary'
     continuous = 'continuous'
@@ -443,15 +440,13 @@ class MetricConfig(BaseModel):
 
     # --- validators (added by post_codegen.py) ---
 
-    @model_validator(mode='after')
+    @model_validator(mode="after")
     def validate_score_type_requirements(self):
         if self.score_type == ScoreType.levels:
             if self.level_names is None:
                 raise ValueError("score_type 'levels' requires level_names")
             if self.has_unknown_level is None:
-                raise ValueError(
-                    "score_type 'levels' requires has_unknown_level"
-                )
+                raise ValueError("score_type 'levels' requires has_unknown_level")
         elif self.score_type == ScoreType.continuous:
             if self.min_score is None:
                 raise ValueError("score_type 'continuous' requires min_score")
@@ -459,17 +454,17 @@ def validate_score_type_requirements(self):
                 raise ValueError("score_type 'continuous' requires max_score")
         return self
 
-
 class EvaluationResult(BaseModel):
     evaluation_result_id: str | None = Field(
         None,
         description='Stable identifier for this metric result inside an evaluation run. Recommended deterministic join key for instance-level records.',
     )
     evaluation_name: str = Field(..., description='Name of the evaluation')
-    source_data: Annotated[
-        SourceDataUrl | SourceDataHf | SourceDataPrivate,
-        Discriminator('source_type'),
-    ] = Field(
+    modality: Modality | None = Field(
+        None,
+        description="Modality of the task being evaluated. Absent means 'text' for backwards compatibility. Use 'text_to_image' for prompt-to-image generation tasks. Future modalities will be added to this enum.",
+    )
+    source_data: Annotated[SourceDataUrl | SourceDataHf | SourceDataPrivate, Discriminator("source_type")] = Field(
         ...,
         description='Source of dataset for this evaluation: URL, HuggingFace dataset, or private/custom dataset.',
     )
diff --git a/every_eval_ever/instance_level_types.py b/every_eval_ever/instance_level_types.py
index ff78d3420..84913c273 100644
--- a/every_eval_ever/instance_level_types.py
+++ b/every_eval_ever/instance_level_types.py
@@ -1,20 +1,13 @@
 # generated by datamodel-codegen:
 #   filename:  instance_level_eval.schema.json
-#   timestamp: 2026-03-19T20:30:15+00:00
+#   timestamp: 2026-05-18T09:44:23+00:00
 
 from __future__ import annotations
 
 from enum import Enum
 from typing import Any
 
-from pydantic import (
-    BaseModel,
-    ConfigDict,
-    Field,
-    confloat,
-    conint,
-    model_validator,
-)
+from pydantic import BaseModel, ConfigDict, Field, confloat, conint, model_validator
 
 
 class InteractionType(Enum):
@@ -23,6 +16,11 @@ class InteractionType(Enum):
     agentic = 'agentic'
 
 
+class Modality(Enum):
+    text = 'text'
+    text_to_image = 'text_to_image'
+
+
 class Input(BaseModel):
     raw: str = Field(..., description='The raw input as defined in the eval')
     formatted: str | None = Field(
@@ -39,14 +37,6 @@ class Input(BaseModel):
     )
 
 
-class Output(BaseModel):
-    raw: list[str] = Field(..., description='Complete model responses')
-    reasoning_trace: list[str] | None = Field(
-        None,
-        description='Reasoning traces of the model if applicable (e.g. chain-of-thought tokens)',
-    )
-
-
 class ToolCall(BaseModel):
     id: str = Field(..., description='Unique identifier for the tool call')
     name: str = Field(..., description='Name of tool/function')
@@ -104,8 +94,9 @@ class AnswerAttributionItem(BaseModel):
 
 class Evaluation(BaseModel):
     score: float = Field(..., description='Instance-level score')
-    is_correct: bool = Field(
-        ..., description='Whether the final answer is correct'
+    is_correct: bool | None = Field(
+        ...,
+        description='Whether the final answer is correct. Required to be present, but may be null when correctness is not well-defined (e.g. continuous T2I metrics like FID or CLIPScore).',
     )
     num_turns: conint(ge=1) | None = Field(
         None, description='Number of turns in the interaction'
@@ -150,6 +141,42 @@ class Performance(BaseModel):
     )
 
 
+class MediaType(Enum):
+    image = 'image'
+    video = 'video'
+    audio = 'audio'
+
+
+class MediaRef(BaseModel):
+    model_config = ConfigDict(
+        extra='forbid',
+    )
+    media_type: MediaType
+    uri: str = Field(
+        ...,
+        description="Location of the artifact: 'file://...', 'https://...', 'hf://...', 's3://...', or 'data:...;base64,...' for inline.",
+    )
+    additional_details: dict[str, str] | None = Field(
+        None,
+        description='Per-artifact extras (key-value pairs, all values must be strings). Use for sha256, mime_type, width/height, seed, index, etc.',
+    )
+
+
+class Output(BaseModel):
+    raw: list[str] = Field(
+        ...,
+        description="Complete model responses. For text_to_image modality, populate with one placeholder string per generated artifact (e.g. '<image:0>') so indexes align with output.media[].",
+    )
+    reasoning_trace: list[str] | None = Field(
+        None,
+        description='Reasoning traces of the model if applicable (e.g. chain-of-thought tokens)',
+    )
+    media: list[MediaRef] | None = Field(
+        None,
+        description="Generated media artifacts (image, video, audio). Required when modality == 'text_to_image'.",
+    )
+
+
 class InstanceLevelEvaluationLog(BaseModel):
     model_config = ConfigDict(
         extra='forbid',
@@ -183,7 +210,11 @@ class InstanceLevelEvaluationLog(BaseModel):
     )
     interaction_type: InteractionType = Field(
         ...,
-        description='Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents',
+        description="Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents. Text-to-image evaluations use 'single_turn'; modality is the orthogonal axis.",
+    )
+    modality: Modality | None = Field(
+        None,
+        description="Modality of this sample. Should match the modality on the linked aggregate evaluation_result. Absent means 'text' for backwards compatibility.",
     )
     input: Input = Field(
         ..., description='Input data for the evaluation sample'
@@ -220,22 +251,31 @@ class InstanceLevelEvaluationLog(BaseModel):
 
     # --- validators (added by post_codegen.py) ---
 
-    @model_validator(mode='after')
+    @model_validator(mode="after")
     def validate_interaction_type_consistency(self):
         if self.interaction_type == InteractionType.single_turn:
             if self.output is None:
-                raise ValueError('single_turn interaction_type requires output')
+                raise ValueError("single_turn interaction_type requires output")
             if self.messages is not None:
                 raise ValueError(
-                    'single_turn interaction_type must not have messages'
+                    "single_turn interaction_type must not have messages"
                 )
         else:
             if self.messages is None:
                 raise ValueError(
-                    f'{self.interaction_type.value} interaction_type requires messages'
+                    f"{self.interaction_type.value} interaction_type requires messages"
                 )
             if self.output is not None:
                 raise ValueError(
-                    f'{self.interaction_type.value} interaction_type must not have output'
+                    f"{self.interaction_type.value} interaction_type must not have output"
+                )
+        return self
+
+    @model_validator(mode="after")
+    def validate_modality_consistency(self):
+        if self.modality == Modality.text_to_image:
+            if self.output is None or not self.output.media:
+                raise ValueError(
+                    "modality 'text_to_image' requires output.media to be a non-empty list"
                 )
         return self
diff --git a/every_eval_ever/schemas/eval.schema.json b/every_eval_ever/schemas/eval.schema.json
index 1c7f17271..ff09c7c6f 100644
--- a/every_eval_ever/schemas/eval.schema.json
+++ b/every_eval_ever/schemas/eval.schema.json
@@ -130,6 +130,14 @@
                         "type": "string",
                         "description": "Name of the evaluation"
                     },
+                    "modality": {
+                        "type": "string",
+                        "enum": [
+                            "text",
+                            "text_to_image"
+                        ],
+                        "description": "Modality of the task being evaluated. Absent means 'text' for backwards compatibility. Use 'text_to_image' for prompt-to-image generation tasks. Future modalities will be added to this enum."
+                    },
                     "source_data": {
                         "description": "Source of dataset for this evaluation: URL, HuggingFace dataset, or private/custom dataset.",
                         "oneOf": [
diff --git a/every_eval_ever/schemas/instance_level_eval.schema.json b/every_eval_ever/schemas/instance_level_eval.schema.json
index 8701eb8f0..a7e4bf4c2 100644
--- a/every_eval_ever/schemas/instance_level_eval.schema.json
+++ b/every_eval_ever/schemas/instance_level_eval.schema.json
@@ -46,7 +46,12 @@
         "interaction_type": {
             "type": "string",
             "enum": ["single_turn", "multi_turn", "agentic"],
-            "description": "Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents"
+            "description": "Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents. Text-to-image evaluations use 'single_turn'; modality is the orthogonal axis."
+        },
+        "modality": {
+            "type": "string",
+            "enum": ["text", "text_to_image"],
+            "description": "Modality of this sample. Should match the modality on the linked aggregate evaluation_result. Absent means 'text' for backwards compatibility."
         },
         "input": {
             "type": "object",
@@ -82,13 +87,18 @@
             "properties": {
                 "raw": {
                     "type": "array",
-                    "description": "Complete model responses",
+                    "description": "Complete model responses. For text_to_image modality, populate with one placeholder string per generated artifact (e.g. '<image:0>') so indexes align with output.media[].",
                     "items": { "type": "string" }
                 },
                 "reasoning_trace": {
                     "type": ["array", "null"],
                     "description": "Reasoning traces of the model if applicable (e.g. chain-of-thought tokens)",
                     "items": { "type": "string" }
+                },
+                "media": {
+                    "type": ["array", "null"],
+                    "description": "Generated media artifacts (image, video, audio). Required when modality == 'text_to_image'.",
+                    "items": { "$ref": "#/$defs/media_ref" }
                 }
             }
         },
@@ -192,8 +202,8 @@
                     "description": "Instance-level score"
                 },
                 "is_correct": {
-                    "type": "boolean",
-                    "description": "Whether the final answer is correct"
+                    "type": ["boolean", "null"],
+                    "description": "Whether the final answer is correct. Required to be present, but may be null when correctness is not well-defined (e.g. continuous T2I metrics like FID or CLIPScore)."
                 },
                 "num_turns": {
                     "type": ["integer", "null"],
@@ -334,5 +344,28 @@
                 }
             }
         }
-    ]
+    ],
+    "$defs": {
+        "media_ref": {
+            "type": "object",
+            "description": "Reference to a generated media artifact. Required fields are intentionally minimal; record extras (mime_type, sha256, width/height, seed, index, ...) in additional_details.",
+            "required": ["media_type", "uri"],
+            "additionalProperties": false,
+            "properties": {
+                "media_type": {
+                    "type": "string",
+                    "enum": ["image", "video", "audio"]
+                },
+                "uri": {
+                    "type": "string",
+                    "description": "Location of the artifact: 'file://...', 'https://...', 'hf://...', 's3://...', or 'data:...;base64,...' for inline."
+                },
+                "additional_details": {
+                    "type": "object",
+                    "description": "Per-artifact extras (key-value pairs, all values must be strings). Use for sha256, mime_type, width/height, seed, index, etc.",
+                    "additionalProperties": {"type": "string"}
+                }
+            }
+        }
+    }
 }
diff --git a/post_codegen.py b/post_codegen.py
index c355536e6..c6e963138 100644
--- a/post_codegen.py
+++ b/post_codegen.py
@@ -66,6 +66,21 @@ def validate_score_type_requirements(self):
             if self.max_score is None:
                 raise ValueError("score_type 'continuous' requires max_score")
         return self
+""",
+    },
+    {
+        'file': 'every_eval_ever/instance_level_types.py',
+        'import_add': 'model_validator',
+        'class_name': 'InstanceLevelEvaluationLog',
+        'validator': """
+    @model_validator(mode="after")
+    def validate_modality_consistency(self):
+        if self.modality == Modality.text_to_image:
+            if self.output is None or not self.output.media:
+                raise ValueError(
+                    "modality 'text_to_image' requires output.media to be a non-empty list"
+                )
+        return self
 """,
     },
 ]
@@ -126,9 +141,9 @@ def patch_file(patch: dict) -> None:
     path = Path(__file__).parent / patch['file']
     content = path.read_text()
 
-    # Check if already patched
-    if 'post_codegen.py' in content:
-        print(f'  {patch["file"]}: already patched, skipping')
+    validator_def = re.search(r'def (\w+)\(self', patch['validator'])
+    if validator_def and f'def {validator_def.group(1)}(self' in content:
+        print(f'  {patch["file"]}: {patch["class_name"]}.{validator_def.group(1)} already patched, skipping')
         return
 
     content = add_import(content, patch['import_add'])
diff --git a/tests/data/t2i/geneval_sdxl_example.json b/tests/data/t2i/geneval_sdxl_example.json
new file mode 100644
index 000000000..e21488f3f
--- /dev/null
+++ b/tests/data/t2i/geneval_sdxl_example.json
@@ -0,0 +1,58 @@
+{
+    "schema_version": "0.2.2",
+    "evaluation_id": "geneval/stability-ai/stable-diffusion-xl-base-1.0/1747312345",
+    "retrieved_timestamp": "1747312345.0",
+    "source_metadata": {
+        "source_type": "evaluation_run",
+        "source_organization_name": "EvalEval Coalition",
+        "evaluator_relationship": "third_party"
+    },
+    "eval_library": {
+        "name": "geneval",
+        "version": "unknown"
+    },
+    "model_info": {
+        "name": "Stable Diffusion XL Base 1.0",
+        "id": "stabilityai/stable-diffusion-xl-base-1.0",
+        "developer": "Stability AI",
+        "inference_engine": {
+            "name": "diffusers",
+            "version": "0.30.0"
+        }
+    },
+    "evaluation_results": [
+        {
+            "evaluation_result_id": "geneval_overall_sdxl",
+            "evaluation_name": "geneval_overall",
+            "modality": "text_to_image",
+            "source_data": {
+                "dataset_name": "GenEval",
+                "source_type": "hf_dataset",
+                "hf_repo": "djghosh/wds_geneval"
+            },
+            "metric_config": {
+                "metric_id": "geneval.overall",
+                "metric_name": "GenEval Overall",
+                "metric_kind": "vqa_score",
+                "lower_is_better": false,
+                "score_type": "continuous",
+                "min_score": 0,
+                "max_score": 1
+            },
+            "score_details": {
+                "score": 0.55
+            },
+            "generation_config": {
+                "additional_details": {
+                    "num_inference_steps": "50",
+                    "guidance_scale": "7.5",
+                    "width": "1024",
+                    "height": "1024",
+                    "num_images_per_prompt": "4",
+                    "scheduler": "EulerDiscreteScheduler",
+                    "seed": "42"
+                }
+            }
+        }
+    ]
+}
diff --git a/tests/data/t2i/geneval_sdxl_example_samples.jsonl b/tests/data/t2i/geneval_sdxl_example_samples.jsonl
new file mode 100644
index 000000000..7c2098c61
--- /dev/null
+++ b/tests/data/t2i/geneval_sdxl_example_samples.jsonl
@@ -0,0 +1 @@
+{"schema_version": "instance_level_eval_0.2.2", "evaluation_id": "geneval/stability-ai/stable-diffusion-xl-base-1.0/1747312345", "evaluation_result_id": "geneval_overall_sdxl", "model_id": "stabilityai/stable-diffusion-xl-base-1.0", "evaluation_name": "geneval_overall", "sample_id": "geneval_0001", "interaction_type": "single_turn", "modality": "text_to_image", "input": {"raw": "a photo of a red car and a blue motorcycle", "reference": ["red car", "blue motorcycle"]}, "output": {"raw": ["<image:0>", "<image:1>"], "media": [{"media_type": "image", "uri": "file://./images/geneval_0001_0.png", "additional_details": {"sha256": "1111111111111111111111111111111111111111111111111111111111111111", "width": "1024", "height": "1024", "seed": "42", "index": "0"}}, {"media_type": "image", "uri": "file://./images/geneval_0001_1.png", "additional_details": {"sha256": "2222222222222222222222222222222222222222222222222222222222222222", "width": "1024", "height": "1024", "seed": "43", "index": "1"}}]}, "answer_attribution": [], "evaluation": {"score": 0.75, "is_correct": null}}
diff --git a/tests/test_validate.py b/tests/test_validate.py
index edb2d5b6a..49cdaa838 100644
--- a/tests/test_validate.py
+++ b/tests/test_validate.py
@@ -376,3 +376,68 @@ def test_exit_code_1_on_failure(self, tmp_path: Path):
         fp = _write_json(tmp_path, 'fail.json', data)
         report = validate_file(fp)
         assert report.valid is False
+
+
+# ===================================================================
+# Text-to-image (modality: text_to_image) tests
+# ===================================================================
+
+
+T2I_FIXTURE_DIR = Path(__file__).parent / 'data' / 't2i'
+
+
+class TestT2I:
+    def test_geneval_fixture_passes(self):
+        agg = validate_file(T2I_FIXTURE_DIR / 'geneval_sdxl_example.json')
+        samples = validate_file(
+            T2I_FIXTURE_DIR / 'geneval_sdxl_example_samples.jsonl'
+        )
+        assert agg.valid is True, agg.errors
+        assert samples.valid is True, samples.errors
+
+    def test_t2i_record_requires_media(self, tmp_path: Path):
+        rec = {
+            'schema_version': 'instance_level_eval_0.2.2',
+            'evaluation_id': 'x', 'model_id': 'x/y', 'evaluation_name': 'x',
+            'sample_id': '1', 'interaction_type': 'single_turn',
+            'modality': 'text_to_image',
+            'input': {'raw': 'a cat', 'reference': []},
+            'output': {'raw': ['<image:0>']},
+            'answer_attribution': [],
+            'evaluation': {'score': 0.5, 'is_correct': None},
+        }
+        fp = _write_jsonl(tmp_path, 't.jsonl', [rec])
+        report = validate_file(fp)
+        assert report.valid is False
+        assert any('media' in e['msg'] for e in report.errors)
+
+    def test_t2i_null_is_correct_passes(self, tmp_path: Path):
+        rec = {
+            'schema_version': 'instance_level_eval_0.2.2',
+            'evaluation_id': 'x', 'model_id': 'x/y', 'evaluation_name': 'x',
+            'sample_id': '1', 'interaction_type': 'single_turn',
+            'modality': 'text_to_image',
+            'input': {'raw': 'a cat', 'reference': []},
+            'output': {
+                'raw': ['<image:0>'],
+                'media': [{'media_type': 'image', 'uri': 'file://./a.png'}],
+            },
+            'answer_attribution': [],
+            'evaluation': {'score': 0.87, 'is_correct': None},
+        }
+        fp = _write_jsonl(tmp_path, 't.jsonl', [rec])
+        report = validate_file(fp)
+        assert report.valid is True, report.errors
+
+    def test_modality_unknown_value_fails(self, tmp_path: Path):
+        data = json.loads(json.dumps(VALID_AGGREGATE))
+        data['evaluation_results'][0]['modality'] = 'image_edit'
+        fp = _write_json(tmp_path, 'agg.json', data)
+        report = validate_file(fp)
+        assert report.valid is False
+
+    def test_existing_records_without_modality_still_pass(self, tmp_path: Path):
+        fp_agg = _write_json(tmp_path, 'agg.json', VALID_AGGREGATE)
+        fp_inst = _write_jsonl(tmp_path, 'inst.jsonl', [VALID_SINGLE_TURN])
+        assert validate_file(fp_agg).valid is True
+        assert validate_file(fp_inst).valid is True