diff --git a/.gitignore b/.gitignore
index d493519d7..198f151dc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,8 @@
 # Local data (generated by running adapters)
 # data/
+plan/
+misc/
+*.tmp
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/.tmp/audit_after.json b/.tmp/audit_after.json
new file mode 100644
index 000000000..0dda99be0
--- /dev/null
+++ b/.tmp/audit_after.json
@@ -0,0 +1,262 @@
+{
+  "files_scanned": 6448,
+  "results_scanned": 49659,
+  "missing": {
+    "metric_id": 1021,
+    "metric_name": 1021,
+    "metric_kind": 1021,
+    "metric_unit": 1021
+  },
+  "malformed": {},
+  "top_missing_by_benchmark": {
+    "evaluation_result_id": [],
+    "metric_id": [
+      [
+        "fibble_arena",
+        336
+      ],
+      [
+        "helm_classic",
+        201
+      ],
+      [
+        "helm_lite",
+        182
+      ],
+      [
+        "livecodebenchpro",
+        87
+      ],
+      [
+        "helm_capabilities",
+        68
+      ],
+      [
+        "ace",
+        32
+      ],
+      [
+        "apex-v1",
+        19
+      ],
+      [
+        "appworld_test_normal",
+        15
+      ],
+      [
+        "browsecompplus",
+        15
+      ],
+      [
+        "swe-bench",
+        15
+      ],
+      [
+        "tau-bench-2_airline",
+        15
+      ],
+      [
+        "tau-bench-2_retail",
+        15
+      ],
+      [
+        "tau-bench-2_telecom",
+        15
+      ],
+      [
+        "la_leaderboard",
+        5
+      ],
+      [
+        "theory_of_mind",
+        1
+      ]
+    ],
+    "metric_name": [
+      [
+        "fibble_arena",
+        336
+      ],
+      [
+        "helm_classic",
+        201
+      ],
+      [
+        "helm_lite",
+        182
+      ],
+      [
+        "livecodebenchpro",
+        87
+      ],
+      [
+        "helm_capabilities",
+        68
+      ],
+      [
+        "ace",
+        32
+      ],
+      [
+        "apex-v1",
+        19
+      ],
+      [
+        "appworld_test_normal",
+        15
+      ],
+      [
+        "browsecompplus",
+        15
+      ],
+      [
+        "swe-bench",
+        15
+      ],
+      [
+        "tau-bench-2_airline",
+        15
+      ],
+      [
+        "tau-bench-2_retail",
+        15
+      ],
+      [
+        "tau-bench-2_telecom",
+        15
+      ],
+      [
+        "la_leaderboard",
+        5
+      ],
+      [
+        "theory_of_mind",
+        1
+      ]
+    ],
+    "metric_kind": [
+      [
+        "fibble_arena",
+        336
+      ],
+      [
+        "helm_classic",
+        201
+      ],
+      [
+        "helm_lite",
+        182
+      ],
+      [
+        "livecodebenchpro",
+        87
+      ],
+      [
+        "helm_capabilities",
+        68
+      ],
+      [
+        "ace",
+        32
+      ],
+      [
+        "apex-v1",
+        19
+      ],
+      [
+        "appworld_test_normal",
+        15
+      ],
+      [
+        "browsecompplus",
+        15
+      ],
+      [
+        "swe-bench",
+        15
+      ],
+      [
+        "tau-bench-2_airline",
+        15
+      ],
+      [
+        "tau-bench-2_retail",
+        15
+      ],
+      [
+        "tau-bench-2_telecom",
+        15
+      ],
+      [
+        "la_leaderboard",
+        5
+      ],
+      [
+        "theory_of_mind",
+        1
+      ]
+    ],
+    "metric_unit": [
+      [
+        "fibble_arena",
+        336
+      ],
+      [
+        "helm_classic",
+        201
+      ],
+      [
+        "helm_lite",
+        182
+      ],
+      [
+        "livecodebenchpro",
+        87
+      ],
+      [
+        "helm_capabilities",
+        68
+      ],
+      [
+        "ace",
+        32
+      ],
+      [
+        "apex-v1",
+        19
+      ],
+      [
+        "appworld_test_normal",
+        15
+      ],
+      [
+        "browsecompplus",
+        15
+      ],
+      [
+        "swe-bench",
+        15
+      ],
+      [
+        "tau-bench-2_airline",
+        15
+      ],
+      [
+        "tau-bench-2_retail",
+        15
+      ],
+      [
+        "tau-bench-2_telecom",
+        15
+      ],
+      [
+        "la_leaderboard",
+        5
+      ],
+      [
+        "theory_of_mind",
+        1
+      ]
+    ]
+  }
+}
\ No newline at end of file
diff --git a/.tmp/audit_before.json b/.tmp/audit_before.json
new file mode 100644
index 000000000..7b432cb88
--- /dev/null
+++ b/.tmp/audit_before.json
@@ -0,0 +1,426 @@
+{
+  "files_scanned": 6448,
+  "results_scanned": 49659,
+  "missing": {
+    "evaluation_result_id": 37071,
+    "metric_id": 37071,
+    "metric_name": 37071,
+    "metric_kind": 37071,
+    "metric_unit": 37071
+  },
+  "malformed": {
+    "evaluation_result_id_pattern": 12588
+  },
+  "top_missing_by_benchmark": {
+    "evaluation_result_id": [
+      [
+        "hfopenllm_v2",
+        27444
+      ],
+      [
+        "helm_mmlu",
+        2844
+      ],
+      [
+        "reward-bench",
+        2404
+      ],
+      [
+        "helm_classic",
+        1005
+      ],
+      [
+        "global-mmlu-lite",
+        912
+      ],
+      [
+        "helm_lite",
+        910
+      ],
+      [
+        "fibble_arena",
+        559
+      ],
+      [
+        "helm_capabilities",
+        408
+      ],
+      [
+        "wordle_arena",
+        134
+      ],
+      [
+        "terminal-bench-2.0",
+        115
+      ],
+      [
+        "livecodebenchpro",
+        87
+      ],
+      [
+        "apex-agents",
+        74
+      ],
+      [
+        "ace",
+        32
+      ],
+      [
+        "helm_instruct",
+        28
+      ],
+      [
+        "apex-v1",
+        19
+      ],
+      [
+        "appworld_test_normal",
+        15
+      ],
+      [
+        "browsecompplus",
+        15
+      ],
+      [
+        "swe-bench",
+        15
+      ],
+      [
+        "tau-bench-2_airline",
+        15
+      ],
+      [
+        "tau-bench-2_retail",
+        15
+      ]
+    ],
+    "metric_id": [
+      [
+        "hfopenllm_v2",
+        27444
+      ],
+      [
+        "helm_mmlu",
+        2844
+      ],
+      [
+        "reward-bench",
+        2404
+      ],
+      [
+        "helm_classic",
+        1005
+      ],
+      [
+        "global-mmlu-lite",
+        912
+      ],
+      [
+        "helm_lite",
+        910
+      ],
+      [
+        "fibble_arena",
+        559
+      ],
+      [
+        "helm_capabilities",
+        408
+      ],
+      [
+        "wordle_arena",
+        134
+      ],
+      [
+        "terminal-bench-2.0",
+        115
+      ],
+      [
+        "livecodebenchpro",
+        87
+      ],
+      [
+        "apex-agents",
+        74
+      ],
+      [
+        "ace",
+        32
+      ],
+      [
+        "helm_instruct",
+        28
+      ],
+      [
+        "apex-v1",
+        19
+      ],
+      [
+        "appworld_test_normal",
+        15
+      ],
+      [
+        "browsecompplus",
+        15
+      ],
+      [
+        "swe-bench",
+        15
+      ],
+      [
+        "tau-bench-2_airline",
+        15
+      ],
+      [
+        "tau-bench-2_retail",
+        15
+      ]
+    ],
+    "metric_name": [
+      [
+        "hfopenllm_v2",
+        27444
+      ],
+      [
+        "helm_mmlu",
+        2844
+      ],
+      [
+        "reward-bench",
+        2404
+      ],
+      [
+        "helm_classic",
+        1005
+      ],
+      [
+        "global-mmlu-lite",
+        912
+      ],
+      [
+        "helm_lite",
+        910
+      ],
+      [
+        "fibble_arena",
+        559
+      ],
+      [
+        "helm_capabilities",
+        408
+      ],
+      [
+        "wordle_arena",
+        134
+      ],
+      [
+        "terminal-bench-2.0",
+        115
+      ],
+      [
+        "livecodebenchpro",
+        87
+      ],
+      [
+        "apex-agents",
+        74
+      ],
+      [
+        "ace",
+        32
+      ],
+      [
+        "helm_instruct",
+        28
+      ],
+      [
+        "apex-v1",
+        19
+      ],
+      [
+        "appworld_test_normal",
+        15
+      ],
+      [
+        "browsecompplus",
+        15
+      ],
+      [
+        "swe-bench",
+        15
+      ],
+      [
+        "tau-bench-2_airline",
+        15
+      ],
+      [
+        "tau-bench-2_retail",
+        15
+      ]
+    ],
+    "metric_kind": [
+      [
+        "hfopenllm_v2",
+        27444
+      ],
+      [
+        "helm_mmlu",
+        2844
+      ],
+      [
+        "reward-bench",
+        2404
+      ],
+      [
+        "helm_classic",
+        1005
+      ],
+      [
+        "global-mmlu-lite",
+        912
+      ],
+      [
+        "helm_lite",
+        910
+      ],
+      [
+        "fibble_arena",
+        559
+      ],
+      [
+        "helm_capabilities",
+        408
+      ],
+      [
+        "wordle_arena",
+        134
+      ],
+      [
+        "terminal-bench-2.0",
+        115
+      ],
+      [
+        "livecodebenchpro",
+        87
+      ],
+      [
+        "apex-agents",
+        74
+      ],
+      [
+        "ace",
+        32
+      ],
+      [
+        "helm_instruct",
+        28
+      ],
+      [
+        "apex-v1",
+        19
+      ],
+      [
+        "appworld_test_normal",
+        15
+      ],
+      [
+        "browsecompplus",
+        15
+      ],
+      [
+        "swe-bench",
+        15
+      ],
+      [
+        "tau-bench-2_airline",
+        15
+      ],
+      [
+        "tau-bench-2_retail",
+        15
+      ]
+    ],
+    "metric_unit": [
+      [
+        "hfopenllm_v2",
+        27444
+      ],
+      [
+        "helm_mmlu",
+        2844
+      ],
+      [
+        "reward-bench",
+        2404
+      ],
+      [
+        "helm_classic",
+        1005
+      ],
+      [
+        "global-mmlu-lite",
+        912
+      ],
+      [
+        "helm_lite",
+        910
+      ],
+      [
+        "fibble_arena",
+        559
+      ],
+      [
+        "helm_capabilities",
+        408
+      ],
+      [
+        "wordle_arena",
+        134
+      ],
+      [
+        "terminal-bench-2.0",
+        115
+      ],
+      [
+        "livecodebenchpro",
+        87
+      ],
+      [
+        "apex-agents",
+        74
+      ],
+      [
+        "ace",
+        32
+      ],
+      [
+        "helm_instruct",
+        28
+      ],
+      [
+        "apex-v1",
+        19
+      ],
+      [
+        "appworld_test_normal",
+        15
+      ],
+      [
+        "browsecompplus",
+        15
+      ],
+      [
+        "swe-bench",
+        15
+      ],
+      [
+        "tau-bench-2_airline",
+        15
+      ],
+      [
+        "tau-bench-2_retail",
+        15
+      ]
+    ]
+  }
+}
\ No newline at end of file
diff --git a/README.md b/README.md
index 262a2ca11..bc2f12997 100644
--- a/README.md
+++ b/README.md
@@ -199,6 +199,24 @@ uv run python -m every_eval_ever validate --format github data/
 
 Exit code is `0` if all files pass and `1` if any fail.
 
+### Canonical identity backfill + audit
+
+For legacy aggregate files missing canonical metric identity fields, you can backfill and then audit coverage:
+
+```sh
+# Dry-run canonical augmentation
+uv run python -m every_eval_ever augment-canonical-identity data/
+
+# Apply canonical augmentation in place (also updates companion *_samples.jsonl when present)
+uv run python -m every_eval_ever augment-canonical-identity data/ --write
+
+# Audit missing/malformed canonical identity fields
+uv run python -m every_eval_ever check-canonical-identity data/
+
+# CI mode (exit 1 if issues remain)
+uv run python -m every_eval_ever check-canonical-identity data/ --fail-on-issues
+```
+
 ## 🗂️ Data Structure
 
 Evaluation data is hosted on the [Hugging Face datastore](https://huggingface.co/datasets/evaleval/EEE_datastore). The folder structure is:
diff --git a/every_eval_ever/augment_canonical_identity.py b/every_eval_ever/augment_canonical_identity.py
new file mode 100644
index 000000000..4be663643
--- /dev/null
+++ b/every_eval_ever/augment_canonical_identity.py
@@ -0,0 +1,1249 @@
+"""Backfill canonical metric/eval identity into existing datastore files."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from every_eval_ever.validate import expand_paths
+
+WORDLE_METRICS: dict[str, dict[str, Any]] = {
+    'win_rate': {
+        'metric_id': 'win_rate',
+        'metric_name': 'Win Rate',
+        'metric_kind': 'win_rate',
+        'metric_unit': 'proportion',
+    },
+    'avg_attempts': {
+        'metric_id': 'mean_attempts',
+        'metric_name': 'Average Attempts',
+        'metric_kind': 'count',
+        'metric_unit': 'attempts',
+    },
+    'avg_latency_ms': {
+        'metric_id': 'latency_mean',
+        'metric_name': 'Average Latency',
+        'metric_kind': 'latency',
+        'metric_unit': 'ms',
+    },
+}
+
+HFOPENLLM_METRICS: dict[str, dict[str, Any]] = {
+    'IFEval': {
+        'metric_id': 'accuracy',
+        'metric_name': 'Accuracy',
+        'metric_kind': 'accuracy',
+        'metric_unit': 'proportion',
+    },
+    'BBH': {
+        'metric_id': 'accuracy',
+        'metric_name': 'Accuracy',
+        'metric_kind': 'accuracy',
+        'metric_unit': 'proportion',
+    },
+    'MATH Level 5': {
+        'metric_id': 'exact_match',
+        'metric_name': 'Exact Match',
+        'metric_kind': 'exact_match',
+        'metric_unit': 'proportion',
+    },
+    'GPQA': {
+        'metric_id': 'accuracy',
+        'metric_name': 'Accuracy',
+        'metric_kind': 'accuracy',
+        'metric_unit': 'proportion',
+    },
+    'MUSR': {
+        'metric_id': 'accuracy',
+        'metric_name': 'Accuracy',
+        'metric_kind': 'accuracy',
+        'metric_unit': 'proportion',
+    },
+    'MMLU-PRO': {
+        'metric_id': 'accuracy',
+        'metric_name': 'Accuracy',
+        'metric_kind': 'accuracy',
+        'metric_unit': 'proportion',
+    },
+}
+
+SINGLE_SCORE_BENCHMARK_FAMILIES = {
+    'appworld_test_normal',
+    'browsecompplus',
+    'la_leaderboard',
+    'multi-swe-bench-leaderboard',
+    'swe-bench',
+    'swe-polybench-leaderboard',
+}
+
+INSPECT_ACCURACY_BENCHMARK_FAMILIES = {
+    'GAIA',
+    'MathVista',
+    'MMLU-Pro',
+    'MMMU-Multiple-Choice',
+    'MMMU-Open-Ended',
+    'big_bench_hard',
+    'commonsense_qa',
+    'cybench',
+    'cyse2_interpreter_abuse',
+    'cyse2_prompt_injection',
+    'cyse2_vulnerability_exploit',
+    'gdm_intercode_ctf',
+    'gpqa_diamond',
+    'gsm8k',
+    'hellaswag',
+    'mbpp',
+    'openai_humaneval',
+    'piqa',
+}
+
+
+@dataclass
+class CanonicalPatch:
+    evaluation_name: str | None = None
+    metric_id: str | None = None
+    metric_name: str | None = None
+    metric_kind: str | None = None
+    metric_unit: str | None = None
+    metric_parameters: dict[str, str | float | bool | None] | None = None
+
+
+@dataclass
+class AugmentationReport:
+    original_file_path: Path
+    file_path: Path
+    benchmark_family: str
+    aggregate_changed: bool
+    sample_file_path: Path | None
+    sample_changed: bool
+    changed_results: int
+    sample_rows: int
+    path_changed: bool = False
+
+    @property
+    def changed(self) -> bool:
+        return self.aggregate_changed or self.sample_changed or self.path_changed
+
+
+def _slug(value: str) -> str:
+    slug = re.sub(r'[^a-z0-9]+', '_', value.lower()).strip('_')
+    return slug or 'unknown'
+
+
+def _store_raw_evaluation_name(
+    metric_config: dict[str, Any], raw_evaluation_name: str
+) -> None:
+    additional = metric_config.get('additional_details')
+    if not isinstance(additional, dict):
+        additional = {}
+        metric_config['additional_details'] = additional
+    additional.setdefault('raw_evaluation_name', raw_evaluation_name)
+
+
+def _set_metric_field(
+    metric_config: dict[str, Any], key: str, value: Any
+) -> bool:
+    if value is None:
+        return False
+    if metric_config.get(key) in (None, ''):
+        metric_config[key] = value
+        return True
+    return False
+
+
+def _benchmark_metric_namespace(benchmark_family: str) -> str:
+    namespace = re.sub(r'[^a-z0-9]+', '_', benchmark_family.lower()).strip('_')
+    return namespace or 'benchmark'
+
+
+def _score_metric_unit(metric_config: dict[str, Any]) -> str:
+    min_score = metric_config.get('min_score')
+    max_score = metric_config.get('max_score')
+
+    if min_score in {0, 0.0} and max_score in {1, 1.0}:
+        return 'proportion'
+    if min_score in {0, 0.0} and max_score in {100, 100.0}:
+        return 'points'
+    return 'points'
+
+
+def _metric_from_phrase(phrase: str) -> CanonicalPatch | None:
+    normalized = phrase.strip().lower().rstrip(':.')
+
+    pass_at_k_match = re.fullmatch(r'pass@(?P<k>\d+)', normalized)
+    if pass_at_k_match is not None:
+        k = int(pass_at_k_match.group('k'))
+        return CanonicalPatch(
+            metric_id='pass_at_k',
+            metric_name=f'Pass@{k}',
+            metric_kind='pass_rate',
+            metric_unit='proportion',
+            metric_parameters={'k': k},
+        )
+
+    ndcg_match = re.fullmatch(r'ndcg@(?P<k>\d+)', normalized)
+    if ndcg_match is not None:
+        k = int(ndcg_match.group('k'))
+        return CanonicalPatch(
+            metric_id='ndcg',
+            metric_name=f'NDCG@{k}',
+            metric_kind='ndcg',
+            metric_unit='proportion',
+            metric_parameters={'k': k},
+        )
+
+    rouge_match = re.fullmatch(r'rouge[-_ ]?(?P<n>\d+)', normalized)
+    if rouge_match is not None:
+        n = int(rouge_match.group('n'))
+        return CanonicalPatch(
+            metric_id=f'rouge_{n}',
+            metric_name=f'ROUGE-{n}',
+            metric_kind='rouge',
+            metric_unit='proportion',
+            metric_parameters={'n': n},
+        )
+
+    bleu_match = re.fullmatch(r'bleu[-_ ]?(?P<n>\d+)', normalized)
+    if bleu_match is not None:
+        n = int(bleu_match.group('n'))
+        return CanonicalPatch(
+            metric_id=f'bleu_{n}',
+            metric_name=f'BLEU-{n}',
+            metric_kind='bleu',
+            metric_unit='proportion',
+            metric_parameters={'n': n},
+        )
+
+    if normalized in {'acc', 'accuracy', 'cot correct', 'correct'}:
+        return CanonicalPatch(
+            metric_id='accuracy',
+            metric_name='Accuracy',
+            metric_kind='accuracy',
+            metric_unit='proportion',
+        )
+    if normalized in {'equivalent', 'equivalent (cot)'}:
+        return CanonicalPatch(
+            metric_id='equivalent_cot',
+            metric_name='Equivalent (CoT)',
+            metric_kind='accuracy',
+            metric_unit='proportion',
+        )
+    if 'strict acc' in normalized:
+        return CanonicalPatch(
+            metric_id='strict_accuracy',
+            metric_name='Strict Accuracy',
+            metric_kind='accuracy',
+            metric_unit='proportion',
+        )
+    if normalized == 'em':
+        return CanonicalPatch(
+            metric_id='exact_match',
+            metric_name='Exact Match',
+            metric_kind='exact_match',
+            metric_unit='proportion',
+        )
+    if normalized == 'f1':
+        return CanonicalPatch(
+            metric_id='f1',
+            metric_name='F1',
+            metric_kind='f1',
+            metric_unit='proportion',
+        )
+    if normalized in {'mean win rate', 'win rate'}:
+        return CanonicalPatch(
+            metric_id='win_rate',
+            metric_name='Win Rate',
+            metric_kind='win_rate',
+            metric_unit='proportion',
+        )
+    if normalized == 'mean':
+        return CanonicalPatch(
+            metric_id='mean_score',
+            metric_name='Mean Score',
+            metric_kind='score',
+            metric_unit='proportion',
+        )
+    if normalized in {'mean score', 'score', 'wb score'}:
+        return CanonicalPatch(
+            metric_id='score',
+            metric_name='Score',
+            metric_kind='score',
+            metric_unit='proportion',
+        )
+    if normalized in {'stderr', 'standard error'}:
+        return CanonicalPatch(
+            metric_id='standard_error',
+            metric_name='Standard Error',
+            metric_kind='standard_error',
+            metric_unit='proportion',
+        )
+    if normalized in {'std', 'stddev', 'standard deviation'}:
+        return CanonicalPatch(
+            metric_id='standard_deviation',
+            metric_name='Standard Deviation',
+            metric_kind='standard_deviation',
+            metric_unit='proportion',
+        )
+    if normalized == 'harmlessness':
+        return CanonicalPatch(
+            metric_id='harmlessness',
+            metric_name='Harmlessness',
+            metric_kind='score',
+            metric_unit='points',
+        )
+    return None
+
+
+def _wordle_patch(raw_evaluation_name: str) -> CanonicalPatch | None:
+    prefix = 'wordle_arena_'
+    if not raw_evaluation_name.startswith(prefix):
+        return None
+
+    metric_suffix = raw_evaluation_name[len(prefix) :]
+    metric = WORDLE_METRICS.get(metric_suffix)
+    if metric is None:
+        return None
+
+    return CanonicalPatch(
+        evaluation_name='wordle_arena',
+        metric_id=metric['metric_id'],
+        metric_name=metric['metric_name'],
+        metric_kind=metric['metric_kind'],
+        metric_unit=metric['metric_unit'],
+    )
+
+
+def _fibble_patch(raw_evaluation_name: str) -> CanonicalPatch | None:
+    legacy_match = re.fullmatch(
+        r'fibble_arena_(?P<variant>1lie|[2-9]lies)_(?P<metric>.+)',
+        raw_evaluation_name,
+    )
+    if legacy_match is not None:
+        metric = WORDLE_METRICS.get(legacy_match.group('metric'))
+        if metric is None:
+            return None
+
+        return CanonicalPatch(
+            evaluation_name=f'fibble_arena_{legacy_match.group("variant")}',
+            metric_id=metric['metric_id'],
+            metric_name=metric['metric_name'],
+            metric_kind=metric['metric_kind'],
+            metric_unit=metric['metric_unit'],
+        )
+
+    family_match = re.fullmatch(
+        r'(?P<family>fibble(?:[1-5])?_arena)_(?P<metric>.+)',
+        raw_evaluation_name,
+    )
+    if family_match is None:
+        return None
+
+    metric = WORDLE_METRICS.get(family_match.group('metric'))
+    if metric is None:
+        return None
+
+    return CanonicalPatch(
+        evaluation_name=family_match.group('family'),
+        metric_id=metric['metric_id'],
+        metric_name=metric['metric_name'],
+        metric_kind=metric['metric_kind'],
+        metric_unit=metric['metric_unit'],
+    )
+
+
+def _ifeval_patch(raw_evaluation_name: str) -> CanonicalPatch | None:
+    match = re.fullmatch(
+        r'(?P<slice>prompt_strict|prompt_loose|inst_strict|inst_loose|final)'
+        r'_(?P<metric>acc|stderr)'
+        r' on inspect_evals/ifeval(?: for scorer .+)?',
+        raw_evaluation_name,
+    )
+    if match is None:
+        return None
+
+    metric_key = match.group('metric')
+    metric = _metric_from_phrase(
+        'accuracy' if metric_key == 'acc' else 'standard error'
+    )
+    if metric is None:
+        return None
+    metric.evaluation_name = match.group('slice')
+    return metric
+
+
+def _agentharm_patch(raw_evaluation_name: str) -> CanonicalPatch | None:
+    overall_match = re.fullmatch(
+        r'inspect_evals/(?P<metric>'
+        r'avg_score|avg_full_score|avg_refusals|avg_score_non_refusals'
+        r') on inspect_evals/agentharm(?: for scorer .+)?',
+        raw_evaluation_name,
+    )
+    if overall_match is not None:
+        metric_key = overall_match.group('metric')
+        if metric_key == 'avg_score':
+            return CanonicalPatch(
+                evaluation_name='agentharm',
+                metric_id='average_score',
+                metric_name='Average Score',
+                metric_kind='score',
+                metric_unit='proportion',
+            )
+        if metric_key == 'avg_full_score':
+            return CanonicalPatch(
+                evaluation_name='agentharm',
+                metric_id='average_full_score',
+                metric_name='Average Full Score',
+                metric_kind='score',
+                metric_unit='proportion',
+            )
+        if metric_key == 'avg_refusals':
+            return CanonicalPatch(
+                evaluation_name='agentharm',
+                metric_id='average_refusal_rate',
+                metric_name='Average Refusal Rate',
+                metric_kind='refusal_rate',
+                metric_unit='proportion',
+            )
+        if metric_key == 'avg_score_non_refusals':
+            return CanonicalPatch(
+                evaluation_name='agentharm',
+                metric_id='average_score_non_refusals',
+                metric_name='Average Score (Non-Refusals)',
+                metric_kind='score',
+                metric_unit='proportion',
+            )
+
+    category_match = re.fullmatch(
+        r'(?P<category>[A-Za-z]+)_avg_(?P<metric>scores|refusals)'
+        r' on inspect_evals/agentharm(?: for scorer .+)?',
+        raw_evaluation_name,
+    )
+    if category_match is None:
+        return None
+
+    category = category_match.group('category')
+    metric_kind = category_match.group('metric')
+    if metric_kind == 'scores':
+        return CanonicalPatch(
+            evaluation_name=category,
+            metric_id='average_score',
+            metric_name='Average Score',
+            metric_kind='score',
+            metric_unit='proportion',
+        )
+
+    return CanonicalPatch(
+        evaluation_name=category,
+        metric_id='average_refusal_rate',
+        metric_name='Average Refusal Rate',
+        metric_kind='refusal_rate',
+        metric_unit='proportion',
+    )
+
+
+def _generic_inspect_eval_patch(
+    benchmark_family: str,
+    raw_evaluation_name: str,
+    metric_config: dict[str, Any],
+) -> CanonicalPatch | None:
+    match = re.fullmatch(
+        r'(?P<metric>accuracy|mean|std) on '
+        r'(?:(?:inspect_evals/)?(?P<slice>[^ ]+))'
+        r'(?: for scorer .+)?',
+        raw_evaluation_name,
+        flags=re.IGNORECASE,
+    )
+    if match is None:
+        metric_phrase = str(metric_config.get('evaluation_description') or '')
+    else:
+        metric_phrase = match.group('metric')
+
+    metric = _metric_from_phrase(metric_phrase)
+    if metric is None:
+        return None
+
+    metric.evaluation_name = benchmark_family
+    return metric
+
+
+def _alphaxiv_patch(
+    payload: dict[str, Any], result: dict[str, Any]
+) -> CanonicalPatch:
+    raw_evaluation_name = str(result.get('evaluation_name') or '').strip()
+    metric_config = result.setdefault('metric_config', {})
+
+    if not raw_evaluation_name:
+        additional = metric_config.get('additional_details')
+        if isinstance(additional, dict):
+            raw_evaluation_name = str(additional.get('alphaxiv_y_axis') or '').strip()
+    if not raw_evaluation_name:
+        raw_evaluation_name = 'Score'
+
+    source_data = result.get('source_data')
+    evaluation_name = ''
+    if isinstance(source_data, dict):
+        evaluation_name = str(source_data.get('dataset_name') or '').strip()
+    if not evaluation_name:
+        evaluation_id = str(payload.get('evaluation_id') or '')
+        if '/' in evaluation_id:
+            evaluation_name = evaluation_id.split('/', 1)[0].strip()
+    if not evaluation_name:
+        evaluation_name = 'alphaxiv'
+
+    return CanonicalPatch(
+        evaluation_name=evaluation_name,
+        metric_id=_slug(raw_evaluation_name),
+        metric_name=raw_evaluation_name,
+        metric_kind='score',
+        metric_unit=_score_metric_unit(metric_config),
+    )
+
+
+def _apex_agents_patch(raw_evaluation_name: str) -> CanonicalPatch | None:
+    pass_match = re.fullmatch(
+        r'(?P<slice>.+) Pass@(?P<k>\d+)', raw_evaluation_name
+    )
+    if pass_match is not None:
+        k = int(pass_match.group('k'))
+        return CanonicalPatch(
+            evaluation_name=pass_match.group('slice'),
+            metric_id='pass_at_k',
+            metric_name=f'Pass@{k}',
+            metric_kind='pass_rate',
+            metric_unit='proportion',
+            metric_parameters={'k': k},
+        )
+
+    score_match = re.fullmatch(r'(?P<slice>.+) Mean Score', raw_evaluation_name)
+    if score_match is not None:
+        return CanonicalPatch(
+            evaluation_name=score_match.group('slice'),
+            metric_id='mean_score',
+            metric_name='Mean Score',
+            metric_kind='score',
+            metric_unit='proportion',
+        )
+
+    return None
+
+
+def _bfcl_patch(
+    raw_evaluation_name: str, metric_config: dict[str, Any]
+) -> CanonicalPatch | None:
+    metric_id = metric_config.get('metric_id')
+    if not isinstance(metric_id, str) or not metric_id.startswith('bfcl.'):
+        return None
+
+    parts = metric_id.split('.')
+    if len(parts) < 3:
+        return None
+
+    eval_slice = '.'.join(parts[1:-1])
+    if not eval_slice:
+        return None
+
+    if raw_evaluation_name != metric_id and raw_evaluation_name == eval_slice:
+        return None
+
+    return CanonicalPatch(evaluation_name=eval_slice)
+
+
+def _rewardbench_patch(
+    raw_evaluation_name: str, metric_config: dict[str, Any]
+) -> CanonicalPatch | None:
+    description = str(metric_config.get('evaluation_description') or '')
+    lowered = description.lower()
+
+    if raw_evaluation_name == 'Score':
+        return CanonicalPatch(
+            evaluation_name='RewardBench',
+            metric_id='rewardbench.score',
+            metric_name='Score',
+            metric_kind='score',
+            metric_unit='proportion',
+        )
+
+    if 'accuracy' in lowered:
+        return CanonicalPatch(
+            metric_id='accuracy',
+            metric_name='Accuracy',
+            metric_kind='accuracy',
+            metric_unit='proportion',
+        )
+
+    if 'score' in lowered:
+        return CanonicalPatch(
+            metric_id='score',
+            metric_name='Score',
+            metric_kind='score',
+            metric_unit='proportion',
+        )
+
+    return None
+
+
+def _description_metric_patch(metric_config: dict[str, Any]) -> CanonicalPatch | None:
+    description = str(metric_config.get('evaluation_description') or '')
+    if ' on ' not in description:
+        return None
+    metric_phrase = description.split(' on ', 1)[0]
+    return _metric_from_phrase(metric_phrase)
+
+
+def _score_suffix_patch(
+    benchmark_family: str,
+    raw_evaluation_name: str,
+    metric_config: dict[str, Any],
+) -> CanonicalPatch | None:
+    score_match = re.fullmatch(r'(?P<slice>.+?) Score', raw_evaluation_name)
+    if score_match is None:
+        return None
+
+    eval_slice = score_match.group('slice').strip()
+    evaluation_name = (
+        benchmark_family if eval_slice.lower() == 'overall' else eval_slice
+    )
+    namespace = _benchmark_metric_namespace(benchmark_family)
+    return CanonicalPatch(
+        evaluation_name=evaluation_name,
+        metric_id=f'{namespace}.score',
+        metric_name='Score',
+        metric_kind='score',
+        metric_unit=_score_metric_unit(metric_config),
+    )
+
+
+def _single_score_patch(
+    benchmark_family: str, metric_config: dict[str, Any]
+) -> CanonicalPatch:
+    namespace = _benchmark_metric_namespace(benchmark_family)
+    return CanonicalPatch(
+        metric_id=f'{namespace}.score',
+        metric_name='Score',
+        metric_kind='score',
+        metric_unit=_score_metric_unit(metric_config),
+    )
+
+
+def _theory_of_mind_patch(
+    raw_evaluation_name: str, metric_config: dict[str, Any]
+) -> CanonicalPatch | None:
+    match = re.fullmatch(
+        r'(?P<metric>.+?) on (?P<slice>.+?)(?: for scorer .+)?',
+        raw_evaluation_name,
+    )
+    if match is not None:
+        patch = _metric_from_phrase(match.group('metric'))
+        if patch is not None:
+            patch.evaluation_name = match.group('slice').strip()
+            return patch
+
+    return _metric_from_phrase(str(metric_config.get('evaluation_description') or ''))
+
+
+def _helm_patch(
+    benchmark_family: str,
+    raw_evaluation_name: str,
+    metric_config: dict[str, Any],
+) -> CanonicalPatch | None:
+    description = str(metric_config.get('evaluation_description') or '')
+
+    if raw_evaluation_name.lower().startswith('mean '):
+        metric = _metric_from_phrase(raw_evaluation_name)
+        if metric is None:
+            return None
+        metric.evaluation_name = benchmark_family
+        return metric
+
+    if ' on ' in description:
+        metric_phrase = description.split(' on ', 1)[0]
+        return _metric_from_phrase(metric_phrase)
+
+    return None
+
+
+def _swe_bench_verified_mini_patch(
+    benchmark_family: str,
+    raw_evaluation_name: str,
+    metric_config: dict[str, Any],
+) -> CanonicalPatch | None:
+    match = re.fullmatch(
+        r'(?P<metric>mean|std) on inspect_evals/swe_bench_verified_mini(?: for scorer .+)?',
+        raw_evaluation_name,
+        flags=re.IGNORECASE,
+    )
+    if match is None:
+        metric_phrase = str(metric_config.get('evaluation_description') or '')
+    else:
+        metric_phrase = match.group('metric')
+
+    metric_phrase = metric_phrase.strip().lower()
+    if metric_phrase == 'mean':
+        return CanonicalPatch(
+            evaluation_name=benchmark_family,
+            metric_id='mean_score',
+            metric_name='Mean Score',
+            metric_kind='score',
+            metric_unit=_score_metric_unit(metric_config),
+        )
+    if metric_phrase == 'std':
+        return CanonicalPatch(
+            evaluation_name=benchmark_family,
+            metric_id='standard_deviation',
+            metric_name='Standard Deviation',
+            metric_kind='standard_deviation',
+            metric_unit=_score_metric_unit(metric_config),
+        )
+    return None
+
+
+def _canonical_patch_for_result(
+    benchmark_family: str,
+    result: dict[str, Any],
+    *,
+    payload: dict[str, Any] | None = None,
+) -> CanonicalPatch | None:
+    raw_evaluation_name = str(result.get('evaluation_name') or '')
+    metric_config = result.setdefault('metric_config', {})
+
+    if benchmark_family == 'alphaxiv':
+        return _alphaxiv_patch(payload or {}, result)
+
+    if benchmark_family == 'global-mmlu-lite':
+        return CanonicalPatch(
+            metric_id='accuracy',
+            metric_name='Accuracy',
+            metric_kind='accuracy',
+            metric_unit='proportion',
+        )
+
+    if benchmark_family == 'hfopenllm_v2':
+        metric = HFOPENLLM_METRICS.get(raw_evaluation_name)
+        if metric is None:
+            return None
+        return CanonicalPatch(
+            metric_id=metric['metric_id'],
+            metric_name=metric['metric_name'],
+            metric_kind=metric['metric_kind'],
+            metric_unit=metric['metric_unit'],
+        )
+
+    if benchmark_family == 'reward-bench':
+        return _rewardbench_patch(raw_evaluation_name, metric_config)
+
+    if benchmark_family == 'terminal-bench-2.0':
+        return CanonicalPatch(
+            metric_id='accuracy',
+            metric_name='Task Resolution Accuracy',
+            metric_kind='accuracy',
+            metric_unit='percentage',
+        )
+
+    if benchmark_family == 'wordle_arena':
+        return _wordle_patch(raw_evaluation_name)
+
+    if benchmark_family == 'fibble_arena':
+        return _fibble_patch(raw_evaluation_name)
+
+    if re.fullmatch(r'fibble[1-5]_arena', benchmark_family):
+        return _fibble_patch(raw_evaluation_name)
+
+    if benchmark_family == 'apex-agents':
+        return _apex_agents_patch(raw_evaluation_name)
+
+    if benchmark_family in {'ace', 'apex-v1'}:
+        return _score_suffix_patch(
+            benchmark_family=benchmark_family,
+            raw_evaluation_name=raw_evaluation_name,
+            metric_config=metric_config,
+        )
+
+    if benchmark_family == 'bfcl':
+        return _bfcl_patch(raw_evaluation_name, metric_config)
+
+    if benchmark_family.startswith('helm_'):
+        return _helm_patch(
+            benchmark_family=benchmark_family,
+            raw_evaluation_name=raw_evaluation_name,
+            metric_config=metric_config,
+        )
+
+    if benchmark_family == 'livecodebenchpro':
+        return _description_metric_patch(metric_config)
+
+    if benchmark_family == 'swe-bench-verified-mini':
+        return _swe_bench_verified_mini_patch(
+            benchmark_family=benchmark_family,
+            raw_evaluation_name=raw_evaluation_name,
+            metric_config=metric_config,
+        )
+
+    if benchmark_family in SINGLE_SCORE_BENCHMARK_FAMILIES or benchmark_family.startswith(
+        'tau-bench-2_'
+    ):
+        return _single_score_patch(benchmark_family, metric_config)
+
+    if benchmark_family == 'IFEval':
+        return _ifeval_patch(raw_evaluation_name)
+
+    if benchmark_family == 'agentharm':
+        return _agentharm_patch(raw_evaluation_name)
+
+    if benchmark_family == 'cvebench':
+        return _generic_inspect_eval_patch(
+            benchmark_family=benchmark_family,
+            raw_evaluation_name=raw_evaluation_name,
+            metric_config=metric_config,
+        )
+
+    if benchmark_family in INSPECT_ACCURACY_BENCHMARK_FAMILIES:
+        return _generic_inspect_eval_patch(
+            benchmark_family=benchmark_family,
+            raw_evaluation_name=raw_evaluation_name,
+            metric_config=metric_config,
+        )
+
+    if benchmark_family == 'theory_of_mind':
+        return _theory_of_mind_patch(raw_evaluation_name, metric_config)
+
+    return None
+
+
+def _apply_patch(
+    result: dict[str, Any], patch: CanonicalPatch | None
+) -> tuple[bool, str]:
+    raw_evaluation_name = str(result.get('evaluation_name') or '')
+    if patch is None:
+        return False, raw_evaluation_name
+
+    metric_config = result.setdefault('metric_config', {})
+    changed = False
+
+    if (
+        patch.evaluation_name is not None
+        and result.get('evaluation_name') != patch.evaluation_name
+    ):
+        _store_raw_evaluation_name(metric_config, raw_evaluation_name)
+        result['evaluation_name'] = patch.evaluation_name
+        changed = True
+
+    changed |= _set_metric_field(metric_config, 'metric_id', patch.metric_id)
+    changed |= _set_metric_field(
+        metric_config, 'metric_name', patch.metric_name
+    )
+    changed |= _set_metric_field(
+        metric_config, 'metric_kind', patch.metric_kind
+    )
+    changed |= _set_metric_field(
+        metric_config, 'metric_unit', patch.metric_unit
+    )
+
+    if patch.metric_parameters and metric_config.get('metric_parameters') in (
+        None,
+        {},
+    ):
+        metric_config['metric_parameters'] = patch.metric_parameters
+        changed = True
+
+    return changed, raw_evaluation_name
+
+
+def _metric_fragment(metric_config: dict[str, Any]) -> str:
+    metric_id = str(
+        metric_config.get('metric_id')
+        or metric_config.get('metric_name')
+        or 'score'
+    )
+    fragment = _slug(metric_id)
+    parameters = metric_config.get('metric_parameters')
+    if isinstance(parameters, dict):
+        for key, value in sorted(parameters.items()):
+            fragment += f'__{_slug(str(key))}_{_slug(str(value))}'
+    return fragment
+
+
+def _raw_evaluation_name(result: dict[str, Any]) -> str:
+    metric_config = result.get('metric_config', {})
+    if not isinstance(metric_config, dict):
+        return str(result.get('evaluation_name') or '')
+
+    additional = metric_config.get('additional_details')
+    if isinstance(additional, dict):
+        raw_name = additional.get('raw_evaluation_name')
+        if raw_name:
+            return str(raw_name)
+
+    return str(result.get('evaluation_name') or '')
+
+
+def _assign_evaluation_result_ids(
+    payload: dict[str, Any],
+    sample_updates: dict[str, dict[str, str]],
+) -> bool:
+    evaluation_id = str(payload.get('evaluation_id') or '')
+    counts: dict[str, int] = {}
+    changed = False
+
+    for result in payload.get('evaluation_results', []):
+        raw_evaluation_name = _raw_evaluation_name(result)
+        eval_name = str(result.get('evaluation_name') or '')
+        base = (
+            f'{evaluation_id}'
+            f'#{_slug(eval_name)}'
+            f'#{_metric_fragment(result.get("metric_config", {}))}'
+        )
+        counts[base] = counts.get(base, 0) + 1
+        evaluation_result_id = base
+        if counts[base] > 1:
+            evaluation_result_id = f'{base}__{counts[base]}'
+
+        if result.get('evaluation_result_id') != evaluation_result_id:
+            result['evaluation_result_id'] = evaluation_result_id
+            changed = True
+
+        sample_updates[raw_evaluation_name] = {
+            'evaluation_name': eval_name,
+            'evaluation_result_id': evaluation_result_id,
+            'evaluation_id': evaluation_id,
+        }
+
+    return changed
+
+
+def _add_sample_alias_updates(
+    benchmark_family: str,
+    payload: dict[str, Any],
+    sample_updates: dict[str, dict[str, str]],
+) -> None:
+    if benchmark_family != 'swe-bench-verified-mini':
+        return
+
+    evaluation_id = str(payload.get('evaluation_id') or '')
+    mean_result = None
+    for result in payload.get('evaluation_results', []):
+        metric_config = result.get('metric_config', {})
+        if metric_config.get('metric_id') == 'mean_score':
+            mean_result = result
+            break
+
+    if mean_result is None:
+        return
+
+    alias_update = {
+        'evaluation_name': str(
+            mean_result.get('evaluation_name') or benchmark_family
+        ),
+        'evaluation_result_id': str(
+            mean_result.get('evaluation_result_id') or ''
+        ),
+        'evaluation_id': evaluation_id,
+    }
+    for raw_name in (
+        'inspect_evals/swe_bench_verified_mini',
+        'swe_bench_verified_mini',
+    ):
+        sample_updates[raw_name] = alias_update.copy()
+
+
+def infer_benchmark_family(
+    file_path: Path, payload: dict[str, Any] | None = None
+) -> str:
+    parts = list(file_path.parts)
+    if 'data' in parts:
+        idx = parts.index('data')
+        if idx + 1 < len(parts):
+            return parts[idx + 1]
+
+    if payload is None:
+        return 'unknown'
+
+    evaluation_id = str(payload.get('evaluation_id') or '')
+    if '/' in evaluation_id:
+        return evaluation_id.split('/', 1)[0]
+    return 'unknown'
+
+
+def _ensure_eval_library(
+    payload: dict[str, Any], benchmark_family: str
+) -> bool:
+    if benchmark_family != 'alphaxiv':
+        return False
+
+    changed = False
+    eval_library = payload.get('eval_library')
+    if not isinstance(eval_library, dict):
+        eval_library = {}
+        payload['eval_library'] = eval_library
+        changed = True
+
+    if eval_library.get('name') in (None, ''):
+        eval_library['name'] = 'alphaxiv'
+        changed = True
+    if eval_library.get('version') in (None, ''):
+        eval_library['version'] = 'unknown'
+        changed = True
+
+    return changed
+
+
+def augment_aggregate_payload(
+    payload: dict[str, Any], benchmark_family: str | None = None
+) -> tuple[dict[str, Any], int, dict[str, dict[str, str]], bool]:
+    benchmark_family = benchmark_family or infer_benchmark_family(
+        Path('.'), payload
+    )
+    changed_results = 0
+    sample_updates: dict[str, dict[str, str]] = {}
+    payload_changed = _ensure_eval_library(payload, benchmark_family)
+
+    for result in payload.get('evaluation_results', []):
+        patch = _canonical_patch_for_result(
+            benchmark_family, result, payload=payload
+        )
+        changed, _ = _apply_patch(result, patch)
+        if changed:
+            changed_results += 1
+
+    ids_changed = _assign_evaluation_result_ids(payload, sample_updates)
+    _add_sample_alias_updates(benchmark_family, payload, sample_updates)
+    return payload, changed_results, sample_updates, (
+        ids_changed or payload_changed
+    )
+
+
+def _read_jsonl(path: Path) -> list[dict[str, Any]]:
+    rows: list[dict[str, Any]] = []
+    with path.open(encoding='utf-8') as handle:
+        for line in handle:
+            stripped = line.strip()
+            if not stripped:
+                continue
+            rows.append(json.loads(stripped))
+    return rows
+
+
+def _write_json(path: Path, payload: dict[str, Any]) -> None:
+    path.write_text(
+        json.dumps(payload, indent=2, ensure_ascii=False) + '\n',
+        encoding='utf-8',
+    )
+
+
+def _write_jsonl(path: Path, rows: list[dict[str, Any]]) -> None:
+    with path.open('w', encoding='utf-8') as handle:
+        for row in rows:
+            handle.write(json.dumps(row, ensure_ascii=False) + '\n')
+
+
+def canonicalize_datastore_path(file_path: Path) -> Path:
+    parts = list(file_path.parts)
+    if 'data' not in parts:
+        return file_path
+
+    idx = parts.index('data')
+    relative = parts[idx + 1 :]
+    if len(relative) <= 4:
+        return file_path
+
+    benchmark_family = relative[0]
+    developer = relative[-3]
+    model_name = relative[-2]
+    filename = relative[-1]
+    return Path(*parts[: idx + 1], benchmark_family, developer, model_name, filename)
+
+
+def _move_file(source: Path, target: Path) -> None:
+    if source == target:
+        return
+    if target.exists():
+        raise FileExistsError(
+            f'Cannot normalize path for {source}: target already exists at {target}'
+        )
+    target.parent.mkdir(parents=True, exist_ok=True)
+    source.rename(target)
+
+
+def augment_sample_rows(
+    rows: list[dict[str, Any]],
+    sample_updates: dict[str, dict[str, str]],
+) -> tuple[list[dict[str, Any]], bool]:
+    changed = False
+
+    for row in rows:
+        raw_evaluation_name = str(row.get('evaluation_name') or '')
+        update = sample_updates.get(raw_evaluation_name)
+        if update is None:
+            continue
+
+        if row.get('evaluation_name') != update['evaluation_name']:
+            row['evaluation_name'] = update['evaluation_name']
+            changed = True
+
+        if row.get('evaluation_result_id') != update['evaluation_result_id']:
+            row['evaluation_result_id'] = update['evaluation_result_id']
+            changed = True
+
+        updated_evaluation_id = update.get('evaluation_id')
+        if (
+            updated_evaluation_id is not None
+            and row.get('evaluation_id') != updated_evaluation_id
+        ):
+            row['evaluation_id'] = updated_evaluation_id
+            changed = True
+
+    return rows, changed
+
+
+def augment_aggregate_file(
+    file_path: Path,
+    *,
+    write_changes: bool = False,
+    update_samples: bool = True,
+    normalize_paths: bool = True,
+) -> AugmentationReport:
+    original_file_path = file_path
+    payload = json.loads(file_path.read_text(encoding='utf-8'))
+    benchmark_family = infer_benchmark_family(file_path, payload)
+    payload, changed_results, sample_updates, ids_changed = (
+        augment_aggregate_payload(payload, benchmark_family=benchmark_family)
+    )
+    aggregate_changed = changed_results > 0 or ids_changed
+
+    sample_file_path = file_path.with_name(f'{file_path.stem}_samples.jsonl')
+    sample_exists = sample_file_path.exists()
+    sample_changed = False
+    sample_rows = 0
+
+    if update_samples and sample_exists:
+        rows = _read_jsonl(sample_file_path)
+        sample_rows = len(rows)
+        rows, sample_changed = augment_sample_rows(rows, sample_updates)
+
+        detailed = payload.get('detailed_evaluation_results')
+        if (
+            isinstance(detailed, dict)
+            and detailed.get('total_rows') != sample_rows
+        ):
+            detailed['total_rows'] = sample_rows
+            aggregate_changed = True
+
+        if write_changes and sample_changed:
+            _write_jsonl(sample_file_path, rows)
+
+    if write_changes and aggregate_changed:
+        _write_json(file_path, payload)
+
+    target_file_path = (
+        canonicalize_datastore_path(file_path) if normalize_paths else file_path
+    )
+    target_sample_file_path = (
+        canonicalize_datastore_path(sample_file_path)
+        if sample_exists and normalize_paths
+        else sample_file_path
+    )
+    path_changed = target_file_path != file_path
+
+    reported_file_path = target_file_path if path_changed else file_path
+    reported_sample_file_path = (
+        target_sample_file_path if sample_exists else None
+    )
+
+    if write_changes and path_changed:
+        if sample_exists and target_sample_file_path is not None:
+            _move_file(sample_file_path, target_sample_file_path)
+        _move_file(file_path, target_file_path)
+
+    return AugmentationReport(
+        original_file_path=original_file_path,
+        file_path=reported_file_path,
+        benchmark_family=benchmark_family,
+        aggregate_changed=aggregate_changed,
+        sample_file_path=reported_sample_file_path,
+        sample_changed=sample_changed,
+        changed_results=changed_results,
+        sample_rows=sample_rows,
+        path_changed=path_changed,
+    )
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog='every_eval_ever augment-canonical-identity',
+        description=(
+            'Backfill metric identity and canonical eval names into existing '
+            'EEE aggregate JSON files.'
+        ),
+    )
+    parser.add_argument(
+        'paths',
+        nargs='+',
+        help='Aggregate JSON files or directories containing datastore files.',
+    )
+    parser.add_argument(
+        '--write',
+        action='store_true',
+        help='Write changes in place. Without this flag, run as a dry run.',
+    )
+    parser.add_argument(
+        '--skip-samples',
+        action='store_true',
+        help='Do not update companion *_samples.jsonl files.',
+    )
+    parser.add_argument(
+        '--skip-path-normalization',
+        action='store_true',
+        help='Do not flatten over-nested datastore paths to benchmark/developer/model.',
+    )
+    return parser
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = build_parser().parse_args(argv)
+
+    aggregate_files = [
+        path
+        for path in expand_paths(args.paths)
+        if path.suffix == '.json' and not path.name.endswith('_samples.json')
+    ]
+
+    reports = [
+        augment_aggregate_file(
+            path,
+            write_changes=args.write,
+            update_samples=not args.skip_samples,
+            normalize_paths=not args.skip_path_normalization,
+        )
+        for path in aggregate_files
+    ]
+
+    changed_reports = [report for report in reports if report.changed]
+
+    action = 'Updated' if args.write else 'Would update'
+    for report in changed_reports:
+        details = [
+            f'results={report.changed_results}',
+            f'sample_rows={report.sample_rows}',
+            f'samples_changed={report.sample_changed}',
+        ]
+        if report.path_changed:
+            details.insert(
+                0, f'path={report.original_file_path} -> {report.file_path}'
+            )
+        print(
+            f'{action}: {report.file_path} '
+            f'[{report.benchmark_family}] '
+            f'({", ".join(details)})'
+        )
+
+    print(
+        f'Scanned {len(reports)} aggregate file(s); '
+        f'{len(changed_reports)} would change.'
+        if not args.write
+        else f'Scanned {len(reports)} aggregate file(s); '
+        f'updated {len(changed_reports)}.'
+    )
+    return 0
diff --git a/every_eval_ever/check_canonical_identity.py b/every_eval_ever/check_canonical_identity.py
new file mode 100644
index 000000000..bd2b21014
--- /dev/null
+++ b/every_eval_ever/check_canonical_identity.py
@@ -0,0 +1,242 @@
+"""Audit canonical metric/result identity coverage in aggregate datastore JSON."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+from collections import Counter, defaultdict
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+from every_eval_ever.validate import expand_paths
+
+IDENTITY_FIELDS = (
+    'evaluation_result_id',
+    'metric_id',
+    'metric_name',
+    'metric_kind',
+    'metric_unit',
+)
+
+EVALUATION_RESULT_ID_PATTERN = re.compile(r'.+#.+#.+')
+
+
+@dataclass
+class CanonicalIdentityReport:
+    files_scanned: int = 0
+    results_scanned: int = 0
+    missing: Counter[str] = field(default_factory=Counter)
+    malformed: Counter[str] = field(default_factory=Counter)
+    benchmark_missing: dict[str, Counter[str]] = field(
+        default_factory=lambda: defaultdict(Counter)
+    )
+
+    @property
+    def has_issues(self) -> bool:
+        return bool(self.missing) or bool(self.malformed)
+
+    def to_dict(self, top: int = 20) -> dict[str, Any]:
+        top_missing_by_benchmark: dict[str, list[tuple[str, int]]] = {}
+        for field_name in IDENTITY_FIELDS:
+            pairs = [
+                (benchmark, counts[field_name])
+                for benchmark, counts in self.benchmark_missing.items()
+                if counts[field_name] > 0
+            ]
+            pairs.sort(key=lambda item: item[1], reverse=True)
+            top_missing_by_benchmark[field_name] = pairs[:top]
+
+        return {
+            'files_scanned': self.files_scanned,
+            'results_scanned': self.results_scanned,
+            'missing': dict(self.missing),
+            'malformed': dict(self.malformed),
+            'top_missing_by_benchmark': top_missing_by_benchmark,
+        }
+
+
+def infer_benchmark_family(file_path: Path, payload: dict[str, Any]) -> str:
+    parts = list(file_path.parts)
+    if 'data' in parts:
+        idx = parts.index('data')
+        if idx + 1 < len(parts):
+            return parts[idx + 1]
+
+    evaluation_id = str(payload.get('evaluation_id') or '')
+    if '/' in evaluation_id:
+        return evaluation_id.split('/', 1)[0]
+    return 'unknown'
+
+
+def _record_missing(
+    report: CanonicalIdentityReport, benchmark_family: str, field_name: str
+) -> None:
+    report.missing[field_name] += 1
+    report.benchmark_missing[benchmark_family][field_name] += 1
+
+
+def _check_result_fields(
+    report: CanonicalIdentityReport,
+    benchmark_family: str,
+    result: dict[str, Any],
+) -> None:
+    evaluation_result_id = result.get('evaluation_result_id')
+    if not isinstance(evaluation_result_id, str) or not evaluation_result_id.strip():
+        _record_missing(report, benchmark_family, 'evaluation_result_id')
+    elif not EVALUATION_RESULT_ID_PATTERN.fullmatch(evaluation_result_id):
+        report.malformed['evaluation_result_id_pattern'] += 1
+
+    metric_config = result.get('metric_config')
+    if not isinstance(metric_config, dict):
+        report.malformed['metric_config_not_object'] += 1
+        for field_name in IDENTITY_FIELDS[1:]:
+            _record_missing(report, benchmark_family, field_name)
+        return
+
+    for field_name in IDENTITY_FIELDS[1:]:
+        value = metric_config.get(field_name)
+        if value is None:
+            _record_missing(report, benchmark_family, field_name)
+            continue
+        if not isinstance(value, str):
+            report.malformed[f'{field_name}_type'] += 1
+            continue
+        if not value.strip():
+            _record_missing(report, benchmark_family, field_name)
+
+
+def check_file(file_path: Path, report: CanonicalIdentityReport) -> None:
+    try:
+        payload = json.loads(file_path.read_text(encoding='utf-8'))
+    except Exception:
+        report.malformed['aggregate_json_parse_error'] += 1
+        report.files_scanned += 1
+        return
+
+    if not isinstance(payload, dict):
+        report.malformed['aggregate_json_not_object'] += 1
+        report.files_scanned += 1
+        return
+
+    evaluation_results = payload.get('evaluation_results')
+    if evaluation_results is None:
+        return
+
+    report.files_scanned += 1
+    benchmark_family = infer_benchmark_family(file_path, payload)
+    if not isinstance(evaluation_results, list):
+        report.malformed['evaluation_results_not_list'] += 1
+        return
+
+    report.results_scanned += len(evaluation_results)
+    for result in evaluation_results:
+        if not isinstance(result, dict):
+            report.malformed['evaluation_result_not_object'] += 1
+            continue
+        _check_result_fields(report, benchmark_family, result)
+
+
+def check_paths(paths: list[str]) -> CanonicalIdentityReport:
+    report = CanonicalIdentityReport()
+    aggregate_files = [
+        path
+        for path in expand_paths(paths)
+        if path.suffix == '.json' and not path.name.endswith('_samples.json')
+    ]
+
+    for file_path in aggregate_files:
+        check_file(file_path, report)
+
+    return report
+
+
+def render_text(report: CanonicalIdentityReport, top: int = 20) -> str:
+    payload = report.to_dict(top=top)
+    lines = [
+        f'Scanned {payload["files_scanned"]} aggregate file(s) '
+        f'with {payload["results_scanned"]} result row(s).',
+        '',
+    ]
+
+    if payload['missing']:
+        lines.append('Missing canonical fields:')
+        for field_name in IDENTITY_FIELDS:
+            missing_count = payload['missing'].get(field_name, 0)
+            if missing_count:
+                lines.append(f'- {field_name}: {missing_count}')
+        lines.append('')
+
+    if payload['malformed']:
+        lines.append('Malformed fields:')
+        for key, value in sorted(payload['malformed'].items()):
+            lines.append(f'- {key}: {value}')
+        lines.append('')
+
+    for field_name in IDENTITY_FIELDS:
+        pairs = payload['top_missing_by_benchmark'].get(field_name, [])
+        if not pairs:
+            continue
+        lines.append(f'Top missing benchmarks for {field_name}:')
+        for benchmark, count in pairs:
+            lines.append(f'- {benchmark}: {count}')
+        lines.append('')
+
+    if not report.has_issues:
+        lines.append('No canonical identity issues found.')
+
+    return '\n'.join(lines).rstrip() + '\n'
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog='every_eval_ever check-canonical-identity',
+        description=(
+            'Audit aggregate JSON files for missing or malformed canonical '
+            'metric/result identity fields.'
+        ),
+    )
+    parser.add_argument(
+        'paths',
+        nargs='+',
+        help='Aggregate JSON files or directories containing datastore files.',
+    )
+    parser.add_argument(
+        '--format',
+        choices=['text', 'json'],
+        default='text',
+        dest='output_format',
+        help='Output format.',
+    )
+    parser.add_argument(
+        '--top',
+        type=int,
+        default=20,
+        help='How many benchmarks to show per missing field in text/json output.',
+    )
+    parser.add_argument(
+        '--fail-on-issues',
+        action='store_true',
+        help='Exit with status 1 if any missing/malformed identity fields exist.',
+    )
+    return parser
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = build_parser().parse_args(argv)
+    report = check_paths(args.paths)
+    payload = report.to_dict(top=args.top)
+
+    if args.output_format == 'json':
+        print(json.dumps(payload, indent=2))
+    else:
+        print(render_text(report, top=args.top), end='')
+
+    if args.fail_on_issues and report.has_issues:
+        return 1
+    return 0
+
+
+if __name__ == '__main__':
+    raise SystemExit(main())
diff --git a/every_eval_ever/cli.py b/every_eval_ever/cli.py
index c215c89cd..ec5465323 100644
--- a/every_eval_ever/cli.py
+++ b/every_eval_ever/cli.py
@@ -289,6 +289,83 @@ def build_parser() -> argparse.ArgumentParser:
         help='One or more JSON files or directories containing JSON files.',
     )
 
+    check_canonical_parser = subparsers.add_parser(
+        'check-canonical-identity',
+        help='Audit canonical metric/eval identity coverage',
+        description=(
+            'Audit aggregate JSON files for missing or malformed canonical '
+            'metric/result identity fields.'
+        ),
+    )
+    check_canonical_parser.add_argument(
+        'paths',
+        nargs='+',
+        help='One or more aggregate JSON files or directories to audit.',
+    )
+    check_canonical_parser.add_argument(
+        '--format',
+        choices=['text', 'json'],
+        default='text',
+        dest='canonical_output_format',
+        help='Output format.',
+    )
+    check_canonical_parser.add_argument(
+        '--top',
+        type=int,
+        default=20,
+        help='How many benchmarks to show per missing field in the report.',
+    )
+    check_canonical_parser.add_argument(
+        '--fail-on-issues',
+        action='store_true',
+        help='Exit with status 1 if missing/malformed identity fields exist.',
+    )
+
+    augment_parser = subparsers.add_parser(
+        'augment-canonical-identity',
+        help='Backfill canonical metric/eval identity into datastore JSON',
+        description=(
+            'Backfill metric fields, evaluation_result_id values, and '
+            'metric-free evaluation_name values for known legacy '
+            'benchmark families in the datastore.'
+        ),
+    )
+    augment_parser.add_argument(
+        'paths',
+        nargs='+',
+        help='One or more aggregate JSON files or directories to augment.',
+    )
+    augment_parser.add_argument(
+        '--write',
+        action='store_true',
+        help='Write changes in place. Without this flag, run as a dry run.',
+    )
+    augment_parser.add_argument(
+        '--skip-samples',
+        action='store_true',
+        help='Do not update companion *_samples.jsonl files.',
+    )
+
+    schema_version_parser = subparsers.add_parser(
+        'upgrade-schema-version',
+        help='Normalize datastore schema_version fields to 0.2.2',
+        description=(
+            'Update aggregate JSON files to schema_version=0.2.2 and '
+            'instance JSONL rows to '
+            'schema_version=instance_level_eval_0.2.2.'
+        ),
+    )
+    schema_version_parser.add_argument(
+        'paths',
+        nargs='+',
+        help='One or more JSON/JSONL files or directories to normalize.',
+    )
+    schema_version_parser.add_argument(
+        '--write',
+        action='store_true',
+        help='Write changes in place. Without this flag, run as a dry run.',
+    )
+
     convert_parser = subparsers.add_parser(
         'convert',
         help='Convert source eval logs to every_eval_ever',
@@ -412,6 +489,44 @@ def main(argv: list[str] | None = None) -> int:
 
         return check_duplicates_main(args.paths)
 
+    if args.command == 'check-canonical-identity':
+        from every_eval_ever.check_canonical_identity import (
+            main as check_canonical_identity_main,
+        )
+
+        forwarded_args = [
+            *args.paths,
+            '--format',
+            args.canonical_output_format,
+            '--top',
+            str(args.top),
+        ]
+        if args.fail_on_issues:
+            forwarded_args.append('--fail-on-issues')
+        return check_canonical_identity_main(forwarded_args)
+
+    if args.command == 'augment-canonical-identity':
+        from every_eval_ever.augment_canonical_identity import (
+            main as augment_canonical_identity_main,
+        )
+
+        forwarded_args = [*args.paths]
+        if args.write:
+            forwarded_args.append('--write')
+        if args.skip_samples:
+            forwarded_args.append('--skip-samples')
+        return augment_canonical_identity_main(forwarded_args)
+
+    if args.command == 'upgrade-schema-version':
+        from every_eval_ever.upgrade_schema_version import (
+            main as upgrade_schema_version_main,
+        )
+
+        forwarded_args = [*args.paths]
+        if args.write:
+            forwarded_args.append('--write')
+        return upgrade_schema_version_main(forwarded_args)
+
     if args.command == 'convert':
         if args.source == 'lm_eval':
             return _cmd_convert_lm_eval(args)
diff --git a/every_eval_ever/converters/__init__.py b/every_eval_ever/converters/__init__.py
index 1b7ca553f..cffbb2467 100644
--- a/every_eval_ever/converters/__init__.py
+++ b/every_eval_ever/converters/__init__.py
@@ -1 +1,5 @@
-SCHEMA_VERSION = '0.2.2'
+AGGREGATE_SCHEMA_VERSION = '0.2.2'
+INSTANCE_LEVEL_SCHEMA_VERSION = 'instance_level_eval_0.2.2'
+
+# Backwards-compatible alias used by aggregate converters.
+SCHEMA_VERSION = AGGREGATE_SCHEMA_VERSION
diff --git a/every_eval_ever/converters/helm/adapter.py b/every_eval_ever/converters/helm/adapter.py
index e43eb01b9..bf28f1dd9 100644
--- a/every_eval_ever/converters/helm/adapter.py
+++ b/every_eval_ever/converters/helm/adapter.py
@@ -485,10 +485,11 @@ def _transform_single(
 
             instance_level_log_path, instance_level_rows_number = (
                 HELMInstanceLevelDataAdapter(
-                    detailed_results_id,
+                    evaluation_id,
                     Format.jsonl.value,
                     HashAlgorithm.sha256.value,
                     evaluation_dir,
+                    file_stem=detailed_results_id,
                 ).convert_instance_level_logs(
                     dataset_name,
                     model_info.id,
diff --git a/every_eval_ever/converters/helm/instance_level_adapter.py b/every_eval_ever/converters/helm/instance_level_adapter.py
index 037237ecc..815ce27e0 100644
--- a/every_eval_ever/converters/helm/instance_level_adapter.py
+++ b/every_eval_ever/converters/helm/instance_level_adapter.py
@@ -20,7 +20,7 @@ def _require_helm_dependencies() -> None:
         ) from _HELM_IMPORT_ERROR
 
 
-from every_eval_ever.converters import SCHEMA_VERSION
+from every_eval_ever.converters import INSTANCE_LEVEL_SCHEMA_VERSION
 from every_eval_ever.converters.common.utils import sha256_string
 from every_eval_ever.converters.helm.utils import extract_all_reasonings
 from every_eval_ever.instance_level_types import (
@@ -42,13 +42,14 @@ def __init__(
         format: str,
         hash_algorithm: str,
         evaluation_dir: str,
+        file_stem: str | None = None,
     ):
         _require_helm_dependencies()
         self.evaluation_id = evaulation_id
         self.format = format
         self.hash_algorithm = hash_algorithm
         self.evaluation_dir = evaluation_dir
-        self.path = f'{evaluation_dir}/{evaulation_id}.{format}'
+        self.path = f'{evaluation_dir}/{file_stem or evaulation_id}.{format}'
 
     def _save_json(self, items: List[InstanceLevelEvaluationLog]):
         eval_dir_path = Path(self.evaluation_dir)
@@ -157,7 +158,7 @@ def convert_instance_level_logs(
 
             instance_level_logs.append(
                 InstanceLevelEvaluationLog(
-                    schema_version=SCHEMA_VERSION,
+                    schema_version=INSTANCE_LEVEL_SCHEMA_VERSION,
                     evaluation_id=self.evaluation_id,
                     model_id=model_id,
                     evaluation_name=evaluation_name,
diff --git a/every_eval_ever/converters/inspect/instance_level_adapter.py b/every_eval_ever/converters/inspect/instance_level_adapter.py
index 99e2d5170..4059fc2d4 100644
--- a/every_eval_ever/converters/inspect/instance_level_adapter.py
+++ b/every_eval_ever/converters/inspect/instance_level_adapter.py
@@ -29,7 +29,7 @@ def _require_inspect_dependencies() -> None:
         ) from _INSPECT_IMPORT_ERROR
 
 
-from every_eval_ever.converters import SCHEMA_VERSION
+from every_eval_ever.converters import INSTANCE_LEVEL_SCHEMA_VERSION
 from every_eval_ever.converters.common.utils import sha256_string
 from every_eval_ever.instance_level_types import (
     AnswerAttributionItem,
@@ -422,7 +422,7 @@ def convert_instance_level_logs(
                 performance = None
 
             instance_level_log = InstanceLevelEvaluationLog(
-                schema_version=SCHEMA_VERSION,
+                schema_version=INSTANCE_LEVEL_SCHEMA_VERSION,
                 evaluation_id=self.evaluation_id,
                 model_id=model_id,
                 evaluation_name=evaluation_name,
diff --git a/every_eval_ever/converters/lm_eval/instance_level_adapter.py b/every_eval_ever/converters/lm_eval/instance_level_adapter.py
index 1cc393862..302f7c769 100644
--- a/every_eval_ever/converters/lm_eval/instance_level_adapter.py
+++ b/every_eval_ever/converters/lm_eval/instance_level_adapter.py
@@ -5,7 +5,7 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
-from every_eval_ever.converters import SCHEMA_VERSION
+from every_eval_ever.converters import INSTANCE_LEVEL_SCHEMA_VERSION
 from every_eval_ever.eval_types import (
     DetailedEvaluationResults,
     Format,
@@ -161,7 +161,7 @@ def _transform_sample(
         ]
 
         return InstanceLevelEvaluationLog(
-            schema_version=SCHEMA_VERSION,
+            schema_version=INSTANCE_LEVEL_SCHEMA_VERSION,
             evaluation_id=evaluation_id,
             model_id=model_id,
             evaluation_name=eval_name,
diff --git a/every_eval_ever/eval_types.py b/every_eval_ever/eval_types.py
index 40035403b..5cac80bfa 100644
--- a/every_eval_ever/eval_types.py
+++ b/every_eval_ever/eval_types.py
@@ -489,7 +489,7 @@ class EvaluationLog(BaseModel):
     model_config = ConfigDict(
         extra='forbid',
     )
-    schema_version: str = Field(
+    schema_version: Literal['0.2.2'] = Field(
         ..., description='Version of the schema used for this evaluation data'
     )
     evaluation_id: str = Field(
diff --git a/every_eval_ever/instance_level_types.py b/every_eval_ever/instance_level_types.py
index ff78d3420..8f8b71bbb 100644
--- a/every_eval_ever/instance_level_types.py
+++ b/every_eval_ever/instance_level_types.py
@@ -5,7 +5,7 @@
 from __future__ import annotations
 
 from enum import Enum
-from typing import Any
+from typing import Any, Literal
 
 from pydantic import (
     BaseModel,
@@ -154,7 +154,7 @@ class InstanceLevelEvaluationLog(BaseModel):
     model_config = ConfigDict(
         extra='forbid',
     )
-    schema_version: str = Field(
+    schema_version: Literal['instance_level_eval_0.2.2'] = Field(
         ..., description='Version of the schema used for this instance data'
     )
     evaluation_id: str = Field(
diff --git a/every_eval_ever/upgrade_schema_version.py b/every_eval_ever/upgrade_schema_version.py
new file mode 100644
index 000000000..31c1dd5e1
--- /dev/null
+++ b/every_eval_ever/upgrade_schema_version.py
@@ -0,0 +1,192 @@
+"""Normalize datastore schema_version fields to the current 0.2.2 schemas."""
+
+from __future__ import annotations
+
+import argparse
+import json
+from dataclasses import dataclass
+from pathlib import Path
+
+from every_eval_ever.validate import expand_paths
+
+AGGREGATE_SCHEMA_VERSION = '0.2.2'
+INSTANCE_SCHEMA_VERSION = 'instance_level_eval_0.2.2'
+
+
+@dataclass
+class SchemaVersionUpgradeReport:
+    file_path: Path
+    file_type: str
+    target_version: str
+    changed: bool
+    rows_changed: int = 0
+    rows_scanned: int = 0
+    old_versions: tuple[str | None, ...] = ()
+
+
+def _write_json(path: Path, payload: dict) -> None:
+    path.write_text(
+        json.dumps(payload, indent=2, ensure_ascii=False) + '\n',
+        encoding='utf-8',
+    )
+
+
+def _write_jsonl(path: Path, rows: list[dict]) -> None:
+    with path.open('w', encoding='utf-8') as handle:
+        for row in rows:
+            handle.write(json.dumps(row, ensure_ascii=False) + '\n')
+
+
+def upgrade_aggregate_file(
+    file_path: Path, *, write_changes: bool = False
+) -> SchemaVersionUpgradeReport:
+    payload = json.loads(file_path.read_text(encoding='utf-8'))
+    if not isinstance(payload, dict):
+        raise ValueError(f'Aggregate payload is not a JSON object: {file_path}')
+
+    old_version = payload.get('schema_version')
+    changed = old_version != AGGREGATE_SCHEMA_VERSION
+    if changed:
+        payload['schema_version'] = AGGREGATE_SCHEMA_VERSION
+        if write_changes:
+            _write_json(file_path, payload)
+
+    return SchemaVersionUpgradeReport(
+        file_path=file_path,
+        file_type='aggregate',
+        target_version=AGGREGATE_SCHEMA_VERSION,
+        changed=changed,
+        rows_scanned=1,
+        rows_changed=1 if changed else 0,
+        old_versions=(old_version,) if changed else (),
+    )
+
+
+def upgrade_instance_file(
+    file_path: Path, *, write_changes: bool = False
+) -> SchemaVersionUpgradeReport:
+    rows: list[dict] = []
+    old_versions: set[str | None] = set()
+    changed_rows = 0
+
+    with file_path.open(encoding='utf-8') as handle:
+        for raw_line in handle:
+            stripped = raw_line.strip()
+            if not stripped:
+                continue
+            row = json.loads(stripped)
+            if not isinstance(row, dict):
+                raise ValueError(
+                    f'Instance payload line is not a JSON object: {file_path}'
+                )
+            old_version = row.get('schema_version')
+            if old_version != INSTANCE_SCHEMA_VERSION:
+                old_versions.add(old_version)
+                row['schema_version'] = INSTANCE_SCHEMA_VERSION
+                changed_rows += 1
+            rows.append(row)
+
+    if write_changes and changed_rows:
+        _write_jsonl(file_path, rows)
+
+    return SchemaVersionUpgradeReport(
+        file_path=file_path,
+        file_type='instance',
+        target_version=INSTANCE_SCHEMA_VERSION,
+        changed=changed_rows > 0,
+        rows_scanned=len(rows),
+        rows_changed=changed_rows,
+        old_versions=tuple(sorted(old_versions, key=lambda value: str(value))),
+    )
+
+
+def upgrade_file(
+    file_path: Path, *, write_changes: bool = False
+) -> SchemaVersionUpgradeReport:
+    if file_path.suffix == '.json':
+        return upgrade_aggregate_file(file_path, write_changes=write_changes)
+    if file_path.suffix == '.jsonl':
+        return upgrade_instance_file(file_path, write_changes=write_changes)
+    raise ValueError(
+        f"Unsupported file extension '{file_path.suffix}' for {file_path}"
+    )
+
+
+def render_text(reports: list[SchemaVersionUpgradeReport]) -> str:
+    aggregate_changed = sum(
+        1
+        for report in reports
+        if report.file_type == 'aggregate' and report.changed
+    )
+    instance_changed = sum(
+        1
+        for report in reports
+        if report.file_type == 'instance' and report.changed
+    )
+    instance_rows_changed = sum(
+        report.rows_changed
+        for report in reports
+        if report.file_type == 'instance'
+    )
+
+    lines = [
+        f'Scanned {len(reports)} file(s).',
+        f'- Aggregate files updated: {aggregate_changed}',
+        f'- Instance files updated: {instance_changed}',
+        f'- Instance rows updated: {instance_rows_changed}',
+        '',
+    ]
+
+    for report in reports:
+        if not report.changed:
+            continue
+        suffix = (
+            f' ({report.rows_changed}/{report.rows_scanned} rows)'
+            if report.file_type == 'instance'
+            else ''
+        )
+        previous = ', '.join(str(v) for v in report.old_versions)
+        lines.append(
+            f'Updated {report.file_path}: {previous} -> {report.target_version}{suffix}'
+        )
+
+    if all(not report.changed for report in reports):
+        lines.append('No schema_version changes were needed.')
+
+    return '\n'.join(lines).rstrip() + '\n'
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog='every_eval_ever upgrade-schema-version',
+        description=(
+            'Normalize aggregate and instance datastore files to the current '
+            '0.2.2 schema_version values.'
+        ),
+    )
+    parser.add_argument(
+        'paths',
+        nargs='+',
+        help='Files or directories containing aggregate JSON and sample JSONL files.',
+    )
+    parser.add_argument(
+        '--write',
+        action='store_true',
+        help='Write changes in place. Without this flag, run as a dry run.',
+    )
+    return parser
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = build_parser().parse_args(argv)
+    file_paths = expand_paths(args.paths)
+    reports = [
+        upgrade_file(file_path, write_changes=args.write)
+        for file_path in file_paths
+    ]
+    print(render_text(reports), end='')
+    return 0
+
+
+if __name__ == '__main__':
+    raise SystemExit(main())
diff --git a/tests/test_augment_canonical_identity.py b/tests/test_augment_canonical_identity.py
new file mode 100644
index 000000000..fd3f9f9ea
--- /dev/null
+++ b/tests/test_augment_canonical_identity.py
@@ -0,0 +1,1049 @@
+"""Tests for canonical identity augmentation of legacy datastore payloads."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from every_eval_ever.augment_canonical_identity import (
+    augment_aggregate_file,
+    augment_aggregate_payload,
+)
+from every_eval_ever.check_canonical_identity import check_paths
+from every_eval_ever.upgrade_schema_version import (
+    upgrade_aggregate_file,
+    upgrade_instance_file,
+)
+from every_eval_ever.validate import validate_file
+
+
+def _base_payload(evaluation_id: str, evaluation_results: list[dict]) -> dict:
+    return {
+        'schema_version': '0.2.2',
+        'evaluation_id': evaluation_id,
+        'retrieved_timestamp': '1234567890',
+        'source_metadata': {
+            'source_type': 'documentation',
+            'source_organization_name': 'TestOrg',
+            'evaluator_relationship': 'third_party',
+        },
+        'eval_library': {'name': 'unknown', 'version': 'unknown'},
+        'model_info': {'name': 'test-model', 'id': 'org/test-model'},
+        'evaluation_results': evaluation_results,
+    }
+
+
+def _base_result(
+    evaluation_name: str,
+    description: str,
+    *,
+    min_score: float = 0.0,
+    max_score: float = 1.0,
+    lower_is_better: bool = False,
+) -> dict:
+    return {
+        'evaluation_name': evaluation_name,
+        'source_data': {
+            'dataset_name': 'test-ds',
+            'source_type': 'hf_dataset',
+            'hf_repo': 'org/test-ds',
+        },
+        'metric_config': {
+            'evaluation_description': description,
+            'lower_is_better': lower_is_better,
+            'score_type': 'continuous',
+            'min_score': min_score,
+            'max_score': max_score,
+        },
+        'score_details': {'score': 0.5},
+    }
+
+
+def test_global_mmlu_lite_backfills_accuracy_metric():
+    payload = _base_payload(
+        'global-mmlu-lite/org_model/123',
+        [
+            _base_result(
+                'Arabic',
+                'Global MMLU Lite - Arabic',
+            )
+        ],
+    )
+
+    augmented, changed_results, _, ids_changed = augment_aggregate_payload(
+        payload, benchmark_family='global-mmlu-lite'
+    )
+
+    result = augmented['evaluation_results'][0]
+    metric_config = result['metric_config']
+
+    assert changed_results == 1
+    assert ids_changed is True
+    assert result['evaluation_name'] == 'Arabic'
+    assert metric_config['metric_id'] == 'accuracy'
+    assert metric_config['metric_name'] == 'Accuracy'
+    assert metric_config['metric_kind'] == 'accuracy'
+    assert metric_config['metric_unit'] == 'proportion'
+    assert result['evaluation_result_id'].endswith('#arabic#accuracy')
+
+
+def test_wordle_arena_updates_aggregate_and_samples(tmp_path: Path):
+    aggregate_path = (
+        tmp_path
+        / 'data'
+        / 'wordle_arena'
+        / 'anthropic'
+        / 'claude-haiku-4.5'
+        / 'run.json'
+    )
+    aggregate_path.parent.mkdir(parents=True)
+    payload = _base_payload(
+        'wordle_arena/anthropic_claude-haiku-4.5/1776347262.820056',
+        [
+            _base_result(
+                'wordle_arena_win_rate',
+                'Win rate on Wordle Arena',
+            ),
+            _base_result(
+                'wordle_arena_avg_attempts',
+                'Average guesses used per game on Wordle Arena',
+                min_score=1.0,
+                max_score=6.0,
+                lower_is_better=True,
+            ),
+        ],
+    )
+    payload['detailed_evaluation_results'] = {
+        'format': 'jsonl',
+        'file_path': 'run_samples.jsonl',
+        'total_rows': 1,
+    }
+    aggregate_path.write_text(json.dumps(payload), encoding='utf-8')
+
+    samples_path = aggregate_path.with_name('run_samples.jsonl')
+    sample_rows = [
+        {
+            'schema_version': 'instance_level_eval_0.2.2',
+            'evaluation_id': payload['evaluation_id'],
+            'model_id': 'org/test-model',
+            'evaluation_name': 'wordle_arena_win_rate',
+            'sample_id': 'sample-1',
+            'interaction_type': 'single_turn',
+            'input': {'raw': 'guess', 'reference': ['guess']},
+            'output': {'raw': ['guess']},
+            'answer_attribution': [
+                {
+                    'turn_idx': 0,
+                    'source': 'output.raw',
+                    'extracted_value': 'guess',
+                    'extraction_method': 'exact_match',
+                    'is_terminal': True,
+                }
+            ],
+            'evaluation': {'score': 1.0, 'is_correct': True},
+        },
+        {
+            'schema_version': 'instance_level_eval_0.2.2',
+            'evaluation_id': payload['evaluation_id'],
+            'model_id': 'org/test-model',
+            'evaluation_name': 'wordle_arena_win_rate',
+            'sample_id': 'sample-2',
+            'interaction_type': 'single_turn',
+            'input': {'raw': 'guess', 'reference': ['guess']},
+            'output': {'raw': ['guess']},
+            'answer_attribution': [
+                {
+                    'turn_idx': 0,
+                    'source': 'output.raw',
+                    'extracted_value': 'guess',
+                    'extraction_method': 'exact_match',
+                    'is_terminal': True,
+                }
+            ],
+            'evaluation': {'score': 0.0, 'is_correct': False},
+        },
+    ]
+    samples_path.write_text(
+        '\n'.join(json.dumps(row) for row in sample_rows) + '\n',
+        encoding='utf-8',
+    )
+
+    report = augment_aggregate_file(aggregate_path, write_changes=True)
+
+    assert report.aggregate_changed is True
+    assert report.sample_changed is True
+
+    updated_payload = json.loads(aggregate_path.read_text(encoding='utf-8'))
+    win_rate = updated_payload['evaluation_results'][0]
+    avg_attempts = updated_payload['evaluation_results'][1]
+
+    assert win_rate['evaluation_name'] == 'wordle_arena'
+    assert win_rate['metric_config']['metric_id'] == 'win_rate'
+    assert (
+        win_rate['metric_config']['additional_details']['raw_evaluation_name']
+        == 'wordle_arena_win_rate'
+    )
+    assert avg_attempts['evaluation_name'] == 'wordle_arena'
+    assert avg_attempts['metric_config']['metric_id'] == 'mean_attempts'
+    assert updated_payload['detailed_evaluation_results']['total_rows'] == 2
+
+    updated_rows = [
+        json.loads(line)
+        for line in samples_path.read_text(encoding='utf-8').splitlines()
+        if line.strip()
+    ]
+    assert all(row['evaluation_name'] == 'wordle_arena' for row in updated_rows)
+    assert all(
+        row['evaluation_result_id'] == win_rate['evaluation_result_id']
+        for row in updated_rows
+    )
+
+
+def test_apex_agents_splits_metric_semantics_from_evaluation_name():
+    payload = _base_payload(
+        'apex-agents/org_model/123',
+        [
+            _base_result(
+                'Overall Pass@8',
+                'Overall Pass@8 (dataset card / paper snapshot).',
+            ),
+            _base_result(
+                'Investment Banking Mean Score',
+                'Investment banking world mean rubric score.',
+            ),
+        ],
+    )
+
+    augmented, changed_results, _, _ = augment_aggregate_payload(
+        payload, benchmark_family='apex-agents'
+    )
+
+    pass_at_8 = augmented['evaluation_results'][0]
+    mean_score = augmented['evaluation_results'][1]
+
+    assert changed_results == 2
+    assert pass_at_8['evaluation_name'] == 'Overall'
+    assert pass_at_8['metric_config']['metric_id'] == 'pass_at_k'
+    assert pass_at_8['metric_config']['metric_parameters'] == {'k': 8}
+    assert mean_score['evaluation_name'] == 'Investment Banking'
+    assert mean_score['metric_config']['metric_id'] == 'mean_score'
+
+
+def test_bfcl_uses_metric_id_to_restore_eval_slice():
+    payload = _base_payload(
+        'bfcl/org_model/123',
+        [
+            {
+                **_base_result(
+                    'bfcl.multi_turn.accuracy',
+                    'Multi-turn accuracy',
+                    max_score=100.0,
+                ),
+                'metric_config': {
+                    'metric_id': 'bfcl.multi_turn.accuracy',
+                    'metric_name': 'Accuracy',
+                    'lower_is_better': False,
+                    'score_type': 'continuous',
+                    'min_score': 0.0,
+                    'max_score': 100.0,
+                },
+            }
+        ],
+    )
+
+    augmented, changed_results, _, _ = augment_aggregate_payload(
+        payload, benchmark_family='bfcl'
+    )
+
+    result = augmented['evaluation_results'][0]
+    assert changed_results == 1
+    assert result['evaluation_name'] == 'multi_turn'
+    assert result['metric_config']['metric_id'] == 'bfcl.multi_turn.accuracy'
+    assert (
+        result['metric_config']['additional_details']['raw_evaluation_name']
+        == 'bfcl.multi_turn.accuracy'
+    )
+
+
+def test_swe_leaderboards_backfill_single_score_metric_identity():
+    payload = _base_payload(
+        'swe-polybench-leaderboard/org_model/123',
+        [
+            _base_result(
+                'SWE-PolyBench Verified (Python)',
+                'Fraction of Python GitHub issues resolved (0.0–1.0)',
+            )
+        ],
+    )
+
+    augmented, changed_results, _, ids_changed = augment_aggregate_payload(
+        payload, benchmark_family='swe-polybench-leaderboard'
+    )
+
+    result = augmented['evaluation_results'][0]
+    metric_config = result['metric_config']
+
+    assert changed_results == 1
+    assert ids_changed is True
+    assert result['evaluation_name'] == 'SWE-PolyBench Verified (Python)'
+    assert metric_config['metric_id'] == 'swe_polybench_leaderboard.score'
+    assert metric_config['metric_name'] == 'Score'
+    assert metric_config['metric_kind'] == 'score'
+    assert metric_config['metric_unit'] == 'proportion'
+    assert result['evaluation_result_id'].endswith(
+        '#swe_polybench_verified_python#swe_polybench_leaderboard_score'
+    )
+
+
+def test_nested_datastore_paths_are_normalized_and_samples_move(tmp_path: Path):
+    aggregate_path = (
+        tmp_path
+        / 'data'
+        / 'multi-swe-bench-leaderboard'
+        / 'typescript'
+        / 'openai'
+        / 'gpt-5'
+        / 'run.json'
+    )
+    aggregate_path.parent.mkdir(parents=True)
+    payload = _base_payload(
+        'multi-swe-bench-leaderboard/openai_gpt-5/123',
+        [
+            _base_result(
+                'Multi-SWE-Bench (TypeScript)',
+                'Fraction of TypeScript issues resolved (0.0-1.0)',
+            )
+        ],
+    )
+    payload['detailed_evaluation_results'] = {
+        'format': 'jsonl',
+        'file_path': 'run_samples.jsonl',
+        'total_rows': 0,
+    }
+    aggregate_path.write_text(json.dumps(payload), encoding='utf-8')
+
+    samples_path = aggregate_path.with_name('run_samples.jsonl')
+    samples_path.write_text(
+        json.dumps(
+            {
+                'schema_version': 'instance_level_eval_0.2.2',
+                'evaluation_id': payload['evaluation_id'],
+                'model_id': 'openai/gpt-5',
+                'evaluation_name': 'Multi-SWE-Bench (TypeScript)',
+                'sample_id': 'sample-1',
+                'interaction_type': 'single_turn',
+                'input': {'raw': 'issue', 'reference': ['issue']},
+                'output': {'raw': ['patch']},
+                'answer_attribution': [],
+                'evaluation': {'score': 1.0, 'is_correct': True},
+            }
+        )
+        + '\n',
+        encoding='utf-8',
+    )
+
+    report = augment_aggregate_file(aggregate_path, write_changes=True)
+
+    expected_aggregate_path = (
+        tmp_path
+        / 'data'
+        / 'multi-swe-bench-leaderboard'
+        / 'openai'
+        / 'gpt-5'
+        / 'run.json'
+    )
+    expected_samples_path = expected_aggregate_path.with_name('run_samples.jsonl')
+
+    assert report.path_changed is True
+    assert report.file_path == expected_aggregate_path
+    assert report.sample_file_path == expected_samples_path
+    assert aggregate_path.exists() is False
+    assert samples_path.exists() is False
+    assert expected_aggregate_path.exists() is True
+    assert expected_samples_path.exists() is True
+
+    updated_payload = json.loads(expected_aggregate_path.read_text(encoding='utf-8'))
+    updated_result = updated_payload['evaluation_results'][0]
+    assert updated_result['metric_config']['metric_id'] == (
+        'multi_swe_bench_leaderboard.score'
+    )
+    assert updated_payload['detailed_evaluation_results']['total_rows'] == 1
+
+    updated_rows = [
+        json.loads(line)
+        for line in expected_samples_path.read_text(encoding='utf-8').splitlines()
+        if line.strip()
+    ]
+    assert updated_rows[0]['evaluation_result_id'] == updated_result['evaluation_result_id']
+
+
+def test_nested_datastore_paths_report_target_path_in_dry_run(tmp_path: Path):
+    aggregate_path = (
+        tmp_path
+        / 'data'
+        / 'swe-polybench-leaderboard'
+        / 'pb-verified'
+        / 'python'
+        / 'openai'
+        / 'gpt-5'
+        / 'run.json'
+    )
+    aggregate_path.parent.mkdir(parents=True)
+    payload = _base_payload(
+        'swe-polybench-leaderboard/openai_gpt-5/123',
+        [
+            _base_result(
+                'SWE-PolyBench Verified (Python)',
+                'Fraction of Python GitHub issues resolved (0.0-1.0)',
+            )
+        ],
+    )
+    aggregate_path.write_text(json.dumps(payload), encoding='utf-8')
+
+    report = augment_aggregate_file(aggregate_path, write_changes=False)
+
+    expected_aggregate_path = (
+        tmp_path
+        / 'data'
+        / 'swe-polybench-leaderboard'
+        / 'openai'
+        / 'gpt-5'
+        / 'run.json'
+    )
+
+    assert report.path_changed is True
+    assert report.original_file_path == aggregate_path
+    assert report.file_path == expected_aggregate_path
+    assert aggregate_path.exists() is True
+    assert expected_aggregate_path.exists() is False
+
+
+def test_pr72_style_faulty_data_is_fully_repaired_by_current_workflow(
+    tmp_path: Path,
+):
+    aggregate_path = (
+        tmp_path
+        / 'data'
+        / 'multi-swe-bench-leaderboard'
+        / 'typescript'
+        / 'openai'
+        / 'gpt-5'
+        / 'run.json'
+    )
+    aggregate_path.parent.mkdir(parents=True)
+    payload = _base_payload(
+        'multi-swe-bench-leaderboard/openai_gpt-5/123',
+        [
+            _base_result(
+                'Multi-SWE-Bench (TypeScript)',
+                'Fraction of TypeScript GitHub issues resolved (0.0-1.0)',
+            )
+        ],
+    )
+    aggregate_path.write_text(json.dumps(payload), encoding='utf-8')
+
+    pre_validate = validate_file(aggregate_path)
+    pre_audit = check_paths([str(tmp_path / 'data')])
+    schema_upgrade = upgrade_aggregate_file(aggregate_path, write_changes=True)
+    report = augment_aggregate_file(aggregate_path, write_changes=True)
+
+    fixed_path = (
+        tmp_path
+        / 'data'
+        / 'multi-swe-bench-leaderboard'
+        / 'openai'
+        / 'gpt-5'
+        / 'run.json'
+    )
+    fixed_payload = json.loads(fixed_path.read_text(encoding='utf-8'))
+    fixed_result = fixed_payload['evaluation_results'][0]
+    post_validate = validate_file(fixed_path)
+    post_audit = check_paths([str(tmp_path / 'data')])
+
+    assert pre_validate.valid is True
+    assert pre_audit.has_issues is True
+    assert schema_upgrade.changed is False
+    assert report.path_changed is True
+    assert aggregate_path.exists() is False
+    assert fixed_path.exists() is True
+    assert fixed_result['metric_config']['metric_id'] == (
+        'multi_swe_bench_leaderboard.score'
+    )
+    assert fixed_result['evaluation_result_id'].endswith(
+        '#multi_swe_bench_typescript#multi_swe_bench_leaderboard_score'
+    )
+    assert post_validate.valid is True
+    assert post_audit.has_issues is False
+
+
+def test_pr57_style_faulty_data_is_fully_repaired_by_current_workflow(
+    tmp_path: Path,
+):
+    aggregate_path = (
+        tmp_path
+        / 'data'
+        / 'swe-bench-verified-mini'
+        / 'google'
+        / 'gemini-2.0-flash-001'
+        / 'run.json'
+    )
+    aggregate_path.parent.mkdir(parents=True)
+    payload = _base_payload(
+        'swe-bench-verified-mini/google_gemini-2.0-flash-001/1744346409.0',
+        [
+            _base_result(
+                'mean on inspect_evals/swe_bench_verified_mini for scorer swe_bench_scorer',
+                'mean',
+            ),
+            _base_result(
+                'std on inspect_evals/swe_bench_verified_mini for scorer swe_bench_scorer',
+                'std',
+            ),
+        ],
+    )
+    payload['detailed_evaluation_results'] = {
+        'format': 'jsonl',
+        'file_path': 'run_samples.jsonl',
+        'total_rows': 1,
+    }
+    aggregate_path.write_text(json.dumps(payload), encoding='utf-8')
+
+    samples_path = aggregate_path.with_name('run_samples.jsonl')
+    samples_path.write_text(
+        json.dumps(
+            {
+                'schema_version': '0.2.2',
+                'evaluation_id': 'run_samples',
+                'model_id': 'google/gemini-2.0-flash-001',
+                'evaluation_name': 'inspect_evals/swe_bench_verified_mini',
+                'evaluation_result_id': None,
+                'sample_id': 'django__django-11790',
+                'sample_hash': 'abc123',
+                'interaction_type': 'agentic',
+                'input': {'raw': 'issue', 'reference': ['issue']},
+                'output': None,
+                'messages': [
+                    {'turn_idx': 0, 'role': 'user', 'content': 'issue'},
+                    {
+                        'turn_idx': 1,
+                        'role': 'assistant',
+                        'content': 'attempted patch',
+                    },
+                ],
+                'answer_attribution': [
+                    {
+                        'turn_idx': 1,
+                        'source': 'messages[1].content',
+                        'extracted_value': 'attempted patch',
+                        'extraction_method': 'exact_match',
+                        'is_terminal': True,
+                    }
+                ],
+                'evaluation': {
+                    'score': 0.0,
+                    'is_correct': False,
+                    'num_turns': 2,
+                    'tool_calls_count': 0,
+                },
+            }
+        )
+        + '\n',
+        encoding='utf-8',
+    )
+
+    pre_sample_validate = validate_file(samples_path)
+    pre_audit = check_paths([str(tmp_path / 'data')])
+    schema_upgrade = upgrade_instance_file(samples_path, write_changes=True)
+    report = augment_aggregate_file(aggregate_path, write_changes=True)
+
+    fixed_payload = json.loads(aggregate_path.read_text(encoding='utf-8'))
+    mean_result, std_result = fixed_payload['evaluation_results']
+    fixed_rows = [
+        json.loads(line)
+        for line in samples_path.read_text(encoding='utf-8').splitlines()
+        if line.strip()
+    ]
+    fixed_row = fixed_rows[0]
+    post_sample_validate = validate_file(samples_path)
+    post_audit = check_paths([str(tmp_path / 'data')])
+
+    assert pre_sample_validate.valid is False
+    assert pre_audit.has_issues is True
+    assert schema_upgrade.changed is True
+    assert report.path_changed is False
+    assert mean_result['evaluation_name'] == 'swe-bench-verified-mini'
+    assert mean_result['metric_config']['metric_id'] == 'mean_score'
+    assert mean_result['metric_config']['metric_name'] == 'Mean Score'
+    assert mean_result['metric_config']['metric_kind'] == 'score'
+    assert std_result['evaluation_name'] == 'swe-bench-verified-mini'
+    assert std_result['metric_config']['metric_id'] == 'standard_deviation'
+    assert std_result['metric_config']['metric_name'] == 'Standard Deviation'
+    assert std_result['metric_config']['metric_kind'] == 'standard_deviation'
+    assert fixed_row['schema_version'] == 'instance_level_eval_0.2.2'
+    assert fixed_row['evaluation_id'] == fixed_payload['evaluation_id']
+    assert fixed_row['evaluation_name'] == 'swe-bench-verified-mini'
+    assert fixed_row['evaluation_result_id'] == mean_result['evaluation_result_id']
+    assert post_sample_validate.valid is True
+    assert post_audit.has_issues is False
+
+
+def test_alphaxiv_faulty_data_is_fully_repaired_by_current_workflow(
+    tmp_path: Path,
+):
+    aggregate_path = (
+        tmp_path
+        / 'data'
+        / 'alphaxiv'
+        / 'Gaia2'
+        / 'unknown'
+        / 'GPT-4o'
+        / 'run.json'
+    )
+    aggregate_path.parent.mkdir(parents=True)
+    payload = {
+        'schema_version': '0.2.0',
+        'evaluation_id': 'Gaia2/GPT-4o/1771591481.616601',
+        'retrieved_timestamp': '1771591481.616601',
+        'source_metadata': {
+            'source_name': 'alphaXiv State of the Art',
+            'source_type': 'documentation',
+            'source_organization_name': 'alphaXiv',
+            'evaluator_relationship': 'third_party',
+        },
+        'model_info': {'id': 'GPT-4o', 'name': 'GPT-4o', 'developer': 'unknown'},
+        'evaluation_results': [
+            {
+                'evaluation_name': 'Overall Performance on Gaia2',
+                'source_data': {
+                    'dataset_name': 'Gaia2',
+                    'source_type': 'url',
+                    'url': ['https://www.alphaxiv.org/abs/example'],
+                },
+                'metric_config': {
+                    'lower_is_better': False,
+                    'score_type': 'continuous',
+                    'min_score': 0.0,
+                    'max_score': 100.0,
+                    'evaluation_description': 'Overall performance on Gaia2.',
+                    'additional_details': {
+                        'alphaxiv_is_primary': 'True',
+                        'alphaxiv_y_axis': 'Score',
+                    },
+                },
+                'score_details': {'score': 67.4},
+            }
+        ],
+    }
+    aggregate_path.write_text(json.dumps(payload), encoding='utf-8')
+
+    pre_validate = validate_file(aggregate_path)
+    pre_audit = check_paths([str(tmp_path / 'data')])
+    schema_upgrade = upgrade_aggregate_file(aggregate_path, write_changes=True)
+    report = augment_aggregate_file(aggregate_path, write_changes=True)
+
+    fixed_path = (
+        tmp_path / 'data' / 'alphaxiv' / 'unknown' / 'GPT-4o' / 'run.json'
+    )
+    fixed_payload = json.loads(fixed_path.read_text(encoding='utf-8'))
+    fixed_result = fixed_payload['evaluation_results'][0]
+    post_validate = validate_file(fixed_path)
+    post_audit = check_paths([str(tmp_path / 'data')])
+
+    assert pre_validate.valid is False
+    assert pre_audit.has_issues is True
+    assert schema_upgrade.changed is True
+    assert report.path_changed is True
+    assert aggregate_path.exists() is False
+    assert fixed_path.exists() is True
+    assert fixed_payload['eval_library'] == {
+        'name': 'alphaxiv',
+        'version': 'unknown',
+    }
+    assert fixed_result['evaluation_name'] == 'Gaia2'
+    assert fixed_result['metric_config']['metric_id'] == (
+        'overall_performance_on_gaia2'
+    )
+    assert fixed_result['metric_config']['metric_name'] == (
+        'Overall Performance on Gaia2'
+    )
+    assert fixed_result['metric_config']['metric_kind'] == 'score'
+    assert fixed_result['metric_config']['metric_unit'] == 'points'
+    assert (
+        fixed_result['metric_config']['additional_details']['raw_evaluation_name']
+        == 'Overall Performance on Gaia2'
+    )
+    assert fixed_result['evaluation_result_id'].endswith(
+        '#gaia2#overall_performance_on_gaia2'
+    )
+    assert post_validate.valid is True
+    assert post_audit.has_issues is False
+
+
+def test_helm_description_drives_metric_identity_and_mean_row_renaming():
+    payload = _base_payload(
+        'helm_lite/org_model/123',
+        [
+            _base_result(
+                'Mean win rate',
+                'How many models this model outperforms on average (over columns).',
+            ),
+            _base_result(
+                'NarrativeQA',
+                'F1 on NarrativeQA',
+            ),
+            _base_result(
+                'IFEval',
+                'IFEval Strict Acc on IFEval',
+            ),
+        ],
+    )
+
+    augmented, changed_results, _, _ = augment_aggregate_payload(
+        payload, benchmark_family='helm_lite'
+    )
+
+    mean_win_rate = augmented['evaluation_results'][0]
+    narrative_qa = augmented['evaluation_results'][1]
+    ifeval = augmented['evaluation_results'][2]
+
+    assert changed_results == 3
+    assert mean_win_rate['evaluation_name'] == 'helm_lite'
+    assert mean_win_rate['metric_config']['metric_id'] == 'win_rate'
+    assert narrative_qa['evaluation_name'] == 'NarrativeQA'
+    assert narrative_qa['metric_config']['metric_id'] == 'f1'
+    assert ifeval['metric_config']['metric_id'] == 'strict_accuracy'
+
+
+def test_ifeval_inspect_rows_split_slice_and_metric_identity():
+    payload = _base_payload(
+        'IFEval/org_model/123',
+        [
+            _base_result(
+                'prompt_strict_acc on inspect_evals/ifeval for scorer instruction_following',
+                'prompt strict accuracy',
+            ),
+            _base_result(
+                'final_stderr on inspect_evals/ifeval for scorer instruction_following',
+                'final standard error',
+            ),
+        ],
+    )
+
+    augmented, changed_results, _, _ = augment_aggregate_payload(
+        payload, benchmark_family='IFEval'
+    )
+
+    strict_acc = augmented['evaluation_results'][0]
+    final_stderr = augmented['evaluation_results'][1]
+
+    assert changed_results == 2
+    assert strict_acc['evaluation_name'] == 'prompt_strict'
+    assert strict_acc['metric_config']['metric_id'] == 'accuracy'
+    assert strict_acc['metric_config']['metric_name'] == 'Accuracy'
+    assert strict_acc['metric_config']['metric_kind'] == 'accuracy'
+    assert final_stderr['evaluation_name'] == 'final'
+    assert final_stderr['metric_config']['metric_id'] == 'standard_error'
+    assert final_stderr['metric_config']['metric_name'] == 'Standard Error'
+    assert final_stderr['metric_config']['metric_kind'] == 'standard_error'
+
+
+def test_agentharm_inspect_rows_cover_overall_and_category_metrics():
+    payload = _base_payload(
+        'agentharm/org_model/123',
+        [
+            _base_result(
+                'inspect_evals/avg_full_score on inspect_evals/agentharm for scorer combined_scorer',
+                'Average full score on agentharm',
+            ),
+            _base_result(
+                'Cybercrime_avg_refusals on inspect_evals/agentharm for scorer combined_scorer',
+                'Average refusals on Cybercrime subset',
+            ),
+        ],
+    )
+
+    augmented, changed_results, _, _ = augment_aggregate_payload(
+        payload, benchmark_family='agentharm'
+    )
+
+    avg_full_score = augmented['evaluation_results'][0]
+    cybercrime_refusals = augmented['evaluation_results'][1]
+
+    assert changed_results == 2
+    assert avg_full_score['evaluation_name'] == 'agentharm'
+    assert avg_full_score['metric_config']['metric_id'] == 'average_full_score'
+    assert avg_full_score['metric_config']['metric_name'] == 'Average Full Score'
+    assert avg_full_score['metric_config']['metric_kind'] == 'score'
+    assert cybercrime_refusals['evaluation_name'] == 'Cybercrime'
+    assert (
+        cybercrime_refusals['metric_config']['metric_id']
+        == 'average_refusal_rate'
+    )
+    assert (
+        cybercrime_refusals['metric_config']['metric_name']
+        == 'Average Refusal Rate'
+    )
+    assert (
+        cybercrime_refusals['metric_config']['metric_kind']
+        == 'refusal_rate'
+    )
+
+
+def test_generic_inspect_eval_benchmarks_backfill_accuracy_mean_and_std():
+    payload = _base_payload(
+        'cyse2_vulnerability_exploit/org_model/123',
+        [
+            _base_result(
+                'accuracy on inspect_evals/cyse2_vulnerability_exploit for scorer vul_exploit_scorer',
+                'accuracy',
+            ),
+            _base_result(
+                'mean on inspect_evals/cyse2_vulnerability_exploit for scorer vul_exploit_scorer',
+                'mean',
+            ),
+            _base_result(
+                'std on inspect_evals/cyse2_vulnerability_exploit for scorer vul_exploit_scorer',
+                'std',
+            ),
+        ],
+    )
+
+    augmented, changed_results, _, _ = augment_aggregate_payload(
+        payload, benchmark_family='cyse2_vulnerability_exploit'
+    )
+
+    accuracy = augmented['evaluation_results'][0]
+    mean_score = augmented['evaluation_results'][1]
+    standard_deviation = augmented['evaluation_results'][2]
+
+    assert changed_results == 3
+    assert accuracy['evaluation_name'] == 'cyse2_vulnerability_exploit'
+    assert accuracy['metric_config']['metric_id'] == 'accuracy'
+    assert mean_score['evaluation_name'] == 'cyse2_vulnerability_exploit'
+    assert mean_score['metric_config']['metric_id'] == 'mean_score'
+    assert mean_score['metric_config']['metric_name'] == 'Mean Score'
+    assert mean_score['metric_config']['metric_kind'] == 'score'
+    assert standard_deviation['evaluation_name'] == 'cyse2_vulnerability_exploit'
+    assert (
+        standard_deviation['metric_config']['metric_id']
+        == 'standard_deviation'
+    )
+    assert (
+        standard_deviation['metric_config']['metric_name']
+        == 'Standard Deviation'
+    )
+    assert (
+        standard_deviation['metric_config']['metric_kind']
+        == 'standard_deviation'
+    )
+
+
+def test_fibble_arena_supports_2_to_4_lie_variants():
+    payload = _base_payload(
+        'fibble_arena/org_model/123',
+        [
+            _base_result(
+                'fibble_arena_3lies_win_rate',
+                'Win rate on Fibble³ Arena (3 lies)',
+            ),
+            _base_result(
+                'fibble_arena_4lies_avg_attempts',
+                'Average guesses used per game on Fibble⁴ Arena (4 lies)',
+                min_score=1.0,
+                max_score=8.0,
+                lower_is_better=True,
+            ),
+        ],
+    )
+
+    augmented, changed_results, _, _ = augment_aggregate_payload(
+        payload, benchmark_family='fibble_arena'
+    )
+
+    win_rate = augmented['evaluation_results'][0]
+    avg_attempts = augmented['evaluation_results'][1]
+
+    assert changed_results == 2
+    assert win_rate['evaluation_name'] == 'fibble_arena_3lies'
+    assert win_rate['metric_config']['metric_id'] == 'win_rate'
+    assert avg_attempts['evaluation_name'] == 'fibble_arena_4lies'
+    assert avg_attempts['metric_config']['metric_id'] == 'mean_attempts'
+
+
+def test_fibble_numbered_families_backfill_metrics_without_renaming_family():
+    payload = _base_payload(
+        'fibble5_arena/org_model/123',
+        [
+            _base_result(
+                'fibble5_arena_win_rate',
+                'Win rate on Fibble5 Arena',
+            ),
+            _base_result(
+                'fibble5_arena_avg_attempts',
+                'Average attempts on Fibble5 Arena',
+                min_score=1.0,
+                max_score=8.0,
+                lower_is_better=True,
+            ),
+            _base_result(
+                'fibble5_arena_avg_latency_ms',
+                'Average latency on Fibble5 Arena',
+                min_score=0.0,
+                max_score=5000.0,
+                lower_is_better=True,
+            ),
+        ],
+    )
+
+    augmented, changed_results, _, _ = augment_aggregate_payload(
+        payload, benchmark_family='fibble5_arena'
+    )
+
+    win_rate = augmented['evaluation_results'][0]
+    avg_attempts = augmented['evaluation_results'][1]
+    avg_latency = augmented['evaluation_results'][2]
+
+    assert changed_results == 3
+    assert win_rate['evaluation_name'] == 'fibble5_arena'
+    assert win_rate['metric_config']['metric_id'] == 'win_rate'
+    assert avg_attempts['evaluation_name'] == 'fibble5_arena'
+    assert avg_attempts['metric_config']['metric_id'] == 'mean_attempts'
+    assert avg_latency['evaluation_name'] == 'fibble5_arena'
+    assert avg_latency['metric_config']['metric_id'] == 'latency_mean'
+
+
+def test_helm_patch_handles_ndcg_bleu_and_acc_abbreviations():
+    payload = _base_payload(
+        'helm_classic/org_model/123',
+        [
+            _base_result(
+                'MS MARCO (TREC)',
+                'NDCG@10 on MS MARCO (TREC)',
+            ),
+            _base_result(
+                'WMT 2014',
+                'BLEU-4 on WMT 2014',
+            ),
+            _base_result(
+                'Omni-MATH',
+                'Acc on Omni-MATH',
+            ),
+        ],
+    )
+
+    augmented, changed_results, _, _ = augment_aggregate_payload(
+        payload, benchmark_family='helm_classic'
+    )
+
+    ndcg = augmented['evaluation_results'][0]['metric_config']
+    bleu = augmented['evaluation_results'][1]['metric_config']
+    acc = augmented['evaluation_results'][2]['metric_config']
+
+    assert changed_results == 3
+    assert ndcg['metric_id'] == 'ndcg'
+    assert ndcg['metric_parameters'] == {'k': 10}
+    assert bleu['metric_id'] == 'bleu_4'
+    assert bleu['metric_parameters'] == {'n': 4}
+    assert acc['metric_id'] == 'accuracy'
+
+
+def test_livecodebenchpro_pass_at_k_is_backfilled():
+    payload = _base_payload(
+        'livecodebenchpro/org_model/123',
+        [
+            _base_result(
+                'Hard Problems',
+                'Pass@1 on Hard Problems',
+            )
+        ],
+    )
+
+    augmented, changed_results, _, _ = augment_aggregate_payload(
+        payload, benchmark_family='livecodebenchpro'
+    )
+
+    metric_config = augmented['evaluation_results'][0]['metric_config']
+    assert changed_results == 1
+    assert metric_config['metric_id'] == 'pass_at_k'
+    assert metric_config['metric_name'] == 'Pass@1'
+    assert metric_config['metric_kind'] == 'pass_rate'
+    assert metric_config['metric_parameters'] == {'k': 1}
+
+
+def test_score_suffix_families_and_single_score_families_are_backfilled():
+    ace_payload = _base_payload(
+        'ace/org_model/123',
+        [
+            _base_result('Overall Score', 'Overall ACE score.'),
+            _base_result('Gaming Score', 'Gaming domain score.'),
+        ],
+    )
+    ace_augmented, changed_results, _, _ = augment_aggregate_payload(
+        ace_payload, benchmark_family='ace'
+    )
+
+    overall = ace_augmented['evaluation_results'][0]
+    gaming = ace_augmented['evaluation_results'][1]
+
+    assert changed_results == 2
+    assert overall['evaluation_name'] == 'ace'
+    assert overall['metric_config']['metric_id'] == 'ace.score'
+    assert gaming['evaluation_name'] == 'Gaming'
+    assert gaming['metric_config']['metric_id'] == 'ace.score'
+
+    tau_payload = _base_payload(
+        'tau-bench-2/airline/org_model/123',
+        [
+            _base_result(
+                'tau-bench-2/airline',
+                'Tau Bench 2 benchmark evaluation (airline subset)',
+            )
+        ],
+    )
+    tau_augmented, tau_changed, _, _ = augment_aggregate_payload(
+        tau_payload, benchmark_family='tau-bench-2_airline'
+    )
+    tau_metric = tau_augmented['evaluation_results'][0]['metric_config']
+    assert tau_changed == 1
+    assert tau_metric['metric_id'] == 'tau_bench_2_airline.score'
+    assert tau_metric['metric_unit'] == 'proportion'
+
+    la_payload = _base_payload(
+        'la_leaderboard/org_model/123',
+        [
+            _base_result(
+                'la_leaderboard',
+                'La Leaderboard benchmark score',
+                min_score=0.0,
+                max_score=100.0,
+            )
+        ],
+    )
+    la_augmented, la_changed, _, _ = augment_aggregate_payload(
+        la_payload, benchmark_family='la_leaderboard'
+    )
+    la_metric = la_augmented['evaluation_results'][0]['metric_config']
+    assert la_changed == 1
+    assert la_metric['metric_id'] == 'la_leaderboard.score'
+    assert la_metric['metric_unit'] == 'points'
+
+
+def test_theory_of_mind_patch_splits_metric_from_eval_name():
+    payload = _base_payload(
+        'theory_of_mind/org_model/123',
+        [
+            _base_result(
+                'accuracy on theory_of_mind for scorer model_graded_fact',
+                'accuracy',
+            )
+        ],
+    )
+
+    augmented, changed_results, _, _ = augment_aggregate_payload(
+        payload, benchmark_family='theory_of_mind'
+    )
+
+    result = augmented['evaluation_results'][0]
+    assert changed_results == 1
+    assert result['evaluation_name'] == 'theory_of_mind'
+    assert result['metric_config']['metric_id'] == 'accuracy'
diff --git a/tests/test_check_canonical_identity.py b/tests/test_check_canonical_identity.py
new file mode 100644
index 000000000..3cfecfe76
--- /dev/null
+++ b/tests/test_check_canonical_identity.py
@@ -0,0 +1,153 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from every_eval_ever.check_canonical_identity import check_paths, main
+
+
+def _base_payload(evaluation_id: str, evaluation_results: list[dict]) -> dict:
+    return {
+        'schema_version': '0.2.2',
+        'evaluation_id': evaluation_id,
+        'retrieved_timestamp': '1234567890',
+        'source_metadata': {
+            'source_type': 'documentation',
+            'source_organization_name': 'TestOrg',
+            'evaluator_relationship': 'third_party',
+        },
+        'eval_library': {'name': 'unknown', 'version': 'unknown'},
+        'model_info': {'name': 'test-model', 'id': 'org/test-model'},
+        'evaluation_results': evaluation_results,
+    }
+
+
+def _result_with_identity(evaluation_name: str) -> dict:
+    return {
+        'evaluation_name': evaluation_name,
+        'evaluation_result_id': f'test/model/123#{evaluation_name}#accuracy',
+        'source_data': {
+            'dataset_name': 'test-ds',
+            'source_type': 'hf_dataset',
+            'hf_repo': 'org/test-ds',
+        },
+        'metric_config': {
+            'evaluation_description': 'accuracy',
+            'metric_id': 'accuracy',
+            'metric_name': 'Accuracy',
+            'metric_kind': 'accuracy',
+            'metric_unit': 'proportion',
+            'lower_is_better': False,
+            'score_type': 'continuous',
+            'min_score': 0.0,
+            'max_score': 1.0,
+        },
+        'score_details': {'score': 0.5},
+    }
+
+
+def test_check_paths_reports_missing_identity_fields(tmp_path: Path):
+    data_dir = tmp_path / 'data' / 'demo-benchmark' / 'org' / 'model'
+    data_dir.mkdir(parents=True)
+
+    payload = _base_payload(
+        'demo-benchmark/org_model/123',
+        [
+            {
+                **_result_with_identity('slice_a'),
+                'evaluation_result_id': '',
+                'metric_config': {
+                    'lower_is_better': False,
+                    'score_type': 'continuous',
+                    'min_score': 0.0,
+                    'max_score': 1.0,
+                },
+            }
+        ],
+    )
+    file_path = data_dir / 'run.json'
+    file_path.write_text(json.dumps(payload), encoding='utf-8')
+
+    report = check_paths([str(tmp_path)])
+    as_dict = report.to_dict()
+
+    assert report.files_scanned == 1
+    assert report.results_scanned == 1
+    assert as_dict['missing']['evaluation_result_id'] == 1
+    assert as_dict['missing']['metric_id'] == 1
+    assert as_dict['top_missing_by_benchmark']['metric_id'][0] == (
+        'demo-benchmark',
+        1,
+    )
+
+
+def test_check_paths_passes_for_complete_identity(tmp_path: Path):
+    data_dir = tmp_path / 'data' / 'clean-benchmark' / 'org' / 'model'
+    data_dir.mkdir(parents=True)
+
+    payload = _base_payload(
+        'clean-benchmark/org_model/123',
+        [_result_with_identity('slice_a')],
+    )
+    file_path = data_dir / 'run.json'
+    file_path.write_text(json.dumps(payload), encoding='utf-8')
+
+    report = check_paths([str(tmp_path)])
+    assert report.files_scanned == 1
+    assert report.results_scanned == 1
+    assert report.has_issues is False
+    assert report.missing == {}
+    assert report.malformed == {}
+
+
+def test_check_paths_ignores_non_aggregate_json(tmp_path: Path):
+    data_dir = tmp_path / 'data' / 'clean-benchmark' / 'org' / 'model'
+    data_dir.mkdir(parents=True)
+    (tmp_path / 'metadata.json').write_text(
+        json.dumps({'generated_by': 'unit-test'}),
+        encoding='utf-8',
+    )
+
+    payload = _base_payload(
+        'clean-benchmark/org_model/123',
+        [_result_with_identity('slice_a')],
+    )
+    (data_dir / 'run.json').write_text(json.dumps(payload), encoding='utf-8')
+
+    report = check_paths([str(tmp_path)])
+    assert report.files_scanned == 1
+    assert report.results_scanned == 1
+    assert report.has_issues is False
+
+
+def test_main_fail_on_issues_returns_nonzero(tmp_path: Path):
+    data_dir = tmp_path / 'data' / 'broken-benchmark' / 'org' / 'model'
+    data_dir.mkdir(parents=True)
+
+    payload = _base_payload(
+        'broken-benchmark/org_model/123',
+        [
+            {
+                **_result_with_identity('slice_a'),
+                'evaluation_result_id': 'not_a_canonical_id',
+                'metric_config': {
+                    **_result_with_identity('slice_a')['metric_config'],
+                    'metric_id': '',
+                },
+            }
+        ],
+    )
+    file_path = data_dir / 'run.json'
+    file_path.write_text(json.dumps(payload), encoding='utf-8')
+
+    assert (
+        main(
+            [
+                str(tmp_path),
+                '--format',
+                'json',
+                '--fail-on-issues',
+            ]
+        )
+        == 1
+    )
diff --git a/tests/test_helm_instance_level_adapter.py b/tests/test_helm_instance_level_adapter.py
index 4ee46c6c2..275bc739a 100644
--- a/tests/test_helm_instance_level_adapter.py
+++ b/tests/test_helm_instance_level_adapter.py
@@ -63,8 +63,8 @@ def test_mmlu_instance_level():
         assert len(instance_logs) == 10
         log = instance_logs[0]
 
-        assert log.schema_version == '0.2.2'
-        assert log.evaluation_id == 'test_mmlu_samples'
+        assert log.schema_version == 'instance_level_eval_0.2.2'
+        assert log.evaluation_id == converted_eval.evaluation_id
         assert log.model_id == 'openai/gpt2'
         assert log.evaluation_name == 'mmlu'
         assert log.sample_id == 'id147'
@@ -112,7 +112,8 @@ def test_hellaswag_instance_level():
         assert len(instance_logs) == 10
         log = instance_logs[0]
 
-        assert log.schema_version == '0.2.2'
+        assert log.schema_version == 'instance_level_eval_0.2.2'
+        assert log.evaluation_id == converted_eval.evaluation_id
         assert log.model_id == 'eleutherai/pythia-1b-v0'
         assert log.evaluation_name == 'hellaswag'
         assert log.interaction_type == InteractionType.single_turn
@@ -148,7 +149,8 @@ def test_narrativeqa_instance_level():
         assert len(instance_logs) == 5
         log = instance_logs[0]
 
-        assert log.schema_version == '0.2.2'
+        assert log.schema_version == 'instance_level_eval_0.2.2'
+        assert log.evaluation_id == converted_eval.evaluation_id
         assert log.model_id == 'openai/gpt2'
         assert log.evaluation_name == 'narrativeqa'
         assert log.interaction_type == InteractionType.single_turn
diff --git a/tests/test_inspect_instance_level_adapter.py b/tests/test_inspect_instance_level_adapter.py
index ea1deab8e..fcd996f0e 100644
--- a/tests/test_inspect_instance_level_adapter.py
+++ b/tests/test_inspect_instance_level_adapter.py
@@ -67,7 +67,8 @@ def test_pubmedqa_instance_level():
         assert len(instance_logs) == 2
         log = instance_logs[0]
 
-        assert log.schema_version == '0.2.2'
+        assert log.schema_version == 'instance_level_eval_0.2.2'
+        assert log.evaluation_id == converted_eval.evaluation_id
         assert log.model_id == 'openai/gpt-4o-mini-2024-07-18'
         assert log.interaction_type == InteractionType.single_turn
 
@@ -98,7 +99,8 @@ def test_arc_sonnet_instance_level():
         assert len(instance_logs) == 5
         log = instance_logs[0]
 
-        assert log.schema_version == '0.2.2'
+        assert log.schema_version == 'instance_level_eval_0.2.2'
+        assert log.evaluation_id == converted_eval.evaluation_id
         assert log.model_id == 'anthropic/claude-sonnet-4-20250514'
         assert log.interaction_type == InteractionType.single_turn
 
@@ -134,7 +136,8 @@ def test_arc_qwen_instance_level():
         assert len(instance_logs) == 3
         log = instance_logs[0]
 
-        assert log.schema_version == '0.2.2'
+        assert log.schema_version == 'instance_level_eval_0.2.2'
+        assert log.evaluation_id == converted_eval.evaluation_id
         assert log.model_id == 'ollama/qwen2.5-0.5b'
         assert log.interaction_type == InteractionType.single_turn
 
@@ -170,7 +173,8 @@ def test_gaia_instance_level():
         assert len(instance_logs) > 0
         log = instance_logs[0]
 
-        assert log.schema_version == '0.2.2'
+        assert log.schema_version == 'instance_level_eval_0.2.2'
+        assert log.evaluation_id == converted_eval.evaluation_id
         assert log.model_id == 'openai/gpt-4.1-mini-2025-04-14'
 
         assert log.interaction_type == InteractionType.agentic
diff --git a/tests/test_upgrade_schema_version.py b/tests/test_upgrade_schema_version.py
new file mode 100644
index 000000000..a6c02a473
--- /dev/null
+++ b/tests/test_upgrade_schema_version.py
@@ -0,0 +1,73 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from every_eval_ever.upgrade_schema_version import (
+    AGGREGATE_SCHEMA_VERSION,
+    INSTANCE_SCHEMA_VERSION,
+    upgrade_aggregate_file,
+    upgrade_instance_file,
+)
+
+
+def test_upgrade_aggregate_file_rewrites_schema_version(tmp_path: Path):
+    file_path = tmp_path / 'aggregate.json'
+    payload = {
+        'schema_version': '0.2.1',
+        'evaluation_id': 'test/model/123',
+        'retrieved_timestamp': '123',
+        'source_metadata': {
+            'source_type': 'evaluation_run',
+            'source_organization_name': 'TestOrg',
+            'evaluator_relationship': 'first_party',
+        },
+        'eval_library': {'name': 'inspect_ai', 'version': '0.3.0'},
+        'model_info': {'name': 'test-model', 'id': 'org/test-model'},
+        'evaluation_results': [],
+    }
+    file_path.write_text(json.dumps(payload), encoding='utf-8')
+
+    report = upgrade_aggregate_file(file_path, write_changes=True)
+    updated = json.loads(file_path.read_text(encoding='utf-8'))
+
+    assert report.changed is True
+    assert updated['schema_version'] == AGGREGATE_SCHEMA_VERSION
+
+
+def test_upgrade_instance_file_rewrites_all_rows(tmp_path: Path):
+    file_path = tmp_path / 'samples.jsonl'
+    rows = [
+        {'schema_version': 'instance_level_eval_0.2.1', 'sample_id': 'a'},
+        {'schema_version': '0.2.2', 'sample_id': 'b'},
+        {'schema_version': INSTANCE_SCHEMA_VERSION, 'sample_id': 'c'},
+    ]
+    file_path.write_text(
+        '\n'.join(json.dumps(row) for row in rows) + '\n',
+        encoding='utf-8',
+    )
+
+    report = upgrade_instance_file(file_path, write_changes=True)
+    updated = [
+        json.loads(line)
+        for line in file_path.read_text(encoding='utf-8').splitlines()
+        if line.strip()
+    ]
+
+    assert report.changed is True
+    assert report.rows_changed == 2
+    assert all(
+        row['schema_version'] == INSTANCE_SCHEMA_VERSION for row in updated
+    )
+
+
+def test_upgrade_dry_run_does_not_write(tmp_path: Path):
+    file_path = tmp_path / 'aggregate.json'
+    payload = {'schema_version': '0.2.1'}
+    file_path.write_text(json.dumps(payload), encoding='utf-8')
+
+    report = upgrade_aggregate_file(file_path, write_changes=False)
+    unchanged = json.loads(file_path.read_text(encoding='utf-8'))
+
+    assert report.changed is True
+    assert unchanged['schema_version'] == '0.2.1'
diff --git a/tests/test_validate.py b/tests/test_validate.py
index edb2d5b6a..70d453ec0 100644
--- a/tests/test_validate.py
+++ b/tests/test_validate.py
@@ -131,6 +131,13 @@ def test_missing_required_field(self, tmp_path: Path):
         assert report.valid is False
         assert any('evaluation_id' in e['loc'] for e in report.errors)
 
+    def test_wrong_schema_version_fails(self, tmp_path: Path):
+        data = {**VALID_AGGREGATE, 'schema_version': '0.2.1'}
+        fp = _write_json(tmp_path, 'wrong_version.json', data)
+        report = validate_aggregate(fp)
+        assert report.valid is False
+        assert any('schema_version' in e['loc'] for e in report.errors)
+
     def test_extra_field_on_evaluation_log_fails(self, tmp_path: Path):
         data = {**VALID_AGGREGATE, 'unexpected_field': 'oops'}
         fp = _write_json(tmp_path, 'extra.json', data)
@@ -223,6 +230,16 @@ def test_valid_single_turn_passes(self, tmp_path: Path):
         assert report.valid is True
         assert report.line_count == 1
 
+    def test_wrong_schema_version_fails(self, tmp_path: Path):
+        data = {
+            **VALID_SINGLE_TURN,
+            'schema_version': 'instance_level_eval_0.2.1',
+        }
+        fp = _write_jsonl(tmp_path, 'wrong_version.jsonl', [data])
+        report = validate_instance_file(fp)
+        assert report.valid is False
+        assert any('schema_version' in e['loc'] for e in report.errors)
+
     def test_valid_multi_turn_passes(self, tmp_path: Path):
         fp = _write_jsonl(tmp_path, 'multi.jsonl', [VALID_MULTI_TURN])
         report = validate_instance_file(fp)