diff --git a/.gitignore b/.gitignore index d493519d7..198f151dc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ # Local data (generated by running adapters) # data/ +plan/ +misc/ +*.tmp # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/.tmp/audit_after.json b/.tmp/audit_after.json new file mode 100644 index 000000000..0dda99be0 --- /dev/null +++ b/.tmp/audit_after.json @@ -0,0 +1,262 @@ +{ + "files_scanned": 6448, + "results_scanned": 49659, + "missing": { + "metric_id": 1021, + "metric_name": 1021, + "metric_kind": 1021, + "metric_unit": 1021 + }, + "malformed": {}, + "top_missing_by_benchmark": { + "evaluation_result_id": [], + "metric_id": [ + [ + "fibble_arena", + 336 + ], + [ + "helm_classic", + 201 + ], + [ + "helm_lite", + 182 + ], + [ + "livecodebenchpro", + 87 + ], + [ + "helm_capabilities", + 68 + ], + [ + "ace", + 32 + ], + [ + "apex-v1", + 19 + ], + [ + "appworld_test_normal", + 15 + ], + [ + "browsecompplus", + 15 + ], + [ + "swe-bench", + 15 + ], + [ + "tau-bench-2_airline", + 15 + ], + [ + "tau-bench-2_retail", + 15 + ], + [ + "tau-bench-2_telecom", + 15 + ], + [ + "la_leaderboard", + 5 + ], + [ + "theory_of_mind", + 1 + ] + ], + "metric_name": [ + [ + "fibble_arena", + 336 + ], + [ + "helm_classic", + 201 + ], + [ + "helm_lite", + 182 + ], + [ + "livecodebenchpro", + 87 + ], + [ + "helm_capabilities", + 68 + ], + [ + "ace", + 32 + ], + [ + "apex-v1", + 19 + ], + [ + "appworld_test_normal", + 15 + ], + [ + "browsecompplus", + 15 + ], + [ + "swe-bench", + 15 + ], + [ + "tau-bench-2_airline", + 15 + ], + [ + "tau-bench-2_retail", + 15 + ], + [ + "tau-bench-2_telecom", + 15 + ], + [ + "la_leaderboard", + 5 + ], + [ + "theory_of_mind", + 1 + ] + ], + "metric_kind": [ + [ + "fibble_arena", + 336 + ], + [ + "helm_classic", + 201 + ], + [ + "helm_lite", + 182 + ], + [ + "livecodebenchpro", + 87 + ], + [ + "helm_capabilities", + 68 + ], + [ + "ace", + 32 + ], + [ + "apex-v1", + 19 + ], + [ + "appworld_test_normal", + 15 + ], + [ + "browsecompplus", + 15 + ], + [ + "swe-bench", + 15 + ], + [ + "tau-bench-2_airline", + 15 + ], + [ + "tau-bench-2_retail", + 15 + ], + [ + "tau-bench-2_telecom", + 15 + ], + [ + "la_leaderboard", + 5 + ], + [ + "theory_of_mind", + 1 + ] + ], + "metric_unit": [ + [ + "fibble_arena", + 336 + ], + [ + "helm_classic", + 201 + ], + [ + "helm_lite", + 182 + ], + [ + "livecodebenchpro", + 87 + ], + [ + "helm_capabilities", + 68 + ], + [ + "ace", + 32 + ], + [ + "apex-v1", + 19 + ], + [ + "appworld_test_normal", + 15 + ], + [ + "browsecompplus", + 15 + ], + [ + "swe-bench", + 15 + ], + [ + "tau-bench-2_airline", + 15 + ], + [ + "tau-bench-2_retail", + 15 + ], + [ + "tau-bench-2_telecom", + 15 + ], + [ + "la_leaderboard", + 5 + ], + [ + "theory_of_mind", + 1 + ] + ] + } +} \ No newline at end of file diff --git a/.tmp/audit_before.json b/.tmp/audit_before.json new file mode 100644 index 000000000..7b432cb88 --- /dev/null +++ b/.tmp/audit_before.json @@ -0,0 +1,426 @@ +{ + "files_scanned": 6448, + "results_scanned": 49659, + "missing": { + "evaluation_result_id": 37071, + "metric_id": 37071, + "metric_name": 37071, + "metric_kind": 37071, + "metric_unit": 37071 + }, + "malformed": { + "evaluation_result_id_pattern": 12588 + }, + "top_missing_by_benchmark": { + "evaluation_result_id": [ + [ + "hfopenllm_v2", + 27444 + ], + [ + "helm_mmlu", + 2844 + ], + [ + "reward-bench", + 2404 + ], + [ + "helm_classic", + 1005 + ], + [ + "global-mmlu-lite", + 912 + ], + [ + "helm_lite", + 910 + ], + [ + "fibble_arena", + 559 + ], + [ + "helm_capabilities", + 408 + ], + [ + "wordle_arena", + 134 + ], + [ + "terminal-bench-2.0", + 115 + ], + [ + "livecodebenchpro", + 87 + ], + [ + "apex-agents", + 74 + ], + [ + "ace", + 32 + ], + [ + "helm_instruct", + 28 + ], + [ + "apex-v1", + 19 + ], + [ + "appworld_test_normal", + 15 + ], + [ + "browsecompplus", + 15 + ], + [ + "swe-bench", + 15 + ], + [ + "tau-bench-2_airline", + 15 + ], + [ + "tau-bench-2_retail", + 15 + ] + ], + "metric_id": [ + [ + "hfopenllm_v2", + 27444 + ], + [ + "helm_mmlu", + 2844 + ], + [ + "reward-bench", + 2404 + ], + [ + "helm_classic", + 1005 + ], + [ + "global-mmlu-lite", + 912 + ], + [ + "helm_lite", + 910 + ], + [ + "fibble_arena", + 559 + ], + [ + "helm_capabilities", + 408 + ], + [ + "wordle_arena", + 134 + ], + [ + "terminal-bench-2.0", + 115 + ], + [ + "livecodebenchpro", + 87 + ], + [ + "apex-agents", + 74 + ], + [ + "ace", + 32 + ], + [ + "helm_instruct", + 28 + ], + [ + "apex-v1", + 19 + ], + [ + "appworld_test_normal", + 15 + ], + [ + "browsecompplus", + 15 + ], + [ + "swe-bench", + 15 + ], + [ + "tau-bench-2_airline", + 15 + ], + [ + "tau-bench-2_retail", + 15 + ] + ], + "metric_name": [ + [ + "hfopenllm_v2", + 27444 + ], + [ + "helm_mmlu", + 2844 + ], + [ + "reward-bench", + 2404 + ], + [ + "helm_classic", + 1005 + ], + [ + "global-mmlu-lite", + 912 + ], + [ + "helm_lite", + 910 + ], + [ + "fibble_arena", + 559 + ], + [ + "helm_capabilities", + 408 + ], + [ + "wordle_arena", + 134 + ], + [ + "terminal-bench-2.0", + 115 + ], + [ + "livecodebenchpro", + 87 + ], + [ + "apex-agents", + 74 + ], + [ + "ace", + 32 + ], + [ + "helm_instruct", + 28 + ], + [ + "apex-v1", + 19 + ], + [ + "appworld_test_normal", + 15 + ], + [ + "browsecompplus", + 15 + ], + [ + "swe-bench", + 15 + ], + [ + "tau-bench-2_airline", + 15 + ], + [ + "tau-bench-2_retail", + 15 + ] + ], + "metric_kind": [ + [ + "hfopenllm_v2", + 27444 + ], + [ + "helm_mmlu", + 2844 + ], + [ + "reward-bench", + 2404 + ], + [ + "helm_classic", + 1005 + ], + [ + "global-mmlu-lite", + 912 + ], + [ + "helm_lite", + 910 + ], + [ + "fibble_arena", + 559 + ], + [ + "helm_capabilities", + 408 + ], + [ + "wordle_arena", + 134 + ], + [ + "terminal-bench-2.0", + 115 + ], + [ + "livecodebenchpro", + 87 + ], + [ + "apex-agents", + 74 + ], + [ + "ace", + 32 + ], + [ + "helm_instruct", + 28 + ], + [ + "apex-v1", + 19 + ], + [ + "appworld_test_normal", + 15 + ], + [ + "browsecompplus", + 15 + ], + [ + "swe-bench", + 15 + ], + [ + "tau-bench-2_airline", + 15 + ], + [ + "tau-bench-2_retail", + 15 + ] + ], + "metric_unit": [ + [ + "hfopenllm_v2", + 27444 + ], + [ + "helm_mmlu", + 2844 + ], + [ + "reward-bench", + 2404 + ], + [ + "helm_classic", + 1005 + ], + [ + "global-mmlu-lite", + 912 + ], + [ + "helm_lite", + 910 + ], + [ + "fibble_arena", + 559 + ], + [ + "helm_capabilities", + 408 + ], + [ + "wordle_arena", + 134 + ], + [ + "terminal-bench-2.0", + 115 + ], + [ + "livecodebenchpro", + 87 + ], + [ + "apex-agents", + 74 + ], + [ + "ace", + 32 + ], + [ + "helm_instruct", + 28 + ], + [ + "apex-v1", + 19 + ], + [ + "appworld_test_normal", + 15 + ], + [ + "browsecompplus", + 15 + ], + [ + "swe-bench", + 15 + ], + [ + "tau-bench-2_airline", + 15 + ], + [ + "tau-bench-2_retail", + 15 + ] + ] + } +} \ No newline at end of file diff --git a/README.md b/README.md index 262a2ca11..bc2f12997 100644 --- a/README.md +++ b/README.md @@ -199,6 +199,24 @@ uv run python -m every_eval_ever validate --format github data/ Exit code is `0` if all files pass and `1` if any fail. +### Canonical identity backfill + audit + +For legacy aggregate files missing canonical metric identity fields, you can backfill and then audit coverage: + +```sh +# Dry-run canonical augmentation +uv run python -m every_eval_ever augment-canonical-identity data/ + +# Apply canonical augmentation in place (also updates companion *_samples.jsonl when present) +uv run python -m every_eval_ever augment-canonical-identity data/ --write + +# Audit missing/malformed canonical identity fields +uv run python -m every_eval_ever check-canonical-identity data/ + +# CI mode (exit 1 if issues remain) +uv run python -m every_eval_ever check-canonical-identity data/ --fail-on-issues +``` + ## 🗂️ Data Structure Evaluation data is hosted on the [Hugging Face datastore](https://huggingface.co/datasets/evaleval/EEE_datastore). The folder structure is: diff --git a/every_eval_ever/augment_canonical_identity.py b/every_eval_ever/augment_canonical_identity.py new file mode 100644 index 000000000..4be663643 --- /dev/null +++ b/every_eval_ever/augment_canonical_identity.py @@ -0,0 +1,1249 @@ +"""Backfill canonical metric/eval identity into existing datastore files.""" + +from __future__ import annotations + +import argparse +import json +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +from every_eval_ever.validate import expand_paths + +WORDLE_METRICS: dict[str, dict[str, Any]] = { + 'win_rate': { + 'metric_id': 'win_rate', + 'metric_name': 'Win Rate', + 'metric_kind': 'win_rate', + 'metric_unit': 'proportion', + }, + 'avg_attempts': { + 'metric_id': 'mean_attempts', + 'metric_name': 'Average Attempts', + 'metric_kind': 'count', + 'metric_unit': 'attempts', + }, + 'avg_latency_ms': { + 'metric_id': 'latency_mean', + 'metric_name': 'Average Latency', + 'metric_kind': 'latency', + 'metric_unit': 'ms', + }, +} + +HFOPENLLM_METRICS: dict[str, dict[str, Any]] = { + 'IFEval': { + 'metric_id': 'accuracy', + 'metric_name': 'Accuracy', + 'metric_kind': 'accuracy', + 'metric_unit': 'proportion', + }, + 'BBH': { + 'metric_id': 'accuracy', + 'metric_name': 'Accuracy', + 'metric_kind': 'accuracy', + 'metric_unit': 'proportion', + }, + 'MATH Level 5': { + 'metric_id': 'exact_match', + 'metric_name': 'Exact Match', + 'metric_kind': 'exact_match', + 'metric_unit': 'proportion', + }, + 'GPQA': { + 'metric_id': 'accuracy', + 'metric_name': 'Accuracy', + 'metric_kind': 'accuracy', + 'metric_unit': 'proportion', + }, + 'MUSR': { + 'metric_id': 'accuracy', + 'metric_name': 'Accuracy', + 'metric_kind': 'accuracy', + 'metric_unit': 'proportion', + }, + 'MMLU-PRO': { + 'metric_id': 'accuracy', + 'metric_name': 'Accuracy', + 'metric_kind': 'accuracy', + 'metric_unit': 'proportion', + }, +} + +SINGLE_SCORE_BENCHMARK_FAMILIES = { + 'appworld_test_normal', + 'browsecompplus', + 'la_leaderboard', + 'multi-swe-bench-leaderboard', + 'swe-bench', + 'swe-polybench-leaderboard', +} + +INSPECT_ACCURACY_BENCHMARK_FAMILIES = { + 'GAIA', + 'MathVista', + 'MMLU-Pro', + 'MMMU-Multiple-Choice', + 'MMMU-Open-Ended', + 'big_bench_hard', + 'commonsense_qa', + 'cybench', + 'cyse2_interpreter_abuse', + 'cyse2_prompt_injection', + 'cyse2_vulnerability_exploit', + 'gdm_intercode_ctf', + 'gpqa_diamond', + 'gsm8k', + 'hellaswag', + 'mbpp', + 'openai_humaneval', + 'piqa', +} + + +@dataclass +class CanonicalPatch: + evaluation_name: str | None = None + metric_id: str | None = None + metric_name: str | None = None + metric_kind: str | None = None + metric_unit: str | None = None + metric_parameters: dict[str, str | float | bool | None] | None = None + + +@dataclass +class AugmentationReport: + original_file_path: Path + file_path: Path + benchmark_family: str + aggregate_changed: bool + sample_file_path: Path | None + sample_changed: bool + changed_results: int + sample_rows: int + path_changed: bool = False + + @property + def changed(self) -> bool: + return self.aggregate_changed or self.sample_changed or self.path_changed + + +def _slug(value: str) -> str: + slug = re.sub(r'[^a-z0-9]+', '_', value.lower()).strip('_') + return slug or 'unknown' + + +def _store_raw_evaluation_name( + metric_config: dict[str, Any], raw_evaluation_name: str +) -> None: + additional = metric_config.get('additional_details') + if not isinstance(additional, dict): + additional = {} + metric_config['additional_details'] = additional + additional.setdefault('raw_evaluation_name', raw_evaluation_name) + + +def _set_metric_field( + metric_config: dict[str, Any], key: str, value: Any +) -> bool: + if value is None: + return False + if metric_config.get(key) in (None, ''): + metric_config[key] = value + return True + return False + + +def _benchmark_metric_namespace(benchmark_family: str) -> str: + namespace = re.sub(r'[^a-z0-9]+', '_', benchmark_family.lower()).strip('_') + return namespace or 'benchmark' + + +def _score_metric_unit(metric_config: dict[str, Any]) -> str: + min_score = metric_config.get('min_score') + max_score = metric_config.get('max_score') + + if min_score in {0, 0.0} and max_score in {1, 1.0}: + return 'proportion' + if min_score in {0, 0.0} and max_score in {100, 100.0}: + return 'points' + return 'points' + + +def _metric_from_phrase(phrase: str) -> CanonicalPatch | None: + normalized = phrase.strip().lower().rstrip(':.') + + pass_at_k_match = re.fullmatch(r'pass@(?P\d+)', normalized) + if pass_at_k_match is not None: + k = int(pass_at_k_match.group('k')) + return CanonicalPatch( + metric_id='pass_at_k', + metric_name=f'Pass@{k}', + metric_kind='pass_rate', + metric_unit='proportion', + metric_parameters={'k': k}, + ) + + ndcg_match = re.fullmatch(r'ndcg@(?P\d+)', normalized) + if ndcg_match is not None: + k = int(ndcg_match.group('k')) + return CanonicalPatch( + metric_id='ndcg', + metric_name=f'NDCG@{k}', + metric_kind='ndcg', + metric_unit='proportion', + metric_parameters={'k': k}, + ) + + rouge_match = re.fullmatch(r'rouge[-_ ]?(?P\d+)', normalized) + if rouge_match is not None: + n = int(rouge_match.group('n')) + return CanonicalPatch( + metric_id=f'rouge_{n}', + metric_name=f'ROUGE-{n}', + metric_kind='rouge', + metric_unit='proportion', + metric_parameters={'n': n}, + ) + + bleu_match = re.fullmatch(r'bleu[-_ ]?(?P\d+)', normalized) + if bleu_match is not None: + n = int(bleu_match.group('n')) + return CanonicalPatch( + metric_id=f'bleu_{n}', + metric_name=f'BLEU-{n}', + metric_kind='bleu', + metric_unit='proportion', + metric_parameters={'n': n}, + ) + + if normalized in {'acc', 'accuracy', 'cot correct', 'correct'}: + return CanonicalPatch( + metric_id='accuracy', + metric_name='Accuracy', + metric_kind='accuracy', + metric_unit='proportion', + ) + if normalized in {'equivalent', 'equivalent (cot)'}: + return CanonicalPatch( + metric_id='equivalent_cot', + metric_name='Equivalent (CoT)', + metric_kind='accuracy', + metric_unit='proportion', + ) + if 'strict acc' in normalized: + return CanonicalPatch( + metric_id='strict_accuracy', + metric_name='Strict Accuracy', + metric_kind='accuracy', + metric_unit='proportion', + ) + if normalized == 'em': + return CanonicalPatch( + metric_id='exact_match', + metric_name='Exact Match', + metric_kind='exact_match', + metric_unit='proportion', + ) + if normalized == 'f1': + return CanonicalPatch( + metric_id='f1', + metric_name='F1', + metric_kind='f1', + metric_unit='proportion', + ) + if normalized in {'mean win rate', 'win rate'}: + return CanonicalPatch( + metric_id='win_rate', + metric_name='Win Rate', + metric_kind='win_rate', + metric_unit='proportion', + ) + if normalized == 'mean': + return CanonicalPatch( + metric_id='mean_score', + metric_name='Mean Score', + metric_kind='score', + metric_unit='proportion', + ) + if normalized in {'mean score', 'score', 'wb score'}: + return CanonicalPatch( + metric_id='score', + metric_name='Score', + metric_kind='score', + metric_unit='proportion', + ) + if normalized in {'stderr', 'standard error'}: + return CanonicalPatch( + metric_id='standard_error', + metric_name='Standard Error', + metric_kind='standard_error', + metric_unit='proportion', + ) + if normalized in {'std', 'stddev', 'standard deviation'}: + return CanonicalPatch( + metric_id='standard_deviation', + metric_name='Standard Deviation', + metric_kind='standard_deviation', + metric_unit='proportion', + ) + if normalized == 'harmlessness': + return CanonicalPatch( + metric_id='harmlessness', + metric_name='Harmlessness', + metric_kind='score', + metric_unit='points', + ) + return None + + +def _wordle_patch(raw_evaluation_name: str) -> CanonicalPatch | None: + prefix = 'wordle_arena_' + if not raw_evaluation_name.startswith(prefix): + return None + + metric_suffix = raw_evaluation_name[len(prefix) :] + metric = WORDLE_METRICS.get(metric_suffix) + if metric is None: + return None + + return CanonicalPatch( + evaluation_name='wordle_arena', + metric_id=metric['metric_id'], + metric_name=metric['metric_name'], + metric_kind=metric['metric_kind'], + metric_unit=metric['metric_unit'], + ) + + +def _fibble_patch(raw_evaluation_name: str) -> CanonicalPatch | None: + legacy_match = re.fullmatch( + r'fibble_arena_(?P1lie|[2-9]lies)_(?P.+)', + raw_evaluation_name, + ) + if legacy_match is not None: + metric = WORDLE_METRICS.get(legacy_match.group('metric')) + if metric is None: + return None + + return CanonicalPatch( + evaluation_name=f'fibble_arena_{legacy_match.group("variant")}', + metric_id=metric['metric_id'], + metric_name=metric['metric_name'], + metric_kind=metric['metric_kind'], + metric_unit=metric['metric_unit'], + ) + + family_match = re.fullmatch( + r'(?Pfibble(?:[1-5])?_arena)_(?P.+)', + raw_evaluation_name, + ) + if family_match is None: + return None + + metric = WORDLE_METRICS.get(family_match.group('metric')) + if metric is None: + return None + + return CanonicalPatch( + evaluation_name=family_match.group('family'), + metric_id=metric['metric_id'], + metric_name=metric['metric_name'], + metric_kind=metric['metric_kind'], + metric_unit=metric['metric_unit'], + ) + + +def _ifeval_patch(raw_evaluation_name: str) -> CanonicalPatch | None: + match = re.fullmatch( + r'(?Pprompt_strict|prompt_loose|inst_strict|inst_loose|final)' + r'_(?Pacc|stderr)' + r' on inspect_evals/ifeval(?: for scorer .+)?', + raw_evaluation_name, + ) + if match is None: + return None + + metric_key = match.group('metric') + metric = _metric_from_phrase( + 'accuracy' if metric_key == 'acc' else 'standard error' + ) + if metric is None: + return None + metric.evaluation_name = match.group('slice') + return metric + + +def _agentharm_patch(raw_evaluation_name: str) -> CanonicalPatch | None: + overall_match = re.fullmatch( + r'inspect_evals/(?P' + r'avg_score|avg_full_score|avg_refusals|avg_score_non_refusals' + r') on inspect_evals/agentharm(?: for scorer .+)?', + raw_evaluation_name, + ) + if overall_match is not None: + metric_key = overall_match.group('metric') + if metric_key == 'avg_score': + return CanonicalPatch( + evaluation_name='agentharm', + metric_id='average_score', + metric_name='Average Score', + metric_kind='score', + metric_unit='proportion', + ) + if metric_key == 'avg_full_score': + return CanonicalPatch( + evaluation_name='agentharm', + metric_id='average_full_score', + metric_name='Average Full Score', + metric_kind='score', + metric_unit='proportion', + ) + if metric_key == 'avg_refusals': + return CanonicalPatch( + evaluation_name='agentharm', + metric_id='average_refusal_rate', + metric_name='Average Refusal Rate', + metric_kind='refusal_rate', + metric_unit='proportion', + ) + if metric_key == 'avg_score_non_refusals': + return CanonicalPatch( + evaluation_name='agentharm', + metric_id='average_score_non_refusals', + metric_name='Average Score (Non-Refusals)', + metric_kind='score', + metric_unit='proportion', + ) + + category_match = re.fullmatch( + r'(?P[A-Za-z]+)_avg_(?Pscores|refusals)' + r' on inspect_evals/agentharm(?: for scorer .+)?', + raw_evaluation_name, + ) + if category_match is None: + return None + + category = category_match.group('category') + metric_kind = category_match.group('metric') + if metric_kind == 'scores': + return CanonicalPatch( + evaluation_name=category, + metric_id='average_score', + metric_name='Average Score', + metric_kind='score', + metric_unit='proportion', + ) + + return CanonicalPatch( + evaluation_name=category, + metric_id='average_refusal_rate', + metric_name='Average Refusal Rate', + metric_kind='refusal_rate', + metric_unit='proportion', + ) + + +def _generic_inspect_eval_patch( + benchmark_family: str, + raw_evaluation_name: str, + metric_config: dict[str, Any], +) -> CanonicalPatch | None: + match = re.fullmatch( + r'(?Paccuracy|mean|std) on ' + r'(?:(?:inspect_evals/)?(?P[^ ]+))' + r'(?: for scorer .+)?', + raw_evaluation_name, + flags=re.IGNORECASE, + ) + if match is None: + metric_phrase = str(metric_config.get('evaluation_description') or '') + else: + metric_phrase = match.group('metric') + + metric = _metric_from_phrase(metric_phrase) + if metric is None: + return None + + metric.evaluation_name = benchmark_family + return metric + + +def _alphaxiv_patch( + payload: dict[str, Any], result: dict[str, Any] +) -> CanonicalPatch: + raw_evaluation_name = str(result.get('evaluation_name') or '').strip() + metric_config = result.setdefault('metric_config', {}) + + if not raw_evaluation_name: + additional = metric_config.get('additional_details') + if isinstance(additional, dict): + raw_evaluation_name = str(additional.get('alphaxiv_y_axis') or '').strip() + if not raw_evaluation_name: + raw_evaluation_name = 'Score' + + source_data = result.get('source_data') + evaluation_name = '' + if isinstance(source_data, dict): + evaluation_name = str(source_data.get('dataset_name') or '').strip() + if not evaluation_name: + evaluation_id = str(payload.get('evaluation_id') or '') + if '/' in evaluation_id: + evaluation_name = evaluation_id.split('/', 1)[0].strip() + if not evaluation_name: + evaluation_name = 'alphaxiv' + + return CanonicalPatch( + evaluation_name=evaluation_name, + metric_id=_slug(raw_evaluation_name), + metric_name=raw_evaluation_name, + metric_kind='score', + metric_unit=_score_metric_unit(metric_config), + ) + + +def _apex_agents_patch(raw_evaluation_name: str) -> CanonicalPatch | None: + pass_match = re.fullmatch( + r'(?P.+) Pass@(?P\d+)', raw_evaluation_name + ) + if pass_match is not None: + k = int(pass_match.group('k')) + return CanonicalPatch( + evaluation_name=pass_match.group('slice'), + metric_id='pass_at_k', + metric_name=f'Pass@{k}', + metric_kind='pass_rate', + metric_unit='proportion', + metric_parameters={'k': k}, + ) + + score_match = re.fullmatch(r'(?P.+) Mean Score', raw_evaluation_name) + if score_match is not None: + return CanonicalPatch( + evaluation_name=score_match.group('slice'), + metric_id='mean_score', + metric_name='Mean Score', + metric_kind='score', + metric_unit='proportion', + ) + + return None + + +def _bfcl_patch( + raw_evaluation_name: str, metric_config: dict[str, Any] +) -> CanonicalPatch | None: + metric_id = metric_config.get('metric_id') + if not isinstance(metric_id, str) or not metric_id.startswith('bfcl.'): + return None + + parts = metric_id.split('.') + if len(parts) < 3: + return None + + eval_slice = '.'.join(parts[1:-1]) + if not eval_slice: + return None + + if raw_evaluation_name != metric_id and raw_evaluation_name == eval_slice: + return None + + return CanonicalPatch(evaluation_name=eval_slice) + + +def _rewardbench_patch( + raw_evaluation_name: str, metric_config: dict[str, Any] +) -> CanonicalPatch | None: + description = str(metric_config.get('evaluation_description') or '') + lowered = description.lower() + + if raw_evaluation_name == 'Score': + return CanonicalPatch( + evaluation_name='RewardBench', + metric_id='rewardbench.score', + metric_name='Score', + metric_kind='score', + metric_unit='proportion', + ) + + if 'accuracy' in lowered: + return CanonicalPatch( + metric_id='accuracy', + metric_name='Accuracy', + metric_kind='accuracy', + metric_unit='proportion', + ) + + if 'score' in lowered: + return CanonicalPatch( + metric_id='score', + metric_name='Score', + metric_kind='score', + metric_unit='proportion', + ) + + return None + + +def _description_metric_patch(metric_config: dict[str, Any]) -> CanonicalPatch | None: + description = str(metric_config.get('evaluation_description') or '') + if ' on ' not in description: + return None + metric_phrase = description.split(' on ', 1)[0] + return _metric_from_phrase(metric_phrase) + + +def _score_suffix_patch( + benchmark_family: str, + raw_evaluation_name: str, + metric_config: dict[str, Any], +) -> CanonicalPatch | None: + score_match = re.fullmatch(r'(?P.+?) Score', raw_evaluation_name) + if score_match is None: + return None + + eval_slice = score_match.group('slice').strip() + evaluation_name = ( + benchmark_family if eval_slice.lower() == 'overall' else eval_slice + ) + namespace = _benchmark_metric_namespace(benchmark_family) + return CanonicalPatch( + evaluation_name=evaluation_name, + metric_id=f'{namespace}.score', + metric_name='Score', + metric_kind='score', + metric_unit=_score_metric_unit(metric_config), + ) + + +def _single_score_patch( + benchmark_family: str, metric_config: dict[str, Any] +) -> CanonicalPatch: + namespace = _benchmark_metric_namespace(benchmark_family) + return CanonicalPatch( + metric_id=f'{namespace}.score', + metric_name='Score', + metric_kind='score', + metric_unit=_score_metric_unit(metric_config), + ) + + +def _theory_of_mind_patch( + raw_evaluation_name: str, metric_config: dict[str, Any] +) -> CanonicalPatch | None: + match = re.fullmatch( + r'(?P.+?) on (?P.+?)(?: for scorer .+)?', + raw_evaluation_name, + ) + if match is not None: + patch = _metric_from_phrase(match.group('metric')) + if patch is not None: + patch.evaluation_name = match.group('slice').strip() + return patch + + return _metric_from_phrase(str(metric_config.get('evaluation_description') or '')) + + +def _helm_patch( + benchmark_family: str, + raw_evaluation_name: str, + metric_config: dict[str, Any], +) -> CanonicalPatch | None: + description = str(metric_config.get('evaluation_description') or '') + + if raw_evaluation_name.lower().startswith('mean '): + metric = _metric_from_phrase(raw_evaluation_name) + if metric is None: + return None + metric.evaluation_name = benchmark_family + return metric + + if ' on ' in description: + metric_phrase = description.split(' on ', 1)[0] + return _metric_from_phrase(metric_phrase) + + return None + + +def _swe_bench_verified_mini_patch( + benchmark_family: str, + raw_evaluation_name: str, + metric_config: dict[str, Any], +) -> CanonicalPatch | None: + match = re.fullmatch( + r'(?Pmean|std) on inspect_evals/swe_bench_verified_mini(?: for scorer .+)?', + raw_evaluation_name, + flags=re.IGNORECASE, + ) + if match is None: + metric_phrase = str(metric_config.get('evaluation_description') or '') + else: + metric_phrase = match.group('metric') + + metric_phrase = metric_phrase.strip().lower() + if metric_phrase == 'mean': + return CanonicalPatch( + evaluation_name=benchmark_family, + metric_id='mean_score', + metric_name='Mean Score', + metric_kind='score', + metric_unit=_score_metric_unit(metric_config), + ) + if metric_phrase == 'std': + return CanonicalPatch( + evaluation_name=benchmark_family, + metric_id='standard_deviation', + metric_name='Standard Deviation', + metric_kind='standard_deviation', + metric_unit=_score_metric_unit(metric_config), + ) + return None + + +def _canonical_patch_for_result( + benchmark_family: str, + result: dict[str, Any], + *, + payload: dict[str, Any] | None = None, +) -> CanonicalPatch | None: + raw_evaluation_name = str(result.get('evaluation_name') or '') + metric_config = result.setdefault('metric_config', {}) + + if benchmark_family == 'alphaxiv': + return _alphaxiv_patch(payload or {}, result) + + if benchmark_family == 'global-mmlu-lite': + return CanonicalPatch( + metric_id='accuracy', + metric_name='Accuracy', + metric_kind='accuracy', + metric_unit='proportion', + ) + + if benchmark_family == 'hfopenllm_v2': + metric = HFOPENLLM_METRICS.get(raw_evaluation_name) + if metric is None: + return None + return CanonicalPatch( + metric_id=metric['metric_id'], + metric_name=metric['metric_name'], + metric_kind=metric['metric_kind'], + metric_unit=metric['metric_unit'], + ) + + if benchmark_family == 'reward-bench': + return _rewardbench_patch(raw_evaluation_name, metric_config) + + if benchmark_family == 'terminal-bench-2.0': + return CanonicalPatch( + metric_id='accuracy', + metric_name='Task Resolution Accuracy', + metric_kind='accuracy', + metric_unit='percentage', + ) + + if benchmark_family == 'wordle_arena': + return _wordle_patch(raw_evaluation_name) + + if benchmark_family == 'fibble_arena': + return _fibble_patch(raw_evaluation_name) + + if re.fullmatch(r'fibble[1-5]_arena', benchmark_family): + return _fibble_patch(raw_evaluation_name) + + if benchmark_family == 'apex-agents': + return _apex_agents_patch(raw_evaluation_name) + + if benchmark_family in {'ace', 'apex-v1'}: + return _score_suffix_patch( + benchmark_family=benchmark_family, + raw_evaluation_name=raw_evaluation_name, + metric_config=metric_config, + ) + + if benchmark_family == 'bfcl': + return _bfcl_patch(raw_evaluation_name, metric_config) + + if benchmark_family.startswith('helm_'): + return _helm_patch( + benchmark_family=benchmark_family, + raw_evaluation_name=raw_evaluation_name, + metric_config=metric_config, + ) + + if benchmark_family == 'livecodebenchpro': + return _description_metric_patch(metric_config) + + if benchmark_family == 'swe-bench-verified-mini': + return _swe_bench_verified_mini_patch( + benchmark_family=benchmark_family, + raw_evaluation_name=raw_evaluation_name, + metric_config=metric_config, + ) + + if benchmark_family in SINGLE_SCORE_BENCHMARK_FAMILIES or benchmark_family.startswith( + 'tau-bench-2_' + ): + return _single_score_patch(benchmark_family, metric_config) + + if benchmark_family == 'IFEval': + return _ifeval_patch(raw_evaluation_name) + + if benchmark_family == 'agentharm': + return _agentharm_patch(raw_evaluation_name) + + if benchmark_family == 'cvebench': + return _generic_inspect_eval_patch( + benchmark_family=benchmark_family, + raw_evaluation_name=raw_evaluation_name, + metric_config=metric_config, + ) + + if benchmark_family in INSPECT_ACCURACY_BENCHMARK_FAMILIES: + return _generic_inspect_eval_patch( + benchmark_family=benchmark_family, + raw_evaluation_name=raw_evaluation_name, + metric_config=metric_config, + ) + + if benchmark_family == 'theory_of_mind': + return _theory_of_mind_patch(raw_evaluation_name, metric_config) + + return None + + +def _apply_patch( + result: dict[str, Any], patch: CanonicalPatch | None +) -> tuple[bool, str]: + raw_evaluation_name = str(result.get('evaluation_name') or '') + if patch is None: + return False, raw_evaluation_name + + metric_config = result.setdefault('metric_config', {}) + changed = False + + if ( + patch.evaluation_name is not None + and result.get('evaluation_name') != patch.evaluation_name + ): + _store_raw_evaluation_name(metric_config, raw_evaluation_name) + result['evaluation_name'] = patch.evaluation_name + changed = True + + changed |= _set_metric_field(metric_config, 'metric_id', patch.metric_id) + changed |= _set_metric_field( + metric_config, 'metric_name', patch.metric_name + ) + changed |= _set_metric_field( + metric_config, 'metric_kind', patch.metric_kind + ) + changed |= _set_metric_field( + metric_config, 'metric_unit', patch.metric_unit + ) + + if patch.metric_parameters and metric_config.get('metric_parameters') in ( + None, + {}, + ): + metric_config['metric_parameters'] = patch.metric_parameters + changed = True + + return changed, raw_evaluation_name + + +def _metric_fragment(metric_config: dict[str, Any]) -> str: + metric_id = str( + metric_config.get('metric_id') + or metric_config.get('metric_name') + or 'score' + ) + fragment = _slug(metric_id) + parameters = metric_config.get('metric_parameters') + if isinstance(parameters, dict): + for key, value in sorted(parameters.items()): + fragment += f'__{_slug(str(key))}_{_slug(str(value))}' + return fragment + + +def _raw_evaluation_name(result: dict[str, Any]) -> str: + metric_config = result.get('metric_config', {}) + if not isinstance(metric_config, dict): + return str(result.get('evaluation_name') or '') + + additional = metric_config.get('additional_details') + if isinstance(additional, dict): + raw_name = additional.get('raw_evaluation_name') + if raw_name: + return str(raw_name) + + return str(result.get('evaluation_name') or '') + + +def _assign_evaluation_result_ids( + payload: dict[str, Any], + sample_updates: dict[str, dict[str, str]], +) -> bool: + evaluation_id = str(payload.get('evaluation_id') or '') + counts: dict[str, int] = {} + changed = False + + for result in payload.get('evaluation_results', []): + raw_evaluation_name = _raw_evaluation_name(result) + eval_name = str(result.get('evaluation_name') or '') + base = ( + f'{evaluation_id}' + f'#{_slug(eval_name)}' + f'#{_metric_fragment(result.get("metric_config", {}))}' + ) + counts[base] = counts.get(base, 0) + 1 + evaluation_result_id = base + if counts[base] > 1: + evaluation_result_id = f'{base}__{counts[base]}' + + if result.get('evaluation_result_id') != evaluation_result_id: + result['evaluation_result_id'] = evaluation_result_id + changed = True + + sample_updates[raw_evaluation_name] = { + 'evaluation_name': eval_name, + 'evaluation_result_id': evaluation_result_id, + 'evaluation_id': evaluation_id, + } + + return changed + + +def _add_sample_alias_updates( + benchmark_family: str, + payload: dict[str, Any], + sample_updates: dict[str, dict[str, str]], +) -> None: + if benchmark_family != 'swe-bench-verified-mini': + return + + evaluation_id = str(payload.get('evaluation_id') or '') + mean_result = None + for result in payload.get('evaluation_results', []): + metric_config = result.get('metric_config', {}) + if metric_config.get('metric_id') == 'mean_score': + mean_result = result + break + + if mean_result is None: + return + + alias_update = { + 'evaluation_name': str( + mean_result.get('evaluation_name') or benchmark_family + ), + 'evaluation_result_id': str( + mean_result.get('evaluation_result_id') or '' + ), + 'evaluation_id': evaluation_id, + } + for raw_name in ( + 'inspect_evals/swe_bench_verified_mini', + 'swe_bench_verified_mini', + ): + sample_updates[raw_name] = alias_update.copy() + + +def infer_benchmark_family( + file_path: Path, payload: dict[str, Any] | None = None +) -> str: + parts = list(file_path.parts) + if 'data' in parts: + idx = parts.index('data') + if idx + 1 < len(parts): + return parts[idx + 1] + + if payload is None: + return 'unknown' + + evaluation_id = str(payload.get('evaluation_id') or '') + if '/' in evaluation_id: + return evaluation_id.split('/', 1)[0] + return 'unknown' + + +def _ensure_eval_library( + payload: dict[str, Any], benchmark_family: str +) -> bool: + if benchmark_family != 'alphaxiv': + return False + + changed = False + eval_library = payload.get('eval_library') + if not isinstance(eval_library, dict): + eval_library = {} + payload['eval_library'] = eval_library + changed = True + + if eval_library.get('name') in (None, ''): + eval_library['name'] = 'alphaxiv' + changed = True + if eval_library.get('version') in (None, ''): + eval_library['version'] = 'unknown' + changed = True + + return changed + + +def augment_aggregate_payload( + payload: dict[str, Any], benchmark_family: str | None = None +) -> tuple[dict[str, Any], int, dict[str, dict[str, str]], bool]: + benchmark_family = benchmark_family or infer_benchmark_family( + Path('.'), payload + ) + changed_results = 0 + sample_updates: dict[str, dict[str, str]] = {} + payload_changed = _ensure_eval_library(payload, benchmark_family) + + for result in payload.get('evaluation_results', []): + patch = _canonical_patch_for_result( + benchmark_family, result, payload=payload + ) + changed, _ = _apply_patch(result, patch) + if changed: + changed_results += 1 + + ids_changed = _assign_evaluation_result_ids(payload, sample_updates) + _add_sample_alias_updates(benchmark_family, payload, sample_updates) + return payload, changed_results, sample_updates, ( + ids_changed or payload_changed + ) + + +def _read_jsonl(path: Path) -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + with path.open(encoding='utf-8') as handle: + for line in handle: + stripped = line.strip() + if not stripped: + continue + rows.append(json.loads(stripped)) + return rows + + +def _write_json(path: Path, payload: dict[str, Any]) -> None: + path.write_text( + json.dumps(payload, indent=2, ensure_ascii=False) + '\n', + encoding='utf-8', + ) + + +def _write_jsonl(path: Path, rows: list[dict[str, Any]]) -> None: + with path.open('w', encoding='utf-8') as handle: + for row in rows: + handle.write(json.dumps(row, ensure_ascii=False) + '\n') + + +def canonicalize_datastore_path(file_path: Path) -> Path: + parts = list(file_path.parts) + if 'data' not in parts: + return file_path + + idx = parts.index('data') + relative = parts[idx + 1 :] + if len(relative) <= 4: + return file_path + + benchmark_family = relative[0] + developer = relative[-3] + model_name = relative[-2] + filename = relative[-1] + return Path(*parts[: idx + 1], benchmark_family, developer, model_name, filename) + + +def _move_file(source: Path, target: Path) -> None: + if source == target: + return + if target.exists(): + raise FileExistsError( + f'Cannot normalize path for {source}: target already exists at {target}' + ) + target.parent.mkdir(parents=True, exist_ok=True) + source.rename(target) + + +def augment_sample_rows( + rows: list[dict[str, Any]], + sample_updates: dict[str, dict[str, str]], +) -> tuple[list[dict[str, Any]], bool]: + changed = False + + for row in rows: + raw_evaluation_name = str(row.get('evaluation_name') or '') + update = sample_updates.get(raw_evaluation_name) + if update is None: + continue + + if row.get('evaluation_name') != update['evaluation_name']: + row['evaluation_name'] = update['evaluation_name'] + changed = True + + if row.get('evaluation_result_id') != update['evaluation_result_id']: + row['evaluation_result_id'] = update['evaluation_result_id'] + changed = True + + updated_evaluation_id = update.get('evaluation_id') + if ( + updated_evaluation_id is not None + and row.get('evaluation_id') != updated_evaluation_id + ): + row['evaluation_id'] = updated_evaluation_id + changed = True + + return rows, changed + + +def augment_aggregate_file( + file_path: Path, + *, + write_changes: bool = False, + update_samples: bool = True, + normalize_paths: bool = True, +) -> AugmentationReport: + original_file_path = file_path + payload = json.loads(file_path.read_text(encoding='utf-8')) + benchmark_family = infer_benchmark_family(file_path, payload) + payload, changed_results, sample_updates, ids_changed = ( + augment_aggregate_payload(payload, benchmark_family=benchmark_family) + ) + aggregate_changed = changed_results > 0 or ids_changed + + sample_file_path = file_path.with_name(f'{file_path.stem}_samples.jsonl') + sample_exists = sample_file_path.exists() + sample_changed = False + sample_rows = 0 + + if update_samples and sample_exists: + rows = _read_jsonl(sample_file_path) + sample_rows = len(rows) + rows, sample_changed = augment_sample_rows(rows, sample_updates) + + detailed = payload.get('detailed_evaluation_results') + if ( + isinstance(detailed, dict) + and detailed.get('total_rows') != sample_rows + ): + detailed['total_rows'] = sample_rows + aggregate_changed = True + + if write_changes and sample_changed: + _write_jsonl(sample_file_path, rows) + + if write_changes and aggregate_changed: + _write_json(file_path, payload) + + target_file_path = ( + canonicalize_datastore_path(file_path) if normalize_paths else file_path + ) + target_sample_file_path = ( + canonicalize_datastore_path(sample_file_path) + if sample_exists and normalize_paths + else sample_file_path + ) + path_changed = target_file_path != file_path + + reported_file_path = target_file_path if path_changed else file_path + reported_sample_file_path = ( + target_sample_file_path if sample_exists else None + ) + + if write_changes and path_changed: + if sample_exists and target_sample_file_path is not None: + _move_file(sample_file_path, target_sample_file_path) + _move_file(file_path, target_file_path) + + return AugmentationReport( + original_file_path=original_file_path, + file_path=reported_file_path, + benchmark_family=benchmark_family, + aggregate_changed=aggregate_changed, + sample_file_path=reported_sample_file_path, + sample_changed=sample_changed, + changed_results=changed_results, + sample_rows=sample_rows, + path_changed=path_changed, + ) + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog='every_eval_ever augment-canonical-identity', + description=( + 'Backfill metric identity and canonical eval names into existing ' + 'EEE aggregate JSON files.' + ), + ) + parser.add_argument( + 'paths', + nargs='+', + help='Aggregate JSON files or directories containing datastore files.', + ) + parser.add_argument( + '--write', + action='store_true', + help='Write changes in place. Without this flag, run as a dry run.', + ) + parser.add_argument( + '--skip-samples', + action='store_true', + help='Do not update companion *_samples.jsonl files.', + ) + parser.add_argument( + '--skip-path-normalization', + action='store_true', + help='Do not flatten over-nested datastore paths to benchmark/developer/model.', + ) + return parser + + +def main(argv: list[str] | None = None) -> int: + args = build_parser().parse_args(argv) + + aggregate_files = [ + path + for path in expand_paths(args.paths) + if path.suffix == '.json' and not path.name.endswith('_samples.json') + ] + + reports = [ + augment_aggregate_file( + path, + write_changes=args.write, + update_samples=not args.skip_samples, + normalize_paths=not args.skip_path_normalization, + ) + for path in aggregate_files + ] + + changed_reports = [report for report in reports if report.changed] + + action = 'Updated' if args.write else 'Would update' + for report in changed_reports: + details = [ + f'results={report.changed_results}', + f'sample_rows={report.sample_rows}', + f'samples_changed={report.sample_changed}', + ] + if report.path_changed: + details.insert( + 0, f'path={report.original_file_path} -> {report.file_path}' + ) + print( + f'{action}: {report.file_path} ' + f'[{report.benchmark_family}] ' + f'({", ".join(details)})' + ) + + print( + f'Scanned {len(reports)} aggregate file(s); ' + f'{len(changed_reports)} would change.' + if not args.write + else f'Scanned {len(reports)} aggregate file(s); ' + f'updated {len(changed_reports)}.' + ) + return 0 diff --git a/every_eval_ever/check_canonical_identity.py b/every_eval_ever/check_canonical_identity.py new file mode 100644 index 000000000..bd2b21014 --- /dev/null +++ b/every_eval_ever/check_canonical_identity.py @@ -0,0 +1,242 @@ +"""Audit canonical metric/result identity coverage in aggregate datastore JSON.""" + +from __future__ import annotations + +import argparse +import json +import re +from collections import Counter, defaultdict +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +from every_eval_ever.validate import expand_paths + +IDENTITY_FIELDS = ( + 'evaluation_result_id', + 'metric_id', + 'metric_name', + 'metric_kind', + 'metric_unit', +) + +EVALUATION_RESULT_ID_PATTERN = re.compile(r'.+#.+#.+') + + +@dataclass +class CanonicalIdentityReport: + files_scanned: int = 0 + results_scanned: int = 0 + missing: Counter[str] = field(default_factory=Counter) + malformed: Counter[str] = field(default_factory=Counter) + benchmark_missing: dict[str, Counter[str]] = field( + default_factory=lambda: defaultdict(Counter) + ) + + @property + def has_issues(self) -> bool: + return bool(self.missing) or bool(self.malformed) + + def to_dict(self, top: int = 20) -> dict[str, Any]: + top_missing_by_benchmark: dict[str, list[tuple[str, int]]] = {} + for field_name in IDENTITY_FIELDS: + pairs = [ + (benchmark, counts[field_name]) + for benchmark, counts in self.benchmark_missing.items() + if counts[field_name] > 0 + ] + pairs.sort(key=lambda item: item[1], reverse=True) + top_missing_by_benchmark[field_name] = pairs[:top] + + return { + 'files_scanned': self.files_scanned, + 'results_scanned': self.results_scanned, + 'missing': dict(self.missing), + 'malformed': dict(self.malformed), + 'top_missing_by_benchmark': top_missing_by_benchmark, + } + + +def infer_benchmark_family(file_path: Path, payload: dict[str, Any]) -> str: + parts = list(file_path.parts) + if 'data' in parts: + idx = parts.index('data') + if idx + 1 < len(parts): + return parts[idx + 1] + + evaluation_id = str(payload.get('evaluation_id') or '') + if '/' in evaluation_id: + return evaluation_id.split('/', 1)[0] + return 'unknown' + + +def _record_missing( + report: CanonicalIdentityReport, benchmark_family: str, field_name: str +) -> None: + report.missing[field_name] += 1 + report.benchmark_missing[benchmark_family][field_name] += 1 + + +def _check_result_fields( + report: CanonicalIdentityReport, + benchmark_family: str, + result: dict[str, Any], +) -> None: + evaluation_result_id = result.get('evaluation_result_id') + if not isinstance(evaluation_result_id, str) or not evaluation_result_id.strip(): + _record_missing(report, benchmark_family, 'evaluation_result_id') + elif not EVALUATION_RESULT_ID_PATTERN.fullmatch(evaluation_result_id): + report.malformed['evaluation_result_id_pattern'] += 1 + + metric_config = result.get('metric_config') + if not isinstance(metric_config, dict): + report.malformed['metric_config_not_object'] += 1 + for field_name in IDENTITY_FIELDS[1:]: + _record_missing(report, benchmark_family, field_name) + return + + for field_name in IDENTITY_FIELDS[1:]: + value = metric_config.get(field_name) + if value is None: + _record_missing(report, benchmark_family, field_name) + continue + if not isinstance(value, str): + report.malformed[f'{field_name}_type'] += 1 + continue + if not value.strip(): + _record_missing(report, benchmark_family, field_name) + + +def check_file(file_path: Path, report: CanonicalIdentityReport) -> None: + try: + payload = json.loads(file_path.read_text(encoding='utf-8')) + except Exception: + report.malformed['aggregate_json_parse_error'] += 1 + report.files_scanned += 1 + return + + if not isinstance(payload, dict): + report.malformed['aggregate_json_not_object'] += 1 + report.files_scanned += 1 + return + + evaluation_results = payload.get('evaluation_results') + if evaluation_results is None: + return + + report.files_scanned += 1 + benchmark_family = infer_benchmark_family(file_path, payload) + if not isinstance(evaluation_results, list): + report.malformed['evaluation_results_not_list'] += 1 + return + + report.results_scanned += len(evaluation_results) + for result in evaluation_results: + if not isinstance(result, dict): + report.malformed['evaluation_result_not_object'] += 1 + continue + _check_result_fields(report, benchmark_family, result) + + +def check_paths(paths: list[str]) -> CanonicalIdentityReport: + report = CanonicalIdentityReport() + aggregate_files = [ + path + for path in expand_paths(paths) + if path.suffix == '.json' and not path.name.endswith('_samples.json') + ] + + for file_path in aggregate_files: + check_file(file_path, report) + + return report + + +def render_text(report: CanonicalIdentityReport, top: int = 20) -> str: + payload = report.to_dict(top=top) + lines = [ + f'Scanned {payload["files_scanned"]} aggregate file(s) ' + f'with {payload["results_scanned"]} result row(s).', + '', + ] + + if payload['missing']: + lines.append('Missing canonical fields:') + for field_name in IDENTITY_FIELDS: + missing_count = payload['missing'].get(field_name, 0) + if missing_count: + lines.append(f'- {field_name}: {missing_count}') + lines.append('') + + if payload['malformed']: + lines.append('Malformed fields:') + for key, value in sorted(payload['malformed'].items()): + lines.append(f'- {key}: {value}') + lines.append('') + + for field_name in IDENTITY_FIELDS: + pairs = payload['top_missing_by_benchmark'].get(field_name, []) + if not pairs: + continue + lines.append(f'Top missing benchmarks for {field_name}:') + for benchmark, count in pairs: + lines.append(f'- {benchmark}: {count}') + lines.append('') + + if not report.has_issues: + lines.append('No canonical identity issues found.') + + return '\n'.join(lines).rstrip() + '\n' + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog='every_eval_ever check-canonical-identity', + description=( + 'Audit aggregate JSON files for missing or malformed canonical ' + 'metric/result identity fields.' + ), + ) + parser.add_argument( + 'paths', + nargs='+', + help='Aggregate JSON files or directories containing datastore files.', + ) + parser.add_argument( + '--format', + choices=['text', 'json'], + default='text', + dest='output_format', + help='Output format.', + ) + parser.add_argument( + '--top', + type=int, + default=20, + help='How many benchmarks to show per missing field in text/json output.', + ) + parser.add_argument( + '--fail-on-issues', + action='store_true', + help='Exit with status 1 if any missing/malformed identity fields exist.', + ) + return parser + + +def main(argv: list[str] | None = None) -> int: + args = build_parser().parse_args(argv) + report = check_paths(args.paths) + payload = report.to_dict(top=args.top) + + if args.output_format == 'json': + print(json.dumps(payload, indent=2)) + else: + print(render_text(report, top=args.top), end='') + + if args.fail_on_issues and report.has_issues: + return 1 + return 0 + + +if __name__ == '__main__': + raise SystemExit(main()) diff --git a/every_eval_ever/cli.py b/every_eval_ever/cli.py index c215c89cd..ec5465323 100644 --- a/every_eval_ever/cli.py +++ b/every_eval_ever/cli.py @@ -289,6 +289,83 @@ def build_parser() -> argparse.ArgumentParser: help='One or more JSON files or directories containing JSON files.', ) + check_canonical_parser = subparsers.add_parser( + 'check-canonical-identity', + help='Audit canonical metric/eval identity coverage', + description=( + 'Audit aggregate JSON files for missing or malformed canonical ' + 'metric/result identity fields.' + ), + ) + check_canonical_parser.add_argument( + 'paths', + nargs='+', + help='One or more aggregate JSON files or directories to audit.', + ) + check_canonical_parser.add_argument( + '--format', + choices=['text', 'json'], + default='text', + dest='canonical_output_format', + help='Output format.', + ) + check_canonical_parser.add_argument( + '--top', + type=int, + default=20, + help='How many benchmarks to show per missing field in the report.', + ) + check_canonical_parser.add_argument( + '--fail-on-issues', + action='store_true', + help='Exit with status 1 if missing/malformed identity fields exist.', + ) + + augment_parser = subparsers.add_parser( + 'augment-canonical-identity', + help='Backfill canonical metric/eval identity into datastore JSON', + description=( + 'Backfill metric fields, evaluation_result_id values, and ' + 'metric-free evaluation_name values for known legacy ' + 'benchmark families in the datastore.' + ), + ) + augment_parser.add_argument( + 'paths', + nargs='+', + help='One or more aggregate JSON files or directories to augment.', + ) + augment_parser.add_argument( + '--write', + action='store_true', + help='Write changes in place. Without this flag, run as a dry run.', + ) + augment_parser.add_argument( + '--skip-samples', + action='store_true', + help='Do not update companion *_samples.jsonl files.', + ) + + schema_version_parser = subparsers.add_parser( + 'upgrade-schema-version', + help='Normalize datastore schema_version fields to 0.2.2', + description=( + 'Update aggregate JSON files to schema_version=0.2.2 and ' + 'instance JSONL rows to ' + 'schema_version=instance_level_eval_0.2.2.' + ), + ) + schema_version_parser.add_argument( + 'paths', + nargs='+', + help='One or more JSON/JSONL files or directories to normalize.', + ) + schema_version_parser.add_argument( + '--write', + action='store_true', + help='Write changes in place. Without this flag, run as a dry run.', + ) + convert_parser = subparsers.add_parser( 'convert', help='Convert source eval logs to every_eval_ever', @@ -412,6 +489,44 @@ def main(argv: list[str] | None = None) -> int: return check_duplicates_main(args.paths) + if args.command == 'check-canonical-identity': + from every_eval_ever.check_canonical_identity import ( + main as check_canonical_identity_main, + ) + + forwarded_args = [ + *args.paths, + '--format', + args.canonical_output_format, + '--top', + str(args.top), + ] + if args.fail_on_issues: + forwarded_args.append('--fail-on-issues') + return check_canonical_identity_main(forwarded_args) + + if args.command == 'augment-canonical-identity': + from every_eval_ever.augment_canonical_identity import ( + main as augment_canonical_identity_main, + ) + + forwarded_args = [*args.paths] + if args.write: + forwarded_args.append('--write') + if args.skip_samples: + forwarded_args.append('--skip-samples') + return augment_canonical_identity_main(forwarded_args) + + if args.command == 'upgrade-schema-version': + from every_eval_ever.upgrade_schema_version import ( + main as upgrade_schema_version_main, + ) + + forwarded_args = [*args.paths] + if args.write: + forwarded_args.append('--write') + return upgrade_schema_version_main(forwarded_args) + if args.command == 'convert': if args.source == 'lm_eval': return _cmd_convert_lm_eval(args) diff --git a/every_eval_ever/converters/__init__.py b/every_eval_ever/converters/__init__.py index 1b7ca553f..cffbb2467 100644 --- a/every_eval_ever/converters/__init__.py +++ b/every_eval_ever/converters/__init__.py @@ -1 +1,5 @@ -SCHEMA_VERSION = '0.2.2' +AGGREGATE_SCHEMA_VERSION = '0.2.2' +INSTANCE_LEVEL_SCHEMA_VERSION = 'instance_level_eval_0.2.2' + +# Backwards-compatible alias used by aggregate converters. +SCHEMA_VERSION = AGGREGATE_SCHEMA_VERSION diff --git a/every_eval_ever/converters/helm/adapter.py b/every_eval_ever/converters/helm/adapter.py index e43eb01b9..bf28f1dd9 100644 --- a/every_eval_ever/converters/helm/adapter.py +++ b/every_eval_ever/converters/helm/adapter.py @@ -485,10 +485,11 @@ def _transform_single( instance_level_log_path, instance_level_rows_number = ( HELMInstanceLevelDataAdapter( - detailed_results_id, + evaluation_id, Format.jsonl.value, HashAlgorithm.sha256.value, evaluation_dir, + file_stem=detailed_results_id, ).convert_instance_level_logs( dataset_name, model_info.id, diff --git a/every_eval_ever/converters/helm/instance_level_adapter.py b/every_eval_ever/converters/helm/instance_level_adapter.py index 037237ecc..815ce27e0 100644 --- a/every_eval_ever/converters/helm/instance_level_adapter.py +++ b/every_eval_ever/converters/helm/instance_level_adapter.py @@ -20,7 +20,7 @@ def _require_helm_dependencies() -> None: ) from _HELM_IMPORT_ERROR -from every_eval_ever.converters import SCHEMA_VERSION +from every_eval_ever.converters import INSTANCE_LEVEL_SCHEMA_VERSION from every_eval_ever.converters.common.utils import sha256_string from every_eval_ever.converters.helm.utils import extract_all_reasonings from every_eval_ever.instance_level_types import ( @@ -42,13 +42,14 @@ def __init__( format: str, hash_algorithm: str, evaluation_dir: str, + file_stem: str | None = None, ): _require_helm_dependencies() self.evaluation_id = evaulation_id self.format = format self.hash_algorithm = hash_algorithm self.evaluation_dir = evaluation_dir - self.path = f'{evaluation_dir}/{evaulation_id}.{format}' + self.path = f'{evaluation_dir}/{file_stem or evaulation_id}.{format}' def _save_json(self, items: List[InstanceLevelEvaluationLog]): eval_dir_path = Path(self.evaluation_dir) @@ -157,7 +158,7 @@ def convert_instance_level_logs( instance_level_logs.append( InstanceLevelEvaluationLog( - schema_version=SCHEMA_VERSION, + schema_version=INSTANCE_LEVEL_SCHEMA_VERSION, evaluation_id=self.evaluation_id, model_id=model_id, evaluation_name=evaluation_name, diff --git a/every_eval_ever/converters/inspect/instance_level_adapter.py b/every_eval_ever/converters/inspect/instance_level_adapter.py index 99e2d5170..4059fc2d4 100644 --- a/every_eval_ever/converters/inspect/instance_level_adapter.py +++ b/every_eval_ever/converters/inspect/instance_level_adapter.py @@ -29,7 +29,7 @@ def _require_inspect_dependencies() -> None: ) from _INSPECT_IMPORT_ERROR -from every_eval_ever.converters import SCHEMA_VERSION +from every_eval_ever.converters import INSTANCE_LEVEL_SCHEMA_VERSION from every_eval_ever.converters.common.utils import sha256_string from every_eval_ever.instance_level_types import ( AnswerAttributionItem, @@ -422,7 +422,7 @@ def convert_instance_level_logs( performance = None instance_level_log = InstanceLevelEvaluationLog( - schema_version=SCHEMA_VERSION, + schema_version=INSTANCE_LEVEL_SCHEMA_VERSION, evaluation_id=self.evaluation_id, model_id=model_id, evaluation_name=evaluation_name, diff --git a/every_eval_ever/converters/lm_eval/instance_level_adapter.py b/every_eval_ever/converters/lm_eval/instance_level_adapter.py index 1cc393862..302f7c769 100644 --- a/every_eval_ever/converters/lm_eval/instance_level_adapter.py +++ b/every_eval_ever/converters/lm_eval/instance_level_adapter.py @@ -5,7 +5,7 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Union -from every_eval_ever.converters import SCHEMA_VERSION +from every_eval_ever.converters import INSTANCE_LEVEL_SCHEMA_VERSION from every_eval_ever.eval_types import ( DetailedEvaluationResults, Format, @@ -161,7 +161,7 @@ def _transform_sample( ] return InstanceLevelEvaluationLog( - schema_version=SCHEMA_VERSION, + schema_version=INSTANCE_LEVEL_SCHEMA_VERSION, evaluation_id=evaluation_id, model_id=model_id, evaluation_name=eval_name, diff --git a/every_eval_ever/eval_types.py b/every_eval_ever/eval_types.py index 40035403b..5cac80bfa 100644 --- a/every_eval_ever/eval_types.py +++ b/every_eval_ever/eval_types.py @@ -489,7 +489,7 @@ class EvaluationLog(BaseModel): model_config = ConfigDict( extra='forbid', ) - schema_version: str = Field( + schema_version: Literal['0.2.2'] = Field( ..., description='Version of the schema used for this evaluation data' ) evaluation_id: str = Field( diff --git a/every_eval_ever/instance_level_types.py b/every_eval_ever/instance_level_types.py index ff78d3420..8f8b71bbb 100644 --- a/every_eval_ever/instance_level_types.py +++ b/every_eval_ever/instance_level_types.py @@ -5,7 +5,7 @@ from __future__ import annotations from enum import Enum -from typing import Any +from typing import Any, Literal from pydantic import ( BaseModel, @@ -154,7 +154,7 @@ class InstanceLevelEvaluationLog(BaseModel): model_config = ConfigDict( extra='forbid', ) - schema_version: str = Field( + schema_version: Literal['instance_level_eval_0.2.2'] = Field( ..., description='Version of the schema used for this instance data' ) evaluation_id: str = Field( diff --git a/every_eval_ever/upgrade_schema_version.py b/every_eval_ever/upgrade_schema_version.py new file mode 100644 index 000000000..31c1dd5e1 --- /dev/null +++ b/every_eval_ever/upgrade_schema_version.py @@ -0,0 +1,192 @@ +"""Normalize datastore schema_version fields to the current 0.2.2 schemas.""" + +from __future__ import annotations + +import argparse +import json +from dataclasses import dataclass +from pathlib import Path + +from every_eval_ever.validate import expand_paths + +AGGREGATE_SCHEMA_VERSION = '0.2.2' +INSTANCE_SCHEMA_VERSION = 'instance_level_eval_0.2.2' + + +@dataclass +class SchemaVersionUpgradeReport: + file_path: Path + file_type: str + target_version: str + changed: bool + rows_changed: int = 0 + rows_scanned: int = 0 + old_versions: tuple[str | None, ...] = () + + +def _write_json(path: Path, payload: dict) -> None: + path.write_text( + json.dumps(payload, indent=2, ensure_ascii=False) + '\n', + encoding='utf-8', + ) + + +def _write_jsonl(path: Path, rows: list[dict]) -> None: + with path.open('w', encoding='utf-8') as handle: + for row in rows: + handle.write(json.dumps(row, ensure_ascii=False) + '\n') + + +def upgrade_aggregate_file( + file_path: Path, *, write_changes: bool = False +) -> SchemaVersionUpgradeReport: + payload = json.loads(file_path.read_text(encoding='utf-8')) + if not isinstance(payload, dict): + raise ValueError(f'Aggregate payload is not a JSON object: {file_path}') + + old_version = payload.get('schema_version') + changed = old_version != AGGREGATE_SCHEMA_VERSION + if changed: + payload['schema_version'] = AGGREGATE_SCHEMA_VERSION + if write_changes: + _write_json(file_path, payload) + + return SchemaVersionUpgradeReport( + file_path=file_path, + file_type='aggregate', + target_version=AGGREGATE_SCHEMA_VERSION, + changed=changed, + rows_scanned=1, + rows_changed=1 if changed else 0, + old_versions=(old_version,) if changed else (), + ) + + +def upgrade_instance_file( + file_path: Path, *, write_changes: bool = False +) -> SchemaVersionUpgradeReport: + rows: list[dict] = [] + old_versions: set[str | None] = set() + changed_rows = 0 + + with file_path.open(encoding='utf-8') as handle: + for raw_line in handle: + stripped = raw_line.strip() + if not stripped: + continue + row = json.loads(stripped) + if not isinstance(row, dict): + raise ValueError( + f'Instance payload line is not a JSON object: {file_path}' + ) + old_version = row.get('schema_version') + if old_version != INSTANCE_SCHEMA_VERSION: + old_versions.add(old_version) + row['schema_version'] = INSTANCE_SCHEMA_VERSION + changed_rows += 1 + rows.append(row) + + if write_changes and changed_rows: + _write_jsonl(file_path, rows) + + return SchemaVersionUpgradeReport( + file_path=file_path, + file_type='instance', + target_version=INSTANCE_SCHEMA_VERSION, + changed=changed_rows > 0, + rows_scanned=len(rows), + rows_changed=changed_rows, + old_versions=tuple(sorted(old_versions, key=lambda value: str(value))), + ) + + +def upgrade_file( + file_path: Path, *, write_changes: bool = False +) -> SchemaVersionUpgradeReport: + if file_path.suffix == '.json': + return upgrade_aggregate_file(file_path, write_changes=write_changes) + if file_path.suffix == '.jsonl': + return upgrade_instance_file(file_path, write_changes=write_changes) + raise ValueError( + f"Unsupported file extension '{file_path.suffix}' for {file_path}" + ) + + +def render_text(reports: list[SchemaVersionUpgradeReport]) -> str: + aggregate_changed = sum( + 1 + for report in reports + if report.file_type == 'aggregate' and report.changed + ) + instance_changed = sum( + 1 + for report in reports + if report.file_type == 'instance' and report.changed + ) + instance_rows_changed = sum( + report.rows_changed + for report in reports + if report.file_type == 'instance' + ) + + lines = [ + f'Scanned {len(reports)} file(s).', + f'- Aggregate files updated: {aggregate_changed}', + f'- Instance files updated: {instance_changed}', + f'- Instance rows updated: {instance_rows_changed}', + '', + ] + + for report in reports: + if not report.changed: + continue + suffix = ( + f' ({report.rows_changed}/{report.rows_scanned} rows)' + if report.file_type == 'instance' + else '' + ) + previous = ', '.join(str(v) for v in report.old_versions) + lines.append( + f'Updated {report.file_path}: {previous} -> {report.target_version}{suffix}' + ) + + if all(not report.changed for report in reports): + lines.append('No schema_version changes were needed.') + + return '\n'.join(lines).rstrip() + '\n' + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog='every_eval_ever upgrade-schema-version', + description=( + 'Normalize aggregate and instance datastore files to the current ' + '0.2.2 schema_version values.' + ), + ) + parser.add_argument( + 'paths', + nargs='+', + help='Files or directories containing aggregate JSON and sample JSONL files.', + ) + parser.add_argument( + '--write', + action='store_true', + help='Write changes in place. Without this flag, run as a dry run.', + ) + return parser + + +def main(argv: list[str] | None = None) -> int: + args = build_parser().parse_args(argv) + file_paths = expand_paths(args.paths) + reports = [ + upgrade_file(file_path, write_changes=args.write) + for file_path in file_paths + ] + print(render_text(reports), end='') + return 0 + + +if __name__ == '__main__': + raise SystemExit(main()) diff --git a/tests/test_augment_canonical_identity.py b/tests/test_augment_canonical_identity.py new file mode 100644 index 000000000..fd3f9f9ea --- /dev/null +++ b/tests/test_augment_canonical_identity.py @@ -0,0 +1,1049 @@ +"""Tests for canonical identity augmentation of legacy datastore payloads.""" + +from __future__ import annotations + +import json +from pathlib import Path + +from every_eval_ever.augment_canonical_identity import ( + augment_aggregate_file, + augment_aggregate_payload, +) +from every_eval_ever.check_canonical_identity import check_paths +from every_eval_ever.upgrade_schema_version import ( + upgrade_aggregate_file, + upgrade_instance_file, +) +from every_eval_ever.validate import validate_file + + +def _base_payload(evaluation_id: str, evaluation_results: list[dict]) -> dict: + return { + 'schema_version': '0.2.2', + 'evaluation_id': evaluation_id, + 'retrieved_timestamp': '1234567890', + 'source_metadata': { + 'source_type': 'documentation', + 'source_organization_name': 'TestOrg', + 'evaluator_relationship': 'third_party', + }, + 'eval_library': {'name': 'unknown', 'version': 'unknown'}, + 'model_info': {'name': 'test-model', 'id': 'org/test-model'}, + 'evaluation_results': evaluation_results, + } + + +def _base_result( + evaluation_name: str, + description: str, + *, + min_score: float = 0.0, + max_score: float = 1.0, + lower_is_better: bool = False, +) -> dict: + return { + 'evaluation_name': evaluation_name, + 'source_data': { + 'dataset_name': 'test-ds', + 'source_type': 'hf_dataset', + 'hf_repo': 'org/test-ds', + }, + 'metric_config': { + 'evaluation_description': description, + 'lower_is_better': lower_is_better, + 'score_type': 'continuous', + 'min_score': min_score, + 'max_score': max_score, + }, + 'score_details': {'score': 0.5}, + } + + +def test_global_mmlu_lite_backfills_accuracy_metric(): + payload = _base_payload( + 'global-mmlu-lite/org_model/123', + [ + _base_result( + 'Arabic', + 'Global MMLU Lite - Arabic', + ) + ], + ) + + augmented, changed_results, _, ids_changed = augment_aggregate_payload( + payload, benchmark_family='global-mmlu-lite' + ) + + result = augmented['evaluation_results'][0] + metric_config = result['metric_config'] + + assert changed_results == 1 + assert ids_changed is True + assert result['evaluation_name'] == 'Arabic' + assert metric_config['metric_id'] == 'accuracy' + assert metric_config['metric_name'] == 'Accuracy' + assert metric_config['metric_kind'] == 'accuracy' + assert metric_config['metric_unit'] == 'proportion' + assert result['evaluation_result_id'].endswith('#arabic#accuracy') + + +def test_wordle_arena_updates_aggregate_and_samples(tmp_path: Path): + aggregate_path = ( + tmp_path + / 'data' + / 'wordle_arena' + / 'anthropic' + / 'claude-haiku-4.5' + / 'run.json' + ) + aggregate_path.parent.mkdir(parents=True) + payload = _base_payload( + 'wordle_arena/anthropic_claude-haiku-4.5/1776347262.820056', + [ + _base_result( + 'wordle_arena_win_rate', + 'Win rate on Wordle Arena', + ), + _base_result( + 'wordle_arena_avg_attempts', + 'Average guesses used per game on Wordle Arena', + min_score=1.0, + max_score=6.0, + lower_is_better=True, + ), + ], + ) + payload['detailed_evaluation_results'] = { + 'format': 'jsonl', + 'file_path': 'run_samples.jsonl', + 'total_rows': 1, + } + aggregate_path.write_text(json.dumps(payload), encoding='utf-8') + + samples_path = aggregate_path.with_name('run_samples.jsonl') + sample_rows = [ + { + 'schema_version': 'instance_level_eval_0.2.2', + 'evaluation_id': payload['evaluation_id'], + 'model_id': 'org/test-model', + 'evaluation_name': 'wordle_arena_win_rate', + 'sample_id': 'sample-1', + 'interaction_type': 'single_turn', + 'input': {'raw': 'guess', 'reference': ['guess']}, + 'output': {'raw': ['guess']}, + 'answer_attribution': [ + { + 'turn_idx': 0, + 'source': 'output.raw', + 'extracted_value': 'guess', + 'extraction_method': 'exact_match', + 'is_terminal': True, + } + ], + 'evaluation': {'score': 1.0, 'is_correct': True}, + }, + { + 'schema_version': 'instance_level_eval_0.2.2', + 'evaluation_id': payload['evaluation_id'], + 'model_id': 'org/test-model', + 'evaluation_name': 'wordle_arena_win_rate', + 'sample_id': 'sample-2', + 'interaction_type': 'single_turn', + 'input': {'raw': 'guess', 'reference': ['guess']}, + 'output': {'raw': ['guess']}, + 'answer_attribution': [ + { + 'turn_idx': 0, + 'source': 'output.raw', + 'extracted_value': 'guess', + 'extraction_method': 'exact_match', + 'is_terminal': True, + } + ], + 'evaluation': {'score': 0.0, 'is_correct': False}, + }, + ] + samples_path.write_text( + '\n'.join(json.dumps(row) for row in sample_rows) + '\n', + encoding='utf-8', + ) + + report = augment_aggregate_file(aggregate_path, write_changes=True) + + assert report.aggregate_changed is True + assert report.sample_changed is True + + updated_payload = json.loads(aggregate_path.read_text(encoding='utf-8')) + win_rate = updated_payload['evaluation_results'][0] + avg_attempts = updated_payload['evaluation_results'][1] + + assert win_rate['evaluation_name'] == 'wordle_arena' + assert win_rate['metric_config']['metric_id'] == 'win_rate' + assert ( + win_rate['metric_config']['additional_details']['raw_evaluation_name'] + == 'wordle_arena_win_rate' + ) + assert avg_attempts['evaluation_name'] == 'wordle_arena' + assert avg_attempts['metric_config']['metric_id'] == 'mean_attempts' + assert updated_payload['detailed_evaluation_results']['total_rows'] == 2 + + updated_rows = [ + json.loads(line) + for line in samples_path.read_text(encoding='utf-8').splitlines() + if line.strip() + ] + assert all(row['evaluation_name'] == 'wordle_arena' for row in updated_rows) + assert all( + row['evaluation_result_id'] == win_rate['evaluation_result_id'] + for row in updated_rows + ) + + +def test_apex_agents_splits_metric_semantics_from_evaluation_name(): + payload = _base_payload( + 'apex-agents/org_model/123', + [ + _base_result( + 'Overall Pass@8', + 'Overall Pass@8 (dataset card / paper snapshot).', + ), + _base_result( + 'Investment Banking Mean Score', + 'Investment banking world mean rubric score.', + ), + ], + ) + + augmented, changed_results, _, _ = augment_aggregate_payload( + payload, benchmark_family='apex-agents' + ) + + pass_at_8 = augmented['evaluation_results'][0] + mean_score = augmented['evaluation_results'][1] + + assert changed_results == 2 + assert pass_at_8['evaluation_name'] == 'Overall' + assert pass_at_8['metric_config']['metric_id'] == 'pass_at_k' + assert pass_at_8['metric_config']['metric_parameters'] == {'k': 8} + assert mean_score['evaluation_name'] == 'Investment Banking' + assert mean_score['metric_config']['metric_id'] == 'mean_score' + + +def test_bfcl_uses_metric_id_to_restore_eval_slice(): + payload = _base_payload( + 'bfcl/org_model/123', + [ + { + **_base_result( + 'bfcl.multi_turn.accuracy', + 'Multi-turn accuracy', + max_score=100.0, + ), + 'metric_config': { + 'metric_id': 'bfcl.multi_turn.accuracy', + 'metric_name': 'Accuracy', + 'lower_is_better': False, + 'score_type': 'continuous', + 'min_score': 0.0, + 'max_score': 100.0, + }, + } + ], + ) + + augmented, changed_results, _, _ = augment_aggregate_payload( + payload, benchmark_family='bfcl' + ) + + result = augmented['evaluation_results'][0] + assert changed_results == 1 + assert result['evaluation_name'] == 'multi_turn' + assert result['metric_config']['metric_id'] == 'bfcl.multi_turn.accuracy' + assert ( + result['metric_config']['additional_details']['raw_evaluation_name'] + == 'bfcl.multi_turn.accuracy' + ) + + +def test_swe_leaderboards_backfill_single_score_metric_identity(): + payload = _base_payload( + 'swe-polybench-leaderboard/org_model/123', + [ + _base_result( + 'SWE-PolyBench Verified (Python)', + 'Fraction of Python GitHub issues resolved (0.0–1.0)', + ) + ], + ) + + augmented, changed_results, _, ids_changed = augment_aggregate_payload( + payload, benchmark_family='swe-polybench-leaderboard' + ) + + result = augmented['evaluation_results'][0] + metric_config = result['metric_config'] + + assert changed_results == 1 + assert ids_changed is True + assert result['evaluation_name'] == 'SWE-PolyBench Verified (Python)' + assert metric_config['metric_id'] == 'swe_polybench_leaderboard.score' + assert metric_config['metric_name'] == 'Score' + assert metric_config['metric_kind'] == 'score' + assert metric_config['metric_unit'] == 'proportion' + assert result['evaluation_result_id'].endswith( + '#swe_polybench_verified_python#swe_polybench_leaderboard_score' + ) + + +def test_nested_datastore_paths_are_normalized_and_samples_move(tmp_path: Path): + aggregate_path = ( + tmp_path + / 'data' + / 'multi-swe-bench-leaderboard' + / 'typescript' + / 'openai' + / 'gpt-5' + / 'run.json' + ) + aggregate_path.parent.mkdir(parents=True) + payload = _base_payload( + 'multi-swe-bench-leaderboard/openai_gpt-5/123', + [ + _base_result( + 'Multi-SWE-Bench (TypeScript)', + 'Fraction of TypeScript issues resolved (0.0-1.0)', + ) + ], + ) + payload['detailed_evaluation_results'] = { + 'format': 'jsonl', + 'file_path': 'run_samples.jsonl', + 'total_rows': 0, + } + aggregate_path.write_text(json.dumps(payload), encoding='utf-8') + + samples_path = aggregate_path.with_name('run_samples.jsonl') + samples_path.write_text( + json.dumps( + { + 'schema_version': 'instance_level_eval_0.2.2', + 'evaluation_id': payload['evaluation_id'], + 'model_id': 'openai/gpt-5', + 'evaluation_name': 'Multi-SWE-Bench (TypeScript)', + 'sample_id': 'sample-1', + 'interaction_type': 'single_turn', + 'input': {'raw': 'issue', 'reference': ['issue']}, + 'output': {'raw': ['patch']}, + 'answer_attribution': [], + 'evaluation': {'score': 1.0, 'is_correct': True}, + } + ) + + '\n', + encoding='utf-8', + ) + + report = augment_aggregate_file(aggregate_path, write_changes=True) + + expected_aggregate_path = ( + tmp_path + / 'data' + / 'multi-swe-bench-leaderboard' + / 'openai' + / 'gpt-5' + / 'run.json' + ) + expected_samples_path = expected_aggregate_path.with_name('run_samples.jsonl') + + assert report.path_changed is True + assert report.file_path == expected_aggregate_path + assert report.sample_file_path == expected_samples_path + assert aggregate_path.exists() is False + assert samples_path.exists() is False + assert expected_aggregate_path.exists() is True + assert expected_samples_path.exists() is True + + updated_payload = json.loads(expected_aggregate_path.read_text(encoding='utf-8')) + updated_result = updated_payload['evaluation_results'][0] + assert updated_result['metric_config']['metric_id'] == ( + 'multi_swe_bench_leaderboard.score' + ) + assert updated_payload['detailed_evaluation_results']['total_rows'] == 1 + + updated_rows = [ + json.loads(line) + for line in expected_samples_path.read_text(encoding='utf-8').splitlines() + if line.strip() + ] + assert updated_rows[0]['evaluation_result_id'] == updated_result['evaluation_result_id'] + + +def test_nested_datastore_paths_report_target_path_in_dry_run(tmp_path: Path): + aggregate_path = ( + tmp_path + / 'data' + / 'swe-polybench-leaderboard' + / 'pb-verified' + / 'python' + / 'openai' + / 'gpt-5' + / 'run.json' + ) + aggregate_path.parent.mkdir(parents=True) + payload = _base_payload( + 'swe-polybench-leaderboard/openai_gpt-5/123', + [ + _base_result( + 'SWE-PolyBench Verified (Python)', + 'Fraction of Python GitHub issues resolved (0.0-1.0)', + ) + ], + ) + aggregate_path.write_text(json.dumps(payload), encoding='utf-8') + + report = augment_aggregate_file(aggregate_path, write_changes=False) + + expected_aggregate_path = ( + tmp_path + / 'data' + / 'swe-polybench-leaderboard' + / 'openai' + / 'gpt-5' + / 'run.json' + ) + + assert report.path_changed is True + assert report.original_file_path == aggregate_path + assert report.file_path == expected_aggregate_path + assert aggregate_path.exists() is True + assert expected_aggregate_path.exists() is False + + +def test_pr72_style_faulty_data_is_fully_repaired_by_current_workflow( + tmp_path: Path, +): + aggregate_path = ( + tmp_path + / 'data' + / 'multi-swe-bench-leaderboard' + / 'typescript' + / 'openai' + / 'gpt-5' + / 'run.json' + ) + aggregate_path.parent.mkdir(parents=True) + payload = _base_payload( + 'multi-swe-bench-leaderboard/openai_gpt-5/123', + [ + _base_result( + 'Multi-SWE-Bench (TypeScript)', + 'Fraction of TypeScript GitHub issues resolved (0.0-1.0)', + ) + ], + ) + aggregate_path.write_text(json.dumps(payload), encoding='utf-8') + + pre_validate = validate_file(aggregate_path) + pre_audit = check_paths([str(tmp_path / 'data')]) + schema_upgrade = upgrade_aggregate_file(aggregate_path, write_changes=True) + report = augment_aggregate_file(aggregate_path, write_changes=True) + + fixed_path = ( + tmp_path + / 'data' + / 'multi-swe-bench-leaderboard' + / 'openai' + / 'gpt-5' + / 'run.json' + ) + fixed_payload = json.loads(fixed_path.read_text(encoding='utf-8')) + fixed_result = fixed_payload['evaluation_results'][0] + post_validate = validate_file(fixed_path) + post_audit = check_paths([str(tmp_path / 'data')]) + + assert pre_validate.valid is True + assert pre_audit.has_issues is True + assert schema_upgrade.changed is False + assert report.path_changed is True + assert aggregate_path.exists() is False + assert fixed_path.exists() is True + assert fixed_result['metric_config']['metric_id'] == ( + 'multi_swe_bench_leaderboard.score' + ) + assert fixed_result['evaluation_result_id'].endswith( + '#multi_swe_bench_typescript#multi_swe_bench_leaderboard_score' + ) + assert post_validate.valid is True + assert post_audit.has_issues is False + + +def test_pr57_style_faulty_data_is_fully_repaired_by_current_workflow( + tmp_path: Path, +): + aggregate_path = ( + tmp_path + / 'data' + / 'swe-bench-verified-mini' + / 'google' + / 'gemini-2.0-flash-001' + / 'run.json' + ) + aggregate_path.parent.mkdir(parents=True) + payload = _base_payload( + 'swe-bench-verified-mini/google_gemini-2.0-flash-001/1744346409.0', + [ + _base_result( + 'mean on inspect_evals/swe_bench_verified_mini for scorer swe_bench_scorer', + 'mean', + ), + _base_result( + 'std on inspect_evals/swe_bench_verified_mini for scorer swe_bench_scorer', + 'std', + ), + ], + ) + payload['detailed_evaluation_results'] = { + 'format': 'jsonl', + 'file_path': 'run_samples.jsonl', + 'total_rows': 1, + } + aggregate_path.write_text(json.dumps(payload), encoding='utf-8') + + samples_path = aggregate_path.with_name('run_samples.jsonl') + samples_path.write_text( + json.dumps( + { + 'schema_version': '0.2.2', + 'evaluation_id': 'run_samples', + 'model_id': 'google/gemini-2.0-flash-001', + 'evaluation_name': 'inspect_evals/swe_bench_verified_mini', + 'evaluation_result_id': None, + 'sample_id': 'django__django-11790', + 'sample_hash': 'abc123', + 'interaction_type': 'agentic', + 'input': {'raw': 'issue', 'reference': ['issue']}, + 'output': None, + 'messages': [ + {'turn_idx': 0, 'role': 'user', 'content': 'issue'}, + { + 'turn_idx': 1, + 'role': 'assistant', + 'content': 'attempted patch', + }, + ], + 'answer_attribution': [ + { + 'turn_idx': 1, + 'source': 'messages[1].content', + 'extracted_value': 'attempted patch', + 'extraction_method': 'exact_match', + 'is_terminal': True, + } + ], + 'evaluation': { + 'score': 0.0, + 'is_correct': False, + 'num_turns': 2, + 'tool_calls_count': 0, + }, + } + ) + + '\n', + encoding='utf-8', + ) + + pre_sample_validate = validate_file(samples_path) + pre_audit = check_paths([str(tmp_path / 'data')]) + schema_upgrade = upgrade_instance_file(samples_path, write_changes=True) + report = augment_aggregate_file(aggregate_path, write_changes=True) + + fixed_payload = json.loads(aggregate_path.read_text(encoding='utf-8')) + mean_result, std_result = fixed_payload['evaluation_results'] + fixed_rows = [ + json.loads(line) + for line in samples_path.read_text(encoding='utf-8').splitlines() + if line.strip() + ] + fixed_row = fixed_rows[0] + post_sample_validate = validate_file(samples_path) + post_audit = check_paths([str(tmp_path / 'data')]) + + assert pre_sample_validate.valid is False + assert pre_audit.has_issues is True + assert schema_upgrade.changed is True + assert report.path_changed is False + assert mean_result['evaluation_name'] == 'swe-bench-verified-mini' + assert mean_result['metric_config']['metric_id'] == 'mean_score' + assert mean_result['metric_config']['metric_name'] == 'Mean Score' + assert mean_result['metric_config']['metric_kind'] == 'score' + assert std_result['evaluation_name'] == 'swe-bench-verified-mini' + assert std_result['metric_config']['metric_id'] == 'standard_deviation' + assert std_result['metric_config']['metric_name'] == 'Standard Deviation' + assert std_result['metric_config']['metric_kind'] == 'standard_deviation' + assert fixed_row['schema_version'] == 'instance_level_eval_0.2.2' + assert fixed_row['evaluation_id'] == fixed_payload['evaluation_id'] + assert fixed_row['evaluation_name'] == 'swe-bench-verified-mini' + assert fixed_row['evaluation_result_id'] == mean_result['evaluation_result_id'] + assert post_sample_validate.valid is True + assert post_audit.has_issues is False + + +def test_alphaxiv_faulty_data_is_fully_repaired_by_current_workflow( + tmp_path: Path, +): + aggregate_path = ( + tmp_path + / 'data' + / 'alphaxiv' + / 'Gaia2' + / 'unknown' + / 'GPT-4o' + / 'run.json' + ) + aggregate_path.parent.mkdir(parents=True) + payload = { + 'schema_version': '0.2.0', + 'evaluation_id': 'Gaia2/GPT-4o/1771591481.616601', + 'retrieved_timestamp': '1771591481.616601', + 'source_metadata': { + 'source_name': 'alphaXiv State of the Art', + 'source_type': 'documentation', + 'source_organization_name': 'alphaXiv', + 'evaluator_relationship': 'third_party', + }, + 'model_info': {'id': 'GPT-4o', 'name': 'GPT-4o', 'developer': 'unknown'}, + 'evaluation_results': [ + { + 'evaluation_name': 'Overall Performance on Gaia2', + 'source_data': { + 'dataset_name': 'Gaia2', + 'source_type': 'url', + 'url': ['https://www.alphaxiv.org/abs/example'], + }, + 'metric_config': { + 'lower_is_better': False, + 'score_type': 'continuous', + 'min_score': 0.0, + 'max_score': 100.0, + 'evaluation_description': 'Overall performance on Gaia2.', + 'additional_details': { + 'alphaxiv_is_primary': 'True', + 'alphaxiv_y_axis': 'Score', + }, + }, + 'score_details': {'score': 67.4}, + } + ], + } + aggregate_path.write_text(json.dumps(payload), encoding='utf-8') + + pre_validate = validate_file(aggregate_path) + pre_audit = check_paths([str(tmp_path / 'data')]) + schema_upgrade = upgrade_aggregate_file(aggregate_path, write_changes=True) + report = augment_aggregate_file(aggregate_path, write_changes=True) + + fixed_path = ( + tmp_path / 'data' / 'alphaxiv' / 'unknown' / 'GPT-4o' / 'run.json' + ) + fixed_payload = json.loads(fixed_path.read_text(encoding='utf-8')) + fixed_result = fixed_payload['evaluation_results'][0] + post_validate = validate_file(fixed_path) + post_audit = check_paths([str(tmp_path / 'data')]) + + assert pre_validate.valid is False + assert pre_audit.has_issues is True + assert schema_upgrade.changed is True + assert report.path_changed is True + assert aggregate_path.exists() is False + assert fixed_path.exists() is True + assert fixed_payload['eval_library'] == { + 'name': 'alphaxiv', + 'version': 'unknown', + } + assert fixed_result['evaluation_name'] == 'Gaia2' + assert fixed_result['metric_config']['metric_id'] == ( + 'overall_performance_on_gaia2' + ) + assert fixed_result['metric_config']['metric_name'] == ( + 'Overall Performance on Gaia2' + ) + assert fixed_result['metric_config']['metric_kind'] == 'score' + assert fixed_result['metric_config']['metric_unit'] == 'points' + assert ( + fixed_result['metric_config']['additional_details']['raw_evaluation_name'] + == 'Overall Performance on Gaia2' + ) + assert fixed_result['evaluation_result_id'].endswith( + '#gaia2#overall_performance_on_gaia2' + ) + assert post_validate.valid is True + assert post_audit.has_issues is False + + +def test_helm_description_drives_metric_identity_and_mean_row_renaming(): + payload = _base_payload( + 'helm_lite/org_model/123', + [ + _base_result( + 'Mean win rate', + 'How many models this model outperforms on average (over columns).', + ), + _base_result( + 'NarrativeQA', + 'F1 on NarrativeQA', + ), + _base_result( + 'IFEval', + 'IFEval Strict Acc on IFEval', + ), + ], + ) + + augmented, changed_results, _, _ = augment_aggregate_payload( + payload, benchmark_family='helm_lite' + ) + + mean_win_rate = augmented['evaluation_results'][0] + narrative_qa = augmented['evaluation_results'][1] + ifeval = augmented['evaluation_results'][2] + + assert changed_results == 3 + assert mean_win_rate['evaluation_name'] == 'helm_lite' + assert mean_win_rate['metric_config']['metric_id'] == 'win_rate' + assert narrative_qa['evaluation_name'] == 'NarrativeQA' + assert narrative_qa['metric_config']['metric_id'] == 'f1' + assert ifeval['metric_config']['metric_id'] == 'strict_accuracy' + + +def test_ifeval_inspect_rows_split_slice_and_metric_identity(): + payload = _base_payload( + 'IFEval/org_model/123', + [ + _base_result( + 'prompt_strict_acc on inspect_evals/ifeval for scorer instruction_following', + 'prompt strict accuracy', + ), + _base_result( + 'final_stderr on inspect_evals/ifeval for scorer instruction_following', + 'final standard error', + ), + ], + ) + + augmented, changed_results, _, _ = augment_aggregate_payload( + payload, benchmark_family='IFEval' + ) + + strict_acc = augmented['evaluation_results'][0] + final_stderr = augmented['evaluation_results'][1] + + assert changed_results == 2 + assert strict_acc['evaluation_name'] == 'prompt_strict' + assert strict_acc['metric_config']['metric_id'] == 'accuracy' + assert strict_acc['metric_config']['metric_name'] == 'Accuracy' + assert strict_acc['metric_config']['metric_kind'] == 'accuracy' + assert final_stderr['evaluation_name'] == 'final' + assert final_stderr['metric_config']['metric_id'] == 'standard_error' + assert final_stderr['metric_config']['metric_name'] == 'Standard Error' + assert final_stderr['metric_config']['metric_kind'] == 'standard_error' + + +def test_agentharm_inspect_rows_cover_overall_and_category_metrics(): + payload = _base_payload( + 'agentharm/org_model/123', + [ + _base_result( + 'inspect_evals/avg_full_score on inspect_evals/agentharm for scorer combined_scorer', + 'Average full score on agentharm', + ), + _base_result( + 'Cybercrime_avg_refusals on inspect_evals/agentharm for scorer combined_scorer', + 'Average refusals on Cybercrime subset', + ), + ], + ) + + augmented, changed_results, _, _ = augment_aggregate_payload( + payload, benchmark_family='agentharm' + ) + + avg_full_score = augmented['evaluation_results'][0] + cybercrime_refusals = augmented['evaluation_results'][1] + + assert changed_results == 2 + assert avg_full_score['evaluation_name'] == 'agentharm' + assert avg_full_score['metric_config']['metric_id'] == 'average_full_score' + assert avg_full_score['metric_config']['metric_name'] == 'Average Full Score' + assert avg_full_score['metric_config']['metric_kind'] == 'score' + assert cybercrime_refusals['evaluation_name'] == 'Cybercrime' + assert ( + cybercrime_refusals['metric_config']['metric_id'] + == 'average_refusal_rate' + ) + assert ( + cybercrime_refusals['metric_config']['metric_name'] + == 'Average Refusal Rate' + ) + assert ( + cybercrime_refusals['metric_config']['metric_kind'] + == 'refusal_rate' + ) + + +def test_generic_inspect_eval_benchmarks_backfill_accuracy_mean_and_std(): + payload = _base_payload( + 'cyse2_vulnerability_exploit/org_model/123', + [ + _base_result( + 'accuracy on inspect_evals/cyse2_vulnerability_exploit for scorer vul_exploit_scorer', + 'accuracy', + ), + _base_result( + 'mean on inspect_evals/cyse2_vulnerability_exploit for scorer vul_exploit_scorer', + 'mean', + ), + _base_result( + 'std on inspect_evals/cyse2_vulnerability_exploit for scorer vul_exploit_scorer', + 'std', + ), + ], + ) + + augmented, changed_results, _, _ = augment_aggregate_payload( + payload, benchmark_family='cyse2_vulnerability_exploit' + ) + + accuracy = augmented['evaluation_results'][0] + mean_score = augmented['evaluation_results'][1] + standard_deviation = augmented['evaluation_results'][2] + + assert changed_results == 3 + assert accuracy['evaluation_name'] == 'cyse2_vulnerability_exploit' + assert accuracy['metric_config']['metric_id'] == 'accuracy' + assert mean_score['evaluation_name'] == 'cyse2_vulnerability_exploit' + assert mean_score['metric_config']['metric_id'] == 'mean_score' + assert mean_score['metric_config']['metric_name'] == 'Mean Score' + assert mean_score['metric_config']['metric_kind'] == 'score' + assert standard_deviation['evaluation_name'] == 'cyse2_vulnerability_exploit' + assert ( + standard_deviation['metric_config']['metric_id'] + == 'standard_deviation' + ) + assert ( + standard_deviation['metric_config']['metric_name'] + == 'Standard Deviation' + ) + assert ( + standard_deviation['metric_config']['metric_kind'] + == 'standard_deviation' + ) + + +def test_fibble_arena_supports_2_to_4_lie_variants(): + payload = _base_payload( + 'fibble_arena/org_model/123', + [ + _base_result( + 'fibble_arena_3lies_win_rate', + 'Win rate on Fibble³ Arena (3 lies)', + ), + _base_result( + 'fibble_arena_4lies_avg_attempts', + 'Average guesses used per game on Fibble⁴ Arena (4 lies)', + min_score=1.0, + max_score=8.0, + lower_is_better=True, + ), + ], + ) + + augmented, changed_results, _, _ = augment_aggregate_payload( + payload, benchmark_family='fibble_arena' + ) + + win_rate = augmented['evaluation_results'][0] + avg_attempts = augmented['evaluation_results'][1] + + assert changed_results == 2 + assert win_rate['evaluation_name'] == 'fibble_arena_3lies' + assert win_rate['metric_config']['metric_id'] == 'win_rate' + assert avg_attempts['evaluation_name'] == 'fibble_arena_4lies' + assert avg_attempts['metric_config']['metric_id'] == 'mean_attempts' + + +def test_fibble_numbered_families_backfill_metrics_without_renaming_family(): + payload = _base_payload( + 'fibble5_arena/org_model/123', + [ + _base_result( + 'fibble5_arena_win_rate', + 'Win rate on Fibble5 Arena', + ), + _base_result( + 'fibble5_arena_avg_attempts', + 'Average attempts on Fibble5 Arena', + min_score=1.0, + max_score=8.0, + lower_is_better=True, + ), + _base_result( + 'fibble5_arena_avg_latency_ms', + 'Average latency on Fibble5 Arena', + min_score=0.0, + max_score=5000.0, + lower_is_better=True, + ), + ], + ) + + augmented, changed_results, _, _ = augment_aggregate_payload( + payload, benchmark_family='fibble5_arena' + ) + + win_rate = augmented['evaluation_results'][0] + avg_attempts = augmented['evaluation_results'][1] + avg_latency = augmented['evaluation_results'][2] + + assert changed_results == 3 + assert win_rate['evaluation_name'] == 'fibble5_arena' + assert win_rate['metric_config']['metric_id'] == 'win_rate' + assert avg_attempts['evaluation_name'] == 'fibble5_arena' + assert avg_attempts['metric_config']['metric_id'] == 'mean_attempts' + assert avg_latency['evaluation_name'] == 'fibble5_arena' + assert avg_latency['metric_config']['metric_id'] == 'latency_mean' + + +def test_helm_patch_handles_ndcg_bleu_and_acc_abbreviations(): + payload = _base_payload( + 'helm_classic/org_model/123', + [ + _base_result( + 'MS MARCO (TREC)', + 'NDCG@10 on MS MARCO (TREC)', + ), + _base_result( + 'WMT 2014', + 'BLEU-4 on WMT 2014', + ), + _base_result( + 'Omni-MATH', + 'Acc on Omni-MATH', + ), + ], + ) + + augmented, changed_results, _, _ = augment_aggregate_payload( + payload, benchmark_family='helm_classic' + ) + + ndcg = augmented['evaluation_results'][0]['metric_config'] + bleu = augmented['evaluation_results'][1]['metric_config'] + acc = augmented['evaluation_results'][2]['metric_config'] + + assert changed_results == 3 + assert ndcg['metric_id'] == 'ndcg' + assert ndcg['metric_parameters'] == {'k': 10} + assert bleu['metric_id'] == 'bleu_4' + assert bleu['metric_parameters'] == {'n': 4} + assert acc['metric_id'] == 'accuracy' + + +def test_livecodebenchpro_pass_at_k_is_backfilled(): + payload = _base_payload( + 'livecodebenchpro/org_model/123', + [ + _base_result( + 'Hard Problems', + 'Pass@1 on Hard Problems', + ) + ], + ) + + augmented, changed_results, _, _ = augment_aggregate_payload( + payload, benchmark_family='livecodebenchpro' + ) + + metric_config = augmented['evaluation_results'][0]['metric_config'] + assert changed_results == 1 + assert metric_config['metric_id'] == 'pass_at_k' + assert metric_config['metric_name'] == 'Pass@1' + assert metric_config['metric_kind'] == 'pass_rate' + assert metric_config['metric_parameters'] == {'k': 1} + + +def test_score_suffix_families_and_single_score_families_are_backfilled(): + ace_payload = _base_payload( + 'ace/org_model/123', + [ + _base_result('Overall Score', 'Overall ACE score.'), + _base_result('Gaming Score', 'Gaming domain score.'), + ], + ) + ace_augmented, changed_results, _, _ = augment_aggregate_payload( + ace_payload, benchmark_family='ace' + ) + + overall = ace_augmented['evaluation_results'][0] + gaming = ace_augmented['evaluation_results'][1] + + assert changed_results == 2 + assert overall['evaluation_name'] == 'ace' + assert overall['metric_config']['metric_id'] == 'ace.score' + assert gaming['evaluation_name'] == 'Gaming' + assert gaming['metric_config']['metric_id'] == 'ace.score' + + tau_payload = _base_payload( + 'tau-bench-2/airline/org_model/123', + [ + _base_result( + 'tau-bench-2/airline', + 'Tau Bench 2 benchmark evaluation (airline subset)', + ) + ], + ) + tau_augmented, tau_changed, _, _ = augment_aggregate_payload( + tau_payload, benchmark_family='tau-bench-2_airline' + ) + tau_metric = tau_augmented['evaluation_results'][0]['metric_config'] + assert tau_changed == 1 + assert tau_metric['metric_id'] == 'tau_bench_2_airline.score' + assert tau_metric['metric_unit'] == 'proportion' + + la_payload = _base_payload( + 'la_leaderboard/org_model/123', + [ + _base_result( + 'la_leaderboard', + 'La Leaderboard benchmark score', + min_score=0.0, + max_score=100.0, + ) + ], + ) + la_augmented, la_changed, _, _ = augment_aggregate_payload( + la_payload, benchmark_family='la_leaderboard' + ) + la_metric = la_augmented['evaluation_results'][0]['metric_config'] + assert la_changed == 1 + assert la_metric['metric_id'] == 'la_leaderboard.score' + assert la_metric['metric_unit'] == 'points' + + +def test_theory_of_mind_patch_splits_metric_from_eval_name(): + payload = _base_payload( + 'theory_of_mind/org_model/123', + [ + _base_result( + 'accuracy on theory_of_mind for scorer model_graded_fact', + 'accuracy', + ) + ], + ) + + augmented, changed_results, _, _ = augment_aggregate_payload( + payload, benchmark_family='theory_of_mind' + ) + + result = augmented['evaluation_results'][0] + assert changed_results == 1 + assert result['evaluation_name'] == 'theory_of_mind' + assert result['metric_config']['metric_id'] == 'accuracy' diff --git a/tests/test_check_canonical_identity.py b/tests/test_check_canonical_identity.py new file mode 100644 index 000000000..3cfecfe76 --- /dev/null +++ b/tests/test_check_canonical_identity.py @@ -0,0 +1,153 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from every_eval_ever.check_canonical_identity import check_paths, main + + +def _base_payload(evaluation_id: str, evaluation_results: list[dict]) -> dict: + return { + 'schema_version': '0.2.2', + 'evaluation_id': evaluation_id, + 'retrieved_timestamp': '1234567890', + 'source_metadata': { + 'source_type': 'documentation', + 'source_organization_name': 'TestOrg', + 'evaluator_relationship': 'third_party', + }, + 'eval_library': {'name': 'unknown', 'version': 'unknown'}, + 'model_info': {'name': 'test-model', 'id': 'org/test-model'}, + 'evaluation_results': evaluation_results, + } + + +def _result_with_identity(evaluation_name: str) -> dict: + return { + 'evaluation_name': evaluation_name, + 'evaluation_result_id': f'test/model/123#{evaluation_name}#accuracy', + 'source_data': { + 'dataset_name': 'test-ds', + 'source_type': 'hf_dataset', + 'hf_repo': 'org/test-ds', + }, + 'metric_config': { + 'evaluation_description': 'accuracy', + 'metric_id': 'accuracy', + 'metric_name': 'Accuracy', + 'metric_kind': 'accuracy', + 'metric_unit': 'proportion', + 'lower_is_better': False, + 'score_type': 'continuous', + 'min_score': 0.0, + 'max_score': 1.0, + }, + 'score_details': {'score': 0.5}, + } + + +def test_check_paths_reports_missing_identity_fields(tmp_path: Path): + data_dir = tmp_path / 'data' / 'demo-benchmark' / 'org' / 'model' + data_dir.mkdir(parents=True) + + payload = _base_payload( + 'demo-benchmark/org_model/123', + [ + { + **_result_with_identity('slice_a'), + 'evaluation_result_id': '', + 'metric_config': { + 'lower_is_better': False, + 'score_type': 'continuous', + 'min_score': 0.0, + 'max_score': 1.0, + }, + } + ], + ) + file_path = data_dir / 'run.json' + file_path.write_text(json.dumps(payload), encoding='utf-8') + + report = check_paths([str(tmp_path)]) + as_dict = report.to_dict() + + assert report.files_scanned == 1 + assert report.results_scanned == 1 + assert as_dict['missing']['evaluation_result_id'] == 1 + assert as_dict['missing']['metric_id'] == 1 + assert as_dict['top_missing_by_benchmark']['metric_id'][0] == ( + 'demo-benchmark', + 1, + ) + + +def test_check_paths_passes_for_complete_identity(tmp_path: Path): + data_dir = tmp_path / 'data' / 'clean-benchmark' / 'org' / 'model' + data_dir.mkdir(parents=True) + + payload = _base_payload( + 'clean-benchmark/org_model/123', + [_result_with_identity('slice_a')], + ) + file_path = data_dir / 'run.json' + file_path.write_text(json.dumps(payload), encoding='utf-8') + + report = check_paths([str(tmp_path)]) + assert report.files_scanned == 1 + assert report.results_scanned == 1 + assert report.has_issues is False + assert report.missing == {} + assert report.malformed == {} + + +def test_check_paths_ignores_non_aggregate_json(tmp_path: Path): + data_dir = tmp_path / 'data' / 'clean-benchmark' / 'org' / 'model' + data_dir.mkdir(parents=True) + (tmp_path / 'metadata.json').write_text( + json.dumps({'generated_by': 'unit-test'}), + encoding='utf-8', + ) + + payload = _base_payload( + 'clean-benchmark/org_model/123', + [_result_with_identity('slice_a')], + ) + (data_dir / 'run.json').write_text(json.dumps(payload), encoding='utf-8') + + report = check_paths([str(tmp_path)]) + assert report.files_scanned == 1 + assert report.results_scanned == 1 + assert report.has_issues is False + + +def test_main_fail_on_issues_returns_nonzero(tmp_path: Path): + data_dir = tmp_path / 'data' / 'broken-benchmark' / 'org' / 'model' + data_dir.mkdir(parents=True) + + payload = _base_payload( + 'broken-benchmark/org_model/123', + [ + { + **_result_with_identity('slice_a'), + 'evaluation_result_id': 'not_a_canonical_id', + 'metric_config': { + **_result_with_identity('slice_a')['metric_config'], + 'metric_id': '', + }, + } + ], + ) + file_path = data_dir / 'run.json' + file_path.write_text(json.dumps(payload), encoding='utf-8') + + assert ( + main( + [ + str(tmp_path), + '--format', + 'json', + '--fail-on-issues', + ] + ) + == 1 + ) diff --git a/tests/test_helm_instance_level_adapter.py b/tests/test_helm_instance_level_adapter.py index 4ee46c6c2..275bc739a 100644 --- a/tests/test_helm_instance_level_adapter.py +++ b/tests/test_helm_instance_level_adapter.py @@ -63,8 +63,8 @@ def test_mmlu_instance_level(): assert len(instance_logs) == 10 log = instance_logs[0] - assert log.schema_version == '0.2.2' - assert log.evaluation_id == 'test_mmlu_samples' + assert log.schema_version == 'instance_level_eval_0.2.2' + assert log.evaluation_id == converted_eval.evaluation_id assert log.model_id == 'openai/gpt2' assert log.evaluation_name == 'mmlu' assert log.sample_id == 'id147' @@ -112,7 +112,8 @@ def test_hellaswag_instance_level(): assert len(instance_logs) == 10 log = instance_logs[0] - assert log.schema_version == '0.2.2' + assert log.schema_version == 'instance_level_eval_0.2.2' + assert log.evaluation_id == converted_eval.evaluation_id assert log.model_id == 'eleutherai/pythia-1b-v0' assert log.evaluation_name == 'hellaswag' assert log.interaction_type == InteractionType.single_turn @@ -148,7 +149,8 @@ def test_narrativeqa_instance_level(): assert len(instance_logs) == 5 log = instance_logs[0] - assert log.schema_version == '0.2.2' + assert log.schema_version == 'instance_level_eval_0.2.2' + assert log.evaluation_id == converted_eval.evaluation_id assert log.model_id == 'openai/gpt2' assert log.evaluation_name == 'narrativeqa' assert log.interaction_type == InteractionType.single_turn diff --git a/tests/test_inspect_instance_level_adapter.py b/tests/test_inspect_instance_level_adapter.py index ea1deab8e..fcd996f0e 100644 --- a/tests/test_inspect_instance_level_adapter.py +++ b/tests/test_inspect_instance_level_adapter.py @@ -67,7 +67,8 @@ def test_pubmedqa_instance_level(): assert len(instance_logs) == 2 log = instance_logs[0] - assert log.schema_version == '0.2.2' + assert log.schema_version == 'instance_level_eval_0.2.2' + assert log.evaluation_id == converted_eval.evaluation_id assert log.model_id == 'openai/gpt-4o-mini-2024-07-18' assert log.interaction_type == InteractionType.single_turn @@ -98,7 +99,8 @@ def test_arc_sonnet_instance_level(): assert len(instance_logs) == 5 log = instance_logs[0] - assert log.schema_version == '0.2.2' + assert log.schema_version == 'instance_level_eval_0.2.2' + assert log.evaluation_id == converted_eval.evaluation_id assert log.model_id == 'anthropic/claude-sonnet-4-20250514' assert log.interaction_type == InteractionType.single_turn @@ -134,7 +136,8 @@ def test_arc_qwen_instance_level(): assert len(instance_logs) == 3 log = instance_logs[0] - assert log.schema_version == '0.2.2' + assert log.schema_version == 'instance_level_eval_0.2.2' + assert log.evaluation_id == converted_eval.evaluation_id assert log.model_id == 'ollama/qwen2.5-0.5b' assert log.interaction_type == InteractionType.single_turn @@ -170,7 +173,8 @@ def test_gaia_instance_level(): assert len(instance_logs) > 0 log = instance_logs[0] - assert log.schema_version == '0.2.2' + assert log.schema_version == 'instance_level_eval_0.2.2' + assert log.evaluation_id == converted_eval.evaluation_id assert log.model_id == 'openai/gpt-4.1-mini-2025-04-14' assert log.interaction_type == InteractionType.agentic diff --git a/tests/test_upgrade_schema_version.py b/tests/test_upgrade_schema_version.py new file mode 100644 index 000000000..a6c02a473 --- /dev/null +++ b/tests/test_upgrade_schema_version.py @@ -0,0 +1,73 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from every_eval_ever.upgrade_schema_version import ( + AGGREGATE_SCHEMA_VERSION, + INSTANCE_SCHEMA_VERSION, + upgrade_aggregate_file, + upgrade_instance_file, +) + + +def test_upgrade_aggregate_file_rewrites_schema_version(tmp_path: Path): + file_path = tmp_path / 'aggregate.json' + payload = { + 'schema_version': '0.2.1', + 'evaluation_id': 'test/model/123', + 'retrieved_timestamp': '123', + 'source_metadata': { + 'source_type': 'evaluation_run', + 'source_organization_name': 'TestOrg', + 'evaluator_relationship': 'first_party', + }, + 'eval_library': {'name': 'inspect_ai', 'version': '0.3.0'}, + 'model_info': {'name': 'test-model', 'id': 'org/test-model'}, + 'evaluation_results': [], + } + file_path.write_text(json.dumps(payload), encoding='utf-8') + + report = upgrade_aggregate_file(file_path, write_changes=True) + updated = json.loads(file_path.read_text(encoding='utf-8')) + + assert report.changed is True + assert updated['schema_version'] == AGGREGATE_SCHEMA_VERSION + + +def test_upgrade_instance_file_rewrites_all_rows(tmp_path: Path): + file_path = tmp_path / 'samples.jsonl' + rows = [ + {'schema_version': 'instance_level_eval_0.2.1', 'sample_id': 'a'}, + {'schema_version': '0.2.2', 'sample_id': 'b'}, + {'schema_version': INSTANCE_SCHEMA_VERSION, 'sample_id': 'c'}, + ] + file_path.write_text( + '\n'.join(json.dumps(row) for row in rows) + '\n', + encoding='utf-8', + ) + + report = upgrade_instance_file(file_path, write_changes=True) + updated = [ + json.loads(line) + for line in file_path.read_text(encoding='utf-8').splitlines() + if line.strip() + ] + + assert report.changed is True + assert report.rows_changed == 2 + assert all( + row['schema_version'] == INSTANCE_SCHEMA_VERSION for row in updated + ) + + +def test_upgrade_dry_run_does_not_write(tmp_path: Path): + file_path = tmp_path / 'aggregate.json' + payload = {'schema_version': '0.2.1'} + file_path.write_text(json.dumps(payload), encoding='utf-8') + + report = upgrade_aggregate_file(file_path, write_changes=False) + unchanged = json.loads(file_path.read_text(encoding='utf-8')) + + assert report.changed is True + assert unchanged['schema_version'] == '0.2.1' diff --git a/tests/test_validate.py b/tests/test_validate.py index edb2d5b6a..70d453ec0 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -131,6 +131,13 @@ def test_missing_required_field(self, tmp_path: Path): assert report.valid is False assert any('evaluation_id' in e['loc'] for e in report.errors) + def test_wrong_schema_version_fails(self, tmp_path: Path): + data = {**VALID_AGGREGATE, 'schema_version': '0.2.1'} + fp = _write_json(tmp_path, 'wrong_version.json', data) + report = validate_aggregate(fp) + assert report.valid is False + assert any('schema_version' in e['loc'] for e in report.errors) + def test_extra_field_on_evaluation_log_fails(self, tmp_path: Path): data = {**VALID_AGGREGATE, 'unexpected_field': 'oops'} fp = _write_json(tmp_path, 'extra.json', data) @@ -223,6 +230,16 @@ def test_valid_single_turn_passes(self, tmp_path: Path): assert report.valid is True assert report.line_count == 1 + def test_wrong_schema_version_fails(self, tmp_path: Path): + data = { + **VALID_SINGLE_TURN, + 'schema_version': 'instance_level_eval_0.2.1', + } + fp = _write_jsonl(tmp_path, 'wrong_version.jsonl', [data]) + report = validate_instance_file(fp) + assert report.valid is False + assert any('schema_version' in e['loc'] for e in report.errors) + def test_valid_multi_turn_passes(self, tmp_path: Path): fp = _write_jsonl(tmp_path, 'multi.jsonl', [VALID_MULTI_TURN]) report = validate_instance_file(fp)