From 52030cbe1675136a8eb1f4f58ece0740d171775c Mon Sep 17 00:00:00 2001 From: tommasocerruti Date: Sat, 16 May 2026 20:15:18 +0200 Subject: [PATCH] Fix LLM Stats evaluator provenance --- tests/test_llm_stats_adapter.py | 93 ++++++++- utils/llm_stats/adapter.py | 344 ++++++++++++++++++++++++++++++-- 2 files changed, 421 insertions(+), 16 deletions(-) diff --git a/tests/test_llm_stats_adapter.py b/tests/test_llm_stats_adapter.py index 1733852d2..515928c8f 100644 --- a/tests/test_llm_stats_adapter.py +++ b/tests/test_llm_stats_adapter.py @@ -86,7 +86,6 @@ def sample_payload() -> dict: 'model_id': 'claude-4-opus', 'benchmark_id': 'gpqa-diamond', 'score': 88.5, - 'source_url': 'https://example.org/claude-gpqa', }, ] }, @@ -163,6 +162,10 @@ def test_raw_citation_and_provenance_are_preserved(): first_details = first_result.score_details.details or {} assert first_details['raw_provenance_label'] == 'model_card' assert first_details['raw_verified'] == 'true' + assert first_details['raw_source_organization'] == 'openai' + assert first_details['relationship_inference_reason'] == ( + 'source_matches_model_developer' + ) assert 'https://openai.com/index/gpt-5-system-card/' in json.loads( first_details['source_urls_json'] ) @@ -176,6 +179,9 @@ def test_raw_citation_and_provenance_are_preserved(): other_result = logs['other'].evaluation_results[0] other_details = other_result.score_details.details or {} assert other_details['raw_provenance_label'] == 'unknown' + assert other_details['relationship_inference_reason'] == ( + 'no_provenance_signal' + ) def test_export_paths_follow_datastore_layout(tmp_path: Path): @@ -231,6 +237,91 @@ def test_scores_from_live_benchmark_detail_shape(): assert adapter.relationship_from_score(scores[0]) == 'first_party' +def test_extracts_model_page_score_sources(): + page_html = ( + r'{\"benchmark_id\":\"arc-agi-v2\",\"name\":\"ARC-AGI v2\",' + r'\"score\":0.065,\"self_reported\":false,' + r'\"self_reported_source\":\"https://x.com/xai/status/1943158495588815072\"}' + r'{\"benchmark_id\":\"gpqa\",\"name\":\"GPQA\",' + r'\"score\":0.936,\"self_reported\":true,' + r'\"self_reported_source\":\"https://openai.com/index/introducing-gpt-5-5/\"}' + ) + + sources = adapter.extract_model_page_score_sources(page_html) + + assert sources['arc-agi-v2']['self_reported'] is False + assert ( + sources['arc-agi-v2']['self_reported_source'] + == 'https://x.com/xai/status/1943158495588815072' + ) + assert sources['arc-agi-v2']['source_organization'] == 'xai' + assert sources['gpqa']['self_reported'] is True + assert sources['gpqa']['source_organization'] == 'openai' + + +def test_enrich_scores_with_model_page_sources(monkeypatch): + page_html = ( + r'{\"benchmark_id\":\"arc-agi-v2\",\"name\":\"ARC-AGI v2\",' + r'\"score\":0.065,\"self_reported\":false,' + r'\"self_reported_source\":\"https://x.com/xai/status/1943158495588815072\"}' + ) + monkeypatch.setattr(adapter, 'fetch_text', lambda _url: page_html) + scores = [ + { + 'model_id': 'o3-2025-04-16', + 'benchmark_id': 'arc-agi-v2', + 'score': 0.065, + } + ] + + enriched = adapter.enrich_scores_with_model_page_sources(scores) + + assert enriched[0]['self_reported'] is False + assert ( + enriched[0]['source_url'] + == 'https://x.com/xai/status/1943158495588815072' + ) + assert enriched[0]['source_organization'] == 'xai' + + +def test_relationship_uses_score_source_against_model_developer(): + openai_model = { + 'id': 'o3-2025-04-16', + 'name': 'o3', + 'organization_id': 'openai', + 'organization_name': 'OpenAI', + } + + assert ( + adapter.relationship_from_score( + {'source_url': 'https://openai.com/index/o3/', 'score': 0.8}, + openai_model, + ) + == 'first_party' + ) + assert ( + adapter.relationship_from_score( + { + 'source_url': 'https://x.com/xai/status/1943158495588815072', + 'score': 0.065, + }, + openai_model, + ) + == 'third_party' + ) + assert ( + adapter.relationship_from_score( + {'self_reported': False, 'score': 0.065}, + openai_model, + ) + == 'third_party' + ) + assert ( + adapter.relationship_from_score({'score': 0.065}, openai_model) + == 'other' + ) + + def test_scores_from_live_benchmark_detail_handles_empty_model_id(): detail = { 'benchmark_id': 'gpqa', diff --git a/utils/llm_stats/adapter.py b/utils/llm_stats/adapter.py index 725605350..557985aa4 100644 --- a/utils/llm_stats/adapter.py +++ b/utils/llm_stats/adapter.py @@ -14,6 +14,7 @@ from __future__ import annotations import argparse +import html import json import os import re @@ -22,6 +23,9 @@ from dataclasses import dataclass from pathlib import Path from typing import Any +from urllib.parse import urlparse + +import requests from every_eval_ever.eval_types import ( EvalLibrary, @@ -92,6 +96,8 @@ URL_KEYS = ( 'citation_url', 'citationUrl', + 'self_reported_source', + 'selfReportedSource', 'source_url', 'sourceUrl', 'source_urls', @@ -107,6 +113,17 @@ 'reference_url', 'referenceUrl', ) +SOURCE_ORGANIZATION_KEYS = ( + 'source_organization', + 'sourceOrganization', + 'source_organization_name', + 'sourceOrganizationName', + 'source_org', + 'sourceOrg', + 'evaluator', + 'evaluator_name', + 'evaluatorName', +) MODEL_DETAIL_KEYS = ( 'id', 'slug', @@ -349,6 +366,7 @@ def fetch_payload(api_key: str, base_url: str) -> dict[str, Any]: base_url, headers, ) + scores = enrich_scores_with_model_page_sources(scores) return { 'models': models, @@ -383,6 +401,107 @@ def fetch_benchmark_score_payloads( return scores +def fetch_text(url: str) -> str: + try: + response = requests.get(url, timeout=60) + response.raise_for_status() + return response.text + except requests.exceptions.RequestException as exc: + raise FetchError(f'Failed to fetch {url}: {exc}') from exc + + +def llm_stats_model_page_url(model_id: str) -> str: + return f'https://llm-stats.com/models/{normalize_slug(model_id)}' + + +def extract_model_page_score_sources(page_html: str) -> dict[str, dict[str, Any]]: + text = html.unescape(page_html).replace('\\"', '"') + matches = re.finditer( + r'"benchmark_id":"(?P[^"]+)".*?' + r'"self_reported":(?Ptrue|false|null).*?' + r'"self_reported_source":(?Pnull|"[^"]*")', + text, + flags=re.DOTALL, + ) + + sources: dict[str, dict[str, Any]] = {} + for match in matches: + benchmark_id = match.group('benchmark_id') + source_token = match.group('self_reported_source') + source_url = None + if source_token != 'null': + try: + source_url = json.loads(source_token) + except json.JSONDecodeError: + source_url = source_token.strip('"') + + self_reported_raw = match.group('self_reported') + self_reported = None + if self_reported_raw == 'true': + self_reported = True + elif self_reported_raw == 'false': + self_reported = False + + source_organization = source_organization_from_url(source_url) + sources[benchmark_id] = stringify_details( + { + 'self_reported': self_reported, + 'self_reported_source': source_url, + 'source_url': source_url, + 'source_organization': source_organization, + 'source_organization_inferred_from_url': bool( + source_organization + ), + 'source_domain': source_domain_from_url(source_url), + } + ) + + # Preserve booleans as booleans for relationship inference. + if self_reported is not None: + sources[benchmark_id]['self_reported'] = self_reported + + return sources + + +def enrich_scores_with_model_page_sources( + scores_payload: Any, +) -> list[dict[str, Any]]: + scores = extract_collection(scores_payload, 'scores') + sources_by_model: dict[str, dict[str, dict[str, Any]]] = {} + + for score in scores: + model_id = score_model_ref(score) + if not model_id or model_id in sources_by_model: + continue + + try: + page_html = fetch_text(llm_stats_model_page_url(model_id)) + except FetchError as exc: + print(f'Skipping LLM Stats model page {model_id!r}: {exc}') + sources_by_model[model_id] = {} + continue + + sources_by_model[model_id] = extract_model_page_score_sources( + page_html + ) + + enriched = [] + for score in scores: + score_copy = dict(score) + model_id = score_model_ref(score_copy) + benchmark_id = score_benchmark_ref(score_copy) + page_source = ( + sources_by_model.get(model_id or '', {}).get(benchmark_id or '') + ) + if page_source: + for key, value in page_source.items(): + if key not in score_copy or score_copy[key] in (None, ''): + score_copy[key] = value + enriched.append(score_copy) + + return enriched + + def scores_from_benchmark_detail( detail: dict[str, Any], benchmark_summary: dict[str, Any] | None = None, @@ -762,20 +881,175 @@ def make_model_details(model: dict[str, Any]) -> dict[str, str]: return stringify_details(details) -def relationship_from_score(score: dict[str, Any]) -> str: +def source_domain_from_url(url: Any) -> str | None: + urls = extract_urls(url) + if not urls: + return None + parsed = urlparse(urls[0]) + host = parsed.netloc.lower().removeprefix('www.') + return host or None + + +def source_organization_from_url(url: Any) -> str | None: + urls = extract_urls(url) + if not urls: + return None + + parsed = urlparse(urls[0]) + host = parsed.netloc.lower().removeprefix('www.') + path_parts = [part for part in parsed.path.split('/') if part] + if host in {'x.com', 'twitter.com'} and path_parts: + return normalize_slug(path_parts[0]) + + domain_overrides = { + 'openai.com': 'openai', + 'anthropic.com': 'anthropic', + 'google.com': 'google', + 'blog.google': 'google', + 'deepmind.google': 'google', + 'ai.google.dev': 'google', + 'x.ai': 'xai', + 'artificialanalysis.ai': 'artificial-analysis', + } + if host in domain_overrides: + return domain_overrides[host] + + parts = host.split('.') + if len(parts) >= 2: + return normalize_slug(parts[-2]) + return normalize_slug(host) if host else None + + +def explicit_score_source_organization(score: dict[str, Any]) -> str | None: + inferred_from_url = bool_value( + first_present(score, ('source_organization_inferred_from_url',)) + ) + for key in SOURCE_ORGANIZATION_KEYS: + if key in {'source_organization', 'sourceOrganization'} and inferred_from_url: + continue + value = first_present(score, (key,)) + if value not in (None, ''): + return normalize_slug(value) + return None + + +def url_score_source_organization(score: dict[str, Any]) -> str | None: + for key in URL_KEYS: + source = source_organization_from_url(first_present(score, (key,))) + if source: + return source + return None + + +def score_source_organization(score: dict[str, Any]) -> str | None: + return explicit_score_source_organization( + score + ) or url_score_source_organization(score) + + +def model_organization_candidates( + score: dict[str, Any], + model: dict[str, Any] | None = None, +) -> set[str]: + candidates = set() + rows = [score] + if model is not None: + rows.append(model) + + for row in rows: + provider_slug, provider_name = provider_value(row) + for value in (provider_slug, provider_name): + if value not in (None, ''): + candidates.add(normalize_slug(value)) + for key in ( + 'organization_id', + 'organizationId', + 'organization_name', + 'organizationName', + 'provider_id', + 'providerId', + ): + value = first_present(row, (key,)) + if value not in (None, ''): + candidates.add(normalize_slug(value)) + + if model is not None: + raw_developer, _ = split_model_id(model_source_id(model)) + if raw_developer: + candidates.add(normalize_slug(raw_developer)) + + return {candidate for candidate in candidates if candidate != 'unknown'} + + +def bool_value(value: Any) -> bool | None: + if isinstance(value, bool): + return value + if isinstance(value, str): + text = value.strip().lower() + if text in {'true', '1', 'yes'}: + return True + if text in {'false', '0', 'no'}: + return False + return None + + +def score_self_reported(score: dict[str, Any]) -> bool | None: + for key in ( + 'is_self_reported', + 'self_reported', + 'isSelfReported', + 'selfReported', + ): + value = bool_value(first_present(score, (key,))) + if value is not None: + return value + return None + + +def relationship_inference( + score: dict[str, Any], + model: dict[str, Any] | None = None, +) -> tuple[str, str]: explicit_keys = ('evaluator_relationship',) + tuple(PROVENANCE_KEYS) for key in explicit_keys: explicit = first_present(score, (key,)) if isinstance(explicit, str) and explicit in RELATIONSHIP_VALUES: - return explicit + return explicit, f'explicit_{key}' + source_organization = explicit_score_source_organization(score) + model_organizations = model_organization_candidates(score, model) + if source_organization and source_organization in model_organizations: + return ( + EvaluatorRelationship.first_party.value, + 'explicit_source_matches_model_developer', + ) + if source_organization: + return ( + EvaluatorRelationship.third_party.value, + 'explicit_source_differs_from_model_developer', + ) + + url_source_organization = url_score_source_organization(score) if ( - score.get('is_self_reported') is True - or score.get('self_reported') is True + url_source_organization + and url_source_organization in model_organizations ): - return EvaluatorRelationship.first_party.value - if score.get('isSelfReported') is True or score.get('selfReported') is True: - return EvaluatorRelationship.first_party.value + return ( + EvaluatorRelationship.first_party.value, + 'source_matches_model_developer', + ) + + self_reported = score_self_reported(score) + if self_reported is True: + return EvaluatorRelationship.first_party.value, 'self_reported_true' + if self_reported is False: + return EvaluatorRelationship.third_party.value, 'self_reported_false' + + if url_source_organization: + return ( + EvaluatorRelationship.third_party.value, + 'source_differs_from_model_developer', + ) labels = [] for key in PROVENANCE_KEYS: @@ -785,7 +1059,7 @@ def relationship_from_score(score: dict[str, Any]) -> str: text = ' '.join(labels).lower().replace('-', '_').replace(' ', '_') if not text: - return EvaluatorRelationship.other.value + return EvaluatorRelationship.other.value, 'no_provenance_signal' if any( marker in text @@ -800,7 +1074,7 @@ def relationship_from_score(score: dict[str, Any]) -> str: 'selfreported', ) ): - return EvaluatorRelationship.first_party.value + return EvaluatorRelationship.first_party.value, 'provenance_text' if any( marker in text for marker in ( @@ -811,20 +1085,56 @@ def relationship_from_score(score: dict[str, Any]) -> str: 'thirdparty', ) ): - return EvaluatorRelationship.third_party.value + return EvaluatorRelationship.third_party.value, 'provenance_text' if 'collaborative' in text or 'joint' in text: - return EvaluatorRelationship.collaborative.value + return EvaluatorRelationship.collaborative.value, 'provenance_text' + + return EvaluatorRelationship.other.value, 'unrecognized_provenance_text' + - return EvaluatorRelationship.other.value +def relationship_from_score( + score: dict[str, Any], + model: dict[str, Any] | None = None, +) -> str: + relationship, _ = relationship_inference(score, model) + return relationship -def provenance_details(score: dict[str, Any]) -> dict[str, str]: +def provenance_details( + score: dict[str, Any], + model: dict[str, Any] | None = None, +) -> dict[str, str]: details: dict[str, Any] = {} raw_fields: dict[str, Any] = {} for key in PROVENANCE_KEYS: value = first_present(score, (key,)) if value not in (None, ''): raw_fields[key] = value + for key in ( + 'self_reported', + 'selfReported', + 'is_self_reported', + 'isSelfReported', + 'self_reported_source', + 'selfReportedSource', + 'source_organization', + 'sourceOrganization', + 'source_domain', + 'sourceDomain', + ): + value = first_present(score, (key,)) + if value not in (None, ''): + details[f'raw_{normalize_slug(key).replace("-", "_")}'] = value + if 'raw_source_organization' not in details: + source_organization = score_source_organization(score) + if source_organization: + details['raw_source_organization'] = source_organization + if 'raw_source_domain' not in details: + for key in URL_KEYS: + source_domain = source_domain_from_url(first_present(score, (key,))) + if source_domain: + details['raw_source_domain'] = source_domain + break if raw_fields: details['raw_provenance_fields_json'] = raw_fields details['raw_provenance_label'] = ' '.join( @@ -838,6 +1148,10 @@ def provenance_details(score: dict[str, Any]) -> dict[str, str]: if value not in (None, ''): details[f'raw_{normalize_slug(key).replace("-", "_")}'] = value + relationship, reason = relationship_inference(score, model) + details['inferred_evaluator_relationship'] = relationship + details['relationship_inference_reason'] = reason + return stringify_details(details) @@ -1086,7 +1400,7 @@ def make_score_details( if score_id not in (None, ''): details['raw_score_id'] = score_id - details.update(provenance_details(score)) + details.update(provenance_details(score, model)) return ScoreDetails( score=score_value, details=stringify_details(details), @@ -1210,7 +1524,7 @@ def make_logs( model = resolve_model(score, model_index) benchmark = resolve_benchmark(score, benchmark_index) model_info, developer, model_slug = normalize_model_info(model) - relationship = relationship_from_score(score) + relationship = relationship_from_score(score, model) result = make_evaluation_result(score, model, benchmark, base_url) if result is None: continue