From 52030cbe1675136a8eb1f4f58ece0740d171775c Mon Sep 17 00:00:00 2001
From: tommasocerruti <tommasocerruti@gmail.com>
Date: Sat, 16 May 2026 20:15:18 +0200
Subject: [PATCH] Fix LLM Stats evaluator provenance

---
 tests/test_llm_stats_adapter.py |  93 ++++++++-
 utils/llm_stats/adapter.py      | 344 ++++++++++++++++++++++++++++++--
 2 files changed, 421 insertions(+), 16 deletions(-)

diff --git a/tests/test_llm_stats_adapter.py b/tests/test_llm_stats_adapter.py
index 1733852d2..515928c8f 100644
--- a/tests/test_llm_stats_adapter.py
+++ b/tests/test_llm_stats_adapter.py
@@ -86,7 +86,6 @@ def sample_payload() -> dict:
                     'model_id': 'claude-4-opus',
                     'benchmark_id': 'gpqa-diamond',
                     'score': 88.5,
-                    'source_url': 'https://example.org/claude-gpqa',
                 },
             ]
         },
@@ -163,6 +162,10 @@ def test_raw_citation_and_provenance_are_preserved():
     first_details = first_result.score_details.details or {}
     assert first_details['raw_provenance_label'] == 'model_card'
     assert first_details['raw_verified'] == 'true'
+    assert first_details['raw_source_organization'] == 'openai'
+    assert first_details['relationship_inference_reason'] == (
+        'source_matches_model_developer'
+    )
     assert 'https://openai.com/index/gpt-5-system-card/' in json.loads(
         first_details['source_urls_json']
     )
@@ -176,6 +179,9 @@ def test_raw_citation_and_provenance_are_preserved():
     other_result = logs['other'].evaluation_results[0]
     other_details = other_result.score_details.details or {}
     assert other_details['raw_provenance_label'] == 'unknown'
+    assert other_details['relationship_inference_reason'] == (
+        'no_provenance_signal'
+    )
 
 
 def test_export_paths_follow_datastore_layout(tmp_path: Path):
@@ -231,6 +237,91 @@ def test_scores_from_live_benchmark_detail_shape():
     assert adapter.relationship_from_score(scores[0]) == 'first_party'
 
 
+def test_extracts_model_page_score_sources():
+    page_html = (
+        r'{\"benchmark_id\":\"arc-agi-v2\",\"name\":\"ARC-AGI v2\",'
+        r'\"score\":0.065,\"self_reported\":false,'
+        r'\"self_reported_source\":\"https://x.com/xai/status/1943158495588815072\"}'
+        r'{\"benchmark_id\":\"gpqa\",\"name\":\"GPQA\",'
+        r'\"score\":0.936,\"self_reported\":true,'
+        r'\"self_reported_source\":\"https://openai.com/index/introducing-gpt-5-5/\"}'
+    )
+
+    sources = adapter.extract_model_page_score_sources(page_html)
+
+    assert sources['arc-agi-v2']['self_reported'] is False
+    assert (
+        sources['arc-agi-v2']['self_reported_source']
+        == 'https://x.com/xai/status/1943158495588815072'
+    )
+    assert sources['arc-agi-v2']['source_organization'] == 'xai'
+    assert sources['gpqa']['self_reported'] is True
+    assert sources['gpqa']['source_organization'] == 'openai'
+
+
+def test_enrich_scores_with_model_page_sources(monkeypatch):
+    page_html = (
+        r'{\"benchmark_id\":\"arc-agi-v2\",\"name\":\"ARC-AGI v2\",'
+        r'\"score\":0.065,\"self_reported\":false,'
+        r'\"self_reported_source\":\"https://x.com/xai/status/1943158495588815072\"}'
+    )
+    monkeypatch.setattr(adapter, 'fetch_text', lambda _url: page_html)
+    scores = [
+        {
+            'model_id': 'o3-2025-04-16',
+            'benchmark_id': 'arc-agi-v2',
+            'score': 0.065,
+        }
+    ]
+
+    enriched = adapter.enrich_scores_with_model_page_sources(scores)
+
+    assert enriched[0]['self_reported'] is False
+    assert (
+        enriched[0]['source_url']
+        == 'https://x.com/xai/status/1943158495588815072'
+    )
+    assert enriched[0]['source_organization'] == 'xai'
+
+
+def test_relationship_uses_score_source_against_model_developer():
+    openai_model = {
+        'id': 'o3-2025-04-16',
+        'name': 'o3',
+        'organization_id': 'openai',
+        'organization_name': 'OpenAI',
+    }
+
+    assert (
+        adapter.relationship_from_score(
+            {'source_url': 'https://openai.com/index/o3/', 'score': 0.8},
+            openai_model,
+        )
+        == 'first_party'
+    )
+    assert (
+        adapter.relationship_from_score(
+            {
+                'source_url': 'https://x.com/xai/status/1943158495588815072',
+                'score': 0.065,
+            },
+            openai_model,
+        )
+        == 'third_party'
+    )
+    assert (
+        adapter.relationship_from_score(
+            {'self_reported': False, 'score': 0.065},
+            openai_model,
+        )
+        == 'third_party'
+    )
+    assert (
+        adapter.relationship_from_score({'score': 0.065}, openai_model)
+        == 'other'
+    )
+
+
 def test_scores_from_live_benchmark_detail_handles_empty_model_id():
     detail = {
         'benchmark_id': 'gpqa',
diff --git a/utils/llm_stats/adapter.py b/utils/llm_stats/adapter.py
index 725605350..557985aa4 100644
--- a/utils/llm_stats/adapter.py
+++ b/utils/llm_stats/adapter.py
@@ -14,6 +14,7 @@
 from __future__ import annotations
 
 import argparse
+import html
 import json
 import os
 import re
@@ -22,6 +23,9 @@
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any
+from urllib.parse import urlparse
+
+import requests
 
 from every_eval_ever.eval_types import (
     EvalLibrary,
@@ -92,6 +96,8 @@
 URL_KEYS = (
     'citation_url',
     'citationUrl',
+    'self_reported_source',
+    'selfReportedSource',
     'source_url',
     'sourceUrl',
     'source_urls',
@@ -107,6 +113,17 @@
     'reference_url',
     'referenceUrl',
 )
+SOURCE_ORGANIZATION_KEYS = (
+    'source_organization',
+    'sourceOrganization',
+    'source_organization_name',
+    'sourceOrganizationName',
+    'source_org',
+    'sourceOrg',
+    'evaluator',
+    'evaluator_name',
+    'evaluatorName',
+)
 MODEL_DETAIL_KEYS = (
     'id',
     'slug',
@@ -349,6 +366,7 @@ def fetch_payload(api_key: str, base_url: str) -> dict[str, Any]:
             base_url,
             headers,
         )
+        scores = enrich_scores_with_model_page_sources(scores)
 
     return {
         'models': models,
@@ -383,6 +401,107 @@ def fetch_benchmark_score_payloads(
     return scores
 
 
+def fetch_text(url: str) -> str:
+    try:
+        response = requests.get(url, timeout=60)
+        response.raise_for_status()
+        return response.text
+    except requests.exceptions.RequestException as exc:
+        raise FetchError(f'Failed to fetch {url}: {exc}') from exc
+
+
+def llm_stats_model_page_url(model_id: str) -> str:
+    return f'https://llm-stats.com/models/{normalize_slug(model_id)}'
+
+
+def extract_model_page_score_sources(page_html: str) -> dict[str, dict[str, Any]]:
+    text = html.unescape(page_html).replace('\\"', '"')
+    matches = re.finditer(
+        r'"benchmark_id":"(?P<benchmark_id>[^"]+)".*?'
+        r'"self_reported":(?P<self_reported>true|false|null).*?'
+        r'"self_reported_source":(?P<self_reported_source>null|"[^"]*")',
+        text,
+        flags=re.DOTALL,
+    )
+
+    sources: dict[str, dict[str, Any]] = {}
+    for match in matches:
+        benchmark_id = match.group('benchmark_id')
+        source_token = match.group('self_reported_source')
+        source_url = None
+        if source_token != 'null':
+            try:
+                source_url = json.loads(source_token)
+            except json.JSONDecodeError:
+                source_url = source_token.strip('"')
+
+        self_reported_raw = match.group('self_reported')
+        self_reported = None
+        if self_reported_raw == 'true':
+            self_reported = True
+        elif self_reported_raw == 'false':
+            self_reported = False
+
+        source_organization = source_organization_from_url(source_url)
+        sources[benchmark_id] = stringify_details(
+            {
+                'self_reported': self_reported,
+                'self_reported_source': source_url,
+                'source_url': source_url,
+                'source_organization': source_organization,
+                'source_organization_inferred_from_url': bool(
+                    source_organization
+                ),
+                'source_domain': source_domain_from_url(source_url),
+            }
+        )
+
+        # Preserve booleans as booleans for relationship inference.
+        if self_reported is not None:
+            sources[benchmark_id]['self_reported'] = self_reported
+
+    return sources
+
+
+def enrich_scores_with_model_page_sources(
+    scores_payload: Any,
+) -> list[dict[str, Any]]:
+    scores = extract_collection(scores_payload, 'scores')
+    sources_by_model: dict[str, dict[str, dict[str, Any]]] = {}
+
+    for score in scores:
+        model_id = score_model_ref(score)
+        if not model_id or model_id in sources_by_model:
+            continue
+
+        try:
+            page_html = fetch_text(llm_stats_model_page_url(model_id))
+        except FetchError as exc:
+            print(f'Skipping LLM Stats model page {model_id!r}: {exc}')
+            sources_by_model[model_id] = {}
+            continue
+
+        sources_by_model[model_id] = extract_model_page_score_sources(
+            page_html
+        )
+
+    enriched = []
+    for score in scores:
+        score_copy = dict(score)
+        model_id = score_model_ref(score_copy)
+        benchmark_id = score_benchmark_ref(score_copy)
+        page_source = (
+            sources_by_model.get(model_id or '', {}).get(benchmark_id or '')
+        )
+        if page_source:
+            for key, value in page_source.items():
+                if key not in score_copy or score_copy[key] in (None, ''):
+                    score_copy[key] = value
+        enriched.append(score_copy)
+
+    return enriched
+
+
 def scores_from_benchmark_detail(
     detail: dict[str, Any],
     benchmark_summary: dict[str, Any] | None = None,
@@ -762,20 +881,175 @@ def make_model_details(model: dict[str, Any]) -> dict[str, str]:
     return stringify_details(details)
 
 
-def relationship_from_score(score: dict[str, Any]) -> str:
+def source_domain_from_url(url: Any) -> str | None:
+    urls = extract_urls(url)
+    if not urls:
+        return None
+    parsed = urlparse(urls[0])
+    host = parsed.netloc.lower().removeprefix('www.')
+    return host or None
+
+
+def source_organization_from_url(url: Any) -> str | None:
+    urls = extract_urls(url)
+    if not urls:
+        return None
+
+    parsed = urlparse(urls[0])
+    host = parsed.netloc.lower().removeprefix('www.')
+    path_parts = [part for part in parsed.path.split('/') if part]
+    if host in {'x.com', 'twitter.com'} and path_parts:
+        return normalize_slug(path_parts[0])
+
+    domain_overrides = {
+        'openai.com': 'openai',
+        'anthropic.com': 'anthropic',
+        'google.com': 'google',
+        'blog.google': 'google',
+        'deepmind.google': 'google',
+        'ai.google.dev': 'google',
+        'x.ai': 'xai',
+        'artificialanalysis.ai': 'artificial-analysis',
+    }
+    if host in domain_overrides:
+        return domain_overrides[host]
+
+    parts = host.split('.')
+    if len(parts) >= 2:
+        return normalize_slug(parts[-2])
+    return normalize_slug(host) if host else None
+
+
+def explicit_score_source_organization(score: dict[str, Any]) -> str | None:
+    inferred_from_url = bool_value(
+        first_present(score, ('source_organization_inferred_from_url',))
+    )
+    for key in SOURCE_ORGANIZATION_KEYS:
+        if key in {'source_organization', 'sourceOrganization'} and inferred_from_url:
+            continue
+        value = first_present(score, (key,))
+        if value not in (None, ''):
+            return normalize_slug(value)
+    return None
+
+
+def url_score_source_organization(score: dict[str, Any]) -> str | None:
+    for key in URL_KEYS:
+        source = source_organization_from_url(first_present(score, (key,)))
+        if source:
+            return source
+    return None
+
+
+def score_source_organization(score: dict[str, Any]) -> str | None:
+    return explicit_score_source_organization(
+        score
+    ) or url_score_source_organization(score)
+
+
+def model_organization_candidates(
+    score: dict[str, Any],
+    model: dict[str, Any] | None = None,
+) -> set[str]:
+    candidates = set()
+    rows = [score]
+    if model is not None:
+        rows.append(model)
+
+    for row in rows:
+        provider_slug, provider_name = provider_value(row)
+        for value in (provider_slug, provider_name):
+            if value not in (None, ''):
+                candidates.add(normalize_slug(value))
+        for key in (
+            'organization_id',
+            'organizationId',
+            'organization_name',
+            'organizationName',
+            'provider_id',
+            'providerId',
+        ):
+            value = first_present(row, (key,))
+            if value not in (None, ''):
+                candidates.add(normalize_slug(value))
+
+    if model is not None:
+        raw_developer, _ = split_model_id(model_source_id(model))
+        if raw_developer:
+            candidates.add(normalize_slug(raw_developer))
+
+    return {candidate for candidate in candidates if candidate != 'unknown'}
+
+
+def bool_value(value: Any) -> bool | None:
+    if isinstance(value, bool):
+        return value
+    if isinstance(value, str):
+        text = value.strip().lower()
+        if text in {'true', '1', 'yes'}:
+            return True
+        if text in {'false', '0', 'no'}:
+            return False
+    return None
+
+
+def score_self_reported(score: dict[str, Any]) -> bool | None:
+    for key in (
+        'is_self_reported',
+        'self_reported',
+        'isSelfReported',
+        'selfReported',
+    ):
+        value = bool_value(first_present(score, (key,)))
+        if value is not None:
+            return value
+    return None
+
+
+def relationship_inference(
+    score: dict[str, Any],
+    model: dict[str, Any] | None = None,
+) -> tuple[str, str]:
     explicit_keys = ('evaluator_relationship',) + tuple(PROVENANCE_KEYS)
     for key in explicit_keys:
         explicit = first_present(score, (key,))
         if isinstance(explicit, str) and explicit in RELATIONSHIP_VALUES:
-            return explicit
+            return explicit, f'explicit_{key}'
 
+    source_organization = explicit_score_source_organization(score)
+    model_organizations = model_organization_candidates(score, model)
+    if source_organization and source_organization in model_organizations:
+        return (
+            EvaluatorRelationship.first_party.value,
+            'explicit_source_matches_model_developer',
+        )
+    if source_organization:
+        return (
+            EvaluatorRelationship.third_party.value,
+            'explicit_source_differs_from_model_developer',
+        )
+
+    url_source_organization = url_score_source_organization(score)
     if (
-        score.get('is_self_reported') is True
-        or score.get('self_reported') is True
+        url_source_organization
+        and url_source_organization in model_organizations
     ):
-        return EvaluatorRelationship.first_party.value
-    if score.get('isSelfReported') is True or score.get('selfReported') is True:
-        return EvaluatorRelationship.first_party.value
+        return (
+            EvaluatorRelationship.first_party.value,
+            'source_matches_model_developer',
+        )
+
+    self_reported = score_self_reported(score)
+    if self_reported is True:
+        return EvaluatorRelationship.first_party.value, 'self_reported_true'
+    if self_reported is False:
+        return EvaluatorRelationship.third_party.value, 'self_reported_false'
+
+    if url_source_organization:
+        return (
+            EvaluatorRelationship.third_party.value,
+            'source_differs_from_model_developer',
+        )
 
     labels = []
     for key in PROVENANCE_KEYS:
@@ -785,7 +1059,7 @@ def relationship_from_score(score: dict[str, Any]) -> str:
 
     text = ' '.join(labels).lower().replace('-', '_').replace(' ', '_')
     if not text:
-        return EvaluatorRelationship.other.value
+        return EvaluatorRelationship.other.value, 'no_provenance_signal'
 
     if any(
         marker in text
@@ -800,7 +1074,7 @@ def relationship_from_score(score: dict[str, Any]) -> str:
             'selfreported',
         )
     ):
-        return EvaluatorRelationship.first_party.value
+        return EvaluatorRelationship.first_party.value, 'provenance_text'
     if any(
         marker in text
         for marker in (
@@ -811,20 +1085,56 @@ def relationship_from_score(score: dict[str, Any]) -> str:
             'thirdparty',
         )
     ):
-        return EvaluatorRelationship.third_party.value
+        return EvaluatorRelationship.third_party.value, 'provenance_text'
     if 'collaborative' in text or 'joint' in text:
-        return EvaluatorRelationship.collaborative.value
+        return EvaluatorRelationship.collaborative.value, 'provenance_text'
+
+    return EvaluatorRelationship.other.value, 'unrecognized_provenance_text'
+
 
-    return EvaluatorRelationship.other.value
+def relationship_from_score(
+    score: dict[str, Any],
+    model: dict[str, Any] | None = None,
+) -> str:
+    relationship, _ = relationship_inference(score, model)
+    return relationship
 
 
-def provenance_details(score: dict[str, Any]) -> dict[str, str]:
+def provenance_details(
+    score: dict[str, Any],
+    model: dict[str, Any] | None = None,
+) -> dict[str, str]:
     details: dict[str, Any] = {}
     raw_fields: dict[str, Any] = {}
     for key in PROVENANCE_KEYS:
         value = first_present(score, (key,))
         if value not in (None, ''):
             raw_fields[key] = value
+    for key in (
+        'self_reported',
+        'selfReported',
+        'is_self_reported',
+        'isSelfReported',
+        'self_reported_source',
+        'selfReportedSource',
+        'source_organization',
+        'sourceOrganization',
+        'source_domain',
+        'sourceDomain',
+    ):
+        value = first_present(score, (key,))
+        if value not in (None, ''):
+            details[f'raw_{normalize_slug(key).replace("-", "_")}'] = value
+    if 'raw_source_organization' not in details:
+        source_organization = score_source_organization(score)
+        if source_organization:
+            details['raw_source_organization'] = source_organization
+    if 'raw_source_domain' not in details:
+        for key in URL_KEYS:
+            source_domain = source_domain_from_url(first_present(score, (key,)))
+            if source_domain:
+                details['raw_source_domain'] = source_domain
+                break
     if raw_fields:
         details['raw_provenance_fields_json'] = raw_fields
         details['raw_provenance_label'] = ' '.join(
@@ -838,6 +1148,10 @@ def provenance_details(score: dict[str, Any]) -> dict[str, str]:
         if value not in (None, ''):
             details[f'raw_{normalize_slug(key).replace("-", "_")}'] = value
 
+    relationship, reason = relationship_inference(score, model)
+    details['inferred_evaluator_relationship'] = relationship
+    details['relationship_inference_reason'] = reason
+
     return stringify_details(details)
 
 
@@ -1086,7 +1400,7 @@ def make_score_details(
     if score_id not in (None, ''):
         details['raw_score_id'] = score_id
 
-    details.update(provenance_details(score))
+    details.update(provenance_details(score, model))
     return ScoreDetails(
         score=score_value,
         details=stringify_details(details),
@@ -1210,7 +1524,7 @@ def make_logs(
         model = resolve_model(score, model_index)
         benchmark = resolve_benchmark(score, benchmark_index)
         model_info, developer, model_slug = normalize_model_info(model)
-        relationship = relationship_from_score(score)
+        relationship = relationship_from_score(score, model)
         result = make_evaluation_result(score, model, benchmark, base_url)
         if result is None:
             continue