From 4353b80019b0e188b6f2f6b35cf1a51dfdd4923b Mon Sep 17 00:00:00 2001
From: agent <erotemic@gmail.com>
Date: Sat, 25 Apr 2026 16:37:54 +0000
Subject: [PATCH 1/8] Fix HELM EEE instance metric rows

---
 .../converters/helm/instance_level_adapter.py | 200 ++++++++++-----
 tests/test_helm_adapter.py                    |  11 +-
 tests/test_helm_instance_level_adapter.py     | 241 +++++++++++++++++-
 3 files changed, 376 insertions(+), 76 deletions(-)

diff --git a/every_eval_ever/converters/helm/instance_level_adapter.py b/every_eval_ever/converters/helm/instance_level_adapter.py
index 037237ecc..e675c7ff7 100644
--- a/every_eval_ever/converters/helm/instance_level_adapter.py
+++ b/every_eval_ever/converters/helm/instance_level_adapter.py
@@ -35,6 +35,60 @@ def _require_helm_dependencies() -> None:
 )
 
 
+def _score_from_stat(stat) -> float | None:
+    value = getattr(stat, 'mean', None)
+    if value is None:
+        count = getattr(stat, 'count', None)
+        total = getattr(stat, 'sum', None)
+        if count:
+            value = total / count
+    if value is None:
+        return None
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+
+
+# Metric names whose per-instance score is a correctness signal in [0, 1]
+# where ``score > 0`` reasonably maps to ``is_correct=True``. Anything not
+# in this allowlist (token counts, runtime, finish-reason flags, logprobs,
+# etc.) gets ``is_correct=False`` because we have no correctness claim
+# from a bookkeeping/resource metric. Keep this list tight and named after
+# the actual HELM stat names — broaden only for verified correctness
+# semantics.
+_BINARY_CORRECTNESS_METRIC_NAMES: frozenset[str] = frozenset({
+    'exact_match',
+    'quasi_exact_match',
+    'prefix_exact_match',
+    'quasi_prefix_exact_match',
+    'exact_match@5',
+    'quasi_exact_match@5',
+    'prefix_exact_match@5',
+    'quasi_prefix_exact_match@5',
+    'ifeval_strict_accuracy',
+    'chain_of_thought_correctness',
+    'math_equiv',
+    'math_equiv_chain_of_thought',
+})
+
+
+def _is_correct_for_metric(metric_name: str | None, score: float) -> bool:
+    """Decide ``is_correct`` honestly per metric name.
+
+    For correctness metrics in the allowlist, the HELM convention is that
+    score==1.0 means correct and 0.0 means wrong, so any positive score
+    rounds up to "correct". For anything else (bookkeeping / resource
+    stats, or graded metrics like rouge_l/bleu where >0 is not a correctness
+    signal) we deliberately do not claim correctness.
+    """
+    if metric_name is None:
+        return False
+    if metric_name in _BINARY_CORRECTNESS_METRIC_NAMES:
+        return score > 0
+    return False
+
+
 class HELMInstanceLevelDataAdapter:
     def __init__(
         self,
@@ -97,27 +151,23 @@ def convert_instance_level_logs(
             reasoning_traces = extract_all_reasonings(state)
             if isinstance(reasoning_traces, str):
                 reasoning_traces = [reasoning_traces]
+            if reasoning_traces is None:
+                reasoning_traces = []
+            reasoning_traces = [
+                trace for trace in reasoning_traces if isinstance(trace, str)
+            ]
 
-            is_correct = False
-            score = 0.0
-            if inst_stats:
-                em_stat = next(
-                    (
-                        s
-                        for s in inst_stats.stats
-                        if s.name.name == 'exact_match'
-                    ),
-                    None,
+            metric_stats = list(inst_stats.stats) if inst_stats else []
+            if not metric_stats:
+                correct_completions = sum(
+                    1 for c in completions if c.strip() in correct_refs
                 )
-                if em_stat:
-                    score = em_stat.mean
-                    is_correct = em_stat.mean > 0
-                else:  # TODO check for more specific tasks
-                    correct_completions = sum(
-                        1 for c in completions if c.strip() in correct_refs
-                    )
-                    score = correct_completions / len(completions)
-                    is_correct = score > 0
+                fallback_score = (
+                    correct_completions / len(completions)
+                    if completions
+                    else 0.0
+                )
+                metric_stats = [None]
 
             token_usage = None
             if inst_stats:
@@ -155,56 +205,72 @@ def convert_instance_level_logs(
                     total_tokens=int(p_tokens + c_tokens),
                 )
 
-            instance_level_logs.append(
-                InstanceLevelEvaluationLog(
-                    schema_version=SCHEMA_VERSION,
-                    evaluation_id=self.evaluation_id,
-                    model_id=model_id,
-                    evaluation_name=evaluation_name,
-                    sample_id=str(state.instance.id),
-                    sample_hash=sha256_string(
-                        state.request.prompt + (correct_refs[0] if correct_refs else '')
-                    ),  # TODO use all references
-                    interaction_type=InteractionType.single_turn,
-                    input=Input(
-                        raw=state.request.prompt,
-                        reference=correct_refs if correct_refs else [],
-                        choices=(
-                            list(state.output_mapping.values())
-                            if state.output_mapping
-                            else [
-                                ref.output.text
-                                for ref in state.instance.references
-                            ]
+            for stat in metric_stats:
+                if stat is None:
+                    metric_name = None
+                    score = fallback_score
+                    # Fallback path: ``score`` here is an exact-match
+                    # proxy from completion-vs-reference matching, so
+                    # the correctness claim is honest in the same sense
+                    # as the legacy single-row behavior.
+                    is_correct = score > 0
+                else:
+                    metric_name = getattr(getattr(stat, 'name', None), 'name', None)
+                    score = _score_from_stat(stat)
+                    if score is None:
+                        continue
+                    is_correct = _is_correct_for_metric(metric_name, score)
+                instance_level_logs.append(
+                    InstanceLevelEvaluationLog(
+                        schema_version=SCHEMA_VERSION,
+                        evaluation_id=self.evaluation_id,
+                        model_id=model_id,
+                        evaluation_name=evaluation_name,
+                        evaluation_result_id=metric_name,
+                        sample_id=str(state.instance.id),
+                        sample_hash=sha256_string(
+                            state.request.prompt + (correct_refs[0] if correct_refs else '')
+                        ),  # TODO use all references
+                        interaction_type=InteractionType.single_turn,
+                        input=Input(
+                            raw=state.request.prompt,
+                            reference=correct_refs if correct_refs else [],
+                            choices=(
+                                list(state.output_mapping.values())
+                                if state.output_mapping
+                                else [
+                                    ref.output.text
+                                    for ref in state.instance.references
+                                ]
+                            ),
                         ),
-                    ),
-                    output=Output(
-                        raw=completions, reasoning_trace=reasoning_traces
-                    ),
-                    answer_attribution=[
-                        AnswerAttributionItem(
-                            turn_idx=0,
-                            source='output.raw',
-                            extracted_value=state.result.completions[
-                                0
-                            ].text.strip()
-                            if state.result and state.result.completions
-                            else '',
-                            extraction_method='exact_match',
-                            is_terminal=True,
-                        )
-                    ],
-                    evaluation=Evaluation(
-                        score=float(score), is_correct=is_correct
-                    ),
-                    token_usage=token_usage,
-                    performance=Performance(
-                        generation_time_ms=state.result.request_time * 1000
-                        if state.result.request_time
-                        else None
-                    ),
+                        output=Output(
+                            raw=completions, reasoning_trace=reasoning_traces
+                        ),
+                        answer_attribution=[
+                            AnswerAttributionItem(
+                                turn_idx=0,
+                                source='output.raw',
+                                extracted_value=state.result.completions[
+                                    0
+                                ].text.strip()
+                                if state.result and state.result.completions
+                                else '',
+                                extraction_method='exact_match',
+                                is_terminal=True,
+                            )
+                        ],
+                        evaluation=Evaluation(
+                            score=float(score), is_correct=is_correct
+                        ),
+                        token_usage=token_usage,
+                        performance=Performance(
+                            generation_time_ms=state.result.request_time * 1000
+                            if state.result.request_time
+                            else None
+                        ),
+                    )
                 )
-            )
 
         self._save_json(instance_level_logs)
         return self.path, len(instance_level_logs)
diff --git a/tests/test_helm_adapter.py b/tests/test_helm_adapter.py
index e08f48dcc..33e279477 100644
--- a/tests/test_helm_adapter.py
+++ b/tests/test_helm_adapter.py
@@ -76,7 +76,10 @@ def test_mmlu_eval():
 
     assert converted_eval.detailed_evaluation_results is not None
     assert converted_eval.detailed_evaluation_results.format is not None
-    assert converted_eval.detailed_evaluation_results.total_rows == 10
+    # Per-(sample, metric) emission: each of the 10 samples produces one
+    # row per non-empty stat, so total_rows is much larger than the
+    # legacy "one row per sample" count.
+    assert converted_eval.detailed_evaluation_results.total_rows >= 10
 
 
 def test_hellswag_eval():
@@ -117,7 +120,8 @@ def test_hellswag_eval():
 
     assert converted_eval.detailed_evaluation_results is not None
     assert converted_eval.detailed_evaluation_results.format is not None
-    assert converted_eval.detailed_evaluation_results.total_rows == 10
+    # Per-(sample, metric): >= sample count, not equal to it.
+    assert converted_eval.detailed_evaluation_results.total_rows >= 10
 
 
 def test_narrativeqa_eval():
@@ -154,7 +158,8 @@ def test_narrativeqa_eval():
 
     assert converted_eval.detailed_evaluation_results is not None
     assert converted_eval.detailed_evaluation_results.format is not None
-    assert converted_eval.detailed_evaluation_results.total_rows == 5
+    # Per-(sample, metric): >= sample count, not equal to it.
+    assert converted_eval.detailed_evaluation_results.total_rows >= 5
 
 
 def test_missing_model_deployment_falls_back_to_model():
diff --git a/tests/test_helm_instance_level_adapter.py b/tests/test_helm_instance_level_adapter.py
index 4ee46c6c2..46b69f906 100644
--- a/tests/test_helm_instance_level_adapter.py
+++ b/tests/test_helm_instance_level_adapter.py
@@ -9,6 +9,10 @@
 from pathlib import Path
 
 from every_eval_ever.converters.helm.adapter import HELMAdapter
+from every_eval_ever.converters.helm.instance_level_adapter import (
+    _BINARY_CORRECTNESS_METRIC_NAMES,
+    _is_correct_for_metric,
+)
 from every_eval_ever.eval_types import EvaluatorRelationship
 from every_eval_ever.instance_level_types import (
     InstanceLevelEvaluationLog,
@@ -43,6 +47,15 @@ def _load_instance_level_data(adapter, filepath, metadata_args):
     return converted_eval, instance_logs
 
 
+def _by_sample_and_metric(
+    instance_logs,
+) -> dict[tuple[str, str | None], InstanceLevelEvaluationLog]:
+    return {
+        (log.sample_id, log.evaluation_result_id): log
+        for log in instance_logs
+    }
+
+
 def test_mmlu_instance_level():
     adapter = HELMAdapter()
 
@@ -60,9 +73,20 @@ def test_mmlu_instance_level():
             metadata_args,
         )
 
-        assert len(instance_logs) == 10
-        log = instance_logs[0]
+        # Per-(sample, metric) emission: 10 samples * many stats per
+        # sample. Confirm by sample_id distinct count, not row count.
+        sample_ids = sorted({log.sample_id for log in instance_logs})
+        assert len(sample_ids) == 10
+        # The MMLU fixture has the standard exact-match family for
+        # multiple_choice_joint; every sample must have an exact_match row.
+        em_rows = [
+            log
+            for log in instance_logs
+            if log.evaluation_result_id == 'exact_match'
+        ]
+        assert len(em_rows) == 10
 
+        log = _by_sample_and_metric(instance_logs)[('id147', 'exact_match')]
         assert log.schema_version == '0.2.2'
         assert log.evaluation_id == 'test_mmlu_samples'
         assert log.model_id == 'openai/gpt2'
@@ -109,8 +133,16 @@ def test_hellaswag_instance_level():
             metadata_args,
         )
 
-        assert len(instance_logs) == 10
-        log = instance_logs[0]
+        sample_ids = sorted({log.sample_id for log in instance_logs})
+        assert len(sample_ids) == 10
+
+        em_rows = [
+            log
+            for log in instance_logs
+            if log.evaluation_result_id == 'exact_match'
+        ]
+        assert len(em_rows) == 10
+        log = em_rows[0]
 
         assert log.schema_version == '0.2.2'
         assert log.model_id == 'eleutherai/pythia-1b-v0'
@@ -145,8 +177,16 @@ def test_narrativeqa_instance_level():
             metadata_args,
         )
 
-        assert len(instance_logs) == 5
-        log = instance_logs[0]
+        sample_ids = sorted({log.sample_id for log in instance_logs})
+        assert len(sample_ids) == 5
+
+        em_rows = [
+            log
+            for log in instance_logs
+            if log.evaluation_result_id == 'exact_match'
+        ]
+        assert len(em_rows) == 5
+        log = em_rows[0]
 
         assert log.schema_version == '0.2.2'
         assert log.model_id == 'openai/gpt2'
@@ -166,3 +206,192 @@ def test_narrativeqa_instance_level():
 
         assert len(log.answer_attribution) == 1
         assert log.answer_attribution[0].extraction_method == 'exact_match'
+
+
+# ---------------------------------------------------------------------------
+# Per-(sample, metric) row emission and correctness semantics
+# ---------------------------------------------------------------------------
+
+
+def test_per_sample_per_metric_rows_are_emitted():
+    """Every per-instance HELM stat with a numeric mean becomes its own row.
+
+    The fixture's first sample has 27 declared stats but two
+    (``training_co2_cost`` and ``training_energy_cost``) carry no value;
+    we expect 25 rows for that sample.
+    """
+    adapter = HELMAdapter()
+    with tempfile.TemporaryDirectory() as tmpdir:
+        metadata_args = {
+            'source_organization_name': 'TestOrg',
+            'evaluator_relationship': EvaluatorRelationship.first_party,
+            'parent_eval_output_dir': tmpdir,
+            'file_uuid': 'test_grain',
+        }
+        _, instance_logs = _load_instance_level_data(
+            adapter,
+            'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2',
+            metadata_args,
+        )
+        rows_for_id147 = [log for log in instance_logs if log.sample_id == 'id147']
+        metric_ids = sorted(log.evaluation_result_id for log in rows_for_id147)
+        assert 'exact_match' in metric_ids
+        assert 'num_prompt_tokens' in metric_ids
+        # No row should have a None evaluation_result_id when stats exist.
+        assert all(log.evaluation_result_id is not None for log in rows_for_id147)
+        # Distinct metric ids — no duplicates within a sample.
+        assert len(metric_ids) == len(set(metric_ids))
+
+
+def test_is_correct_only_claimed_for_correctness_metrics():
+    """``is_correct=True`` should never appear on bookkeeping/resource rows.
+
+    The MMLU fixture has positive-valued bookkeeping rows
+    (``num_references=4``, ``num_prompt_tokens=333``, ``inference_runtime>0``,
+    ``max_prob=1``). Earlier patches set ``is_correct = score > 0`` for
+    every row, which overclaimed correctness on these. The fix gates
+    ``is_correct`` on a tight allowlist of correctness metric names.
+    """
+    adapter = HELMAdapter()
+    with tempfile.TemporaryDirectory() as tmpdir:
+        metadata_args = {
+            'source_organization_name': 'TestOrg',
+            'evaluator_relationship': EvaluatorRelationship.first_party,
+            'parent_eval_output_dir': tmpdir,
+            'file_uuid': 'test_correctness',
+        }
+        _, instance_logs = _load_instance_level_data(
+            adapter,
+            'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2',
+            metadata_args,
+        )
+
+        bookkeeping = [
+            log
+            for log in instance_logs
+            if log.evaluation_result_id
+            in {
+                'num_references',
+                'num_prompt_tokens',
+                'num_completion_tokens',
+                'num_output_tokens',
+                'inference_runtime',
+                'max_prob',
+                'finish_reason_unknown',
+                'logprob',
+                'num_bytes',
+                'batch_size',
+            }
+        ]
+        assert bookkeeping, 'expected bookkeeping rows to be emitted'
+        assert all(
+            log.evaluation.is_correct is False for log in bookkeeping
+        ), 'bookkeeping metrics must not claim correctness'
+
+        # narrative_qa has graded scores (rouge_l/f1/bleu_*) which are not
+        # binary correctness either; check at least one fixture has them
+        # and that they don't overclaim correctness.
+        with tempfile.TemporaryDirectory() as tmpdir2:
+            metadata_args2 = dict(metadata_args)
+            metadata_args2['parent_eval_output_dir'] = tmpdir2
+            metadata_args2['file_uuid'] = 'test_correctness_graded'
+            _, narr_logs = _load_instance_level_data(
+                adapter,
+                'tests/data/helm/narrative_qa:model=openai_gpt2',
+                metadata_args2,
+            )
+            graded = [
+                log
+                for log in narr_logs
+                if log.evaluation_result_id
+                in {'rouge_l', 'f1_score', 'bleu_1', 'bleu_4'}
+            ]
+            assert graded, 'expected graded score rows in narrative_qa fixture'
+            assert all(
+                log.evaluation.is_correct is False for log in graded
+            ), (
+                'graded metrics (rouge_l/f1_score/bleu_*) must not be '
+                'treated as binary correctness'
+            )
+
+
+def test_is_correct_is_true_for_correct_exact_match_rows():
+    """When a correctness metric scores positive, is_correct must be True.
+
+    Run the converter and pick any exact_match row whose score happens to
+    be 1.0. The MMLU fixture contains at least one correct id (id7).
+    """
+    adapter = HELMAdapter()
+    with tempfile.TemporaryDirectory() as tmpdir:
+        metadata_args = {
+            'source_organization_name': 'TestOrg',
+            'evaluator_relationship': EvaluatorRelationship.first_party,
+            'parent_eval_output_dir': tmpdir,
+            'file_uuid': 'test_exact_match_true',
+        }
+        _, instance_logs = _load_instance_level_data(
+            adapter,
+            'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2',
+            metadata_args,
+        )
+        positive_em_rows = [
+            log
+            for log in instance_logs
+            if log.evaluation_result_id == 'exact_match'
+            and log.evaluation.score > 0
+        ]
+        for log in positive_em_rows:
+            assert log.evaluation.is_correct is True
+
+
+def test_is_correct_for_metric_helper():
+    """Pure-function check on the correctness allowlist."""
+    # Every allowlist member, score>0 ⇒ True; score==0 ⇒ False.
+    for name in _BINARY_CORRECTNESS_METRIC_NAMES:
+        assert _is_correct_for_metric(name, 1.0) is True
+        assert _is_correct_for_metric(name, 0.0) is False
+    # Non-allowlisted metrics never claim correctness, regardless of score.
+    for name in (
+        'num_prompt_tokens',
+        'num_references',
+        'inference_runtime',
+        'rouge_l',
+        'f1_score',
+        'bleu_1',
+        'logprob',
+        None,
+    ):
+        assert _is_correct_for_metric(name, 1.0) is False
+        assert _is_correct_for_metric(name, 0.0) is False
+
+
+def test_reasoning_traces_none_does_not_break_conversion(monkeypatch):
+    """Patch the upstream extractor to return None and ensure conversion works.
+
+    Mirrors the bug class fixed by the original 9900ae6 patch — we want a
+    regression test so the next refactor cannot reintroduce the failure.
+    """
+    from every_eval_ever.converters.helm import instance_level_adapter as mod
+
+    monkeypatch.setattr(mod, 'extract_all_reasonings', lambda state: None)
+
+    adapter = HELMAdapter()
+    with tempfile.TemporaryDirectory() as tmpdir:
+        metadata_args = {
+            'source_organization_name': 'TestOrg',
+            'evaluator_relationship': EvaluatorRelationship.first_party,
+            'parent_eval_output_dir': tmpdir,
+            'file_uuid': 'test_reasoning_none',
+        }
+        _, instance_logs = _load_instance_level_data(
+            adapter,
+            'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2',
+            metadata_args,
+        )
+        assert instance_logs
+        for log in instance_logs:
+            # ``reasoning_trace`` is Optional[list[str]]; with extractor
+            # returning None we either get None or an empty list — both
+            # satisfy the schema. The key invariant is "no crash".
+            trace = log.output.reasoning_trace
+            assert trace is None or trace == [] or all(isinstance(t, str) for t in trace)

From f62387a1aa1a71d6f83798e0496655a664c1aa8b Mon Sep 17 00:00:00 2001
From: joncrall <erotemic@gmail.com>
Date: Wed, 13 May 2026 16:46:05 -0400
Subject: [PATCH 2/8] Add more tests

---
 tests/test_helm_instance_level_adapter.py | 209 ++++++++++++++++++++--
 1 file changed, 198 insertions(+), 11 deletions(-)

diff --git a/tests/test_helm_instance_level_adapter.py b/tests/test_helm_instance_level_adapter.py
index 46b69f906..4be723e3a 100644
--- a/tests/test_helm_instance_level_adapter.py
+++ b/tests/test_helm_instance_level_adapter.py
@@ -1,17 +1,16 @@
 import pytest
 
-pytest.importorskip(
-    'helm', reason='crfm-helm not installed; install with: uv sync --extra helm'
-)
-
 import json
 import tempfile
 from pathlib import Path
+from types import SimpleNamespace
 
 from every_eval_ever.converters.helm.adapter import HELMAdapter
 from every_eval_ever.converters.helm.instance_level_adapter import (
+    HELMInstanceLevelDataAdapter,
     _BINARY_CORRECTNESS_METRIC_NAMES,
     _is_correct_for_metric,
+    _score_from_stat,
 )
 from every_eval_ever.eval_types import EvaluatorRelationship
 from every_eval_ever.instance_level_types import (
@@ -20,6 +19,17 @@
 )
 
 
+def _make_helm_adapter():
+    from every_eval_ever.converters.helm import adapter as helm_adapter_mod
+
+    if helm_adapter_mod._HELM_IMPORT_ERROR is not None:
+        pytest.skip(
+            'HELM converter dependencies are missing; install with: '
+            'uv sync --extra helm (or pip install every_eval_ever[helm])'
+        )
+    return HELMAdapter()
+
+
 def _load_instance_level_data(adapter, filepath, metadata_args):
     eval_dirpath = Path(filepath)
     converted_eval_list = adapter.transform_from_directory(
@@ -57,7 +67,7 @@ def _by_sample_and_metric(
 
 
 def test_mmlu_instance_level():
-    adapter = HELMAdapter()
+    adapter = _make_helm_adapter()
 
     with tempfile.TemporaryDirectory() as tmpdir:
         metadata_args = {
@@ -117,7 +127,7 @@ def test_mmlu_instance_level():
 
 
 def test_hellaswag_instance_level():
-    adapter = HELMAdapter()
+    adapter = _make_helm_adapter()
 
     with tempfile.TemporaryDirectory() as tmpdir:
         metadata_args = {
@@ -161,7 +171,7 @@ def test_hellaswag_instance_level():
 
 
 def test_narrativeqa_instance_level():
-    adapter = HELMAdapter()
+    adapter = _make_helm_adapter()
 
     with tempfile.TemporaryDirectory() as tmpdir:
         metadata_args = {
@@ -220,7 +230,7 @@ def test_per_sample_per_metric_rows_are_emitted():
     (``training_co2_cost`` and ``training_energy_cost``) carry no value;
     we expect 25 rows for that sample.
     """
-    adapter = HELMAdapter()
+    adapter = _make_helm_adapter()
     with tempfile.TemporaryDirectory() as tmpdir:
         metadata_args = {
             'source_organization_name': 'TestOrg',
@@ -252,7 +262,7 @@ def test_is_correct_only_claimed_for_correctness_metrics():
     every row, which overclaimed correctness on these. The fix gates
     ``is_correct`` on a tight allowlist of correctness metric names.
     """
-    adapter = HELMAdapter()
+    adapter = _make_helm_adapter()
     with tempfile.TemporaryDirectory() as tmpdir:
         metadata_args = {
             'source_organization_name': 'TestOrg',
@@ -321,7 +331,7 @@ def test_is_correct_is_true_for_correct_exact_match_rows():
     Run the converter and pick any exact_match row whose score happens to
     be 1.0. The MMLU fixture contains at least one correct id (id7).
     """
-    adapter = HELMAdapter()
+    adapter = _make_helm_adapter()
     with tempfile.TemporaryDirectory() as tmpdir:
         metadata_args = {
             'source_organization_name': 'TestOrg',
@@ -365,6 +375,183 @@ def test_is_correct_for_metric_helper():
         assert _is_correct_for_metric(name, 0.0) is False
 
 
+def _score_from_stat_dict(stat_dict):
+    """Mirror the converter's numeric-score policy for fixture assertions."""
+    value = stat_dict.get('mean')
+    if value is None:
+        count = stat_dict.get('count')
+        total = stat_dict.get('sum')
+        if count and total is not None:
+            value = total / count
+    if value is None:
+        return None
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def _expected_numeric_stat_rows(per_instance_stats_path):
+    """Count rows the per-(sample, numeric stat) policy should emit."""
+    per_instance_stats = json.loads(Path(per_instance_stats_path).read_text())
+    return sum(
+        1
+        for inst_stats in per_instance_stats
+        for stat in inst_stats.get('stats', [])
+        if _score_from_stat_dict(stat) is not None
+    )
+
+
+def test_total_rows_matches_numeric_per_instance_stats():
+    """The row count should be exact, not just >= the sample count."""
+    adapter = _make_helm_adapter()
+    fixture_path = Path(
+        'tests/data/helm/'
+        'mmlu:subject=philosophy,method=multiple_choice_joint,'
+        'model=openai_gpt2'
+    )
+    with tempfile.TemporaryDirectory() as tmpdir:
+        metadata_args = {
+            'source_organization_name': 'TestOrg',
+            'evaluator_relationship': EvaluatorRelationship.first_party,
+            'parent_eval_output_dir': tmpdir,
+            'file_uuid': 'test_exact_total_rows',
+        }
+        converted_eval, instance_logs = _load_instance_level_data(
+            adapter,
+            fixture_path,
+            metadata_args,
+        )
+
+        expected_rows = _expected_numeric_stat_rows(
+            fixture_path / 'per_instance_stats.json'
+        )
+        assert converted_eval.detailed_evaluation_results.total_rows == len(
+            instance_logs
+        )
+        assert len(instance_logs) == expected_rows
+
+        sample_metric_keys = {
+            (log.sample_id, log.evaluation_result_id)
+            for log in instance_logs
+        }
+        assert len(sample_metric_keys) == len(instance_logs)
+
+
+def test_instance_evaluation_result_ids_join_to_aggregate_results():
+    """Every non-null instance result id should join to an aggregate row."""
+    adapter = _make_helm_adapter()
+    with tempfile.TemporaryDirectory() as tmpdir:
+        metadata_args = {
+            'source_organization_name': 'TestOrg',
+            'evaluator_relationship': EvaluatorRelationship.first_party,
+            'parent_eval_output_dir': tmpdir,
+            'file_uuid': 'test_join_keys',
+        }
+        converted_eval, instance_logs = _load_instance_level_data(
+            adapter,
+            'tests/data/helm/mmlu:subject=philosophy,'
+            'method=multiple_choice_joint,model=openai_gpt2',
+            metadata_args,
+        )
+
+        aggregate_ids = {
+            result.evaluation_result_id
+            for result in converted_eval.evaluation_results
+            if result.evaluation_result_id is not None
+        }
+        detail_ids = {
+            log.evaluation_result_id
+            for log in instance_logs
+            if log.evaluation_result_id is not None
+        }
+
+        assert aggregate_ids, (
+            'Aggregate HELM results must set evaluation_result_id when '
+            'instance rows use evaluation_result_id as a join key.'
+        )
+        assert detail_ids <= aggregate_ids
+
+
+def test_score_from_stat_helper_edge_cases():
+    """Cover scalar extraction paths and malformed empty stats."""
+    assert _score_from_stat(
+        SimpleNamespace(mean=0.25, sum=10, count=2)
+    ) == 0.25
+    assert _score_from_stat(
+        SimpleNamespace(mean=None, sum=3, count=2)
+    ) == 1.5
+    assert _score_from_stat(
+        SimpleNamespace(mean=None, sum=0, count=0)
+    ) is None
+    assert _score_from_stat(
+        SimpleNamespace(mean='not-a-number', sum=0, count=1)
+    ) is None
+    assert _score_from_stat(
+        SimpleNamespace(mean=None, sum=None, count=1)
+    ) is None
+
+
+def test_missing_inst_stats_uses_legacy_exact_match_fallback(monkeypatch):
+    """No per-instance stats should still produce one legacy EM row."""
+    correct_reference = SimpleNamespace(
+        output=SimpleNamespace(text='expected answer'),
+        tags=['correct'],
+    )
+    distractor_reference = SimpleNamespace(
+        output=SimpleNamespace(text='distractor'),
+        tags=[],
+    )
+    state = SimpleNamespace(
+        instance=SimpleNamespace(
+            id='sample-1',
+            references=[correct_reference, distractor_reference],
+        ),
+        request=SimpleNamespace(prompt='Question?'),
+        result=SimpleNamespace(
+            completions=[SimpleNamespace(text='expected answer')],
+            request_time=0.25,
+        ),
+        output_mapping=None,
+    )
+
+    from every_eval_ever.converters.helm import instance_level_adapter as mod
+
+    # This test only needs the converter logic and a synthetic state object;
+    # bypass the optional HELM dependency guard so it can run in core installs.
+    monkeypatch.setattr(mod, '_HELM_IMPORT_ERROR', None)
+    monkeypatch.setattr(mod, 'extract_all_reasonings', lambda state: None)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        path, total_rows = HELMInstanceLevelDataAdapter(
+            'fallback_eval',
+            'jsonl',
+            'sha256',
+            tmpdir,
+        ).convert_instance_level_logs(
+            'synthetic_eval',
+            'synthetic/model',
+            [state],
+            [],
+        )
+
+        assert total_rows == 1
+        with Path(path).open('r', encoding='utf-8') as file:
+            rows = [
+                InstanceLevelEvaluationLog.model_validate(json.loads(line))
+                for line in file
+                if line.strip()
+            ]
+
+        assert len(rows) == 1
+        log = rows[0]
+        assert log.evaluation_result_id is None
+        assert log.evaluation.score == 1.0
+        assert log.evaluation.is_correct is True
+        assert log.input.reference == ['expected answer']
+        assert log.output.raw == ['expected answer']
+
+
 def test_reasoning_traces_none_does_not_break_conversion(monkeypatch):
     """Patch the upstream extractor to return None and ensure conversion works.
 
@@ -375,7 +562,7 @@ def test_reasoning_traces_none_does_not_break_conversion(monkeypatch):
 
     monkeypatch.setattr(mod, 'extract_all_reasonings', lambda state: None)
 
-    adapter = HELMAdapter()
+    adapter = _make_helm_adapter()
     with tempfile.TemporaryDirectory() as tmpdir:
         metadata_args = {
             'source_organization_name': 'TestOrg',

From a4f2bb634ba0191c2ef1f0ae90ef17b99db001fb Mon Sep 17 00:00:00 2001
From: joncrall <erotemic@gmail.com>
Date: Wed, 13 May 2026 18:23:12 -0400
Subject: [PATCH 3/8] Fix HELM per-metric instance result joins

---
 every_eval_ever/converters/helm/adapter.py    | 139 ++++---
 .../converters/helm/instance_level_adapter.py |  43 +-
 tests/test_helm_adapter.py                    |  12 +
 tests/test_helm_instance_level_adapter.py     | 374 +++++++++---------
 4 files changed, 306 insertions(+), 262 deletions(-)

diff --git a/every_eval_ever/converters/helm/adapter.py b/every_eval_ever/converters/helm/adapter.py
index e43eb01b9..5b9727f76 100644
--- a/every_eval_ever/converters/helm/adapter.py
+++ b/every_eval_ever/converters/helm/adapter.py
@@ -47,6 +47,8 @@
 from every_eval_ever.converters.common.utils import sha256_file
 from every_eval_ever.converters.helm.instance_level_adapter import (
     HELMInstanceLevelDataAdapter,
+    _evaluation_result_id,
+    _score_from_stat,
 )
 from every_eval_ever.converters.helm.utils import extract_reasoning
 from every_eval_ever.eval_types import (
@@ -402,80 +404,95 @@ def _transform_single(
 
         evaluation_id = f'{source_data.dataset_name}/{model_info.id.replace("/", "_")}/{evaluation_timestamp}'
 
-        metric_names = self._extract_metric_names(run_spec)
-
+        # Build aggregate results from the numeric HELM stats themselves, not
+        # only from run_spec.metric_specs. The instance-level converter emits
+        # one row per numeric per-instance stat, so aggregate IDs must cover
+        # the same stat namespace for detailed rows to be joinable.
         evaluation_results: List[EvaluationResult] = []
+        seen_evaluation_result_ids: set[str] = set()
+
+        for stat in stats_raw:
+            metric_name = getattr(getattr(stat, 'name', None), 'name', None)
+            score = _score_from_stat(stat)
+            if metric_name is None or score is None:
+                continue
+
+            evaluation_result_id = _evaluation_result_id(
+                metric_name,
+                getattr(stat.name, 'split', None),
+                getattr(stat.name, 'perturbation', None),
+            )
+            if evaluation_result_id is None:
+                continue
+            if evaluation_result_id in seen_evaluation_result_ids:
+                continue
+            seen_evaluation_result_ids.add(evaluation_result_id)
 
-        for metric_name in set(metric_names):
             metric_config = MetricConfig(
                 evaluation_description=metric_name,
                 lower_is_better=False,  # TODO schema.json check
                 score_type=ScoreType.continuous,
                 min_score=0,
-                max_score=1,
+                max_score=max(1.0, score),
             )
 
-            matching_stats = [
-                s
-                for s in stats_raw
-                if s.name.name == metric_name and not s.name.perturbation
-            ]
-
-            for stat in matching_stats:
-                evaluation_name = (
-                    f'{metric_name} on {source_data.dataset_name}'
-                    if not stat.name.split
-                    else f'{metric_name} {stat.name.split} on {source_data.dataset_name}'
-                )
+            split = getattr(stat.name, 'split', None)
+            perturbation = getattr(stat.name, 'perturbation', None)
+            name_parts = [metric_name]
+            if split:
+                name_parts.append(str(split))
+            if perturbation:
+                name_parts.append(str(perturbation))
+            evaluation_name = (
+                f'{" ".join(name_parts)} on {source_data.dataset_name}'
+            )
 
-                evaluation_results.append(
-                    EvaluationResult(
-                        evaluation_name=evaluation_name,
-                        source_data=source_data,
-                        evaluation_timestamp=evaluation_timestamp,
-                        metric_config=metric_config,
-                        score_details=ScoreDetails(
-                            score=stat.mean
-                            or (stat.sum / stat.count if stat.count else 0.0),
-                            uncertainty=Uncertainty(
-                                standard_deviation=stat.stddev,
-                                num_samples=adapter_spec.max_eval_instances
-                                or len(request_states),
-                            ),
-                            details={
-                                'count': str(stat.count),
-                                'split': str(stat.name.split)
-                                if stat.name.split
-                                else '',
-                                'perturbation': str(stat.name.perturbation)
-                                if stat.name.perturbation
-                                else '',
-                            },
+            evaluation_results.append(
+                EvaluationResult(
+                    evaluation_result_id=evaluation_result_id,
+                    evaluation_name=evaluation_name,
+                    source_data=source_data,
+                    evaluation_timestamp=evaluation_timestamp,
+                    metric_config=metric_config,
+                    score_details=ScoreDetails(
+                        score=score,
+                        uncertainty=Uncertainty(
+                            standard_deviation=getattr(stat, 'stddev', None),
+                            num_samples=adapter_spec.max_eval_instances
+                            or len(request_states),
                         ),
-                        generation_config=GenerationConfig(
-                            generation_args=self._extract_generation_args(
-                                adapter_spec=adapter_spec,
-                                request_state=request_states[0],
-                            ),
-                            additional_details={
-                                'stop_sequences': json.dumps(
-                                    request_states[0].request.stop_sequences
-                                )
-                                if request_states[0].request.stop_sequences
-                                else '[]',
-                                'presence_penalty': str(
-                                    request_states[0].request.presence_penalty
-                                ),
-                                'frequency_penalty': str(
-                                    request_states[0].request.frequency_penalty
-                                ),
-                                'num_completions': str(
-                                    request_states[0].request.num_completions
-                                ),
-                            },
+                        details={
+                            'count': str(getattr(stat, 'count', '')),
+                            'split': str(split) if split else '',
+                            'perturbation': str(perturbation)
+                            if perturbation
+                            else '',
+                        },
+                    ),
+                    generation_config=GenerationConfig(
+                        generation_args=self._extract_generation_args(
+                            adapter_spec=adapter_spec,
+                            request_state=request_states[0],
                         ),
-                    )
+                        additional_details={
+                            'stop_sequences': json.dumps(
+                                request_states[0].request.stop_sequences
+                            )
+                            if request_states[0].request.stop_sequences
+                            else '[]',
+                            'presence_penalty': str(
+                                request_states[0].request.presence_penalty
+                            ),
+                            'frequency_penalty': str(
+                                request_states[0].request.frequency_penalty
+                            ),
+                            'num_completions': str(
+                                request_states[0].request.num_completions
+                            ),
+                        },
+                    ),
                 )
+            )
 
         if request_states:
             parent_eval_output_dir = metadata_args.get('parent_eval_output_dir')
diff --git a/every_eval_ever/converters/helm/instance_level_adapter.py b/every_eval_ever/converters/helm/instance_level_adapter.py
index e675c7ff7..391bb6c2e 100644
--- a/every_eval_ever/converters/helm/instance_level_adapter.py
+++ b/every_eval_ever/converters/helm/instance_level_adapter.py
@@ -41,7 +41,10 @@ def _score_from_stat(stat) -> float | None:
         count = getattr(stat, 'count', None)
         total = getattr(stat, 'sum', None)
         if count:
-            value = total / count
+            try:
+                value = total / count
+            except (TypeError, ValueError, ZeroDivisionError):
+                return None
     if value is None:
         return None
     try:
@@ -50,6 +53,33 @@ def _score_from_stat(stat) -> float | None:
         return None
 
 
+def _stat_name_part(value) -> str | None:
+    if value is None:
+        return None
+    if isinstance(value, str):
+        return value or None
+    if isinstance(value, dict):
+        return value.get('name') or str(value)
+    return getattr(value, 'name', None) or str(value)
+
+
+def _evaluation_result_id(
+    metric_name: str | None,
+    split=None,
+    perturbation=None,
+) -> str | None:
+    if metric_name is None:
+        return None
+    parts = [metric_name]
+    split_part = _stat_name_part(split)
+    perturbation_part = _stat_name_part(perturbation)
+    if split_part:
+        parts.append(split_part)
+    if perturbation_part:
+        parts.append(perturbation_part)
+    return ':'.join(parts)
+
+
 # Metric names whose per-instance score is a correctness signal in [0, 1]
 # where ``score > 0`` reasonably maps to ``is_correct=True``. Anything not
 # in this allowlist (token counts, runtime, finish-reason flags, logprobs,
@@ -208,6 +238,7 @@ def convert_instance_level_logs(
             for stat in metric_stats:
                 if stat is None:
                     metric_name = None
+                    evaluation_result_id = None
                     score = fallback_score
                     # Fallback path: ``score`` here is an exact-match
                     # proxy from completion-vs-reference matching, so
@@ -215,7 +246,13 @@ def convert_instance_level_logs(
                     # as the legacy single-row behavior.
                     is_correct = score > 0
                 else:
-                    metric_name = getattr(getattr(stat, 'name', None), 'name', None)
+                    stat_name = getattr(stat, 'name', None)
+                    metric_name = getattr(stat_name, 'name', None)
+                    evaluation_result_id = _evaluation_result_id(
+                        metric_name,
+                        getattr(stat_name, 'split', None),
+                        getattr(stat_name, 'perturbation', None),
+                    )
                     score = _score_from_stat(stat)
                     if score is None:
                         continue
@@ -226,7 +263,7 @@ def convert_instance_level_logs(
                         evaluation_id=self.evaluation_id,
                         model_id=model_id,
                         evaluation_name=evaluation_name,
-                        evaluation_result_id=metric_name,
+                        evaluation_result_id=evaluation_result_id,
                         sample_id=str(state.instance.id),
                         sample_hash=sha256_string(
                             state.request.prompt + (correct_refs[0] if correct_refs else '')
diff --git a/tests/test_helm_adapter.py b/tests/test_helm_adapter.py
index 33e279477..a3edd3062 100644
--- a/tests/test_helm_adapter.py
+++ b/tests/test_helm_adapter.py
@@ -39,6 +39,15 @@ def _load_eval(adapter, filepath, metadata_args):
     return converted_eval
 
 
+def _assert_unique_evaluation_result_ids(converted_eval):
+    result_ids = [
+        result.evaluation_result_id
+        for result in converted_eval.evaluation_results
+    ]
+    assert all(result_ids)
+    assert len(result_ids) == len(set(result_ids))
+
+
 def test_mmlu_eval():
     adapter = HELMAdapter()
     metadata_args = {
@@ -73,6 +82,7 @@ def test_mmlu_eval():
     assert len(results) > 0
     assert any('mmlu' in r.evaluation_name.lower() for r in results)
     assert all(r.metric_config is not None for r in results)
+    _assert_unique_evaluation_result_ids(converted_eval)
 
     assert converted_eval.detailed_evaluation_results is not None
     assert converted_eval.detailed_evaluation_results.format is not None
@@ -117,6 +127,7 @@ def test_hellswag_eval():
     assert len(results) > 0
     assert results[0].score_details.score is not None
     assert any('hellaswag' in r.evaluation_name.lower() for r in results)
+    _assert_unique_evaluation_result_ids(converted_eval)
 
     assert converted_eval.detailed_evaluation_results is not None
     assert converted_eval.detailed_evaluation_results.format is not None
@@ -155,6 +166,7 @@ def test_narrativeqa_eval():
     assert len(results) > 0
     assert any('narrativeqa' in r.evaluation_name.lower() for r in results)
     assert all(r.metric_config is not None for r in results)
+    _assert_unique_evaluation_result_ids(converted_eval)
 
     assert converted_eval.detailed_evaluation_results is not None
     assert converted_eval.detailed_evaluation_results.format is not None
diff --git a/tests/test_helm_instance_level_adapter.py b/tests/test_helm_instance_level_adapter.py
index 4be723e3a..1685a656d 100644
--- a/tests/test_helm_instance_level_adapter.py
+++ b/tests/test_helm_instance_level_adapter.py
@@ -1,14 +1,16 @@
-import pytest
-
 import json
 import tempfile
 from pathlib import Path
 from types import SimpleNamespace
 
+import pytest
+
+from every_eval_ever.converters.helm import adapter as helm_adapter_module
 from every_eval_ever.converters.helm.adapter import HELMAdapter
 from every_eval_ever.converters.helm.instance_level_adapter import (
     HELMInstanceLevelDataAdapter,
     _BINARY_CORRECTNESS_METRIC_NAMES,
+    _evaluation_result_id,
     _is_correct_for_metric,
     _score_from_stat,
 )
@@ -19,15 +21,13 @@
 )
 
 
-def _make_helm_adapter():
-    from every_eval_ever.converters.helm import adapter as helm_adapter_mod
-
-    if helm_adapter_mod._HELM_IMPORT_ERROR is not None:
+def _require_helm():
+    import_error = getattr(helm_adapter_module, '_HELM_IMPORT_ERROR', None)
+    if import_error is not None:
         pytest.skip(
-            'HELM converter dependencies are missing; install with: '
-            'uv sync --extra helm (or pip install every_eval_ever[helm])'
+            'HELM converter dependencies are missing: '
+            f'{import_error!r}. Install with: uv sync --extra helm'
         )
-    return HELMAdapter()
 
 
 def _load_instance_level_data(adapter, filepath, metadata_args):
@@ -66,8 +66,44 @@ def _by_sample_and_metric(
     }
 
 
+def _metric_name_from_result_id(result_id: str | None) -> str | None:
+    if result_id is None:
+        return None
+    return result_id.split(':', 1)[0]
+
+
+def _json_score_from_stat(stat: dict) -> float | None:
+    value = stat.get('mean')
+    if value is None:
+        count = stat.get('count')
+        total = stat.get('sum')
+        if count:
+            try:
+                value = total / count
+            except (TypeError, ValueError, ZeroDivisionError):
+                return None
+    if value is None:
+        return None
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def _expected_numeric_instance_stat_rows(filepath):
+    per_instance_path = Path(filepath) / 'per_instance_stats.json'
+    per_instance_stats = json.loads(per_instance_path.read_text())
+    return sum(
+        1
+        for item in per_instance_stats
+        for stat in item.get('stats', [])
+        if _json_score_from_stat(stat) is not None
+    )
+
+
 def test_mmlu_instance_level():
-    adapter = _make_helm_adapter()
+    _require_helm()
+    adapter = HELMAdapter()
 
     with tempfile.TemporaryDirectory() as tmpdir:
         metadata_args = {
@@ -83,20 +119,19 @@ def test_mmlu_instance_level():
             metadata_args,
         )
 
-        # Per-(sample, metric) emission: 10 samples * many stats per
-        # sample. Confirm by sample_id distinct count, not row count.
         sample_ids = sorted({log.sample_id for log in instance_logs})
         assert len(sample_ids) == 10
-        # The MMLU fixture has the standard exact-match family for
-        # multiple_choice_joint; every sample must have an exact_match row.
         em_rows = [
             log
             for log in instance_logs
-            if log.evaluation_result_id == 'exact_match'
+            if _metric_name_from_result_id(log.evaluation_result_id)
+            == 'exact_match'
         ]
         assert len(em_rows) == 10
 
-        log = _by_sample_and_metric(instance_logs)[('id147', 'exact_match')]
+        log = _by_sample_and_metric(instance_logs)[
+            ('id147', 'exact_match:test')
+        ]
         assert log.schema_version == '0.2.2'
         assert log.evaluation_id == 'test_mmlu_samples'
         assert log.model_id == 'openai/gpt2'
@@ -125,9 +160,12 @@ def test_mmlu_instance_level():
         assert log.token_usage.output_tokens > 0
         assert log.token_usage.total_tokens > 0
 
+        assert converted_eval.evaluation_results
+
 
 def test_hellaswag_instance_level():
-    adapter = _make_helm_adapter()
+    _require_helm()
+    adapter = HELMAdapter()
 
     with tempfile.TemporaryDirectory() as tmpdir:
         metadata_args = {
@@ -137,7 +175,7 @@ def test_hellaswag_instance_level():
             'file_uuid': 'test_hellaswag',
         }
 
-        converted_eval, instance_logs = _load_instance_level_data(
+        _, instance_logs = _load_instance_level_data(
             adapter,
             'tests/data/helm/commonsense:dataset=hellaswag,method=multiple_choice_joint,model=eleutherai_pythia-1b-v0',
             metadata_args,
@@ -149,7 +187,8 @@ def test_hellaswag_instance_level():
         em_rows = [
             log
             for log in instance_logs
-            if log.evaluation_result_id == 'exact_match'
+            if _metric_name_from_result_id(log.evaluation_result_id)
+            == 'exact_match'
         ]
         assert len(em_rows) == 10
         log = em_rows[0]
@@ -171,7 +210,8 @@ def test_hellaswag_instance_level():
 
 
 def test_narrativeqa_instance_level():
-    adapter = _make_helm_adapter()
+    _require_helm()
+    adapter = HELMAdapter()
 
     with tempfile.TemporaryDirectory() as tmpdir:
         metadata_args = {
@@ -181,7 +221,7 @@ def test_narrativeqa_instance_level():
             'file_uuid': 'test_narrativeqa',
         }
 
-        converted_eval, instance_logs = _load_instance_level_data(
+        _, instance_logs = _load_instance_level_data(
             adapter,
             'tests/data/helm/narrative_qa:model=openai_gpt2',
             metadata_args,
@@ -193,7 +233,8 @@ def test_narrativeqa_instance_level():
         em_rows = [
             log
             for log in instance_logs
-            if log.evaluation_result_id == 'exact_match'
+            if _metric_name_from_result_id(log.evaluation_result_id)
+            == 'exact_match'
         ]
         assert len(em_rows) == 5
         log = em_rows[0]
@@ -218,19 +259,9 @@ def test_narrativeqa_instance_level():
         assert log.answer_attribution[0].extraction_method == 'exact_match'
 
 
-# ---------------------------------------------------------------------------
-# Per-(sample, metric) row emission and correctness semantics
-# ---------------------------------------------------------------------------
-
-
 def test_per_sample_per_metric_rows_are_emitted():
-    """Every per-instance HELM stat with a numeric mean becomes its own row.
-
-    The fixture's first sample has 27 declared stats but two
-    (``training_co2_cost`` and ``training_energy_cost``) carry no value;
-    we expect 25 rows for that sample.
-    """
-    adapter = _make_helm_adapter()
+    _require_helm()
+    adapter = HELMAdapter()
     with tempfile.TemporaryDirectory() as tmpdir:
         metadata_args = {
             'source_organization_name': 'TestOrg',
@@ -243,26 +274,19 @@ def test_per_sample_per_metric_rows_are_emitted():
             'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2',
             metadata_args,
         )
-        rows_for_id147 = [log for log in instance_logs if log.sample_id == 'id147']
+        rows_for_id147 = [
+            log for log in instance_logs if log.sample_id == 'id147'
+        ]
         metric_ids = sorted(log.evaluation_result_id for log in rows_for_id147)
-        assert 'exact_match' in metric_ids
-        assert 'num_prompt_tokens' in metric_ids
-        # No row should have a None evaluation_result_id when stats exist.
+        assert 'exact_match:test' in metric_ids
+        assert 'num_prompt_tokens:test' in metric_ids
         assert all(log.evaluation_result_id is not None for log in rows_for_id147)
-        # Distinct metric ids — no duplicates within a sample.
         assert len(metric_ids) == len(set(metric_ids))
 
 
 def test_is_correct_only_claimed_for_correctness_metrics():
-    """``is_correct=True`` should never appear on bookkeeping/resource rows.
-
-    The MMLU fixture has positive-valued bookkeeping rows
-    (``num_references=4``, ``num_prompt_tokens=333``, ``inference_runtime>0``,
-    ``max_prob=1``). Earlier patches set ``is_correct = score > 0`` for
-    every row, which overclaimed correctness on these. The fix gates
-    ``is_correct`` on a tight allowlist of correctness metric names.
-    """
-    adapter = _make_helm_adapter()
+    _require_helm()
+    adapter = HELMAdapter()
     with tempfile.TemporaryDirectory() as tmpdir:
         metadata_args = {
             'source_organization_name': 'TestOrg',
@@ -276,31 +300,29 @@ def test_is_correct_only_claimed_for_correctness_metrics():
             metadata_args,
         )
 
+        bookkeeping_names = {
+            'num_references',
+            'num_prompt_tokens',
+            'num_completion_tokens',
+            'num_output_tokens',
+            'inference_runtime',
+            'max_prob',
+            'finish_reason_unknown',
+            'logprob',
+            'num_bytes',
+            'batch_size',
+        }
         bookkeeping = [
             log
             for log in instance_logs
-            if log.evaluation_result_id
-            in {
-                'num_references',
-                'num_prompt_tokens',
-                'num_completion_tokens',
-                'num_output_tokens',
-                'inference_runtime',
-                'max_prob',
-                'finish_reason_unknown',
-                'logprob',
-                'num_bytes',
-                'batch_size',
-            }
+            if _metric_name_from_result_id(log.evaluation_result_id)
+            in bookkeeping_names
         ]
         assert bookkeeping, 'expected bookkeeping rows to be emitted'
         assert all(
             log.evaluation.is_correct is False for log in bookkeeping
         ), 'bookkeeping metrics must not claim correctness'
 
-        # narrative_qa has graded scores (rouge_l/f1/bleu_*) which are not
-        # binary correctness either; check at least one fixture has them
-        # and that they don't overclaim correctness.
         with tempfile.TemporaryDirectory() as tmpdir2:
             metadata_args2 = dict(metadata_args)
             metadata_args2['parent_eval_output_dir'] = tmpdir2
@@ -313,7 +335,7 @@ def test_is_correct_only_claimed_for_correctness_metrics():
             graded = [
                 log
                 for log in narr_logs
-                if log.evaluation_result_id
+                if _metric_name_from_result_id(log.evaluation_result_id)
                 in {'rouge_l', 'f1_score', 'bleu_1', 'bleu_4'}
             ]
             assert graded, 'expected graded score rows in narrative_qa fixture'
@@ -326,12 +348,8 @@ def test_is_correct_only_claimed_for_correctness_metrics():
 
 
 def test_is_correct_is_true_for_correct_exact_match_rows():
-    """When a correctness metric scores positive, is_correct must be True.
-
-    Run the converter and pick any exact_match row whose score happens to
-    be 1.0. The MMLU fixture contains at least one correct id (id7).
-    """
-    adapter = _make_helm_adapter()
+    _require_helm()
+    adapter = HELMAdapter()
     with tempfile.TemporaryDirectory() as tmpdir:
         metadata_args = {
             'source_organization_name': 'TestOrg',
@@ -347,20 +365,19 @@ def test_is_correct_is_true_for_correct_exact_match_rows():
         positive_em_rows = [
             log
             for log in instance_logs
-            if log.evaluation_result_id == 'exact_match'
+            if _metric_name_from_result_id(log.evaluation_result_id)
+            == 'exact_match'
             and log.evaluation.score > 0
         ]
+        assert positive_em_rows
         for log in positive_em_rows:
             assert log.evaluation.is_correct is True
 
 
 def test_is_correct_for_metric_helper():
-    """Pure-function check on the correctness allowlist."""
-    # Every allowlist member, score>0 ⇒ True; score==0 ⇒ False.
     for name in _BINARY_CORRECTNESS_METRIC_NAMES:
         assert _is_correct_for_metric(name, 1.0) is True
         assert _is_correct_for_metric(name, 0.0) is False
-    # Non-allowlisted metrics never claim correctness, regardless of score.
     for name in (
         'num_prompt_tokens',
         'num_references',
@@ -375,41 +392,31 @@ def test_is_correct_for_metric_helper():
         assert _is_correct_for_metric(name, 0.0) is False
 
 
-def _score_from_stat_dict(stat_dict):
-    """Mirror the converter's numeric-score policy for fixture assertions."""
-    value = stat_dict.get('mean')
-    if value is None:
-        count = stat_dict.get('count')
-        total = stat_dict.get('sum')
-        if count and total is not None:
-            value = total / count
-    if value is None:
-        return None
-    try:
-        return float(value)
-    except (TypeError, ValueError):
-        return None
-
-
-def _expected_numeric_stat_rows(per_instance_stats_path):
-    """Count rows the per-(sample, numeric stat) policy should emit."""
-    per_instance_stats = json.loads(Path(per_instance_stats_path).read_text())
-    return sum(
-        1
-        for inst_stats in per_instance_stats
-        for stat in inst_stats.get('stats', [])
-        if _score_from_stat_dict(stat) is not None
+def test_score_from_stat_helper_edge_cases():
+    assert _score_from_stat(SimpleNamespace(mean=0.25, sum=10, count=2)) == 0.25
+    assert _score_from_stat(SimpleNamespace(mean=None, sum=3, count=2)) == 1.5
+    assert _score_from_stat(SimpleNamespace(mean=None, sum=0, count=0)) is None
+    assert _score_from_stat(SimpleNamespace(mean=None, sum=None, count=1)) is None
+    assert _score_from_stat(SimpleNamespace(mean='bad', sum=1, count=1)) is None
+
+
+def test_evaluation_result_id_helper_disambiguates_split_and_perturbation():
+    assert _evaluation_result_id('exact_match') == 'exact_match'
+    assert _evaluation_result_id('exact_match', 'test') == 'exact_match:test'
+    assert (
+        _evaluation_result_id(
+            'exact_match',
+            'test',
+            SimpleNamespace(name='robustness'),
+        )
+        == 'exact_match:test:robustness'
     )
 
 
 def test_total_rows_matches_numeric_per_instance_stats():
-    """The row count should be exact, not just >= the sample count."""
-    adapter = _make_helm_adapter()
-    fixture_path = Path(
-        'tests/data/helm/'
-        'mmlu:subject=philosophy,method=multiple_choice_joint,'
-        'model=openai_gpt2'
-    )
+    _require_helm()
+    fixture = 'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2'
+    adapter = HELMAdapter()
     with tempfile.TemporaryDirectory() as tmpdir:
         metadata_args = {
             'source_organization_name': 'TestOrg',
@@ -418,29 +425,21 @@ def test_total_rows_matches_numeric_per_instance_stats():
             'file_uuid': 'test_exact_total_rows',
         }
         converted_eval, instance_logs = _load_instance_level_data(
-            adapter,
-            fixture_path,
-            metadata_args,
+            adapter, fixture, metadata_args
         )
 
-        expected_rows = _expected_numeric_stat_rows(
-            fixture_path / 'per_instance_stats.json'
-        )
-        assert converted_eval.detailed_evaluation_results.total_rows == len(
-            instance_logs
-        )
+        expected_rows = _expected_numeric_instance_stat_rows(fixture)
+        assert converted_eval.detailed_evaluation_results.total_rows == expected_rows
         assert len(instance_logs) == expected_rows
-
-        sample_metric_keys = {
+        assert len({
             (log.sample_id, log.evaluation_result_id)
             for log in instance_logs
-        }
-        assert len(sample_metric_keys) == len(instance_logs)
+        }) == len(instance_logs)
 
 
 def test_instance_evaluation_result_ids_join_to_aggregate_results():
-    """Every non-null instance result id should join to an aggregate row."""
-    adapter = _make_helm_adapter()
+    _require_helm()
+    adapter = HELMAdapter()
     with tempfile.TemporaryDirectory() as tmpdir:
         metadata_args = {
             'source_organization_name': 'TestOrg',
@@ -450,8 +449,7 @@ def test_instance_evaluation_result_ids_join_to_aggregate_results():
         }
         converted_eval, instance_logs = _load_instance_level_data(
             adapter,
-            'tests/data/helm/mmlu:subject=philosophy,'
-            'method=multiple_choice_joint,model=openai_gpt2',
+            'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2',
             metadata_args,
         )
 
@@ -466,103 +464,84 @@ def test_instance_evaluation_result_ids_join_to_aggregate_results():
             if log.evaluation_result_id is not None
         }
 
-        assert aggregate_ids, (
-            'Aggregate HELM results must set evaluation_result_id when '
-            'instance rows use evaluation_result_id as a join key.'
-        )
+        assert aggregate_ids
+        assert detail_ids
         assert detail_ids <= aggregate_ids
 
 
-def test_score_from_stat_helper_edge_cases():
-    """Cover scalar extraction paths and malformed empty stats."""
-    assert _score_from_stat(
-        SimpleNamespace(mean=0.25, sum=10, count=2)
-    ) == 0.25
-    assert _score_from_stat(
-        SimpleNamespace(mean=None, sum=3, count=2)
-    ) == 1.5
-    assert _score_from_stat(
-        SimpleNamespace(mean=None, sum=0, count=0)
-    ) is None
-    assert _score_from_stat(
-        SimpleNamespace(mean='not-a-number', sum=0, count=1)
-    ) is None
-    assert _score_from_stat(
-        SimpleNamespace(mean=None, sum=None, count=1)
-    ) is None
-
-
-def test_missing_inst_stats_uses_legacy_exact_match_fallback(monkeypatch):
-    """No per-instance stats should still produce one legacy EM row."""
-    correct_reference = SimpleNamespace(
-        output=SimpleNamespace(text='expected answer'),
-        tags=['correct'],
-    )
-    distractor_reference = SimpleNamespace(
-        output=SimpleNamespace(text='distractor'),
-        tags=[],
-    )
+def test_aggregate_evaluation_result_ids_are_unique_and_non_null():
+    _require_helm()
+    adapter = HELMAdapter()
+    with tempfile.TemporaryDirectory() as tmpdir:
+        metadata_args = {
+            'source_organization_name': 'TestOrg',
+            'evaluator_relationship': EvaluatorRelationship.first_party,
+            'parent_eval_output_dir': tmpdir,
+            'file_uuid': 'test_aggregate_ids',
+        }
+        converted_eval, _ = _load_instance_level_data(
+            adapter,
+            'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2',
+            metadata_args,
+        )
+
+        result_ids = [
+            result.evaluation_result_id
+            for result in converted_eval.evaluation_results
+        ]
+        assert all(result_ids)
+        assert len(result_ids) == len(set(result_ids))
+        assert 'exact_match:test' in result_ids
+        assert 'num_prompt_tokens:test' in result_ids
+
+
+def test_missing_inst_stats_uses_legacy_exact_match_fallback():
+    _require_helm()
+    completion = SimpleNamespace(text='answer')
     state = SimpleNamespace(
         instance=SimpleNamespace(
-            id='sample-1',
-            references=[correct_reference, distractor_reference],
-        ),
-        request=SimpleNamespace(prompt='Question?'),
-        result=SimpleNamespace(
-            completions=[SimpleNamespace(text='expected answer')],
-            request_time=0.25,
+            id='sample0',
+            references=[
+                SimpleNamespace(
+                    output=SimpleNamespace(text='answer'),
+                    tags=['correct'],
+                )
+            ],
         ),
-        output_mapping=None,
+        result=SimpleNamespace(completions=[completion], request_time=0.1),
+        request=SimpleNamespace(prompt='question'),
+        output_mapping={},
     )
 
-    from every_eval_ever.converters.helm import instance_level_adapter as mod
-
-    # This test only needs the converter logic and a synthetic state object;
-    # bypass the optional HELM dependency guard so it can run in core installs.
-    monkeypatch.setattr(mod, '_HELM_IMPORT_ERROR', None)
-    monkeypatch.setattr(mod, 'extract_all_reasonings', lambda state: None)
-
     with tempfile.TemporaryDirectory() as tmpdir:
-        path, total_rows = HELMInstanceLevelDataAdapter(
-            'fallback_eval',
+        adapter = HELMInstanceLevelDataAdapter(
+            'fallback_samples',
             'jsonl',
             'sha256',
             tmpdir,
-        ).convert_instance_level_logs(
-            'synthetic_eval',
-            'synthetic/model',
+        )
+        path, count = adapter.convert_instance_level_logs(
+            'tiny',
+            'dev/model',
             [state],
             [],
         )
 
-        assert total_rows == 1
-        with Path(path).open('r', encoding='utf-8') as file:
-            rows = [
-                InstanceLevelEvaluationLog.model_validate(json.loads(line))
-                for line in file
-                if line.strip()
-            ]
-
-        assert len(rows) == 1
-        log = rows[0]
+        assert count == 1
+        data = json.loads(Path(path).read_text().strip())
+        log = InstanceLevelEvaluationLog.model_validate(data)
         assert log.evaluation_result_id is None
         assert log.evaluation.score == 1.0
         assert log.evaluation.is_correct is True
-        assert log.input.reference == ['expected answer']
-        assert log.output.raw == ['expected answer']
 
 
 def test_reasoning_traces_none_does_not_break_conversion(monkeypatch):
-    """Patch the upstream extractor to return None and ensure conversion works.
-
-    Mirrors the bug class fixed by the original 9900ae6 patch — we want a
-    regression test so the next refactor cannot reintroduce the failure.
-    """
+    _require_helm()
     from every_eval_ever.converters.helm import instance_level_adapter as mod
 
     monkeypatch.setattr(mod, 'extract_all_reasonings', lambda state: None)
 
-    adapter = _make_helm_adapter()
+    adapter = HELMAdapter()
     with tempfile.TemporaryDirectory() as tmpdir:
         metadata_args = {
             'source_organization_name': 'TestOrg',
@@ -577,8 +556,7 @@ def test_reasoning_traces_none_does_not_break_conversion(monkeypatch):
         )
         assert instance_logs
         for log in instance_logs:
-            # ``reasoning_trace`` is Optional[list[str]]; with extractor
-            # returning None we either get None or an empty list — both
-            # satisfy the schema. The key invariant is "no crash".
             trace = log.output.reasoning_trace
-            assert trace is None or trace == [] or all(isinstance(t, str) for t in trace)
+            assert trace is None or trace == [] or all(
+                isinstance(t, str) for t in trace
+            )

From 991b7cb8a3bff764b7ef4c61a6678e1abddf6dfc Mon Sep 17 00:00:00 2001
From: joncrall <erotemic@gmail.com>
Date: Wed, 13 May 2026 18:38:42 -0400
Subject: [PATCH 4/8] Document HELM instance metric invariants

---
 every_eval_ever/converters/helm/adapter.py    | 18 +++++++++++
 .../converters/helm/instance_level_adapter.py | 22 ++++++++++++++
 tests/test_helm_adapter.py                    |  2 ++
 tests/test_helm_instance_level_adapter.py     | 30 +++++++++++++++++++
 4 files changed, 72 insertions(+)

diff --git a/every_eval_ever/converters/helm/adapter.py b/every_eval_ever/converters/helm/adapter.py
index 5b9727f76..0fa74a00b 100644
--- a/every_eval_ever/converters/helm/adapter.py
+++ b/every_eval_ever/converters/helm/adapter.py
@@ -193,6 +193,7 @@ def _load_file_if_exists(self, dir_path, file_name) -> Any:
         return None
 
     def _load_evaluation_run_logfiles(self, dir_path) -> Dict:
+        """Load the HELM files needed for aggregate and detail conversion."""
         scenario_state_dict = self._load_file_if_exists(
             dir_path, self.SCENARIO_STATE_FILE
         )
@@ -320,6 +321,7 @@ def _extract_evaluation_time(
     def _extract_dataset_name(
         self, run_spec_name: str, scenario_name: str | None
     ) -> str:
+        """Prefer scenario metadata, falling back to HELM run-spec names."""
         if scenario_name:
             return scenario_name
 
@@ -335,6 +337,12 @@ def _extract_dataset_name(
         return run_spec_name.split(':')[0]
 
     def _extract_metric_names(self, run_spec: RunSpec) -> List[str]:
+        """Return metric names declared by the HELM run spec.
+
+        Kept for callers that need the run-spec metric declaration. The
+        main aggregate conversion below uses ``stats.json`` so aggregate
+        IDs cover all numeric detail rows emitted from per-instance stats.
+        """
         metric_names = []
         for metric_spec in run_spec.metric_specs:
             names = metric_spec.args.get('names')
@@ -348,6 +356,13 @@ def _extract_metric_names(self, run_spec: RunSpec) -> List[str]:
     def _transform_single(
         self, raw_data: Dict, metadata_args: Dict[str, Any]
     ) -> Tuple[EvaluationLog, List[InstanceLevelEvaluationLog]]:
+        """Convert one HELM run into aggregate JSON plus detail JSONL.
+
+        The aggregate ``evaluation_result_id`` values are generated from
+        ``stats.json`` with the same helper used by the instance converter
+        so every metric-specific detail row can join back to an aggregate
+        result.
+        """
         run_spec = from_dict(data_class=RunSpec, data=raw_data['run_spec_dict'])
         # cast=[str] coerces int instance IDs to str; newer HELM versions
         # (e.g. long-context suite) store instance.id as int in the JSON.
@@ -412,6 +427,9 @@ def _transform_single(
         seen_evaluation_result_ids: set[str] = set()
 
         for stat in stats_raw:
+            # The ID helper mirrors the instance-level converter. This is the
+            # key invariant: detail rows should never introduce metric IDs that
+            # are absent from aggregate evaluation_results.
             metric_name = getattr(getattr(stat, 'name', None), 'name', None)
             score = _score_from_stat(stat)
             if metric_name is None or score is None:
diff --git a/every_eval_ever/converters/helm/instance_level_adapter.py b/every_eval_ever/converters/helm/instance_level_adapter.py
index 391bb6c2e..199c4009a 100644
--- a/every_eval_ever/converters/helm/instance_level_adapter.py
+++ b/every_eval_ever/converters/helm/instance_level_adapter.py
@@ -36,6 +36,12 @@ def _require_helm_dependencies() -> None:
 
 
 def _score_from_stat(stat) -> float | None:
+    """Return a scalar HELM stat value, or None for empty/bad stats.
+
+    HELM usually provides ``mean``; some stats only have ``sum`` and
+    ``count``. Returning None lets callers skip non-valued bookkeeping
+    rows such as zero-count training cost stats.
+    """
     value = getattr(stat, 'mean', None)
     if value is None:
         count = getattr(stat, 'count', None)
@@ -54,6 +60,7 @@ def _score_from_stat(stat) -> float | None:
 
 
 def _stat_name_part(value) -> str | None:
+    """Normalize HELM split/perturbation labels for stable IDs."""
     if value is None:
         return None
     if isinstance(value, str):
@@ -68,6 +75,11 @@ def _evaluation_result_id(
     split=None,
     perturbation=None,
 ) -> str | None:
+    """Build the join key shared by aggregate and instance rows.
+
+    Split and perturbation labels are included so two HELM stats with the
+    same metric name do not collide in ``evaluation_result_id``.
+    """
     if metric_name is None:
         return None
     parts = [metric_name]
@@ -135,6 +147,7 @@ def __init__(
         self.path = f'{evaluation_dir}/{evaulation_id}.{format}'
 
     def _save_json(self, items: List[InstanceLevelEvaluationLog]):
+        """Write one validated instance-level log per JSONL line."""
         eval_dir_path = Path(self.evaluation_dir)
         eval_dir_path.mkdir(parents=True, exist_ok=True)
         path = Path(self.path)
@@ -157,6 +170,12 @@ def convert_instance_level_logs(
         request_states: List[RequestState],
         per_instance_stats_list: List,
     ) -> Tuple[str, int]:
+        """Convert HELM request states into per-(sample, metric) rows.
+
+        When HELM per-instance stats are present, each numeric stat gets
+        its own detail row. If stats are absent, keep the legacy one-row
+        exact-match fallback so older or partial logs still convert.
+        """
         instance_level_logs: List[InstanceLevelEvaluationLog] = []
         for state in request_states:
             inst_stats = next(
@@ -199,6 +218,9 @@ def convert_instance_level_logs(
                 )
                 metric_stats = [None]
 
+            # Token usage is copied to every row for the same sample. This is
+            # intentionally denormalized so each detail row is independently
+            # useful when filtered by metric.
             token_usage = None
             if inst_stats:
                 p_tokens = next(
diff --git a/tests/test_helm_adapter.py b/tests/test_helm_adapter.py
index a3edd3062..d3fe08225 100644
--- a/tests/test_helm_adapter.py
+++ b/tests/test_helm_adapter.py
@@ -17,6 +17,7 @@
 
 
 def _load_eval(adapter, filepath, metadata_args):
+    """Run the HELM aggregate adapter against one fixture directory."""
     eval_dirpath = Path(filepath)
 
     with tempfile.TemporaryDirectory() as tmpdir:
@@ -40,6 +41,7 @@ def _load_eval(adapter, filepath, metadata_args):
 
 
 def _assert_unique_evaluation_result_ids(converted_eval):
+    """Aggregate result IDs must be stable join targets for sample rows."""
     result_ids = [
         result.evaluation_result_id
         for result in converted_eval.evaluation_results
diff --git a/tests/test_helm_instance_level_adapter.py b/tests/test_helm_instance_level_adapter.py
index 1685a656d..c6e1ef4c4 100644
--- a/tests/test_helm_instance_level_adapter.py
+++ b/tests/test_helm_instance_level_adapter.py
@@ -22,6 +22,7 @@
 
 
 def _require_helm():
+    """Skip HELM fixture tests when the optional converter deps are absent."""
     import_error = getattr(helm_adapter_module, '_HELM_IMPORT_ERROR', None)
     if import_error is not None:
         pytest.skip(
@@ -31,6 +32,7 @@ def _require_helm():
 
 
 def _load_instance_level_data(adapter, filepath, metadata_args):
+    """Run the HELM adapter and read back the generated JSONL detail rows."""
     eval_dirpath = Path(filepath)
     converted_eval_list = adapter.transform_from_directory(
         eval_dirpath,
@@ -60,6 +62,7 @@ def _load_instance_level_data(adapter, filepath, metadata_args):
 def _by_sample_and_metric(
     instance_logs,
 ) -> dict[tuple[str, str | None], InstanceLevelEvaluationLog]:
+    """Index detail rows by the two fields that should be unique together."""
     return {
         (log.sample_id, log.evaluation_result_id): log
         for log in instance_logs
@@ -67,12 +70,14 @@ def _by_sample_and_metric(
 
 
 def _metric_name_from_result_id(result_id: str | None) -> str | None:
+    """Strip split/perturbation suffixes from deterministic result IDs."""
     if result_id is None:
         return None
     return result_id.split(':', 1)[0]
 
 
 def _json_score_from_stat(stat: dict) -> float | None:
+    """Mirror converter score extraction for raw JSON fixtures."""
     value = stat.get('mean')
     if value is None:
         count = stat.get('count')
@@ -91,6 +96,7 @@ def _json_score_from_stat(stat: dict) -> float | None:
 
 
 def _expected_numeric_instance_stat_rows(filepath):
+    """Count fixture stats that should become instance-level rows."""
     per_instance_path = Path(filepath) / 'per_instance_stats.json'
     per_instance_stats = json.loads(per_instance_path.read_text())
     return sum(
@@ -119,6 +125,8 @@ def test_mmlu_instance_level():
             metadata_args,
         )
 
+        # The converter now emits many rows per sample. Count distinct samples
+        # separately from rows so this test stays focused on sample coverage.
         sample_ids = sorted({log.sample_id for log in instance_logs})
         assert len(sample_ids) == 10
         em_rows = [
@@ -129,6 +137,7 @@ def test_mmlu_instance_level():
         ]
         assert len(em_rows) == 10
 
+        # Pick a specific metric row instead of relying on JSONL order.
         log = _by_sample_and_metric(instance_logs)[
             ('id147', 'exact_match:test')
         ]
@@ -278,6 +287,8 @@ def test_per_sample_per_metric_rows_are_emitted():
             log for log in instance_logs if log.sample_id == 'id147'
         ]
         metric_ids = sorted(log.evaluation_result_id for log in rows_for_id147)
+        # This guards the core PR behavior: correctness and bookkeeping stats
+        # both survive as separate rows for the same sample.
         assert 'exact_match:test' in metric_ids
         assert 'num_prompt_tokens:test' in metric_ids
         assert all(log.evaluation_result_id is not None for log in rows_for_id147)
@@ -300,6 +311,8 @@ def test_is_correct_only_claimed_for_correctness_metrics():
             metadata_args,
         )
 
+        # Positive bookkeeping values are not correctness claims. A token
+        # count or runtime can be > 0 without the answer being correct.
         bookkeeping_names = {
             'num_references',
             'num_prompt_tokens',
@@ -332,6 +345,8 @@ def test_is_correct_only_claimed_for_correctness_metrics():
                 'tests/data/helm/narrative_qa:model=openai_gpt2',
                 metadata_args2,
             )
+            # Graded generation metrics also should not be coerced into a
+            # binary correctness label just because their scores are positive.
             graded = [
                 log
                 for log in narr_logs
@@ -362,6 +377,8 @@ def test_is_correct_is_true_for_correct_exact_match_rows():
             'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2',
             metadata_args,
         )
+        # Correctness metrics are the exception: positive exact-match rows
+        # should still carry is_correct=True.
         positive_em_rows = [
             log
             for log in instance_logs
@@ -393,6 +410,7 @@ def test_is_correct_for_metric_helper():
 
 
 def test_score_from_stat_helper_edge_cases():
+    # Empty or malformed HELM stats should be skipped, not crash conversion.
     assert _score_from_stat(SimpleNamespace(mean=0.25, sum=10, count=2)) == 0.25
     assert _score_from_stat(SimpleNamespace(mean=None, sum=3, count=2)) == 1.5
     assert _score_from_stat(SimpleNamespace(mean=None, sum=0, count=0)) is None
@@ -401,6 +419,8 @@ def test_score_from_stat_helper_edge_cases():
 
 
 def test_evaluation_result_id_helper_disambiguates_split_and_perturbation():
+    # Split and perturbation suffixes prevent same-named HELM stats from
+    # colliding when they are used as join keys.
     assert _evaluation_result_id('exact_match') == 'exact_match'
     assert _evaluation_result_id('exact_match', 'test') == 'exact_match:test'
     assert (
@@ -428,6 +448,8 @@ def test_total_rows_matches_numeric_per_instance_stats():
             adapter, fixture, metadata_args
         )
 
+        # Count expected rows from the fixture itself so duplication or
+        # accidental filtering changes are caught precisely.
         expected_rows = _expected_numeric_instance_stat_rows(fixture)
         assert converted_eval.detailed_evaluation_results.total_rows == expected_rows
         assert len(instance_logs) == expected_rows
@@ -453,6 +475,8 @@ def test_instance_evaluation_result_ids_join_to_aggregate_results():
             metadata_args,
         )
 
+        # This is the most important schema invariant: every metric-specific
+        # detail row should be joinable to one aggregate evaluation result.
         aggregate_ids = {
             result.evaluation_result_id
             for result in converted_eval.evaluation_results
@@ -485,6 +509,8 @@ def test_aggregate_evaluation_result_ids_are_unique_and_non_null():
             metadata_args,
         )
 
+        # Aggregate IDs are the target side of the join, so they must be
+        # present and unique.
         result_ids = [
             result.evaluation_result_id
             for result in converted_eval.evaluation_results
@@ -497,6 +523,8 @@ def test_aggregate_evaluation_result_ids_are_unique_and_non_null():
 
 def test_missing_inst_stats_uses_legacy_exact_match_fallback():
     _require_helm()
+    # Some old or partial HELM logs may lack per-instance stats. The adapter
+    # should still emit the legacy one-row exact-match fallback.
     completion = SimpleNamespace(text='answer')
     state = SimpleNamespace(
         instance=SimpleNamespace(
@@ -539,6 +567,8 @@ def test_reasoning_traces_none_does_not_break_conversion(monkeypatch):
     _require_helm()
     from every_eval_ever.converters.helm import instance_level_adapter as mod
 
+    # HELM reasoning extraction may legitimately return None. The converter
+    # normalizes that case instead of passing an invalid value to the schema.
     monkeypatch.setattr(mod, 'extract_all_reasonings', lambda state: None)
 
     adapter = HELMAdapter()

From 1b34a2475885fcc341badd91b233bcfafd0a3e37 Mon Sep 17 00:00:00 2001
From: joncrall <erotemic@gmail.com>
Date: Wed, 13 May 2026 19:04:42 -0400
Subject: [PATCH 5/8] Restrict HELM metric rows to core metrics

---
 every_eval_ever/converters/helm/adapter.py    |  22 ++--
 .../converters/helm/instance_level_adapter.py |  24 +++--
 every_eval_ever/converters/helm/metrics.py    | 101 ++++++++++++++++++
 tests/test_helm_instance_level_adapter.py     |  97 +++++++++--------
 4 files changed, 185 insertions(+), 59 deletions(-)
 create mode 100644 every_eval_ever/converters/helm/metrics.py

diff --git a/every_eval_ever/converters/helm/adapter.py b/every_eval_ever/converters/helm/adapter.py
index 0fa74a00b..484480829 100644
--- a/every_eval_ever/converters/helm/adapter.py
+++ b/every_eval_ever/converters/helm/adapter.py
@@ -45,6 +45,7 @@
     SupportedLibrary,
 )
 from every_eval_ever.converters.common.utils import sha256_file
+from every_eval_ever.converters.helm.metrics import is_core_metric
 from every_eval_ever.converters.helm.instance_level_adapter import (
     HELMInstanceLevelDataAdapter,
     _evaluation_result_id,
@@ -341,7 +342,7 @@ def _extract_metric_names(self, run_spec: RunSpec) -> List[str]:
 
         Kept for callers that need the run-spec metric declaration. The
         main aggregate conversion below uses ``stats.json`` so aggregate
-        IDs cover all numeric detail rows emitted from per-instance stats.
+        IDs cover all core metric rows emitted from per-instance stats.
         """
         metric_names = []
         for metric_spec in run_spec.metric_specs:
@@ -359,9 +360,9 @@ def _transform_single(
         """Convert one HELM run into aggregate JSON plus detail JSONL.
 
         The aggregate ``evaluation_result_id`` values are generated from
-        ``stats.json`` with the same helper used by the instance converter
-        so every metric-specific detail row can join back to an aggregate
-        result.
+        core metrics in ``stats.json`` with the same helper used by the
+        instance converter so every metric-specific detail row can join
+        back to an aggregate result.
         """
         run_spec = from_dict(data_class=RunSpec, data=raw_data['run_spec_dict'])
         # cast=[str] coerces int instance IDs to str; newer HELM versions
@@ -419,10 +420,13 @@ def _transform_single(
 
         evaluation_id = f'{source_data.dataset_name}/{model_info.id.replace("/", "_")}/{evaluation_timestamp}'
 
-        # Build aggregate results from the numeric HELM stats themselves, not
+        # Build aggregate results from core HELM stats themselves, not
         # only from run_spec.metric_specs. The instance-level converter emits
-        # one row per numeric per-instance stat, so aggregate IDs must cover
-        # the same stat namespace for detailed rows to be joinable.
+        # one row per core per-instance stat, so aggregate IDs must cover
+        # the same core namespace for detailed rows to be joinable.
+        # TODO: Consider promoting bookkeeping telemetry into structured
+        # fields such as token_usage, performance, metadata, or
+        # additional_details in a separate follow-up.
         evaluation_results: List[EvaluationResult] = []
         seen_evaluation_result_ids: set[str] = set()
 
@@ -431,6 +435,8 @@ def _transform_single(
             # key invariant: detail rows should never introduce metric IDs that
             # are absent from aggregate evaluation_results.
             metric_name = getattr(getattr(stat, 'name', None), 'name', None)
+            if not is_core_metric(metric_name):
+                continue
             score = _score_from_stat(stat)
             if metric_name is None or score is None:
                 continue
@@ -451,7 +457,7 @@ def _transform_single(
                 lower_is_better=False,  # TODO schema.json check
                 score_type=ScoreType.continuous,
                 min_score=0,
-                max_score=max(1.0, score),
+                max_score=1.0,
             )
 
             split = getattr(stat.name, 'split', None)
diff --git a/every_eval_ever/converters/helm/instance_level_adapter.py b/every_eval_ever/converters/helm/instance_level_adapter.py
index 199c4009a..f73383aa5 100644
--- a/every_eval_ever/converters/helm/instance_level_adapter.py
+++ b/every_eval_ever/converters/helm/instance_level_adapter.py
@@ -22,6 +22,7 @@ def _require_helm_dependencies() -> None:
 
 from every_eval_ever.converters import SCHEMA_VERSION
 from every_eval_ever.converters.common.utils import sha256_string
+from every_eval_ever.converters.helm.metrics import is_core_metric
 from every_eval_ever.converters.helm.utils import extract_all_reasonings
 from every_eval_ever.instance_level_types import (
     AnswerAttributionItem,
@@ -172,9 +173,9 @@ def convert_instance_level_logs(
     ) -> Tuple[str, int]:
         """Convert HELM request states into per-(sample, metric) rows.
 
-        When HELM per-instance stats are present, each numeric stat gets
-        its own detail row. If stats are absent, keep the legacy one-row
-        exact-match fallback so older or partial logs still convert.
+        When HELM per-instance stats are present, each core metric gets
+        its own detail row. If core metrics are absent, keep the legacy
+        one-row exact-match fallback so older or partial logs still convert.
         """
         instance_level_logs: List[InstanceLevelEvaluationLog] = []
         for state in request_states:
@@ -206,7 +207,14 @@ def convert_instance_level_logs(
                 trace for trace in reasoning_traces if isinstance(trace, str)
             ]
 
-            metric_stats = list(inst_stats.stats) if inst_stats else []
+            all_stats = list(inst_stats.stats) if inst_stats else []
+            metric_stats = [
+                stat
+                for stat in all_stats
+                if is_core_metric(
+                    getattr(getattr(stat, 'name', None), 'name', None)
+                )
+            ]
             if not metric_stats:
                 correct_completions = sum(
                     1 for c in completions if c.strip() in correct_refs
@@ -218,9 +226,13 @@ def convert_instance_level_logs(
                 )
                 metric_stats = [None]
 
+            # Scope control: only core HELM metrics become metric rows.
+            # TODO: Consider preserving additional bookkeeping telemetry in
+            # token_usage, performance.additional_details, or metadata.
+
             # Token usage is copied to every row for the same sample. This is
-            # intentionally denormalized so each detail row is independently
-            # useful when filtered by metric.
+            # intentionally denormalized so each core metric row is
+            # independently useful when filtered by metric.
             token_usage = None
             if inst_stats:
                 p_tokens = next(
diff --git a/every_eval_ever/converters/helm/metrics.py b/every_eval_ever/converters/helm/metrics.py
new file mode 100644
index 000000000..7b9956d39
--- /dev/null
+++ b/every_eval_ever/converters/helm/metrics.py
@@ -0,0 +1,101 @@
+"""HELM metric classification helpers.
+
+Only core metrics are promoted to EEE aggregate/detail metric rows.
+Bookkeeping stats are intentionally kept out of ``evaluation_result_id``
+rows; selected telemetry can be mapped to structured fields such as
+``token_usage`` or ``performance``.
+"""
+
+from __future__ import annotations
+
+from typing import Optional
+
+
+class METRIC_PREFIXES:
+    """Registry of HELM metric prefixes used by the converter."""
+
+    CORE_PREFIXES: tuple[str, ...] = (
+        'exact_match',
+        'quasi_exact_match',
+        'prefix_exact_match',
+        'quasi_prefix_exact_match',
+        'classification_micro_f1',
+        'classification_macro_f1',
+        'f1_score',
+        'rouge_l',
+        'bleu_',
+        'ifeval_strict_accuracy',
+        'wildbench_score',
+        'wildbench_score_rescaled',
+        'omni_math_accuracy',
+        'chain_of_thought_correctness',
+        'math_equiv',
+        'math_equiv_chain_of_thought',
+        'safety_score',
+        'safety_gpt_score',
+        'safety_llama_score',
+        'air_score',
+        'air_category_',
+    )
+
+    BOOKKEEPING_PREFIXES: tuple[str, ...] = (
+        # token/size/runtime/resource accounting
+        'num_',
+        'training_',
+        'inference_',
+        'batch_size',
+        'max_prob',
+        'logprob',
+        'num_perplexity_tokens',
+        'num_bytes',
+        'perplexity',
+        'bits_per_byte',
+        'logprob_per_byte',
+        # decoding/stopping bookkeeping
+        'finish_reason_',
+        'prompt_truncated',
+        # calibration/fitting plumbing
+        'ece_',
+        'platt_',
+        'selective_',
+        # meta/dataset sizing
+        'num_instances',
+        'num_train_',
+        'num_references',
+    )
+
+
+def classify_metric(metric_name: Optional[str]) -> tuple[str, str | None]:
+    """Classify a HELM metric as core, bookkeeping, or untracked."""
+    if not metric_name:
+        return ('untracked', None)
+    for prefix in METRIC_PREFIXES.CORE_PREFIXES:
+        if metric_name.startswith(prefix):
+            return ('core', prefix)
+    for prefix in METRIC_PREFIXES.BOOKKEEPING_PREFIXES:
+        if metric_name.startswith(prefix):
+            return ('bookkeeping', prefix)
+    return ('untracked', None)
+
+
+def is_core_metric(metric_name: Optional[str]) -> bool:
+    """Return True when a HELM stat should become an EEE metric row."""
+    metric_class, _ = classify_metric(metric_name)
+    return metric_class == 'core'
+
+
+def metric_family(metric_name: Optional[str]) -> str:
+    """Return a lightweight metric family name for summaries/debugging."""
+    if not metric_name:
+        return '?'
+    if metric_name.startswith('air_'):
+        return 'air'
+    if metric_name.startswith('bias_metric:'):
+        return 'bias_metric'
+    if metric_name.startswith('safety_'):
+        return 'safety'
+    if metric_name.startswith('bbq_'):
+        return 'bbq'
+    if '@' in metric_name:
+        return metric_name.split('@', 1)[0]
+    return metric_name.split('_', 1)[0].split(':', 1)[0]
diff --git a/tests/test_helm_instance_level_adapter.py b/tests/test_helm_instance_level_adapter.py
index c6e1ef4c4..98e21fcc3 100644
--- a/tests/test_helm_instance_level_adapter.py
+++ b/tests/test_helm_instance_level_adapter.py
@@ -7,6 +7,7 @@
 
 from every_eval_ever.converters.helm import adapter as helm_adapter_module
 from every_eval_ever.converters.helm.adapter import HELMAdapter
+from every_eval_ever.converters.helm.metrics import is_core_metric
 from every_eval_ever.converters.helm.instance_level_adapter import (
     HELMInstanceLevelDataAdapter,
     _BINARY_CORRECTNESS_METRIC_NAMES,
@@ -95,15 +96,16 @@ def _json_score_from_stat(stat: dict) -> float | None:
         return None
 
 
-def _expected_numeric_instance_stat_rows(filepath):
-    """Count fixture stats that should become instance-level rows."""
+def _expected_core_instance_stat_rows(filepath):
+    """Count core fixture stats that should become detail rows."""
     per_instance_path = Path(filepath) / 'per_instance_stats.json'
     per_instance_stats = json.loads(per_instance_path.read_text())
     return sum(
         1
         for item in per_instance_stats
         for stat in item.get('stats', [])
-        if _json_score_from_stat(stat) is not None
+        if is_core_metric(stat.get('name', {}).get('name'))
+        and _json_score_from_stat(stat) is not None
     )
 
 
@@ -268,7 +270,7 @@ def test_narrativeqa_instance_level():
         assert log.answer_attribution[0].extraction_method == 'exact_match'
 
 
-def test_per_sample_per_metric_rows_are_emitted():
+def test_per_sample_core_metric_rows_are_emitted():
     _require_helm()
     adapter = HELMAdapter()
     with tempfile.TemporaryDirectory() as tmpdir:
@@ -287,15 +289,17 @@ def test_per_sample_per_metric_rows_are_emitted():
             log for log in instance_logs if log.sample_id == 'id147'
         ]
         metric_ids = sorted(log.evaluation_result_id for log in rows_for_id147)
-        # This guards the core PR behavior: correctness and bookkeeping stats
-        # both survive as separate rows for the same sample.
+        # This guards the core PR behavior: HELM evaluation metrics survive
+        # as separate rows, while bookkeeping stats remain out-of-band.
         assert 'exact_match:test' in metric_ids
-        assert 'num_prompt_tokens:test' in metric_ids
+        assert 'quasi_exact_match:test' in metric_ids
+        assert 'num_prompt_tokens:test' not in metric_ids
+        assert 'inference_runtime:test' not in metric_ids
         assert all(log.evaluation_result_id is not None for log in rows_for_id147)
         assert len(metric_ids) == len(set(metric_ids))
 
 
-def test_is_correct_only_claimed_for_correctness_metrics():
+def test_bookkeeping_stats_are_not_emitted_as_metric_rows():
     _require_helm()
     adapter = HELMAdapter()
     with tempfile.TemporaryDirectory() as tmpdir:
@@ -312,7 +316,8 @@ def test_is_correct_only_claimed_for_correctness_metrics():
         )
 
         # Positive bookkeeping values are not correctness claims. A token
-        # count or runtime can be > 0 without the answer being correct.
+        # count or runtime can be > 0 without the answer being correct, so
+        # these stats should not become metric rows at all.
         bookkeeping_names = {
             'num_references',
             'num_prompt_tokens',
@@ -325,41 +330,43 @@ def test_is_correct_only_claimed_for_correctness_metrics():
             'num_bytes',
             'batch_size',
         }
-        bookkeeping = [
-            log
+        emitted_metric_names = {
+            _metric_name_from_result_id(log.evaluation_result_id)
             for log in instance_logs
+        }
+        assert emitted_metric_names.isdisjoint(bookkeeping_names)
+
+
+def test_graded_core_metrics_are_not_binary_correctness():
+    _require_helm()
+    adapter = HELMAdapter()
+    with tempfile.TemporaryDirectory() as tmpdir2:
+        metadata_args2 = {
+            'source_organization_name': 'TestOrg',
+            'evaluator_relationship': EvaluatorRelationship.first_party,
+            'parent_eval_output_dir': tmpdir2,
+            'file_uuid': 'test_correctness_graded',
+        }
+        _, narr_logs = _load_instance_level_data(
+            adapter,
+            'tests/data/helm/narrative_qa:model=openai_gpt2',
+            metadata_args2,
+        )
+        # Graded generation metrics also should not be coerced into a
+        # binary correctness label just because their scores are positive.
+        graded = [
+            log
+            for log in narr_logs
             if _metric_name_from_result_id(log.evaluation_result_id)
-            in bookkeeping_names
+            in {'rouge_l', 'f1_score', 'bleu_1', 'bleu_4'}
         ]
-        assert bookkeeping, 'expected bookkeeping rows to be emitted'
+        assert graded, 'expected graded score rows in narrative_qa fixture'
         assert all(
-            log.evaluation.is_correct is False for log in bookkeeping
-        ), 'bookkeeping metrics must not claim correctness'
-
-        with tempfile.TemporaryDirectory() as tmpdir2:
-            metadata_args2 = dict(metadata_args)
-            metadata_args2['parent_eval_output_dir'] = tmpdir2
-            metadata_args2['file_uuid'] = 'test_correctness_graded'
-            _, narr_logs = _load_instance_level_data(
-                adapter,
-                'tests/data/helm/narrative_qa:model=openai_gpt2',
-                metadata_args2,
-            )
-            # Graded generation metrics also should not be coerced into a
-            # binary correctness label just because their scores are positive.
-            graded = [
-                log
-                for log in narr_logs
-                if _metric_name_from_result_id(log.evaluation_result_id)
-                in {'rouge_l', 'f1_score', 'bleu_1', 'bleu_4'}
-            ]
-            assert graded, 'expected graded score rows in narrative_qa fixture'
-            assert all(
-                log.evaluation.is_correct is False for log in graded
-            ), (
-                'graded metrics (rouge_l/f1_score/bleu_*) must not be '
-                'treated as binary correctness'
-            )
+            log.evaluation.is_correct is False for log in graded
+        ), (
+            'graded metrics (rouge_l/f1_score/bleu_*) must not be '
+            'treated as binary correctness'
+        )
 
 
 def test_is_correct_is_true_for_correct_exact_match_rows():
@@ -433,7 +440,7 @@ def test_evaluation_result_id_helper_disambiguates_split_and_perturbation():
     )
 
 
-def test_total_rows_matches_numeric_per_instance_stats():
+def test_total_rows_matches_core_per_instance_stats():
     _require_helm()
     fixture = 'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2'
     adapter = HELMAdapter()
@@ -448,9 +455,9 @@ def test_total_rows_matches_numeric_per_instance_stats():
             adapter, fixture, metadata_args
         )
 
-        # Count expected rows from the fixture itself so duplication or
-        # accidental filtering changes are caught precisely.
-        expected_rows = _expected_numeric_instance_stat_rows(fixture)
+        # Count expected core metric rows from the fixture itself so
+        # duplication or accidental filtering changes are caught precisely.
+        expected_rows = _expected_core_instance_stat_rows(fixture)
         assert converted_eval.detailed_evaluation_results.total_rows == expected_rows
         assert len(instance_logs) == expected_rows
         assert len({
@@ -518,7 +525,7 @@ def test_aggregate_evaluation_result_ids_are_unique_and_non_null():
         assert all(result_ids)
         assert len(result_ids) == len(set(result_ids))
         assert 'exact_match:test' in result_ids
-        assert 'num_prompt_tokens:test' in result_ids
+        assert 'num_prompt_tokens:test' not in result_ids
 
 
 def test_missing_inst_stats_uses_legacy_exact_match_fallback():

From e86c5f0a10fe468afb42a7dad4a34999d6d6b7f4 Mon Sep 17 00:00:00 2001
From: joncrall <erotemic@gmail.com>
Date: Wed, 13 May 2026 19:47:58 -0400
Subject: [PATCH 6/8] Simplify HELM core metric filter

---
 every_eval_ever/converters/helm/metrics.py | 114 +++++----------------
 1 file changed, 23 insertions(+), 91 deletions(-)

diff --git a/every_eval_ever/converters/helm/metrics.py b/every_eval_ever/converters/helm/metrics.py
index 7b9956d39..10a509697 100644
--- a/every_eval_ever/converters/helm/metrics.py
+++ b/every_eval_ever/converters/helm/metrics.py
@@ -1,101 +1,33 @@
-"""HELM metric classification helpers.
-
-Only core metrics are promoted to EEE aggregate/detail metric rows.
-Bookkeeping stats are intentionally kept out of ``evaluation_result_id``
-rows; selected telemetry can be mapped to structured fields such as
-``token_usage`` or ``performance``.
-"""
+"""HELM metric filtering helpers."""
 
 from __future__ import annotations
 
 from typing import Optional
 
 
-class METRIC_PREFIXES:
-    """Registry of HELM metric prefixes used by the converter."""
-
-    CORE_PREFIXES: tuple[str, ...] = (
-        'exact_match',
-        'quasi_exact_match',
-        'prefix_exact_match',
-        'quasi_prefix_exact_match',
-        'classification_micro_f1',
-        'classification_macro_f1',
-        'f1_score',
-        'rouge_l',
-        'bleu_',
-        'ifeval_strict_accuracy',
-        'wildbench_score',
-        'wildbench_score_rescaled',
-        'omni_math_accuracy',
-        'chain_of_thought_correctness',
-        'math_equiv',
-        'math_equiv_chain_of_thought',
-        'safety_score',
-        'safety_gpt_score',
-        'safety_llama_score',
-        'air_score',
-        'air_category_',
-    )
-
-    BOOKKEEPING_PREFIXES: tuple[str, ...] = (
-        # token/size/runtime/resource accounting
-        'num_',
-        'training_',
-        'inference_',
-        'batch_size',
-        'max_prob',
-        'logprob',
-        'num_perplexity_tokens',
-        'num_bytes',
-        'perplexity',
-        'bits_per_byte',
-        'logprob_per_byte',
-        # decoding/stopping bookkeeping
-        'finish_reason_',
-        'prompt_truncated',
-        # calibration/fitting plumbing
-        'ece_',
-        'platt_',
-        'selective_',
-        # meta/dataset sizing
-        'num_instances',
-        'num_train_',
-        'num_references',
-    )
-
-
-def classify_metric(metric_name: Optional[str]) -> tuple[str, str | None]:
-    """Classify a HELM metric as core, bookkeeping, or untracked."""
-    if not metric_name:
-        return ('untracked', None)
-    for prefix in METRIC_PREFIXES.CORE_PREFIXES:
-        if metric_name.startswith(prefix):
-            return ('core', prefix)
-    for prefix in METRIC_PREFIXES.BOOKKEEPING_PREFIXES:
-        if metric_name.startswith(prefix):
-            return ('bookkeeping', prefix)
-    return ('untracked', None)
+# HELM emits both benchmark metrics and bookkeeping telemetry in stats.json /
+# per_instance_stats.json. In this PR, only benchmark-quality metrics become
+# EEE aggregate/detail metric rows. Bookkeeping can be mapped to token_usage,
+# performance, metadata, or additional_details in a future follow-up.
+CORE_METRIC_PREFIXES: tuple[str, ...] = (
+    'exact_match',
+    'quasi_exact_match',
+    'prefix_exact_match',
+    'quasi_prefix_exact_match',
+    'classification_micro_f1',
+    'classification_macro_f1',
+    'f1_score',
+    'rouge_l',
+    'bleu_',
+    'ifeval_strict_accuracy',
+    'chain_of_thought_correctness',
+    'math_equiv',
+    'math_equiv_chain_of_thought',
+)
 
 
 def is_core_metric(metric_name: Optional[str]) -> bool:
     """Return True when a HELM stat should become an EEE metric row."""
-    metric_class, _ = classify_metric(metric_name)
-    return metric_class == 'core'
-
-
-def metric_family(metric_name: Optional[str]) -> str:
-    """Return a lightweight metric family name for summaries/debugging."""
-    if not metric_name:
-        return '?'
-    if metric_name.startswith('air_'):
-        return 'air'
-    if metric_name.startswith('bias_metric:'):
-        return 'bias_metric'
-    if metric_name.startswith('safety_'):
-        return 'safety'
-    if metric_name.startswith('bbq_'):
-        return 'bbq'
-    if '@' in metric_name:
-        return metric_name.split('@', 1)[0]
-    return metric_name.split('_', 1)[0].split(':', 1)[0]
+    return bool(metric_name) and any(
+        metric_name.startswith(prefix) for prefix in CORE_METRIC_PREFIXES
+    )

From e7de0290bfbc12d5f1c9170dd7c76e8ca640e770 Mon Sep 17 00:00:00 2001
From: joncrall <erotemic@gmail.com>
Date: Wed, 13 May 2026 19:53:34 -0400
Subject: [PATCH 7/8] Clean up HELM core metric conversion

---
 every_eval_ever/converters/helm/adapter.py    | 40 ++++++-------------
 .../converters/helm/instance_level_adapter.py | 21 +++++-----
 tests/test_helm_adapter.py                    | 19 ++++++---
 3 files changed, 36 insertions(+), 44 deletions(-)

diff --git a/every_eval_ever/converters/helm/adapter.py b/every_eval_ever/converters/helm/adapter.py
index 484480829..e9d833d1d 100644
--- a/every_eval_ever/converters/helm/adapter.py
+++ b/every_eval_ever/converters/helm/adapter.py
@@ -3,7 +3,7 @@
 import os
 import uuid
 from pathlib import Path
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List
 
 _HELM_IMPORT_ERROR: Exception | None = None
 try:
@@ -70,7 +70,6 @@
     SourceType,
     Uncertainty,
 )
-from every_eval_ever.instance_level_types import InstanceLevelEvaluationLog
 
 
 def _require_helm_dependencies() -> None:
@@ -220,7 +219,6 @@ def transform_from_directory(
         Transforms HELM results into one aggregate EvaluationLog and one
         instance-level JSONL file containing all samples.
         """
-        # all_instance_logs: List[InstanceLevelEvaluationLog] = []
         aggregate_logs: List[EvaluationLog] = []
 
         file_uuids = metadata_args.get('file_uuids')
@@ -264,10 +262,6 @@ def transform_from_directory(
                     aggregate_logs.append(agg)
                     converted_idx += 1
 
-        # # Write all consolidated instance logs to JSONL
-        # with open(output_path, 'w', encoding='utf-8') as f:
-        #     for log in all_instance_logs:
-        #         f.write(json.dumps(log.model_dump(), ensure_ascii=False) + '\n')
 
         return aggregate_logs
 
@@ -337,26 +331,9 @@ def _extract_dataset_name(
 
         return run_spec_name.split(':')[0]
 
-    def _extract_metric_names(self, run_spec: RunSpec) -> List[str]:
-        """Return metric names declared by the HELM run spec.
-
-        Kept for callers that need the run-spec metric declaration. The
-        main aggregate conversion below uses ``stats.json`` so aggregate
-        IDs cover all core metric rows emitted from per-instance stats.
-        """
-        metric_names = []
-        for metric_spec in run_spec.metric_specs:
-            names = metric_spec.args.get('names')
-            if names:
-                metric_names.extend(names)
-            else:
-                metric_names.append(metric_spec.class_name.split('.')[-1])
-
-        return metric_names
-
     def _transform_single(
         self, raw_data: Dict, metadata_args: Dict[str, Any]
-    ) -> Tuple[EvaluationLog, List[InstanceLevelEvaluationLog]]:
+    ) -> EvaluationLog:
         """Convert one HELM run into aggregate JSON plus detail JSONL.
 
         The aggregate ``evaluation_result_id`` values are generated from
@@ -441,6 +418,8 @@ def _transform_single(
             if metric_name is None or score is None:
                 continue
 
+            stat_count = getattr(stat, 'count', None)
+
             evaluation_result_id = _evaluation_result_id(
                 metric_name,
                 getattr(stat.name, 'split', None),
@@ -482,8 +461,15 @@ def _transform_single(
                         score=score,
                         uncertainty=Uncertainty(
                             standard_deviation=getattr(stat, 'stddev', None),
-                            num_samples=adapter_spec.max_eval_instances
-                            or len(request_states),
+                            # Split-specific HELM stats may cover fewer
+                            # examples than the full run, so use the stat's
+                            # own count when it is available.
+                            num_samples=(
+                                stat_count
+                                if stat_count is not None
+                                else adapter_spec.max_eval_instances
+                                or len(request_states)
+                            ),
                         ),
                         details={
                             'count': str(getattr(stat, 'count', '')),
diff --git a/every_eval_ever/converters/helm/instance_level_adapter.py b/every_eval_ever/converters/helm/instance_level_adapter.py
index f73383aa5..ef1ae868a 100644
--- a/every_eval_ever/converters/helm/instance_level_adapter.py
+++ b/every_eval_ever/converters/helm/instance_level_adapter.py
@@ -40,8 +40,8 @@ def _score_from_stat(stat) -> float | None:
     """Return a scalar HELM stat value, or None for empty/bad stats.
 
     HELM usually provides ``mean``; some stats only have ``sum`` and
-    ``count``. Returning None lets callers skip non-valued bookkeeping
-    rows such as zero-count training cost stats.
+    ``count``. Returning None lets callers skip stat rows that do not
+    contain a usable scalar value.
     """
     value = getattr(stat, 'mean', None)
     if value is None:
@@ -94,12 +94,9 @@ def _evaluation_result_id(
 
 
 # Metric names whose per-instance score is a correctness signal in [0, 1]
-# where ``score > 0`` reasonably maps to ``is_correct=True``. Anything not
-# in this allowlist (token counts, runtime, finish-reason flags, logprobs,
-# etc.) gets ``is_correct=False`` because we have no correctness claim
-# from a bookkeeping/resource metric. Keep this list tight and named after
-# the actual HELM stat names — broaden only for verified correctness
-# semantics.
+# where ``score > 0`` reasonably maps to ``is_correct=True``. Keep this list
+# tight: graded core metrics such as rouge/bleu/f1 should stay out of it
+# because a positive score is not the same as a binary correctness claim.
 _BINARY_CORRECTNESS_METRIC_NAMES: frozenset[str] = frozenset({
     'exact_match',
     'quasi_exact_match',
@@ -121,9 +118,8 @@ def _is_correct_for_metric(metric_name: str | None, score: float) -> bool:
 
     For correctness metrics in the allowlist, the HELM convention is that
     score==1.0 means correct and 0.0 means wrong, so any positive score
-    rounds up to "correct". For anything else (bookkeeping / resource
-    stats, or graded metrics like rouge_l/bleu where >0 is not a correctness
-    signal) we deliberately do not claim correctness.
+    rounds up to "correct". For graded metrics like rouge_l/bleu/f1 where
+    >0 is not a correctness signal, we deliberately do not claim correctness.
     """
     if metric_name is None:
         return False
@@ -216,6 +212,9 @@ def convert_instance_level_logs(
                 )
             ]
             if not metric_stats:
+                # Preserve the legacy exact-match proxy instead of dropping
+                # samples that have no per-instance stats or no recognized
+                # core metric rows.
                 correct_completions = sum(
                     1 for c in completions if c.strip() in correct_refs
                 )
diff --git a/tests/test_helm_adapter.py b/tests/test_helm_adapter.py
index d3fe08225..5d2bbb9a0 100644
--- a/tests/test_helm_adapter.py
+++ b/tests/test_helm_adapter.py
@@ -1,12 +1,9 @@
 import pytest
 
-pytest.importorskip(
-    'helm', reason='crfm-helm not installed; install with: uv sync --extra helm'
-)
-
 import tempfile
 from pathlib import Path
 
+from every_eval_ever.converters.helm import adapter as helm_adapter_module
 from every_eval_ever.converters.helm.adapter import HELMAdapter
 from every_eval_ever.eval_types import (
     EvaluationLog,
@@ -16,6 +13,16 @@
 )
 
 
+pytestmark = pytest.mark.skipif(
+    helm_adapter_module._HELM_IMPORT_ERROR is not None,
+    reason=(
+        'HELM converter dependencies are missing: '
+        f'{helm_adapter_module._HELM_IMPORT_ERROR!r}. '
+        'Install with: uv sync --extra helm'
+    ),
+)
+
+
 def _load_eval(adapter, filepath, metadata_args):
     """Run the HELM aggregate adapter against one fixture directory."""
     eval_dirpath = Path(filepath)
@@ -133,7 +140,7 @@ def test_hellswag_eval():
 
     assert converted_eval.detailed_evaluation_results is not None
     assert converted_eval.detailed_evaluation_results.format is not None
-    # Per-(sample, metric): >= sample count, not equal to it.
+    # Per-(sample, core metric): >= sample count, not equal to it.
     assert converted_eval.detailed_evaluation_results.total_rows >= 10
 
 
@@ -172,7 +179,7 @@ def test_narrativeqa_eval():
 
     assert converted_eval.detailed_evaluation_results is not None
     assert converted_eval.detailed_evaluation_results.format is not None
-    # Per-(sample, metric): >= sample count, not equal to it.
+    # Per-(sample, core metric): >= sample count, not equal to it.
     assert converted_eval.detailed_evaluation_results.total_rows >= 5
 
 

From 43011f5c566696131af707aa506d446650633d25 Mon Sep 17 00:00:00 2001
From: joncrall <erotemic@gmail.com>
Date: Wed, 13 May 2026 19:58:08 -0400
Subject: [PATCH 8/8] Clean up HELM converter typing

---
 every_eval_ever/converters/helm/adapter.py    | 36 ++++++++++++-------
 .../converters/helm/instance_level_adapter.py | 13 ++++---
 2 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/every_eval_ever/converters/helm/adapter.py b/every_eval_ever/converters/helm/adapter.py
index e9d833d1d..d57d7f30f 100644
--- a/every_eval_ever/converters/helm/adapter.py
+++ b/every_eval_ever/converters/helm/adapter.py
@@ -3,7 +3,7 @@
 import os
 import uuid
 from pathlib import Path
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Union, cast
 
 _HELM_IMPORT_ERROR: Exception | None = None
 try:
@@ -29,14 +29,18 @@
     Exception
 ) as ex:  # pragma: no cover - exercised only when optional deps missing
     _HELM_IMPORT_ERROR = ex
-    DaciteConfig = from_dict = None  # type: ignore[assignment]
-    PerInstanceStats = AdapterSpec = RequestState = ScenarioState = Stat = (
-        RunSpec
-    ) = Any  # type: ignore[assignment]
-    get_model_deployment = register_builtin_configs_from_helm_package = (
-        from_json
-    ) = None  # type: ignore[assignment]
-    ModelDeploymentNotFoundError = Exception  # type: ignore[assignment]
+    DaciteConfig = cast(Any, None)
+    from_dict = cast(Any, None)
+    PerInstanceStats = cast(Any, None)
+    AdapterSpec = cast(Any, None)
+    RequestState = cast(Any, None)
+    ScenarioState = cast(Any, None)
+    Stat = cast(Any, None)
+    RunSpec = cast(Any, None)
+    get_model_deployment = cast(Any, None)
+    register_builtin_configs_from_helm_package = cast(Any, None)
+    from_json = cast(Any, None)
+    ModelDeploymentNotFoundError = cast(Any, Exception)
 
 from every_eval_ever.converters import SCHEMA_VERSION
 from every_eval_ever.converters.common.adapter import (
@@ -108,6 +112,7 @@ def metadata(self) -> AdapterMetadata:
         return AdapterMetadata(
             name='HELMAdapter',
             version='0.0.1',
+            supported_library_versions=['helm'],
             description='HELM adapter with dynamic metrics and unified JSONL instance logging',
         )
 
@@ -131,7 +136,8 @@ def _split_model_id(self, model_id: str | None) -> tuple[str, str]:
         if not model_id:
             return ('unknown', 'unknown')
         if '/' in model_id:
-            return tuple(model_id.split('/', 1))
+            developer, name = model_id.split('/', 1)
+            return (developer, name)
         return ('unknown', model_id)
 
     def _extract_model_info(self, adapter_spec: AdapterSpec) -> ModelInfo:
@@ -213,13 +219,18 @@ def _load_evaluation_run_logfiles(self, dir_path) -> Dict:
         }
 
     def transform_from_directory(
-        self, dir_path: str, output_path: str, metadata_args: Dict[str, Any]
-    ):
+        self,
+        dir_path: str | Path,
+        metadata_args: Dict[str, Any] | None = None,
+        output_path: str | None = None,
+    ) -> List[EvaluationLog]:
         """
         Transforms HELM results into one aggregate EvaluationLog and one
         instance-level JSONL file containing all samples.
         """
         aggregate_logs: List[EvaluationLog] = []
+        metadata_args = metadata_args or {}
+        dir_path = str(dir_path)
 
         file_uuids = metadata_args.get('file_uuids')
 
@@ -262,7 +273,6 @@ def transform_from_directory(
                     aggregate_logs.append(agg)
                     converted_idx += 1
 
-
         return aggregate_logs
 
     def _extract_generation_args(
diff --git a/every_eval_ever/converters/helm/instance_level_adapter.py b/every_eval_ever/converters/helm/instance_level_adapter.py
index ef1ae868a..bd0442a89 100644
--- a/every_eval_ever/converters/helm/instance_level_adapter.py
+++ b/every_eval_ever/converters/helm/instance_level_adapter.py
@@ -1,6 +1,6 @@
 import json
 from pathlib import Path
-from typing import Any, List, Tuple
+from typing import Any, List, Tuple, cast
 
 _HELM_IMPORT_ERROR: Exception | None = None
 try:
@@ -9,7 +9,7 @@
     Exception
 ) as ex:  # pragma: no cover - exercised only when optional deps missing
     _HELM_IMPORT_ERROR = ex
-    RequestState = Any  # type: ignore[assignment]
+    RequestState = cast(Any, None)
 
 
 def _require_helm_dependencies() -> None:
@@ -335,9 +335,12 @@ def convert_instance_level_logs(
                         ),
                         token_usage=token_usage,
                         performance=Performance(
-                            generation_time_ms=state.result.request_time * 1000
-                            if state.result.request_time
-                            else None
+                            generation_time_ms=(
+                                state.result.request_time * 1000
+                                if state.result
+                                and state.result.request_time
+                                else None
+                            )
                         ),
                     )
                 )