From 4353b80019b0e188b6f2f6b35cf1a51dfdd4923b Mon Sep 17 00:00:00 2001 From: agent Date: Sat, 25 Apr 2026 16:37:54 +0000 Subject: [PATCH 1/8] Fix HELM EEE instance metric rows --- .../converters/helm/instance_level_adapter.py | 200 ++++++++++----- tests/test_helm_adapter.py | 11 +- tests/test_helm_instance_level_adapter.py | 241 +++++++++++++++++- 3 files changed, 376 insertions(+), 76 deletions(-) diff --git a/every_eval_ever/converters/helm/instance_level_adapter.py b/every_eval_ever/converters/helm/instance_level_adapter.py index 037237ecc..e675c7ff7 100644 --- a/every_eval_ever/converters/helm/instance_level_adapter.py +++ b/every_eval_ever/converters/helm/instance_level_adapter.py @@ -35,6 +35,60 @@ def _require_helm_dependencies() -> None: ) +def _score_from_stat(stat) -> float | None: + value = getattr(stat, 'mean', None) + if value is None: + count = getattr(stat, 'count', None) + total = getattr(stat, 'sum', None) + if count: + value = total / count + if value is None: + return None + try: + return float(value) + except (TypeError, ValueError): + return None + + +# Metric names whose per-instance score is a correctness signal in [0, 1] +# where ``score > 0`` reasonably maps to ``is_correct=True``. Anything not +# in this allowlist (token counts, runtime, finish-reason flags, logprobs, +# etc.) gets ``is_correct=False`` because we have no correctness claim +# from a bookkeeping/resource metric. Keep this list tight and named after +# the actual HELM stat names — broaden only for verified correctness +# semantics. +_BINARY_CORRECTNESS_METRIC_NAMES: frozenset[str] = frozenset({ + 'exact_match', + 'quasi_exact_match', + 'prefix_exact_match', + 'quasi_prefix_exact_match', + 'exact_match@5', + 'quasi_exact_match@5', + 'prefix_exact_match@5', + 'quasi_prefix_exact_match@5', + 'ifeval_strict_accuracy', + 'chain_of_thought_correctness', + 'math_equiv', + 'math_equiv_chain_of_thought', +}) + + +def _is_correct_for_metric(metric_name: str | None, score: float) -> bool: + """Decide ``is_correct`` honestly per metric name. + + For correctness metrics in the allowlist, the HELM convention is that + score==1.0 means correct and 0.0 means wrong, so any positive score + rounds up to "correct". For anything else (bookkeeping / resource + stats, or graded metrics like rouge_l/bleu where >0 is not a correctness + signal) we deliberately do not claim correctness. + """ + if metric_name is None: + return False + if metric_name in _BINARY_CORRECTNESS_METRIC_NAMES: + return score > 0 + return False + + class HELMInstanceLevelDataAdapter: def __init__( self, @@ -97,27 +151,23 @@ def convert_instance_level_logs( reasoning_traces = extract_all_reasonings(state) if isinstance(reasoning_traces, str): reasoning_traces = [reasoning_traces] + if reasoning_traces is None: + reasoning_traces = [] + reasoning_traces = [ + trace for trace in reasoning_traces if isinstance(trace, str) + ] - is_correct = False - score = 0.0 - if inst_stats: - em_stat = next( - ( - s - for s in inst_stats.stats - if s.name.name == 'exact_match' - ), - None, + metric_stats = list(inst_stats.stats) if inst_stats else [] + if not metric_stats: + correct_completions = sum( + 1 for c in completions if c.strip() in correct_refs ) - if em_stat: - score = em_stat.mean - is_correct = em_stat.mean > 0 - else: # TODO check for more specific tasks - correct_completions = sum( - 1 for c in completions if c.strip() in correct_refs - ) - score = correct_completions / len(completions) - is_correct = score > 0 + fallback_score = ( + correct_completions / len(completions) + if completions + else 0.0 + ) + metric_stats = [None] token_usage = None if inst_stats: @@ -155,56 +205,72 @@ def convert_instance_level_logs( total_tokens=int(p_tokens + c_tokens), ) - instance_level_logs.append( - InstanceLevelEvaluationLog( - schema_version=SCHEMA_VERSION, - evaluation_id=self.evaluation_id, - model_id=model_id, - evaluation_name=evaluation_name, - sample_id=str(state.instance.id), - sample_hash=sha256_string( - state.request.prompt + (correct_refs[0] if correct_refs else '') - ), # TODO use all references - interaction_type=InteractionType.single_turn, - input=Input( - raw=state.request.prompt, - reference=correct_refs if correct_refs else [], - choices=( - list(state.output_mapping.values()) - if state.output_mapping - else [ - ref.output.text - for ref in state.instance.references - ] + for stat in metric_stats: + if stat is None: + metric_name = None + score = fallback_score + # Fallback path: ``score`` here is an exact-match + # proxy from completion-vs-reference matching, so + # the correctness claim is honest in the same sense + # as the legacy single-row behavior. + is_correct = score > 0 + else: + metric_name = getattr(getattr(stat, 'name', None), 'name', None) + score = _score_from_stat(stat) + if score is None: + continue + is_correct = _is_correct_for_metric(metric_name, score) + instance_level_logs.append( + InstanceLevelEvaluationLog( + schema_version=SCHEMA_VERSION, + evaluation_id=self.evaluation_id, + model_id=model_id, + evaluation_name=evaluation_name, + evaluation_result_id=metric_name, + sample_id=str(state.instance.id), + sample_hash=sha256_string( + state.request.prompt + (correct_refs[0] if correct_refs else '') + ), # TODO use all references + interaction_type=InteractionType.single_turn, + input=Input( + raw=state.request.prompt, + reference=correct_refs if correct_refs else [], + choices=( + list(state.output_mapping.values()) + if state.output_mapping + else [ + ref.output.text + for ref in state.instance.references + ] + ), ), - ), - output=Output( - raw=completions, reasoning_trace=reasoning_traces - ), - answer_attribution=[ - AnswerAttributionItem( - turn_idx=0, - source='output.raw', - extracted_value=state.result.completions[ - 0 - ].text.strip() - if state.result and state.result.completions - else '', - extraction_method='exact_match', - is_terminal=True, - ) - ], - evaluation=Evaluation( - score=float(score), is_correct=is_correct - ), - token_usage=token_usage, - performance=Performance( - generation_time_ms=state.result.request_time * 1000 - if state.result.request_time - else None - ), + output=Output( + raw=completions, reasoning_trace=reasoning_traces + ), + answer_attribution=[ + AnswerAttributionItem( + turn_idx=0, + source='output.raw', + extracted_value=state.result.completions[ + 0 + ].text.strip() + if state.result and state.result.completions + else '', + extraction_method='exact_match', + is_terminal=True, + ) + ], + evaluation=Evaluation( + score=float(score), is_correct=is_correct + ), + token_usage=token_usage, + performance=Performance( + generation_time_ms=state.result.request_time * 1000 + if state.result.request_time + else None + ), + ) ) - ) self._save_json(instance_level_logs) return self.path, len(instance_level_logs) diff --git a/tests/test_helm_adapter.py b/tests/test_helm_adapter.py index e08f48dcc..33e279477 100644 --- a/tests/test_helm_adapter.py +++ b/tests/test_helm_adapter.py @@ -76,7 +76,10 @@ def test_mmlu_eval(): assert converted_eval.detailed_evaluation_results is not None assert converted_eval.detailed_evaluation_results.format is not None - assert converted_eval.detailed_evaluation_results.total_rows == 10 + # Per-(sample, metric) emission: each of the 10 samples produces one + # row per non-empty stat, so total_rows is much larger than the + # legacy "one row per sample" count. + assert converted_eval.detailed_evaluation_results.total_rows >= 10 def test_hellswag_eval(): @@ -117,7 +120,8 @@ def test_hellswag_eval(): assert converted_eval.detailed_evaluation_results is not None assert converted_eval.detailed_evaluation_results.format is not None - assert converted_eval.detailed_evaluation_results.total_rows == 10 + # Per-(sample, metric): >= sample count, not equal to it. + assert converted_eval.detailed_evaluation_results.total_rows >= 10 def test_narrativeqa_eval(): @@ -154,7 +158,8 @@ def test_narrativeqa_eval(): assert converted_eval.detailed_evaluation_results is not None assert converted_eval.detailed_evaluation_results.format is not None - assert converted_eval.detailed_evaluation_results.total_rows == 5 + # Per-(sample, metric): >= sample count, not equal to it. + assert converted_eval.detailed_evaluation_results.total_rows >= 5 def test_missing_model_deployment_falls_back_to_model(): diff --git a/tests/test_helm_instance_level_adapter.py b/tests/test_helm_instance_level_adapter.py index 4ee46c6c2..46b69f906 100644 --- a/tests/test_helm_instance_level_adapter.py +++ b/tests/test_helm_instance_level_adapter.py @@ -9,6 +9,10 @@ from pathlib import Path from every_eval_ever.converters.helm.adapter import HELMAdapter +from every_eval_ever.converters.helm.instance_level_adapter import ( + _BINARY_CORRECTNESS_METRIC_NAMES, + _is_correct_for_metric, +) from every_eval_ever.eval_types import EvaluatorRelationship from every_eval_ever.instance_level_types import ( InstanceLevelEvaluationLog, @@ -43,6 +47,15 @@ def _load_instance_level_data(adapter, filepath, metadata_args): return converted_eval, instance_logs +def _by_sample_and_metric( + instance_logs, +) -> dict[tuple[str, str | None], InstanceLevelEvaluationLog]: + return { + (log.sample_id, log.evaluation_result_id): log + for log in instance_logs + } + + def test_mmlu_instance_level(): adapter = HELMAdapter() @@ -60,9 +73,20 @@ def test_mmlu_instance_level(): metadata_args, ) - assert len(instance_logs) == 10 - log = instance_logs[0] + # Per-(sample, metric) emission: 10 samples * many stats per + # sample. Confirm by sample_id distinct count, not row count. + sample_ids = sorted({log.sample_id for log in instance_logs}) + assert len(sample_ids) == 10 + # The MMLU fixture has the standard exact-match family for + # multiple_choice_joint; every sample must have an exact_match row. + em_rows = [ + log + for log in instance_logs + if log.evaluation_result_id == 'exact_match' + ] + assert len(em_rows) == 10 + log = _by_sample_and_metric(instance_logs)[('id147', 'exact_match')] assert log.schema_version == '0.2.2' assert log.evaluation_id == 'test_mmlu_samples' assert log.model_id == 'openai/gpt2' @@ -109,8 +133,16 @@ def test_hellaswag_instance_level(): metadata_args, ) - assert len(instance_logs) == 10 - log = instance_logs[0] + sample_ids = sorted({log.sample_id for log in instance_logs}) + assert len(sample_ids) == 10 + + em_rows = [ + log + for log in instance_logs + if log.evaluation_result_id == 'exact_match' + ] + assert len(em_rows) == 10 + log = em_rows[0] assert log.schema_version == '0.2.2' assert log.model_id == 'eleutherai/pythia-1b-v0' @@ -145,8 +177,16 @@ def test_narrativeqa_instance_level(): metadata_args, ) - assert len(instance_logs) == 5 - log = instance_logs[0] + sample_ids = sorted({log.sample_id for log in instance_logs}) + assert len(sample_ids) == 5 + + em_rows = [ + log + for log in instance_logs + if log.evaluation_result_id == 'exact_match' + ] + assert len(em_rows) == 5 + log = em_rows[0] assert log.schema_version == '0.2.2' assert log.model_id == 'openai/gpt2' @@ -166,3 +206,192 @@ def test_narrativeqa_instance_level(): assert len(log.answer_attribution) == 1 assert log.answer_attribution[0].extraction_method == 'exact_match' + + +# --------------------------------------------------------------------------- +# Per-(sample, metric) row emission and correctness semantics +# --------------------------------------------------------------------------- + + +def test_per_sample_per_metric_rows_are_emitted(): + """Every per-instance HELM stat with a numeric mean becomes its own row. + + The fixture's first sample has 27 declared stats but two + (``training_co2_cost`` and ``training_energy_cost``) carry no value; + we expect 25 rows for that sample. + """ + adapter = HELMAdapter() + with tempfile.TemporaryDirectory() as tmpdir: + metadata_args = { + 'source_organization_name': 'TestOrg', + 'evaluator_relationship': EvaluatorRelationship.first_party, + 'parent_eval_output_dir': tmpdir, + 'file_uuid': 'test_grain', + } + _, instance_logs = _load_instance_level_data( + adapter, + 'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2', + metadata_args, + ) + rows_for_id147 = [log for log in instance_logs if log.sample_id == 'id147'] + metric_ids = sorted(log.evaluation_result_id for log in rows_for_id147) + assert 'exact_match' in metric_ids + assert 'num_prompt_tokens' in metric_ids + # No row should have a None evaluation_result_id when stats exist. + assert all(log.evaluation_result_id is not None for log in rows_for_id147) + # Distinct metric ids — no duplicates within a sample. + assert len(metric_ids) == len(set(metric_ids)) + + +def test_is_correct_only_claimed_for_correctness_metrics(): + """``is_correct=True`` should never appear on bookkeeping/resource rows. + + The MMLU fixture has positive-valued bookkeeping rows + (``num_references=4``, ``num_prompt_tokens=333``, ``inference_runtime>0``, + ``max_prob=1``). Earlier patches set ``is_correct = score > 0`` for + every row, which overclaimed correctness on these. The fix gates + ``is_correct`` on a tight allowlist of correctness metric names. + """ + adapter = HELMAdapter() + with tempfile.TemporaryDirectory() as tmpdir: + metadata_args = { + 'source_organization_name': 'TestOrg', + 'evaluator_relationship': EvaluatorRelationship.first_party, + 'parent_eval_output_dir': tmpdir, + 'file_uuid': 'test_correctness', + } + _, instance_logs = _load_instance_level_data( + adapter, + 'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2', + metadata_args, + ) + + bookkeeping = [ + log + for log in instance_logs + if log.evaluation_result_id + in { + 'num_references', + 'num_prompt_tokens', + 'num_completion_tokens', + 'num_output_tokens', + 'inference_runtime', + 'max_prob', + 'finish_reason_unknown', + 'logprob', + 'num_bytes', + 'batch_size', + } + ] + assert bookkeeping, 'expected bookkeeping rows to be emitted' + assert all( + log.evaluation.is_correct is False for log in bookkeeping + ), 'bookkeeping metrics must not claim correctness' + + # narrative_qa has graded scores (rouge_l/f1/bleu_*) which are not + # binary correctness either; check at least one fixture has them + # and that they don't overclaim correctness. + with tempfile.TemporaryDirectory() as tmpdir2: + metadata_args2 = dict(metadata_args) + metadata_args2['parent_eval_output_dir'] = tmpdir2 + metadata_args2['file_uuid'] = 'test_correctness_graded' + _, narr_logs = _load_instance_level_data( + adapter, + 'tests/data/helm/narrative_qa:model=openai_gpt2', + metadata_args2, + ) + graded = [ + log + for log in narr_logs + if log.evaluation_result_id + in {'rouge_l', 'f1_score', 'bleu_1', 'bleu_4'} + ] + assert graded, 'expected graded score rows in narrative_qa fixture' + assert all( + log.evaluation.is_correct is False for log in graded + ), ( + 'graded metrics (rouge_l/f1_score/bleu_*) must not be ' + 'treated as binary correctness' + ) + + +def test_is_correct_is_true_for_correct_exact_match_rows(): + """When a correctness metric scores positive, is_correct must be True. + + Run the converter and pick any exact_match row whose score happens to + be 1.0. The MMLU fixture contains at least one correct id (id7). + """ + adapter = HELMAdapter() + with tempfile.TemporaryDirectory() as tmpdir: + metadata_args = { + 'source_organization_name': 'TestOrg', + 'evaluator_relationship': EvaluatorRelationship.first_party, + 'parent_eval_output_dir': tmpdir, + 'file_uuid': 'test_exact_match_true', + } + _, instance_logs = _load_instance_level_data( + adapter, + 'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2', + metadata_args, + ) + positive_em_rows = [ + log + for log in instance_logs + if log.evaluation_result_id == 'exact_match' + and log.evaluation.score > 0 + ] + for log in positive_em_rows: + assert log.evaluation.is_correct is True + + +def test_is_correct_for_metric_helper(): + """Pure-function check on the correctness allowlist.""" + # Every allowlist member, score>0 ⇒ True; score==0 ⇒ False. + for name in _BINARY_CORRECTNESS_METRIC_NAMES: + assert _is_correct_for_metric(name, 1.0) is True + assert _is_correct_for_metric(name, 0.0) is False + # Non-allowlisted metrics never claim correctness, regardless of score. + for name in ( + 'num_prompt_tokens', + 'num_references', + 'inference_runtime', + 'rouge_l', + 'f1_score', + 'bleu_1', + 'logprob', + None, + ): + assert _is_correct_for_metric(name, 1.0) is False + assert _is_correct_for_metric(name, 0.0) is False + + +def test_reasoning_traces_none_does_not_break_conversion(monkeypatch): + """Patch the upstream extractor to return None and ensure conversion works. + + Mirrors the bug class fixed by the original 9900ae6 patch — we want a + regression test so the next refactor cannot reintroduce the failure. + """ + from every_eval_ever.converters.helm import instance_level_adapter as mod + + monkeypatch.setattr(mod, 'extract_all_reasonings', lambda state: None) + + adapter = HELMAdapter() + with tempfile.TemporaryDirectory() as tmpdir: + metadata_args = { + 'source_organization_name': 'TestOrg', + 'evaluator_relationship': EvaluatorRelationship.first_party, + 'parent_eval_output_dir': tmpdir, + 'file_uuid': 'test_reasoning_none', + } + _, instance_logs = _load_instance_level_data( + adapter, + 'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2', + metadata_args, + ) + assert instance_logs + for log in instance_logs: + # ``reasoning_trace`` is Optional[list[str]]; with extractor + # returning None we either get None or an empty list — both + # satisfy the schema. The key invariant is "no crash". + trace = log.output.reasoning_trace + assert trace is None or trace == [] or all(isinstance(t, str) for t in trace) From f62387a1aa1a71d6f83798e0496655a664c1aa8b Mon Sep 17 00:00:00 2001 From: joncrall Date: Wed, 13 May 2026 16:46:05 -0400 Subject: [PATCH 2/8] Add more tests --- tests/test_helm_instance_level_adapter.py | 209 ++++++++++++++++++++-- 1 file changed, 198 insertions(+), 11 deletions(-) diff --git a/tests/test_helm_instance_level_adapter.py b/tests/test_helm_instance_level_adapter.py index 46b69f906..4be723e3a 100644 --- a/tests/test_helm_instance_level_adapter.py +++ b/tests/test_helm_instance_level_adapter.py @@ -1,17 +1,16 @@ import pytest -pytest.importorskip( - 'helm', reason='crfm-helm not installed; install with: uv sync --extra helm' -) - import json import tempfile from pathlib import Path +from types import SimpleNamespace from every_eval_ever.converters.helm.adapter import HELMAdapter from every_eval_ever.converters.helm.instance_level_adapter import ( + HELMInstanceLevelDataAdapter, _BINARY_CORRECTNESS_METRIC_NAMES, _is_correct_for_metric, + _score_from_stat, ) from every_eval_ever.eval_types import EvaluatorRelationship from every_eval_ever.instance_level_types import ( @@ -20,6 +19,17 @@ ) +def _make_helm_adapter(): + from every_eval_ever.converters.helm import adapter as helm_adapter_mod + + if helm_adapter_mod._HELM_IMPORT_ERROR is not None: + pytest.skip( + 'HELM converter dependencies are missing; install with: ' + 'uv sync --extra helm (or pip install every_eval_ever[helm])' + ) + return HELMAdapter() + + def _load_instance_level_data(adapter, filepath, metadata_args): eval_dirpath = Path(filepath) converted_eval_list = adapter.transform_from_directory( @@ -57,7 +67,7 @@ def _by_sample_and_metric( def test_mmlu_instance_level(): - adapter = HELMAdapter() + adapter = _make_helm_adapter() with tempfile.TemporaryDirectory() as tmpdir: metadata_args = { @@ -117,7 +127,7 @@ def test_mmlu_instance_level(): def test_hellaswag_instance_level(): - adapter = HELMAdapter() + adapter = _make_helm_adapter() with tempfile.TemporaryDirectory() as tmpdir: metadata_args = { @@ -161,7 +171,7 @@ def test_hellaswag_instance_level(): def test_narrativeqa_instance_level(): - adapter = HELMAdapter() + adapter = _make_helm_adapter() with tempfile.TemporaryDirectory() as tmpdir: metadata_args = { @@ -220,7 +230,7 @@ def test_per_sample_per_metric_rows_are_emitted(): (``training_co2_cost`` and ``training_energy_cost``) carry no value; we expect 25 rows for that sample. """ - adapter = HELMAdapter() + adapter = _make_helm_adapter() with tempfile.TemporaryDirectory() as tmpdir: metadata_args = { 'source_organization_name': 'TestOrg', @@ -252,7 +262,7 @@ def test_is_correct_only_claimed_for_correctness_metrics(): every row, which overclaimed correctness on these. The fix gates ``is_correct`` on a tight allowlist of correctness metric names. """ - adapter = HELMAdapter() + adapter = _make_helm_adapter() with tempfile.TemporaryDirectory() as tmpdir: metadata_args = { 'source_organization_name': 'TestOrg', @@ -321,7 +331,7 @@ def test_is_correct_is_true_for_correct_exact_match_rows(): Run the converter and pick any exact_match row whose score happens to be 1.0. The MMLU fixture contains at least one correct id (id7). """ - adapter = HELMAdapter() + adapter = _make_helm_adapter() with tempfile.TemporaryDirectory() as tmpdir: metadata_args = { 'source_organization_name': 'TestOrg', @@ -365,6 +375,183 @@ def test_is_correct_for_metric_helper(): assert _is_correct_for_metric(name, 0.0) is False +def _score_from_stat_dict(stat_dict): + """Mirror the converter's numeric-score policy for fixture assertions.""" + value = stat_dict.get('mean') + if value is None: + count = stat_dict.get('count') + total = stat_dict.get('sum') + if count and total is not None: + value = total / count + if value is None: + return None + try: + return float(value) + except (TypeError, ValueError): + return None + + +def _expected_numeric_stat_rows(per_instance_stats_path): + """Count rows the per-(sample, numeric stat) policy should emit.""" + per_instance_stats = json.loads(Path(per_instance_stats_path).read_text()) + return sum( + 1 + for inst_stats in per_instance_stats + for stat in inst_stats.get('stats', []) + if _score_from_stat_dict(stat) is not None + ) + + +def test_total_rows_matches_numeric_per_instance_stats(): + """The row count should be exact, not just >= the sample count.""" + adapter = _make_helm_adapter() + fixture_path = Path( + 'tests/data/helm/' + 'mmlu:subject=philosophy,method=multiple_choice_joint,' + 'model=openai_gpt2' + ) + with tempfile.TemporaryDirectory() as tmpdir: + metadata_args = { + 'source_organization_name': 'TestOrg', + 'evaluator_relationship': EvaluatorRelationship.first_party, + 'parent_eval_output_dir': tmpdir, + 'file_uuid': 'test_exact_total_rows', + } + converted_eval, instance_logs = _load_instance_level_data( + adapter, + fixture_path, + metadata_args, + ) + + expected_rows = _expected_numeric_stat_rows( + fixture_path / 'per_instance_stats.json' + ) + assert converted_eval.detailed_evaluation_results.total_rows == len( + instance_logs + ) + assert len(instance_logs) == expected_rows + + sample_metric_keys = { + (log.sample_id, log.evaluation_result_id) + for log in instance_logs + } + assert len(sample_metric_keys) == len(instance_logs) + + +def test_instance_evaluation_result_ids_join_to_aggregate_results(): + """Every non-null instance result id should join to an aggregate row.""" + adapter = _make_helm_adapter() + with tempfile.TemporaryDirectory() as tmpdir: + metadata_args = { + 'source_organization_name': 'TestOrg', + 'evaluator_relationship': EvaluatorRelationship.first_party, + 'parent_eval_output_dir': tmpdir, + 'file_uuid': 'test_join_keys', + } + converted_eval, instance_logs = _load_instance_level_data( + adapter, + 'tests/data/helm/mmlu:subject=philosophy,' + 'method=multiple_choice_joint,model=openai_gpt2', + metadata_args, + ) + + aggregate_ids = { + result.evaluation_result_id + for result in converted_eval.evaluation_results + if result.evaluation_result_id is not None + } + detail_ids = { + log.evaluation_result_id + for log in instance_logs + if log.evaluation_result_id is not None + } + + assert aggregate_ids, ( + 'Aggregate HELM results must set evaluation_result_id when ' + 'instance rows use evaluation_result_id as a join key.' + ) + assert detail_ids <= aggregate_ids + + +def test_score_from_stat_helper_edge_cases(): + """Cover scalar extraction paths and malformed empty stats.""" + assert _score_from_stat( + SimpleNamespace(mean=0.25, sum=10, count=2) + ) == 0.25 + assert _score_from_stat( + SimpleNamespace(mean=None, sum=3, count=2) + ) == 1.5 + assert _score_from_stat( + SimpleNamespace(mean=None, sum=0, count=0) + ) is None + assert _score_from_stat( + SimpleNamespace(mean='not-a-number', sum=0, count=1) + ) is None + assert _score_from_stat( + SimpleNamespace(mean=None, sum=None, count=1) + ) is None + + +def test_missing_inst_stats_uses_legacy_exact_match_fallback(monkeypatch): + """No per-instance stats should still produce one legacy EM row.""" + correct_reference = SimpleNamespace( + output=SimpleNamespace(text='expected answer'), + tags=['correct'], + ) + distractor_reference = SimpleNamespace( + output=SimpleNamespace(text='distractor'), + tags=[], + ) + state = SimpleNamespace( + instance=SimpleNamespace( + id='sample-1', + references=[correct_reference, distractor_reference], + ), + request=SimpleNamespace(prompt='Question?'), + result=SimpleNamespace( + completions=[SimpleNamespace(text='expected answer')], + request_time=0.25, + ), + output_mapping=None, + ) + + from every_eval_ever.converters.helm import instance_level_adapter as mod + + # This test only needs the converter logic and a synthetic state object; + # bypass the optional HELM dependency guard so it can run in core installs. + monkeypatch.setattr(mod, '_HELM_IMPORT_ERROR', None) + monkeypatch.setattr(mod, 'extract_all_reasonings', lambda state: None) + + with tempfile.TemporaryDirectory() as tmpdir: + path, total_rows = HELMInstanceLevelDataAdapter( + 'fallback_eval', + 'jsonl', + 'sha256', + tmpdir, + ).convert_instance_level_logs( + 'synthetic_eval', + 'synthetic/model', + [state], + [], + ) + + assert total_rows == 1 + with Path(path).open('r', encoding='utf-8') as file: + rows = [ + InstanceLevelEvaluationLog.model_validate(json.loads(line)) + for line in file + if line.strip() + ] + + assert len(rows) == 1 + log = rows[0] + assert log.evaluation_result_id is None + assert log.evaluation.score == 1.0 + assert log.evaluation.is_correct is True + assert log.input.reference == ['expected answer'] + assert log.output.raw == ['expected answer'] + + def test_reasoning_traces_none_does_not_break_conversion(monkeypatch): """Patch the upstream extractor to return None and ensure conversion works. @@ -375,7 +562,7 @@ def test_reasoning_traces_none_does_not_break_conversion(monkeypatch): monkeypatch.setattr(mod, 'extract_all_reasonings', lambda state: None) - adapter = HELMAdapter() + adapter = _make_helm_adapter() with tempfile.TemporaryDirectory() as tmpdir: metadata_args = { 'source_organization_name': 'TestOrg', From a4f2bb634ba0191c2ef1f0ae90ef17b99db001fb Mon Sep 17 00:00:00 2001 From: joncrall Date: Wed, 13 May 2026 18:23:12 -0400 Subject: [PATCH 3/8] Fix HELM per-metric instance result joins --- every_eval_ever/converters/helm/adapter.py | 139 ++++--- .../converters/helm/instance_level_adapter.py | 43 +- tests/test_helm_adapter.py | 12 + tests/test_helm_instance_level_adapter.py | 374 +++++++++--------- 4 files changed, 306 insertions(+), 262 deletions(-) diff --git a/every_eval_ever/converters/helm/adapter.py b/every_eval_ever/converters/helm/adapter.py index e43eb01b9..5b9727f76 100644 --- a/every_eval_ever/converters/helm/adapter.py +++ b/every_eval_ever/converters/helm/adapter.py @@ -47,6 +47,8 @@ from every_eval_ever.converters.common.utils import sha256_file from every_eval_ever.converters.helm.instance_level_adapter import ( HELMInstanceLevelDataAdapter, + _evaluation_result_id, + _score_from_stat, ) from every_eval_ever.converters.helm.utils import extract_reasoning from every_eval_ever.eval_types import ( @@ -402,80 +404,95 @@ def _transform_single( evaluation_id = f'{source_data.dataset_name}/{model_info.id.replace("/", "_")}/{evaluation_timestamp}' - metric_names = self._extract_metric_names(run_spec) - + # Build aggregate results from the numeric HELM stats themselves, not + # only from run_spec.metric_specs. The instance-level converter emits + # one row per numeric per-instance stat, so aggregate IDs must cover + # the same stat namespace for detailed rows to be joinable. evaluation_results: List[EvaluationResult] = [] + seen_evaluation_result_ids: set[str] = set() + + for stat in stats_raw: + metric_name = getattr(getattr(stat, 'name', None), 'name', None) + score = _score_from_stat(stat) + if metric_name is None or score is None: + continue + + evaluation_result_id = _evaluation_result_id( + metric_name, + getattr(stat.name, 'split', None), + getattr(stat.name, 'perturbation', None), + ) + if evaluation_result_id is None: + continue + if evaluation_result_id in seen_evaluation_result_ids: + continue + seen_evaluation_result_ids.add(evaluation_result_id) - for metric_name in set(metric_names): metric_config = MetricConfig( evaluation_description=metric_name, lower_is_better=False, # TODO schema.json check score_type=ScoreType.continuous, min_score=0, - max_score=1, + max_score=max(1.0, score), ) - matching_stats = [ - s - for s in stats_raw - if s.name.name == metric_name and not s.name.perturbation - ] - - for stat in matching_stats: - evaluation_name = ( - f'{metric_name} on {source_data.dataset_name}' - if not stat.name.split - else f'{metric_name} {stat.name.split} on {source_data.dataset_name}' - ) + split = getattr(stat.name, 'split', None) + perturbation = getattr(stat.name, 'perturbation', None) + name_parts = [metric_name] + if split: + name_parts.append(str(split)) + if perturbation: + name_parts.append(str(perturbation)) + evaluation_name = ( + f'{" ".join(name_parts)} on {source_data.dataset_name}' + ) - evaluation_results.append( - EvaluationResult( - evaluation_name=evaluation_name, - source_data=source_data, - evaluation_timestamp=evaluation_timestamp, - metric_config=metric_config, - score_details=ScoreDetails( - score=stat.mean - or (stat.sum / stat.count if stat.count else 0.0), - uncertainty=Uncertainty( - standard_deviation=stat.stddev, - num_samples=adapter_spec.max_eval_instances - or len(request_states), - ), - details={ - 'count': str(stat.count), - 'split': str(stat.name.split) - if stat.name.split - else '', - 'perturbation': str(stat.name.perturbation) - if stat.name.perturbation - else '', - }, + evaluation_results.append( + EvaluationResult( + evaluation_result_id=evaluation_result_id, + evaluation_name=evaluation_name, + source_data=source_data, + evaluation_timestamp=evaluation_timestamp, + metric_config=metric_config, + score_details=ScoreDetails( + score=score, + uncertainty=Uncertainty( + standard_deviation=getattr(stat, 'stddev', None), + num_samples=adapter_spec.max_eval_instances + or len(request_states), ), - generation_config=GenerationConfig( - generation_args=self._extract_generation_args( - adapter_spec=adapter_spec, - request_state=request_states[0], - ), - additional_details={ - 'stop_sequences': json.dumps( - request_states[0].request.stop_sequences - ) - if request_states[0].request.stop_sequences - else '[]', - 'presence_penalty': str( - request_states[0].request.presence_penalty - ), - 'frequency_penalty': str( - request_states[0].request.frequency_penalty - ), - 'num_completions': str( - request_states[0].request.num_completions - ), - }, + details={ + 'count': str(getattr(stat, 'count', '')), + 'split': str(split) if split else '', + 'perturbation': str(perturbation) + if perturbation + else '', + }, + ), + generation_config=GenerationConfig( + generation_args=self._extract_generation_args( + adapter_spec=adapter_spec, + request_state=request_states[0], ), - ) + additional_details={ + 'stop_sequences': json.dumps( + request_states[0].request.stop_sequences + ) + if request_states[0].request.stop_sequences + else '[]', + 'presence_penalty': str( + request_states[0].request.presence_penalty + ), + 'frequency_penalty': str( + request_states[0].request.frequency_penalty + ), + 'num_completions': str( + request_states[0].request.num_completions + ), + }, + ), ) + ) if request_states: parent_eval_output_dir = metadata_args.get('parent_eval_output_dir') diff --git a/every_eval_ever/converters/helm/instance_level_adapter.py b/every_eval_ever/converters/helm/instance_level_adapter.py index e675c7ff7..391bb6c2e 100644 --- a/every_eval_ever/converters/helm/instance_level_adapter.py +++ b/every_eval_ever/converters/helm/instance_level_adapter.py @@ -41,7 +41,10 @@ def _score_from_stat(stat) -> float | None: count = getattr(stat, 'count', None) total = getattr(stat, 'sum', None) if count: - value = total / count + try: + value = total / count + except (TypeError, ValueError, ZeroDivisionError): + return None if value is None: return None try: @@ -50,6 +53,33 @@ def _score_from_stat(stat) -> float | None: return None +def _stat_name_part(value) -> str | None: + if value is None: + return None + if isinstance(value, str): + return value or None + if isinstance(value, dict): + return value.get('name') or str(value) + return getattr(value, 'name', None) or str(value) + + +def _evaluation_result_id( + metric_name: str | None, + split=None, + perturbation=None, +) -> str | None: + if metric_name is None: + return None + parts = [metric_name] + split_part = _stat_name_part(split) + perturbation_part = _stat_name_part(perturbation) + if split_part: + parts.append(split_part) + if perturbation_part: + parts.append(perturbation_part) + return ':'.join(parts) + + # Metric names whose per-instance score is a correctness signal in [0, 1] # where ``score > 0`` reasonably maps to ``is_correct=True``. Anything not # in this allowlist (token counts, runtime, finish-reason flags, logprobs, @@ -208,6 +238,7 @@ def convert_instance_level_logs( for stat in metric_stats: if stat is None: metric_name = None + evaluation_result_id = None score = fallback_score # Fallback path: ``score`` here is an exact-match # proxy from completion-vs-reference matching, so @@ -215,7 +246,13 @@ def convert_instance_level_logs( # as the legacy single-row behavior. is_correct = score > 0 else: - metric_name = getattr(getattr(stat, 'name', None), 'name', None) + stat_name = getattr(stat, 'name', None) + metric_name = getattr(stat_name, 'name', None) + evaluation_result_id = _evaluation_result_id( + metric_name, + getattr(stat_name, 'split', None), + getattr(stat_name, 'perturbation', None), + ) score = _score_from_stat(stat) if score is None: continue @@ -226,7 +263,7 @@ def convert_instance_level_logs( evaluation_id=self.evaluation_id, model_id=model_id, evaluation_name=evaluation_name, - evaluation_result_id=metric_name, + evaluation_result_id=evaluation_result_id, sample_id=str(state.instance.id), sample_hash=sha256_string( state.request.prompt + (correct_refs[0] if correct_refs else '') diff --git a/tests/test_helm_adapter.py b/tests/test_helm_adapter.py index 33e279477..a3edd3062 100644 --- a/tests/test_helm_adapter.py +++ b/tests/test_helm_adapter.py @@ -39,6 +39,15 @@ def _load_eval(adapter, filepath, metadata_args): return converted_eval +def _assert_unique_evaluation_result_ids(converted_eval): + result_ids = [ + result.evaluation_result_id + for result in converted_eval.evaluation_results + ] + assert all(result_ids) + assert len(result_ids) == len(set(result_ids)) + + def test_mmlu_eval(): adapter = HELMAdapter() metadata_args = { @@ -73,6 +82,7 @@ def test_mmlu_eval(): assert len(results) > 0 assert any('mmlu' in r.evaluation_name.lower() for r in results) assert all(r.metric_config is not None for r in results) + _assert_unique_evaluation_result_ids(converted_eval) assert converted_eval.detailed_evaluation_results is not None assert converted_eval.detailed_evaluation_results.format is not None @@ -117,6 +127,7 @@ def test_hellswag_eval(): assert len(results) > 0 assert results[0].score_details.score is not None assert any('hellaswag' in r.evaluation_name.lower() for r in results) + _assert_unique_evaluation_result_ids(converted_eval) assert converted_eval.detailed_evaluation_results is not None assert converted_eval.detailed_evaluation_results.format is not None @@ -155,6 +166,7 @@ def test_narrativeqa_eval(): assert len(results) > 0 assert any('narrativeqa' in r.evaluation_name.lower() for r in results) assert all(r.metric_config is not None for r in results) + _assert_unique_evaluation_result_ids(converted_eval) assert converted_eval.detailed_evaluation_results is not None assert converted_eval.detailed_evaluation_results.format is not None diff --git a/tests/test_helm_instance_level_adapter.py b/tests/test_helm_instance_level_adapter.py index 4be723e3a..1685a656d 100644 --- a/tests/test_helm_instance_level_adapter.py +++ b/tests/test_helm_instance_level_adapter.py @@ -1,14 +1,16 @@ -import pytest - import json import tempfile from pathlib import Path from types import SimpleNamespace +import pytest + +from every_eval_ever.converters.helm import adapter as helm_adapter_module from every_eval_ever.converters.helm.adapter import HELMAdapter from every_eval_ever.converters.helm.instance_level_adapter import ( HELMInstanceLevelDataAdapter, _BINARY_CORRECTNESS_METRIC_NAMES, + _evaluation_result_id, _is_correct_for_metric, _score_from_stat, ) @@ -19,15 +21,13 @@ ) -def _make_helm_adapter(): - from every_eval_ever.converters.helm import adapter as helm_adapter_mod - - if helm_adapter_mod._HELM_IMPORT_ERROR is not None: +def _require_helm(): + import_error = getattr(helm_adapter_module, '_HELM_IMPORT_ERROR', None) + if import_error is not None: pytest.skip( - 'HELM converter dependencies are missing; install with: ' - 'uv sync --extra helm (or pip install every_eval_ever[helm])' + 'HELM converter dependencies are missing: ' + f'{import_error!r}. Install with: uv sync --extra helm' ) - return HELMAdapter() def _load_instance_level_data(adapter, filepath, metadata_args): @@ -66,8 +66,44 @@ def _by_sample_and_metric( } +def _metric_name_from_result_id(result_id: str | None) -> str | None: + if result_id is None: + return None + return result_id.split(':', 1)[0] + + +def _json_score_from_stat(stat: dict) -> float | None: + value = stat.get('mean') + if value is None: + count = stat.get('count') + total = stat.get('sum') + if count: + try: + value = total / count + except (TypeError, ValueError, ZeroDivisionError): + return None + if value is None: + return None + try: + return float(value) + except (TypeError, ValueError): + return None + + +def _expected_numeric_instance_stat_rows(filepath): + per_instance_path = Path(filepath) / 'per_instance_stats.json' + per_instance_stats = json.loads(per_instance_path.read_text()) + return sum( + 1 + for item in per_instance_stats + for stat in item.get('stats', []) + if _json_score_from_stat(stat) is not None + ) + + def test_mmlu_instance_level(): - adapter = _make_helm_adapter() + _require_helm() + adapter = HELMAdapter() with tempfile.TemporaryDirectory() as tmpdir: metadata_args = { @@ -83,20 +119,19 @@ def test_mmlu_instance_level(): metadata_args, ) - # Per-(sample, metric) emission: 10 samples * many stats per - # sample. Confirm by sample_id distinct count, not row count. sample_ids = sorted({log.sample_id for log in instance_logs}) assert len(sample_ids) == 10 - # The MMLU fixture has the standard exact-match family for - # multiple_choice_joint; every sample must have an exact_match row. em_rows = [ log for log in instance_logs - if log.evaluation_result_id == 'exact_match' + if _metric_name_from_result_id(log.evaluation_result_id) + == 'exact_match' ] assert len(em_rows) == 10 - log = _by_sample_and_metric(instance_logs)[('id147', 'exact_match')] + log = _by_sample_and_metric(instance_logs)[ + ('id147', 'exact_match:test') + ] assert log.schema_version == '0.2.2' assert log.evaluation_id == 'test_mmlu_samples' assert log.model_id == 'openai/gpt2' @@ -125,9 +160,12 @@ def test_mmlu_instance_level(): assert log.token_usage.output_tokens > 0 assert log.token_usage.total_tokens > 0 + assert converted_eval.evaluation_results + def test_hellaswag_instance_level(): - adapter = _make_helm_adapter() + _require_helm() + adapter = HELMAdapter() with tempfile.TemporaryDirectory() as tmpdir: metadata_args = { @@ -137,7 +175,7 @@ def test_hellaswag_instance_level(): 'file_uuid': 'test_hellaswag', } - converted_eval, instance_logs = _load_instance_level_data( + _, instance_logs = _load_instance_level_data( adapter, 'tests/data/helm/commonsense:dataset=hellaswag,method=multiple_choice_joint,model=eleutherai_pythia-1b-v0', metadata_args, @@ -149,7 +187,8 @@ def test_hellaswag_instance_level(): em_rows = [ log for log in instance_logs - if log.evaluation_result_id == 'exact_match' + if _metric_name_from_result_id(log.evaluation_result_id) + == 'exact_match' ] assert len(em_rows) == 10 log = em_rows[0] @@ -171,7 +210,8 @@ def test_hellaswag_instance_level(): def test_narrativeqa_instance_level(): - adapter = _make_helm_adapter() + _require_helm() + adapter = HELMAdapter() with tempfile.TemporaryDirectory() as tmpdir: metadata_args = { @@ -181,7 +221,7 @@ def test_narrativeqa_instance_level(): 'file_uuid': 'test_narrativeqa', } - converted_eval, instance_logs = _load_instance_level_data( + _, instance_logs = _load_instance_level_data( adapter, 'tests/data/helm/narrative_qa:model=openai_gpt2', metadata_args, @@ -193,7 +233,8 @@ def test_narrativeqa_instance_level(): em_rows = [ log for log in instance_logs - if log.evaluation_result_id == 'exact_match' + if _metric_name_from_result_id(log.evaluation_result_id) + == 'exact_match' ] assert len(em_rows) == 5 log = em_rows[0] @@ -218,19 +259,9 @@ def test_narrativeqa_instance_level(): assert log.answer_attribution[0].extraction_method == 'exact_match' -# --------------------------------------------------------------------------- -# Per-(sample, metric) row emission and correctness semantics -# --------------------------------------------------------------------------- - - def test_per_sample_per_metric_rows_are_emitted(): - """Every per-instance HELM stat with a numeric mean becomes its own row. - - The fixture's first sample has 27 declared stats but two - (``training_co2_cost`` and ``training_energy_cost``) carry no value; - we expect 25 rows for that sample. - """ - adapter = _make_helm_adapter() + _require_helm() + adapter = HELMAdapter() with tempfile.TemporaryDirectory() as tmpdir: metadata_args = { 'source_organization_name': 'TestOrg', @@ -243,26 +274,19 @@ def test_per_sample_per_metric_rows_are_emitted(): 'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2', metadata_args, ) - rows_for_id147 = [log for log in instance_logs if log.sample_id == 'id147'] + rows_for_id147 = [ + log for log in instance_logs if log.sample_id == 'id147' + ] metric_ids = sorted(log.evaluation_result_id for log in rows_for_id147) - assert 'exact_match' in metric_ids - assert 'num_prompt_tokens' in metric_ids - # No row should have a None evaluation_result_id when stats exist. + assert 'exact_match:test' in metric_ids + assert 'num_prompt_tokens:test' in metric_ids assert all(log.evaluation_result_id is not None for log in rows_for_id147) - # Distinct metric ids — no duplicates within a sample. assert len(metric_ids) == len(set(metric_ids)) def test_is_correct_only_claimed_for_correctness_metrics(): - """``is_correct=True`` should never appear on bookkeeping/resource rows. - - The MMLU fixture has positive-valued bookkeeping rows - (``num_references=4``, ``num_prompt_tokens=333``, ``inference_runtime>0``, - ``max_prob=1``). Earlier patches set ``is_correct = score > 0`` for - every row, which overclaimed correctness on these. The fix gates - ``is_correct`` on a tight allowlist of correctness metric names. - """ - adapter = _make_helm_adapter() + _require_helm() + adapter = HELMAdapter() with tempfile.TemporaryDirectory() as tmpdir: metadata_args = { 'source_organization_name': 'TestOrg', @@ -276,31 +300,29 @@ def test_is_correct_only_claimed_for_correctness_metrics(): metadata_args, ) + bookkeeping_names = { + 'num_references', + 'num_prompt_tokens', + 'num_completion_tokens', + 'num_output_tokens', + 'inference_runtime', + 'max_prob', + 'finish_reason_unknown', + 'logprob', + 'num_bytes', + 'batch_size', + } bookkeeping = [ log for log in instance_logs - if log.evaluation_result_id - in { - 'num_references', - 'num_prompt_tokens', - 'num_completion_tokens', - 'num_output_tokens', - 'inference_runtime', - 'max_prob', - 'finish_reason_unknown', - 'logprob', - 'num_bytes', - 'batch_size', - } + if _metric_name_from_result_id(log.evaluation_result_id) + in bookkeeping_names ] assert bookkeeping, 'expected bookkeeping rows to be emitted' assert all( log.evaluation.is_correct is False for log in bookkeeping ), 'bookkeeping metrics must not claim correctness' - # narrative_qa has graded scores (rouge_l/f1/bleu_*) which are not - # binary correctness either; check at least one fixture has them - # and that they don't overclaim correctness. with tempfile.TemporaryDirectory() as tmpdir2: metadata_args2 = dict(metadata_args) metadata_args2['parent_eval_output_dir'] = tmpdir2 @@ -313,7 +335,7 @@ def test_is_correct_only_claimed_for_correctness_metrics(): graded = [ log for log in narr_logs - if log.evaluation_result_id + if _metric_name_from_result_id(log.evaluation_result_id) in {'rouge_l', 'f1_score', 'bleu_1', 'bleu_4'} ] assert graded, 'expected graded score rows in narrative_qa fixture' @@ -326,12 +348,8 @@ def test_is_correct_only_claimed_for_correctness_metrics(): def test_is_correct_is_true_for_correct_exact_match_rows(): - """When a correctness metric scores positive, is_correct must be True. - - Run the converter and pick any exact_match row whose score happens to - be 1.0. The MMLU fixture contains at least one correct id (id7). - """ - adapter = _make_helm_adapter() + _require_helm() + adapter = HELMAdapter() with tempfile.TemporaryDirectory() as tmpdir: metadata_args = { 'source_organization_name': 'TestOrg', @@ -347,20 +365,19 @@ def test_is_correct_is_true_for_correct_exact_match_rows(): positive_em_rows = [ log for log in instance_logs - if log.evaluation_result_id == 'exact_match' + if _metric_name_from_result_id(log.evaluation_result_id) + == 'exact_match' and log.evaluation.score > 0 ] + assert positive_em_rows for log in positive_em_rows: assert log.evaluation.is_correct is True def test_is_correct_for_metric_helper(): - """Pure-function check on the correctness allowlist.""" - # Every allowlist member, score>0 ⇒ True; score==0 ⇒ False. for name in _BINARY_CORRECTNESS_METRIC_NAMES: assert _is_correct_for_metric(name, 1.0) is True assert _is_correct_for_metric(name, 0.0) is False - # Non-allowlisted metrics never claim correctness, regardless of score. for name in ( 'num_prompt_tokens', 'num_references', @@ -375,41 +392,31 @@ def test_is_correct_for_metric_helper(): assert _is_correct_for_metric(name, 0.0) is False -def _score_from_stat_dict(stat_dict): - """Mirror the converter's numeric-score policy for fixture assertions.""" - value = stat_dict.get('mean') - if value is None: - count = stat_dict.get('count') - total = stat_dict.get('sum') - if count and total is not None: - value = total / count - if value is None: - return None - try: - return float(value) - except (TypeError, ValueError): - return None - - -def _expected_numeric_stat_rows(per_instance_stats_path): - """Count rows the per-(sample, numeric stat) policy should emit.""" - per_instance_stats = json.loads(Path(per_instance_stats_path).read_text()) - return sum( - 1 - for inst_stats in per_instance_stats - for stat in inst_stats.get('stats', []) - if _score_from_stat_dict(stat) is not None +def test_score_from_stat_helper_edge_cases(): + assert _score_from_stat(SimpleNamespace(mean=0.25, sum=10, count=2)) == 0.25 + assert _score_from_stat(SimpleNamespace(mean=None, sum=3, count=2)) == 1.5 + assert _score_from_stat(SimpleNamespace(mean=None, sum=0, count=0)) is None + assert _score_from_stat(SimpleNamespace(mean=None, sum=None, count=1)) is None + assert _score_from_stat(SimpleNamespace(mean='bad', sum=1, count=1)) is None + + +def test_evaluation_result_id_helper_disambiguates_split_and_perturbation(): + assert _evaluation_result_id('exact_match') == 'exact_match' + assert _evaluation_result_id('exact_match', 'test') == 'exact_match:test' + assert ( + _evaluation_result_id( + 'exact_match', + 'test', + SimpleNamespace(name='robustness'), + ) + == 'exact_match:test:robustness' ) def test_total_rows_matches_numeric_per_instance_stats(): - """The row count should be exact, not just >= the sample count.""" - adapter = _make_helm_adapter() - fixture_path = Path( - 'tests/data/helm/' - 'mmlu:subject=philosophy,method=multiple_choice_joint,' - 'model=openai_gpt2' - ) + _require_helm() + fixture = 'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2' + adapter = HELMAdapter() with tempfile.TemporaryDirectory() as tmpdir: metadata_args = { 'source_organization_name': 'TestOrg', @@ -418,29 +425,21 @@ def test_total_rows_matches_numeric_per_instance_stats(): 'file_uuid': 'test_exact_total_rows', } converted_eval, instance_logs = _load_instance_level_data( - adapter, - fixture_path, - metadata_args, + adapter, fixture, metadata_args ) - expected_rows = _expected_numeric_stat_rows( - fixture_path / 'per_instance_stats.json' - ) - assert converted_eval.detailed_evaluation_results.total_rows == len( - instance_logs - ) + expected_rows = _expected_numeric_instance_stat_rows(fixture) + assert converted_eval.detailed_evaluation_results.total_rows == expected_rows assert len(instance_logs) == expected_rows - - sample_metric_keys = { + assert len({ (log.sample_id, log.evaluation_result_id) for log in instance_logs - } - assert len(sample_metric_keys) == len(instance_logs) + }) == len(instance_logs) def test_instance_evaluation_result_ids_join_to_aggregate_results(): - """Every non-null instance result id should join to an aggregate row.""" - adapter = _make_helm_adapter() + _require_helm() + adapter = HELMAdapter() with tempfile.TemporaryDirectory() as tmpdir: metadata_args = { 'source_organization_name': 'TestOrg', @@ -450,8 +449,7 @@ def test_instance_evaluation_result_ids_join_to_aggregate_results(): } converted_eval, instance_logs = _load_instance_level_data( adapter, - 'tests/data/helm/mmlu:subject=philosophy,' - 'method=multiple_choice_joint,model=openai_gpt2', + 'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2', metadata_args, ) @@ -466,103 +464,84 @@ def test_instance_evaluation_result_ids_join_to_aggregate_results(): if log.evaluation_result_id is not None } - assert aggregate_ids, ( - 'Aggregate HELM results must set evaluation_result_id when ' - 'instance rows use evaluation_result_id as a join key.' - ) + assert aggregate_ids + assert detail_ids assert detail_ids <= aggregate_ids -def test_score_from_stat_helper_edge_cases(): - """Cover scalar extraction paths and malformed empty stats.""" - assert _score_from_stat( - SimpleNamespace(mean=0.25, sum=10, count=2) - ) == 0.25 - assert _score_from_stat( - SimpleNamespace(mean=None, sum=3, count=2) - ) == 1.5 - assert _score_from_stat( - SimpleNamespace(mean=None, sum=0, count=0) - ) is None - assert _score_from_stat( - SimpleNamespace(mean='not-a-number', sum=0, count=1) - ) is None - assert _score_from_stat( - SimpleNamespace(mean=None, sum=None, count=1) - ) is None - - -def test_missing_inst_stats_uses_legacy_exact_match_fallback(monkeypatch): - """No per-instance stats should still produce one legacy EM row.""" - correct_reference = SimpleNamespace( - output=SimpleNamespace(text='expected answer'), - tags=['correct'], - ) - distractor_reference = SimpleNamespace( - output=SimpleNamespace(text='distractor'), - tags=[], - ) +def test_aggregate_evaluation_result_ids_are_unique_and_non_null(): + _require_helm() + adapter = HELMAdapter() + with tempfile.TemporaryDirectory() as tmpdir: + metadata_args = { + 'source_organization_name': 'TestOrg', + 'evaluator_relationship': EvaluatorRelationship.first_party, + 'parent_eval_output_dir': tmpdir, + 'file_uuid': 'test_aggregate_ids', + } + converted_eval, _ = _load_instance_level_data( + adapter, + 'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2', + metadata_args, + ) + + result_ids = [ + result.evaluation_result_id + for result in converted_eval.evaluation_results + ] + assert all(result_ids) + assert len(result_ids) == len(set(result_ids)) + assert 'exact_match:test' in result_ids + assert 'num_prompt_tokens:test' in result_ids + + +def test_missing_inst_stats_uses_legacy_exact_match_fallback(): + _require_helm() + completion = SimpleNamespace(text='answer') state = SimpleNamespace( instance=SimpleNamespace( - id='sample-1', - references=[correct_reference, distractor_reference], - ), - request=SimpleNamespace(prompt='Question?'), - result=SimpleNamespace( - completions=[SimpleNamespace(text='expected answer')], - request_time=0.25, + id='sample0', + references=[ + SimpleNamespace( + output=SimpleNamespace(text='answer'), + tags=['correct'], + ) + ], ), - output_mapping=None, + result=SimpleNamespace(completions=[completion], request_time=0.1), + request=SimpleNamespace(prompt='question'), + output_mapping={}, ) - from every_eval_ever.converters.helm import instance_level_adapter as mod - - # This test only needs the converter logic and a synthetic state object; - # bypass the optional HELM dependency guard so it can run in core installs. - monkeypatch.setattr(mod, '_HELM_IMPORT_ERROR', None) - monkeypatch.setattr(mod, 'extract_all_reasonings', lambda state: None) - with tempfile.TemporaryDirectory() as tmpdir: - path, total_rows = HELMInstanceLevelDataAdapter( - 'fallback_eval', + adapter = HELMInstanceLevelDataAdapter( + 'fallback_samples', 'jsonl', 'sha256', tmpdir, - ).convert_instance_level_logs( - 'synthetic_eval', - 'synthetic/model', + ) + path, count = adapter.convert_instance_level_logs( + 'tiny', + 'dev/model', [state], [], ) - assert total_rows == 1 - with Path(path).open('r', encoding='utf-8') as file: - rows = [ - InstanceLevelEvaluationLog.model_validate(json.loads(line)) - for line in file - if line.strip() - ] - - assert len(rows) == 1 - log = rows[0] + assert count == 1 + data = json.loads(Path(path).read_text().strip()) + log = InstanceLevelEvaluationLog.model_validate(data) assert log.evaluation_result_id is None assert log.evaluation.score == 1.0 assert log.evaluation.is_correct is True - assert log.input.reference == ['expected answer'] - assert log.output.raw == ['expected answer'] def test_reasoning_traces_none_does_not_break_conversion(monkeypatch): - """Patch the upstream extractor to return None and ensure conversion works. - - Mirrors the bug class fixed by the original 9900ae6 patch — we want a - regression test so the next refactor cannot reintroduce the failure. - """ + _require_helm() from every_eval_ever.converters.helm import instance_level_adapter as mod monkeypatch.setattr(mod, 'extract_all_reasonings', lambda state: None) - adapter = _make_helm_adapter() + adapter = HELMAdapter() with tempfile.TemporaryDirectory() as tmpdir: metadata_args = { 'source_organization_name': 'TestOrg', @@ -577,8 +556,7 @@ def test_reasoning_traces_none_does_not_break_conversion(monkeypatch): ) assert instance_logs for log in instance_logs: - # ``reasoning_trace`` is Optional[list[str]]; with extractor - # returning None we either get None or an empty list — both - # satisfy the schema. The key invariant is "no crash". trace = log.output.reasoning_trace - assert trace is None or trace == [] or all(isinstance(t, str) for t in trace) + assert trace is None or trace == [] or all( + isinstance(t, str) for t in trace + ) From 991b7cb8a3bff764b7ef4c61a6678e1abddf6dfc Mon Sep 17 00:00:00 2001 From: joncrall Date: Wed, 13 May 2026 18:38:42 -0400 Subject: [PATCH 4/8] Document HELM instance metric invariants --- every_eval_ever/converters/helm/adapter.py | 18 +++++++++++ .../converters/helm/instance_level_adapter.py | 22 ++++++++++++++ tests/test_helm_adapter.py | 2 ++ tests/test_helm_instance_level_adapter.py | 30 +++++++++++++++++++ 4 files changed, 72 insertions(+) diff --git a/every_eval_ever/converters/helm/adapter.py b/every_eval_ever/converters/helm/adapter.py index 5b9727f76..0fa74a00b 100644 --- a/every_eval_ever/converters/helm/adapter.py +++ b/every_eval_ever/converters/helm/adapter.py @@ -193,6 +193,7 @@ def _load_file_if_exists(self, dir_path, file_name) -> Any: return None def _load_evaluation_run_logfiles(self, dir_path) -> Dict: + """Load the HELM files needed for aggregate and detail conversion.""" scenario_state_dict = self._load_file_if_exists( dir_path, self.SCENARIO_STATE_FILE ) @@ -320,6 +321,7 @@ def _extract_evaluation_time( def _extract_dataset_name( self, run_spec_name: str, scenario_name: str | None ) -> str: + """Prefer scenario metadata, falling back to HELM run-spec names.""" if scenario_name: return scenario_name @@ -335,6 +337,12 @@ def _extract_dataset_name( return run_spec_name.split(':')[0] def _extract_metric_names(self, run_spec: RunSpec) -> List[str]: + """Return metric names declared by the HELM run spec. + + Kept for callers that need the run-spec metric declaration. The + main aggregate conversion below uses ``stats.json`` so aggregate + IDs cover all numeric detail rows emitted from per-instance stats. + """ metric_names = [] for metric_spec in run_spec.metric_specs: names = metric_spec.args.get('names') @@ -348,6 +356,13 @@ def _extract_metric_names(self, run_spec: RunSpec) -> List[str]: def _transform_single( self, raw_data: Dict, metadata_args: Dict[str, Any] ) -> Tuple[EvaluationLog, List[InstanceLevelEvaluationLog]]: + """Convert one HELM run into aggregate JSON plus detail JSONL. + + The aggregate ``evaluation_result_id`` values are generated from + ``stats.json`` with the same helper used by the instance converter + so every metric-specific detail row can join back to an aggregate + result. + """ run_spec = from_dict(data_class=RunSpec, data=raw_data['run_spec_dict']) # cast=[str] coerces int instance IDs to str; newer HELM versions # (e.g. long-context suite) store instance.id as int in the JSON. @@ -412,6 +427,9 @@ def _transform_single( seen_evaluation_result_ids: set[str] = set() for stat in stats_raw: + # The ID helper mirrors the instance-level converter. This is the + # key invariant: detail rows should never introduce metric IDs that + # are absent from aggregate evaluation_results. metric_name = getattr(getattr(stat, 'name', None), 'name', None) score = _score_from_stat(stat) if metric_name is None or score is None: diff --git a/every_eval_ever/converters/helm/instance_level_adapter.py b/every_eval_ever/converters/helm/instance_level_adapter.py index 391bb6c2e..199c4009a 100644 --- a/every_eval_ever/converters/helm/instance_level_adapter.py +++ b/every_eval_ever/converters/helm/instance_level_adapter.py @@ -36,6 +36,12 @@ def _require_helm_dependencies() -> None: def _score_from_stat(stat) -> float | None: + """Return a scalar HELM stat value, or None for empty/bad stats. + + HELM usually provides ``mean``; some stats only have ``sum`` and + ``count``. Returning None lets callers skip non-valued bookkeeping + rows such as zero-count training cost stats. + """ value = getattr(stat, 'mean', None) if value is None: count = getattr(stat, 'count', None) @@ -54,6 +60,7 @@ def _score_from_stat(stat) -> float | None: def _stat_name_part(value) -> str | None: + """Normalize HELM split/perturbation labels for stable IDs.""" if value is None: return None if isinstance(value, str): @@ -68,6 +75,11 @@ def _evaluation_result_id( split=None, perturbation=None, ) -> str | None: + """Build the join key shared by aggregate and instance rows. + + Split and perturbation labels are included so two HELM stats with the + same metric name do not collide in ``evaluation_result_id``. + """ if metric_name is None: return None parts = [metric_name] @@ -135,6 +147,7 @@ def __init__( self.path = f'{evaluation_dir}/{evaulation_id}.{format}' def _save_json(self, items: List[InstanceLevelEvaluationLog]): + """Write one validated instance-level log per JSONL line.""" eval_dir_path = Path(self.evaluation_dir) eval_dir_path.mkdir(parents=True, exist_ok=True) path = Path(self.path) @@ -157,6 +170,12 @@ def convert_instance_level_logs( request_states: List[RequestState], per_instance_stats_list: List, ) -> Tuple[str, int]: + """Convert HELM request states into per-(sample, metric) rows. + + When HELM per-instance stats are present, each numeric stat gets + its own detail row. If stats are absent, keep the legacy one-row + exact-match fallback so older or partial logs still convert. + """ instance_level_logs: List[InstanceLevelEvaluationLog] = [] for state in request_states: inst_stats = next( @@ -199,6 +218,9 @@ def convert_instance_level_logs( ) metric_stats = [None] + # Token usage is copied to every row for the same sample. This is + # intentionally denormalized so each detail row is independently + # useful when filtered by metric. token_usage = None if inst_stats: p_tokens = next( diff --git a/tests/test_helm_adapter.py b/tests/test_helm_adapter.py index a3edd3062..d3fe08225 100644 --- a/tests/test_helm_adapter.py +++ b/tests/test_helm_adapter.py @@ -17,6 +17,7 @@ def _load_eval(adapter, filepath, metadata_args): + """Run the HELM aggregate adapter against one fixture directory.""" eval_dirpath = Path(filepath) with tempfile.TemporaryDirectory() as tmpdir: @@ -40,6 +41,7 @@ def _load_eval(adapter, filepath, metadata_args): def _assert_unique_evaluation_result_ids(converted_eval): + """Aggregate result IDs must be stable join targets for sample rows.""" result_ids = [ result.evaluation_result_id for result in converted_eval.evaluation_results diff --git a/tests/test_helm_instance_level_adapter.py b/tests/test_helm_instance_level_adapter.py index 1685a656d..c6e1ef4c4 100644 --- a/tests/test_helm_instance_level_adapter.py +++ b/tests/test_helm_instance_level_adapter.py @@ -22,6 +22,7 @@ def _require_helm(): + """Skip HELM fixture tests when the optional converter deps are absent.""" import_error = getattr(helm_adapter_module, '_HELM_IMPORT_ERROR', None) if import_error is not None: pytest.skip( @@ -31,6 +32,7 @@ def _require_helm(): def _load_instance_level_data(adapter, filepath, metadata_args): + """Run the HELM adapter and read back the generated JSONL detail rows.""" eval_dirpath = Path(filepath) converted_eval_list = adapter.transform_from_directory( eval_dirpath, @@ -60,6 +62,7 @@ def _load_instance_level_data(adapter, filepath, metadata_args): def _by_sample_and_metric( instance_logs, ) -> dict[tuple[str, str | None], InstanceLevelEvaluationLog]: + """Index detail rows by the two fields that should be unique together.""" return { (log.sample_id, log.evaluation_result_id): log for log in instance_logs @@ -67,12 +70,14 @@ def _by_sample_and_metric( def _metric_name_from_result_id(result_id: str | None) -> str | None: + """Strip split/perturbation suffixes from deterministic result IDs.""" if result_id is None: return None return result_id.split(':', 1)[0] def _json_score_from_stat(stat: dict) -> float | None: + """Mirror converter score extraction for raw JSON fixtures.""" value = stat.get('mean') if value is None: count = stat.get('count') @@ -91,6 +96,7 @@ def _json_score_from_stat(stat: dict) -> float | None: def _expected_numeric_instance_stat_rows(filepath): + """Count fixture stats that should become instance-level rows.""" per_instance_path = Path(filepath) / 'per_instance_stats.json' per_instance_stats = json.loads(per_instance_path.read_text()) return sum( @@ -119,6 +125,8 @@ def test_mmlu_instance_level(): metadata_args, ) + # The converter now emits many rows per sample. Count distinct samples + # separately from rows so this test stays focused on sample coverage. sample_ids = sorted({log.sample_id for log in instance_logs}) assert len(sample_ids) == 10 em_rows = [ @@ -129,6 +137,7 @@ def test_mmlu_instance_level(): ] assert len(em_rows) == 10 + # Pick a specific metric row instead of relying on JSONL order. log = _by_sample_and_metric(instance_logs)[ ('id147', 'exact_match:test') ] @@ -278,6 +287,8 @@ def test_per_sample_per_metric_rows_are_emitted(): log for log in instance_logs if log.sample_id == 'id147' ] metric_ids = sorted(log.evaluation_result_id for log in rows_for_id147) + # This guards the core PR behavior: correctness and bookkeeping stats + # both survive as separate rows for the same sample. assert 'exact_match:test' in metric_ids assert 'num_prompt_tokens:test' in metric_ids assert all(log.evaluation_result_id is not None for log in rows_for_id147) @@ -300,6 +311,8 @@ def test_is_correct_only_claimed_for_correctness_metrics(): metadata_args, ) + # Positive bookkeeping values are not correctness claims. A token + # count or runtime can be > 0 without the answer being correct. bookkeeping_names = { 'num_references', 'num_prompt_tokens', @@ -332,6 +345,8 @@ def test_is_correct_only_claimed_for_correctness_metrics(): 'tests/data/helm/narrative_qa:model=openai_gpt2', metadata_args2, ) + # Graded generation metrics also should not be coerced into a + # binary correctness label just because their scores are positive. graded = [ log for log in narr_logs @@ -362,6 +377,8 @@ def test_is_correct_is_true_for_correct_exact_match_rows(): 'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2', metadata_args, ) + # Correctness metrics are the exception: positive exact-match rows + # should still carry is_correct=True. positive_em_rows = [ log for log in instance_logs @@ -393,6 +410,7 @@ def test_is_correct_for_metric_helper(): def test_score_from_stat_helper_edge_cases(): + # Empty or malformed HELM stats should be skipped, not crash conversion. assert _score_from_stat(SimpleNamespace(mean=0.25, sum=10, count=2)) == 0.25 assert _score_from_stat(SimpleNamespace(mean=None, sum=3, count=2)) == 1.5 assert _score_from_stat(SimpleNamespace(mean=None, sum=0, count=0)) is None @@ -401,6 +419,8 @@ def test_score_from_stat_helper_edge_cases(): def test_evaluation_result_id_helper_disambiguates_split_and_perturbation(): + # Split and perturbation suffixes prevent same-named HELM stats from + # colliding when they are used as join keys. assert _evaluation_result_id('exact_match') == 'exact_match' assert _evaluation_result_id('exact_match', 'test') == 'exact_match:test' assert ( @@ -428,6 +448,8 @@ def test_total_rows_matches_numeric_per_instance_stats(): adapter, fixture, metadata_args ) + # Count expected rows from the fixture itself so duplication or + # accidental filtering changes are caught precisely. expected_rows = _expected_numeric_instance_stat_rows(fixture) assert converted_eval.detailed_evaluation_results.total_rows == expected_rows assert len(instance_logs) == expected_rows @@ -453,6 +475,8 @@ def test_instance_evaluation_result_ids_join_to_aggregate_results(): metadata_args, ) + # This is the most important schema invariant: every metric-specific + # detail row should be joinable to one aggregate evaluation result. aggregate_ids = { result.evaluation_result_id for result in converted_eval.evaluation_results @@ -485,6 +509,8 @@ def test_aggregate_evaluation_result_ids_are_unique_and_non_null(): metadata_args, ) + # Aggregate IDs are the target side of the join, so they must be + # present and unique. result_ids = [ result.evaluation_result_id for result in converted_eval.evaluation_results @@ -497,6 +523,8 @@ def test_aggregate_evaluation_result_ids_are_unique_and_non_null(): def test_missing_inst_stats_uses_legacy_exact_match_fallback(): _require_helm() + # Some old or partial HELM logs may lack per-instance stats. The adapter + # should still emit the legacy one-row exact-match fallback. completion = SimpleNamespace(text='answer') state = SimpleNamespace( instance=SimpleNamespace( @@ -539,6 +567,8 @@ def test_reasoning_traces_none_does_not_break_conversion(monkeypatch): _require_helm() from every_eval_ever.converters.helm import instance_level_adapter as mod + # HELM reasoning extraction may legitimately return None. The converter + # normalizes that case instead of passing an invalid value to the schema. monkeypatch.setattr(mod, 'extract_all_reasonings', lambda state: None) adapter = HELMAdapter() From 1b34a2475885fcc341badd91b233bcfafd0a3e37 Mon Sep 17 00:00:00 2001 From: joncrall Date: Wed, 13 May 2026 19:04:42 -0400 Subject: [PATCH 5/8] Restrict HELM metric rows to core metrics --- every_eval_ever/converters/helm/adapter.py | 22 ++-- .../converters/helm/instance_level_adapter.py | 24 +++-- every_eval_ever/converters/helm/metrics.py | 101 ++++++++++++++++++ tests/test_helm_instance_level_adapter.py | 97 +++++++++-------- 4 files changed, 185 insertions(+), 59 deletions(-) create mode 100644 every_eval_ever/converters/helm/metrics.py diff --git a/every_eval_ever/converters/helm/adapter.py b/every_eval_ever/converters/helm/adapter.py index 0fa74a00b..484480829 100644 --- a/every_eval_ever/converters/helm/adapter.py +++ b/every_eval_ever/converters/helm/adapter.py @@ -45,6 +45,7 @@ SupportedLibrary, ) from every_eval_ever.converters.common.utils import sha256_file +from every_eval_ever.converters.helm.metrics import is_core_metric from every_eval_ever.converters.helm.instance_level_adapter import ( HELMInstanceLevelDataAdapter, _evaluation_result_id, @@ -341,7 +342,7 @@ def _extract_metric_names(self, run_spec: RunSpec) -> List[str]: Kept for callers that need the run-spec metric declaration. The main aggregate conversion below uses ``stats.json`` so aggregate - IDs cover all numeric detail rows emitted from per-instance stats. + IDs cover all core metric rows emitted from per-instance stats. """ metric_names = [] for metric_spec in run_spec.metric_specs: @@ -359,9 +360,9 @@ def _transform_single( """Convert one HELM run into aggregate JSON plus detail JSONL. The aggregate ``evaluation_result_id`` values are generated from - ``stats.json`` with the same helper used by the instance converter - so every metric-specific detail row can join back to an aggregate - result. + core metrics in ``stats.json`` with the same helper used by the + instance converter so every metric-specific detail row can join + back to an aggregate result. """ run_spec = from_dict(data_class=RunSpec, data=raw_data['run_spec_dict']) # cast=[str] coerces int instance IDs to str; newer HELM versions @@ -419,10 +420,13 @@ def _transform_single( evaluation_id = f'{source_data.dataset_name}/{model_info.id.replace("/", "_")}/{evaluation_timestamp}' - # Build aggregate results from the numeric HELM stats themselves, not + # Build aggregate results from core HELM stats themselves, not # only from run_spec.metric_specs. The instance-level converter emits - # one row per numeric per-instance stat, so aggregate IDs must cover - # the same stat namespace for detailed rows to be joinable. + # one row per core per-instance stat, so aggregate IDs must cover + # the same core namespace for detailed rows to be joinable. + # TODO: Consider promoting bookkeeping telemetry into structured + # fields such as token_usage, performance, metadata, or + # additional_details in a separate follow-up. evaluation_results: List[EvaluationResult] = [] seen_evaluation_result_ids: set[str] = set() @@ -431,6 +435,8 @@ def _transform_single( # key invariant: detail rows should never introduce metric IDs that # are absent from aggregate evaluation_results. metric_name = getattr(getattr(stat, 'name', None), 'name', None) + if not is_core_metric(metric_name): + continue score = _score_from_stat(stat) if metric_name is None or score is None: continue @@ -451,7 +457,7 @@ def _transform_single( lower_is_better=False, # TODO schema.json check score_type=ScoreType.continuous, min_score=0, - max_score=max(1.0, score), + max_score=1.0, ) split = getattr(stat.name, 'split', None) diff --git a/every_eval_ever/converters/helm/instance_level_adapter.py b/every_eval_ever/converters/helm/instance_level_adapter.py index 199c4009a..f73383aa5 100644 --- a/every_eval_ever/converters/helm/instance_level_adapter.py +++ b/every_eval_ever/converters/helm/instance_level_adapter.py @@ -22,6 +22,7 @@ def _require_helm_dependencies() -> None: from every_eval_ever.converters import SCHEMA_VERSION from every_eval_ever.converters.common.utils import sha256_string +from every_eval_ever.converters.helm.metrics import is_core_metric from every_eval_ever.converters.helm.utils import extract_all_reasonings from every_eval_ever.instance_level_types import ( AnswerAttributionItem, @@ -172,9 +173,9 @@ def convert_instance_level_logs( ) -> Tuple[str, int]: """Convert HELM request states into per-(sample, metric) rows. - When HELM per-instance stats are present, each numeric stat gets - its own detail row. If stats are absent, keep the legacy one-row - exact-match fallback so older or partial logs still convert. + When HELM per-instance stats are present, each core metric gets + its own detail row. If core metrics are absent, keep the legacy + one-row exact-match fallback so older or partial logs still convert. """ instance_level_logs: List[InstanceLevelEvaluationLog] = [] for state in request_states: @@ -206,7 +207,14 @@ def convert_instance_level_logs( trace for trace in reasoning_traces if isinstance(trace, str) ] - metric_stats = list(inst_stats.stats) if inst_stats else [] + all_stats = list(inst_stats.stats) if inst_stats else [] + metric_stats = [ + stat + for stat in all_stats + if is_core_metric( + getattr(getattr(stat, 'name', None), 'name', None) + ) + ] if not metric_stats: correct_completions = sum( 1 for c in completions if c.strip() in correct_refs @@ -218,9 +226,13 @@ def convert_instance_level_logs( ) metric_stats = [None] + # Scope control: only core HELM metrics become metric rows. + # TODO: Consider preserving additional bookkeeping telemetry in + # token_usage, performance.additional_details, or metadata. + # Token usage is copied to every row for the same sample. This is - # intentionally denormalized so each detail row is independently - # useful when filtered by metric. + # intentionally denormalized so each core metric row is + # independently useful when filtered by metric. token_usage = None if inst_stats: p_tokens = next( diff --git a/every_eval_ever/converters/helm/metrics.py b/every_eval_ever/converters/helm/metrics.py new file mode 100644 index 000000000..7b9956d39 --- /dev/null +++ b/every_eval_ever/converters/helm/metrics.py @@ -0,0 +1,101 @@ +"""HELM metric classification helpers. + +Only core metrics are promoted to EEE aggregate/detail metric rows. +Bookkeeping stats are intentionally kept out of ``evaluation_result_id`` +rows; selected telemetry can be mapped to structured fields such as +``token_usage`` or ``performance``. +""" + +from __future__ import annotations + +from typing import Optional + + +class METRIC_PREFIXES: + """Registry of HELM metric prefixes used by the converter.""" + + CORE_PREFIXES: tuple[str, ...] = ( + 'exact_match', + 'quasi_exact_match', + 'prefix_exact_match', + 'quasi_prefix_exact_match', + 'classification_micro_f1', + 'classification_macro_f1', + 'f1_score', + 'rouge_l', + 'bleu_', + 'ifeval_strict_accuracy', + 'wildbench_score', + 'wildbench_score_rescaled', + 'omni_math_accuracy', + 'chain_of_thought_correctness', + 'math_equiv', + 'math_equiv_chain_of_thought', + 'safety_score', + 'safety_gpt_score', + 'safety_llama_score', + 'air_score', + 'air_category_', + ) + + BOOKKEEPING_PREFIXES: tuple[str, ...] = ( + # token/size/runtime/resource accounting + 'num_', + 'training_', + 'inference_', + 'batch_size', + 'max_prob', + 'logprob', + 'num_perplexity_tokens', + 'num_bytes', + 'perplexity', + 'bits_per_byte', + 'logprob_per_byte', + # decoding/stopping bookkeeping + 'finish_reason_', + 'prompt_truncated', + # calibration/fitting plumbing + 'ece_', + 'platt_', + 'selective_', + # meta/dataset sizing + 'num_instances', + 'num_train_', + 'num_references', + ) + + +def classify_metric(metric_name: Optional[str]) -> tuple[str, str | None]: + """Classify a HELM metric as core, bookkeeping, or untracked.""" + if not metric_name: + return ('untracked', None) + for prefix in METRIC_PREFIXES.CORE_PREFIXES: + if metric_name.startswith(prefix): + return ('core', prefix) + for prefix in METRIC_PREFIXES.BOOKKEEPING_PREFIXES: + if metric_name.startswith(prefix): + return ('bookkeeping', prefix) + return ('untracked', None) + + +def is_core_metric(metric_name: Optional[str]) -> bool: + """Return True when a HELM stat should become an EEE metric row.""" + metric_class, _ = classify_metric(metric_name) + return metric_class == 'core' + + +def metric_family(metric_name: Optional[str]) -> str: + """Return a lightweight metric family name for summaries/debugging.""" + if not metric_name: + return '?' + if metric_name.startswith('air_'): + return 'air' + if metric_name.startswith('bias_metric:'): + return 'bias_metric' + if metric_name.startswith('safety_'): + return 'safety' + if metric_name.startswith('bbq_'): + return 'bbq' + if '@' in metric_name: + return metric_name.split('@', 1)[0] + return metric_name.split('_', 1)[0].split(':', 1)[0] diff --git a/tests/test_helm_instance_level_adapter.py b/tests/test_helm_instance_level_adapter.py index c6e1ef4c4..98e21fcc3 100644 --- a/tests/test_helm_instance_level_adapter.py +++ b/tests/test_helm_instance_level_adapter.py @@ -7,6 +7,7 @@ from every_eval_ever.converters.helm import adapter as helm_adapter_module from every_eval_ever.converters.helm.adapter import HELMAdapter +from every_eval_ever.converters.helm.metrics import is_core_metric from every_eval_ever.converters.helm.instance_level_adapter import ( HELMInstanceLevelDataAdapter, _BINARY_CORRECTNESS_METRIC_NAMES, @@ -95,15 +96,16 @@ def _json_score_from_stat(stat: dict) -> float | None: return None -def _expected_numeric_instance_stat_rows(filepath): - """Count fixture stats that should become instance-level rows.""" +def _expected_core_instance_stat_rows(filepath): + """Count core fixture stats that should become detail rows.""" per_instance_path = Path(filepath) / 'per_instance_stats.json' per_instance_stats = json.loads(per_instance_path.read_text()) return sum( 1 for item in per_instance_stats for stat in item.get('stats', []) - if _json_score_from_stat(stat) is not None + if is_core_metric(stat.get('name', {}).get('name')) + and _json_score_from_stat(stat) is not None ) @@ -268,7 +270,7 @@ def test_narrativeqa_instance_level(): assert log.answer_attribution[0].extraction_method == 'exact_match' -def test_per_sample_per_metric_rows_are_emitted(): +def test_per_sample_core_metric_rows_are_emitted(): _require_helm() adapter = HELMAdapter() with tempfile.TemporaryDirectory() as tmpdir: @@ -287,15 +289,17 @@ def test_per_sample_per_metric_rows_are_emitted(): log for log in instance_logs if log.sample_id == 'id147' ] metric_ids = sorted(log.evaluation_result_id for log in rows_for_id147) - # This guards the core PR behavior: correctness and bookkeeping stats - # both survive as separate rows for the same sample. + # This guards the core PR behavior: HELM evaluation metrics survive + # as separate rows, while bookkeeping stats remain out-of-band. assert 'exact_match:test' in metric_ids - assert 'num_prompt_tokens:test' in metric_ids + assert 'quasi_exact_match:test' in metric_ids + assert 'num_prompt_tokens:test' not in metric_ids + assert 'inference_runtime:test' not in metric_ids assert all(log.evaluation_result_id is not None for log in rows_for_id147) assert len(metric_ids) == len(set(metric_ids)) -def test_is_correct_only_claimed_for_correctness_metrics(): +def test_bookkeeping_stats_are_not_emitted_as_metric_rows(): _require_helm() adapter = HELMAdapter() with tempfile.TemporaryDirectory() as tmpdir: @@ -312,7 +316,8 @@ def test_is_correct_only_claimed_for_correctness_metrics(): ) # Positive bookkeeping values are not correctness claims. A token - # count or runtime can be > 0 without the answer being correct. + # count or runtime can be > 0 without the answer being correct, so + # these stats should not become metric rows at all. bookkeeping_names = { 'num_references', 'num_prompt_tokens', @@ -325,41 +330,43 @@ def test_is_correct_only_claimed_for_correctness_metrics(): 'num_bytes', 'batch_size', } - bookkeeping = [ - log + emitted_metric_names = { + _metric_name_from_result_id(log.evaluation_result_id) for log in instance_logs + } + assert emitted_metric_names.isdisjoint(bookkeeping_names) + + +def test_graded_core_metrics_are_not_binary_correctness(): + _require_helm() + adapter = HELMAdapter() + with tempfile.TemporaryDirectory() as tmpdir2: + metadata_args2 = { + 'source_organization_name': 'TestOrg', + 'evaluator_relationship': EvaluatorRelationship.first_party, + 'parent_eval_output_dir': tmpdir2, + 'file_uuid': 'test_correctness_graded', + } + _, narr_logs = _load_instance_level_data( + adapter, + 'tests/data/helm/narrative_qa:model=openai_gpt2', + metadata_args2, + ) + # Graded generation metrics also should not be coerced into a + # binary correctness label just because their scores are positive. + graded = [ + log + for log in narr_logs if _metric_name_from_result_id(log.evaluation_result_id) - in bookkeeping_names + in {'rouge_l', 'f1_score', 'bleu_1', 'bleu_4'} ] - assert bookkeeping, 'expected bookkeeping rows to be emitted' + assert graded, 'expected graded score rows in narrative_qa fixture' assert all( - log.evaluation.is_correct is False for log in bookkeeping - ), 'bookkeeping metrics must not claim correctness' - - with tempfile.TemporaryDirectory() as tmpdir2: - metadata_args2 = dict(metadata_args) - metadata_args2['parent_eval_output_dir'] = tmpdir2 - metadata_args2['file_uuid'] = 'test_correctness_graded' - _, narr_logs = _load_instance_level_data( - adapter, - 'tests/data/helm/narrative_qa:model=openai_gpt2', - metadata_args2, - ) - # Graded generation metrics also should not be coerced into a - # binary correctness label just because their scores are positive. - graded = [ - log - for log in narr_logs - if _metric_name_from_result_id(log.evaluation_result_id) - in {'rouge_l', 'f1_score', 'bleu_1', 'bleu_4'} - ] - assert graded, 'expected graded score rows in narrative_qa fixture' - assert all( - log.evaluation.is_correct is False for log in graded - ), ( - 'graded metrics (rouge_l/f1_score/bleu_*) must not be ' - 'treated as binary correctness' - ) + log.evaluation.is_correct is False for log in graded + ), ( + 'graded metrics (rouge_l/f1_score/bleu_*) must not be ' + 'treated as binary correctness' + ) def test_is_correct_is_true_for_correct_exact_match_rows(): @@ -433,7 +440,7 @@ def test_evaluation_result_id_helper_disambiguates_split_and_perturbation(): ) -def test_total_rows_matches_numeric_per_instance_stats(): +def test_total_rows_matches_core_per_instance_stats(): _require_helm() fixture = 'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2' adapter = HELMAdapter() @@ -448,9 +455,9 @@ def test_total_rows_matches_numeric_per_instance_stats(): adapter, fixture, metadata_args ) - # Count expected rows from the fixture itself so duplication or - # accidental filtering changes are caught precisely. - expected_rows = _expected_numeric_instance_stat_rows(fixture) + # Count expected core metric rows from the fixture itself so + # duplication or accidental filtering changes are caught precisely. + expected_rows = _expected_core_instance_stat_rows(fixture) assert converted_eval.detailed_evaluation_results.total_rows == expected_rows assert len(instance_logs) == expected_rows assert len({ @@ -518,7 +525,7 @@ def test_aggregate_evaluation_result_ids_are_unique_and_non_null(): assert all(result_ids) assert len(result_ids) == len(set(result_ids)) assert 'exact_match:test' in result_ids - assert 'num_prompt_tokens:test' in result_ids + assert 'num_prompt_tokens:test' not in result_ids def test_missing_inst_stats_uses_legacy_exact_match_fallback(): From e86c5f0a10fe468afb42a7dad4a34999d6d6b7f4 Mon Sep 17 00:00:00 2001 From: joncrall Date: Wed, 13 May 2026 19:47:58 -0400 Subject: [PATCH 6/8] Simplify HELM core metric filter --- every_eval_ever/converters/helm/metrics.py | 114 +++++---------------- 1 file changed, 23 insertions(+), 91 deletions(-) diff --git a/every_eval_ever/converters/helm/metrics.py b/every_eval_ever/converters/helm/metrics.py index 7b9956d39..10a509697 100644 --- a/every_eval_ever/converters/helm/metrics.py +++ b/every_eval_ever/converters/helm/metrics.py @@ -1,101 +1,33 @@ -"""HELM metric classification helpers. - -Only core metrics are promoted to EEE aggregate/detail metric rows. -Bookkeeping stats are intentionally kept out of ``evaluation_result_id`` -rows; selected telemetry can be mapped to structured fields such as -``token_usage`` or ``performance``. -""" +"""HELM metric filtering helpers.""" from __future__ import annotations from typing import Optional -class METRIC_PREFIXES: - """Registry of HELM metric prefixes used by the converter.""" - - CORE_PREFIXES: tuple[str, ...] = ( - 'exact_match', - 'quasi_exact_match', - 'prefix_exact_match', - 'quasi_prefix_exact_match', - 'classification_micro_f1', - 'classification_macro_f1', - 'f1_score', - 'rouge_l', - 'bleu_', - 'ifeval_strict_accuracy', - 'wildbench_score', - 'wildbench_score_rescaled', - 'omni_math_accuracy', - 'chain_of_thought_correctness', - 'math_equiv', - 'math_equiv_chain_of_thought', - 'safety_score', - 'safety_gpt_score', - 'safety_llama_score', - 'air_score', - 'air_category_', - ) - - BOOKKEEPING_PREFIXES: tuple[str, ...] = ( - # token/size/runtime/resource accounting - 'num_', - 'training_', - 'inference_', - 'batch_size', - 'max_prob', - 'logprob', - 'num_perplexity_tokens', - 'num_bytes', - 'perplexity', - 'bits_per_byte', - 'logprob_per_byte', - # decoding/stopping bookkeeping - 'finish_reason_', - 'prompt_truncated', - # calibration/fitting plumbing - 'ece_', - 'platt_', - 'selective_', - # meta/dataset sizing - 'num_instances', - 'num_train_', - 'num_references', - ) - - -def classify_metric(metric_name: Optional[str]) -> tuple[str, str | None]: - """Classify a HELM metric as core, bookkeeping, or untracked.""" - if not metric_name: - return ('untracked', None) - for prefix in METRIC_PREFIXES.CORE_PREFIXES: - if metric_name.startswith(prefix): - return ('core', prefix) - for prefix in METRIC_PREFIXES.BOOKKEEPING_PREFIXES: - if metric_name.startswith(prefix): - return ('bookkeeping', prefix) - return ('untracked', None) +# HELM emits both benchmark metrics and bookkeeping telemetry in stats.json / +# per_instance_stats.json. In this PR, only benchmark-quality metrics become +# EEE aggregate/detail metric rows. Bookkeeping can be mapped to token_usage, +# performance, metadata, or additional_details in a future follow-up. +CORE_METRIC_PREFIXES: tuple[str, ...] = ( + 'exact_match', + 'quasi_exact_match', + 'prefix_exact_match', + 'quasi_prefix_exact_match', + 'classification_micro_f1', + 'classification_macro_f1', + 'f1_score', + 'rouge_l', + 'bleu_', + 'ifeval_strict_accuracy', + 'chain_of_thought_correctness', + 'math_equiv', + 'math_equiv_chain_of_thought', +) def is_core_metric(metric_name: Optional[str]) -> bool: """Return True when a HELM stat should become an EEE metric row.""" - metric_class, _ = classify_metric(metric_name) - return metric_class == 'core' - - -def metric_family(metric_name: Optional[str]) -> str: - """Return a lightweight metric family name for summaries/debugging.""" - if not metric_name: - return '?' - if metric_name.startswith('air_'): - return 'air' - if metric_name.startswith('bias_metric:'): - return 'bias_metric' - if metric_name.startswith('safety_'): - return 'safety' - if metric_name.startswith('bbq_'): - return 'bbq' - if '@' in metric_name: - return metric_name.split('@', 1)[0] - return metric_name.split('_', 1)[0].split(':', 1)[0] + return bool(metric_name) and any( + metric_name.startswith(prefix) for prefix in CORE_METRIC_PREFIXES + ) From e7de0290bfbc12d5f1c9170dd7c76e8ca640e770 Mon Sep 17 00:00:00 2001 From: joncrall Date: Wed, 13 May 2026 19:53:34 -0400 Subject: [PATCH 7/8] Clean up HELM core metric conversion --- every_eval_ever/converters/helm/adapter.py | 40 ++++++------------- .../converters/helm/instance_level_adapter.py | 21 +++++----- tests/test_helm_adapter.py | 19 ++++++--- 3 files changed, 36 insertions(+), 44 deletions(-) diff --git a/every_eval_ever/converters/helm/adapter.py b/every_eval_ever/converters/helm/adapter.py index 484480829..e9d833d1d 100644 --- a/every_eval_ever/converters/helm/adapter.py +++ b/every_eval_ever/converters/helm/adapter.py @@ -3,7 +3,7 @@ import os import uuid from pathlib import Path -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, List _HELM_IMPORT_ERROR: Exception | None = None try: @@ -70,7 +70,6 @@ SourceType, Uncertainty, ) -from every_eval_ever.instance_level_types import InstanceLevelEvaluationLog def _require_helm_dependencies() -> None: @@ -220,7 +219,6 @@ def transform_from_directory( Transforms HELM results into one aggregate EvaluationLog and one instance-level JSONL file containing all samples. """ - # all_instance_logs: List[InstanceLevelEvaluationLog] = [] aggregate_logs: List[EvaluationLog] = [] file_uuids = metadata_args.get('file_uuids') @@ -264,10 +262,6 @@ def transform_from_directory( aggregate_logs.append(agg) converted_idx += 1 - # # Write all consolidated instance logs to JSONL - # with open(output_path, 'w', encoding='utf-8') as f: - # for log in all_instance_logs: - # f.write(json.dumps(log.model_dump(), ensure_ascii=False) + '\n') return aggregate_logs @@ -337,26 +331,9 @@ def _extract_dataset_name( return run_spec_name.split(':')[0] - def _extract_metric_names(self, run_spec: RunSpec) -> List[str]: - """Return metric names declared by the HELM run spec. - - Kept for callers that need the run-spec metric declaration. The - main aggregate conversion below uses ``stats.json`` so aggregate - IDs cover all core metric rows emitted from per-instance stats. - """ - metric_names = [] - for metric_spec in run_spec.metric_specs: - names = metric_spec.args.get('names') - if names: - metric_names.extend(names) - else: - metric_names.append(metric_spec.class_name.split('.')[-1]) - - return metric_names - def _transform_single( self, raw_data: Dict, metadata_args: Dict[str, Any] - ) -> Tuple[EvaluationLog, List[InstanceLevelEvaluationLog]]: + ) -> EvaluationLog: """Convert one HELM run into aggregate JSON plus detail JSONL. The aggregate ``evaluation_result_id`` values are generated from @@ -441,6 +418,8 @@ def _transform_single( if metric_name is None or score is None: continue + stat_count = getattr(stat, 'count', None) + evaluation_result_id = _evaluation_result_id( metric_name, getattr(stat.name, 'split', None), @@ -482,8 +461,15 @@ def _transform_single( score=score, uncertainty=Uncertainty( standard_deviation=getattr(stat, 'stddev', None), - num_samples=adapter_spec.max_eval_instances - or len(request_states), + # Split-specific HELM stats may cover fewer + # examples than the full run, so use the stat's + # own count when it is available. + num_samples=( + stat_count + if stat_count is not None + else adapter_spec.max_eval_instances + or len(request_states) + ), ), details={ 'count': str(getattr(stat, 'count', '')), diff --git a/every_eval_ever/converters/helm/instance_level_adapter.py b/every_eval_ever/converters/helm/instance_level_adapter.py index f73383aa5..ef1ae868a 100644 --- a/every_eval_ever/converters/helm/instance_level_adapter.py +++ b/every_eval_ever/converters/helm/instance_level_adapter.py @@ -40,8 +40,8 @@ def _score_from_stat(stat) -> float | None: """Return a scalar HELM stat value, or None for empty/bad stats. HELM usually provides ``mean``; some stats only have ``sum`` and - ``count``. Returning None lets callers skip non-valued bookkeeping - rows such as zero-count training cost stats. + ``count``. Returning None lets callers skip stat rows that do not + contain a usable scalar value. """ value = getattr(stat, 'mean', None) if value is None: @@ -94,12 +94,9 @@ def _evaluation_result_id( # Metric names whose per-instance score is a correctness signal in [0, 1] -# where ``score > 0`` reasonably maps to ``is_correct=True``. Anything not -# in this allowlist (token counts, runtime, finish-reason flags, logprobs, -# etc.) gets ``is_correct=False`` because we have no correctness claim -# from a bookkeeping/resource metric. Keep this list tight and named after -# the actual HELM stat names — broaden only for verified correctness -# semantics. +# where ``score > 0`` reasonably maps to ``is_correct=True``. Keep this list +# tight: graded core metrics such as rouge/bleu/f1 should stay out of it +# because a positive score is not the same as a binary correctness claim. _BINARY_CORRECTNESS_METRIC_NAMES: frozenset[str] = frozenset({ 'exact_match', 'quasi_exact_match', @@ -121,9 +118,8 @@ def _is_correct_for_metric(metric_name: str | None, score: float) -> bool: For correctness metrics in the allowlist, the HELM convention is that score==1.0 means correct and 0.0 means wrong, so any positive score - rounds up to "correct". For anything else (bookkeeping / resource - stats, or graded metrics like rouge_l/bleu where >0 is not a correctness - signal) we deliberately do not claim correctness. + rounds up to "correct". For graded metrics like rouge_l/bleu/f1 where + >0 is not a correctness signal, we deliberately do not claim correctness. """ if metric_name is None: return False @@ -216,6 +212,9 @@ def convert_instance_level_logs( ) ] if not metric_stats: + # Preserve the legacy exact-match proxy instead of dropping + # samples that have no per-instance stats or no recognized + # core metric rows. correct_completions = sum( 1 for c in completions if c.strip() in correct_refs ) diff --git a/tests/test_helm_adapter.py b/tests/test_helm_adapter.py index d3fe08225..5d2bbb9a0 100644 --- a/tests/test_helm_adapter.py +++ b/tests/test_helm_adapter.py @@ -1,12 +1,9 @@ import pytest -pytest.importorskip( - 'helm', reason='crfm-helm not installed; install with: uv sync --extra helm' -) - import tempfile from pathlib import Path +from every_eval_ever.converters.helm import adapter as helm_adapter_module from every_eval_ever.converters.helm.adapter import HELMAdapter from every_eval_ever.eval_types import ( EvaluationLog, @@ -16,6 +13,16 @@ ) +pytestmark = pytest.mark.skipif( + helm_adapter_module._HELM_IMPORT_ERROR is not None, + reason=( + 'HELM converter dependencies are missing: ' + f'{helm_adapter_module._HELM_IMPORT_ERROR!r}. ' + 'Install with: uv sync --extra helm' + ), +) + + def _load_eval(adapter, filepath, metadata_args): """Run the HELM aggregate adapter against one fixture directory.""" eval_dirpath = Path(filepath) @@ -133,7 +140,7 @@ def test_hellswag_eval(): assert converted_eval.detailed_evaluation_results is not None assert converted_eval.detailed_evaluation_results.format is not None - # Per-(sample, metric): >= sample count, not equal to it. + # Per-(sample, core metric): >= sample count, not equal to it. assert converted_eval.detailed_evaluation_results.total_rows >= 10 @@ -172,7 +179,7 @@ def test_narrativeqa_eval(): assert converted_eval.detailed_evaluation_results is not None assert converted_eval.detailed_evaluation_results.format is not None - # Per-(sample, metric): >= sample count, not equal to it. + # Per-(sample, core metric): >= sample count, not equal to it. assert converted_eval.detailed_evaluation_results.total_rows >= 5 From 43011f5c566696131af707aa506d446650633d25 Mon Sep 17 00:00:00 2001 From: joncrall Date: Wed, 13 May 2026 19:58:08 -0400 Subject: [PATCH 8/8] Clean up HELM converter typing --- every_eval_ever/converters/helm/adapter.py | 36 ++++++++++++------- .../converters/helm/instance_level_adapter.py | 13 ++++--- 2 files changed, 31 insertions(+), 18 deletions(-) diff --git a/every_eval_ever/converters/helm/adapter.py b/every_eval_ever/converters/helm/adapter.py index e9d833d1d..d57d7f30f 100644 --- a/every_eval_ever/converters/helm/adapter.py +++ b/every_eval_ever/converters/helm/adapter.py @@ -3,7 +3,7 @@ import os import uuid from pathlib import Path -from typing import Any, Dict, List +from typing import Any, Dict, List, Union, cast _HELM_IMPORT_ERROR: Exception | None = None try: @@ -29,14 +29,18 @@ Exception ) as ex: # pragma: no cover - exercised only when optional deps missing _HELM_IMPORT_ERROR = ex - DaciteConfig = from_dict = None # type: ignore[assignment] - PerInstanceStats = AdapterSpec = RequestState = ScenarioState = Stat = ( - RunSpec - ) = Any # type: ignore[assignment] - get_model_deployment = register_builtin_configs_from_helm_package = ( - from_json - ) = None # type: ignore[assignment] - ModelDeploymentNotFoundError = Exception # type: ignore[assignment] + DaciteConfig = cast(Any, None) + from_dict = cast(Any, None) + PerInstanceStats = cast(Any, None) + AdapterSpec = cast(Any, None) + RequestState = cast(Any, None) + ScenarioState = cast(Any, None) + Stat = cast(Any, None) + RunSpec = cast(Any, None) + get_model_deployment = cast(Any, None) + register_builtin_configs_from_helm_package = cast(Any, None) + from_json = cast(Any, None) + ModelDeploymentNotFoundError = cast(Any, Exception) from every_eval_ever.converters import SCHEMA_VERSION from every_eval_ever.converters.common.adapter import ( @@ -108,6 +112,7 @@ def metadata(self) -> AdapterMetadata: return AdapterMetadata( name='HELMAdapter', version='0.0.1', + supported_library_versions=['helm'], description='HELM adapter with dynamic metrics and unified JSONL instance logging', ) @@ -131,7 +136,8 @@ def _split_model_id(self, model_id: str | None) -> tuple[str, str]: if not model_id: return ('unknown', 'unknown') if '/' in model_id: - return tuple(model_id.split('/', 1)) + developer, name = model_id.split('/', 1) + return (developer, name) return ('unknown', model_id) def _extract_model_info(self, adapter_spec: AdapterSpec) -> ModelInfo: @@ -213,13 +219,18 @@ def _load_evaluation_run_logfiles(self, dir_path) -> Dict: } def transform_from_directory( - self, dir_path: str, output_path: str, metadata_args: Dict[str, Any] - ): + self, + dir_path: str | Path, + metadata_args: Dict[str, Any] | None = None, + output_path: str | None = None, + ) -> List[EvaluationLog]: """ Transforms HELM results into one aggregate EvaluationLog and one instance-level JSONL file containing all samples. """ aggregate_logs: List[EvaluationLog] = [] + metadata_args = metadata_args or {} + dir_path = str(dir_path) file_uuids = metadata_args.get('file_uuids') @@ -262,7 +273,6 @@ def transform_from_directory( aggregate_logs.append(agg) converted_idx += 1 - return aggregate_logs def _extract_generation_args( diff --git a/every_eval_ever/converters/helm/instance_level_adapter.py b/every_eval_ever/converters/helm/instance_level_adapter.py index ef1ae868a..bd0442a89 100644 --- a/every_eval_ever/converters/helm/instance_level_adapter.py +++ b/every_eval_ever/converters/helm/instance_level_adapter.py @@ -1,6 +1,6 @@ import json from pathlib import Path -from typing import Any, List, Tuple +from typing import Any, List, Tuple, cast _HELM_IMPORT_ERROR: Exception | None = None try: @@ -9,7 +9,7 @@ Exception ) as ex: # pragma: no cover - exercised only when optional deps missing _HELM_IMPORT_ERROR = ex - RequestState = Any # type: ignore[assignment] + RequestState = cast(Any, None) def _require_helm_dependencies() -> None: @@ -335,9 +335,12 @@ def convert_instance_level_logs( ), token_usage=token_usage, performance=Performance( - generation_time_ms=state.result.request_time * 1000 - if state.result.request_time - else None + generation_time_ms=( + state.result.request_time * 1000 + if state.result + and state.result.request_time + else None + ) ), ) )