From a56908b666b44c68db18a226b3bc0d1d55bbc0d0 Mon Sep 17 00:00:00 2001 From: joncrall Date: Thu, 7 May 2026 21:58:37 -0400 Subject: [PATCH 1/4] io: add transparent compression helpers for EEE result files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New module ``every_eval_ever/io.py`` provides a single source of truth for opening EEE result files (``.json`` aggregates and ``_samples.jsonl`` per-instance samples) regardless of compression. Recognized codecs match the HuggingFace Hub's documented set (https://huggingface.co/docs/hub/en/datasets-adding#file-formats): .gz, .zst, .bz2, .xz, .lz4 ``.zip`` is intentionally excluded — it is an archive container, not a stream codec, and would conflict with the duplicate-variant rule. Helpers: * ``open_eee_text(path, mode)`` — text I/O with codec auto-detection * ``is_eee_result(path)`` — 'aggregate' / 'samples' / None * ``eee_uuid_stem(path)`` — strip kind+codec suffixes * ``detect_compression(path)`` — name of the trailing codec * ``add_compression_suffix(path, cs)`` — synthesize the on-disk filename * ``iter_eee_results(roots)`` — recursive discovery of EEE files * ``find_duplicate_variants(paths)`` — enforces 'one variant per (folder, uuid, kind)' rule ``zstandard`` and ``lz4`` are optional runtime dependencies; the open helper raises ``ImportError`` with an actionable message pointing at the right ``[zst]`` / ``[lz4]`` extra. The stdlib codecs (.gz, .bz2, .xz) work out of the box. This commit only adds the module + unit tests. Subsequent commits wire the rest of the package (validate, check_duplicate_entries, converters, CLI flags) through these helpers. 53 passed, 2 skipped (lz4-codec tests skip when ``lz4`` is not installed; same for ``zstandard`` if absent). --- every_eval_ever/io.py | 296 ++++++++++++++++++++++++++++++++++++++++++ tests/test_io.py | 220 +++++++++++++++++++++++++++++++ 2 files changed, 516 insertions(+) create mode 100644 every_eval_ever/io.py create mode 100644 tests/test_io.py diff --git a/every_eval_ever/io.py b/every_eval_ever/io.py new file mode 100644 index 000000000..351dc0aa0 --- /dev/null +++ b/every_eval_ever/io.py @@ -0,0 +1,296 @@ +"""I/O helpers for EEE result files with transparent compression. + +EEE result files come in two kinds: + +- aggregate: ``.json`` +- per-instance samples: ``_samples.jsonl`` + +This module recognizes both kinds *and* their compressed forms, using the +codec set the HuggingFace Hub already auto-decompresses +(https://huggingface.co/docs/hub/en/datasets-adding#file-formats): + + ``.gz``, ``.zst``, ``.bz2``, ``.xz``, ``.lz4`` + +``.zip`` is intentionally not supported as a stream codec — it is an +archive container, and conflicts with the duplicate-variant rule below. + +All EEE-side tooling (validation, discovery, manifest generation, +viewer-parquet generation, conversion writers) should route file open and +file-suffix recognition through this module so the compressed forms behave +as transparent equivalents of the plain forms. + +Duplicate-variant rule +---------------------- + +For any ``(folder, uuid, kind)`` triple, at most one physical variant may +be present. ``find_duplicate_variants`` enumerates violations; the +validator surfaces them as ``duplicate_variant`` errors. + +Backwards compatibility +----------------------- + +The default ``compression='none'`` everywhere preserves existing +behaviour — uncompressed plain ``.json`` and ``_samples.jsonl`` are still +the default written and read forms. Compression is opt-in. +""" + +from __future__ import annotations + +import io as _io +from collections import defaultdict +from collections.abc import Iterable, Iterator +from pathlib import Path +from typing import TextIO + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +#: No compression — write/read plain UTF-8 text. +COMPRESSION_NONE = 'none' +COMPRESSION_GZ = 'gz' +COMPRESSION_ZST = 'zst' +COMPRESSION_BZ2 = 'bz2' +COMPRESSION_XZ = 'xz' +COMPRESSION_LZ4 = 'lz4' + +#: All accepted compression names. ``COMPRESSION_NONE`` first so it is the +#: documented default in CLI help. +COMPRESSION_CHOICES: tuple[str, ...] = ( + COMPRESSION_NONE, + COMPRESSION_GZ, + COMPRESSION_ZST, + COMPRESSION_BZ2, + COMPRESSION_XZ, + COMPRESSION_LZ4, +) + +# Map filesystem suffix (lowercase, leading dot) to compression name. +_COMPRESSION_BY_SUFFIX: dict[str, str] = { + '.gz': COMPRESSION_GZ, + '.zst': COMPRESSION_ZST, + '.bz2': COMPRESSION_BZ2, + '.xz': COMPRESSION_XZ, + '.lz4': COMPRESSION_LZ4, +} + +# Map compression name to its filesystem suffix (with leading dot). +_SUFFIX_BY_COMPRESSION: dict[str, str] = { + name: suf for suf, name in _COMPRESSION_BY_SUFFIX.items() +} + +# Canonical kind -> base suffix (the part *before* any compression suffix). +_BASE_SUFFIX_BY_KIND: dict[str, str] = { + 'aggregate': '.json', + 'samples': '_samples.jsonl', +} + +# Order matters: 'samples' must be checked before 'aggregate' because +# ``_samples.jsonl`` ends in ``.jsonl`` not ``.json`` (different base suffix +# overall) but the kind detector should not be confused by future shape +# changes. +_KIND_ORDER: tuple[str, ...] = ('samples', 'aggregate') + + +# --------------------------------------------------------------------------- +# Suffix recognition +# --------------------------------------------------------------------------- + + +def detect_compression(path: Path | str) -> str: + """Return the compression name implied by ``path``'s trailing suffix. + + Returns one of :data:`COMPRESSION_CHOICES`. ``COMPRESSION_NONE`` for + paths whose final suffix is not a recognized compression suffix. + """ + p = Path(path) + return _COMPRESSION_BY_SUFFIX.get(p.suffix.lower(), COMPRESSION_NONE) + + +def _strip_compression_suffix(name: str) -> str: + """Return ``name`` with one trailing compression suffix removed (if any).""" + for suf in _COMPRESSION_BY_SUFFIX: + if name.lower().endswith(suf): + return name[: -len(suf)] + return name + + +def is_eee_result(path: Path | str) -> str | None: + """Classify a path as ``'aggregate'``, ``'samples'``, or ``None``. + + Recognizes both plain and compressed forms. Returns ``None`` for any + path that does not match an EEE result filename convention. + """ + name = _strip_compression_suffix(Path(path).name) + for kind in _KIND_ORDER: + if name.endswith(_BASE_SUFFIX_BY_KIND[kind]): + return kind + return None + + +def eee_uuid_stem(path: Path | str) -> str | None: + """Return the bare UUID portion of an EEE result filename, or ``None``. + + Strips both the compression suffix (if any) and the kind-specific base + suffix. ``abc_samples.jsonl.gz`` and ``abc.json.zst`` both yield + ``'abc'``. + """ + name = _strip_compression_suffix(Path(path).name) + for kind in _KIND_ORDER: + base = _BASE_SUFFIX_BY_KIND[kind] + if name.endswith(base): + return name[: -len(base)] + return None + + +def add_compression_suffix(path: Path | str, compression: str) -> Path: + """Return ``path`` with the requested compression suffix appended. + + ``add_compression_suffix(Path('a.json'), 'gz')`` -> ``Path('a.json.gz')``. + ``compression='none'`` returns the path unchanged. + """ + p = Path(path) + if compression == COMPRESSION_NONE: + return p + if compression not in _SUFFIX_BY_COMPRESSION: + raise ValueError( + f'unsupported compression {compression!r}; ' + f'choose from {COMPRESSION_CHOICES}' + ) + return p.with_name(p.name + _SUFFIX_BY_COMPRESSION[compression]) + + +# --------------------------------------------------------------------------- +# Open helper +# --------------------------------------------------------------------------- + + +def _missing_codec_msg(codec: str, package: str, extra: str) -> str: + return ( + f'Reading/writing .{codec} EEE files requires the {package!r} ' + f"package. Install with: pip install 'every-eval-ever[{extra}]'" + ) + + +def open_eee_text(path: Path | str, mode: str = 'r') -> TextIO: + """Open an EEE result file (json or jsonl) for text I/O. + + The compression codec is inferred from ``path``'s trailing suffix. + ``mode`` may be ``'r'`` / ``'rt'`` for reading or ``'w'`` / ``'wt'`` + for writing. UTF-8 is assumed throughout. + + Raises ``ImportError`` (with an actionable message) if the path + indicates a codec whose backing dependency is not installed. + """ + if mode in ('r', 'rt'): + text_mode = 'rt' + elif mode in ('w', 'wt'): + text_mode = 'wt' + else: + raise ValueError( + f"open_eee_text mode must be 'r', 'rt', 'w', or 'wt'; got {mode!r}" + ) + p = Path(path) + cs = detect_compression(p) + + if cs == COMPRESSION_NONE: + return open(p, text_mode, encoding='utf-8') + if cs == COMPRESSION_GZ: + import gzip + return gzip.open(p, text_mode, encoding='utf-8') + if cs == COMPRESSION_BZ2: + import bz2 + return bz2.open(p, text_mode, encoding='utf-8') + if cs == COMPRESSION_XZ: + import lzma + return lzma.open(p, text_mode, encoding='utf-8') + if cs == COMPRESSION_ZST: + try: + import zstandard # type: ignore[import-not-found] + except ImportError as exc: # pragma: no cover - exercised in env w/o dep + raise ImportError(_missing_codec_msg('zst', 'zstandard', 'zst')) from exc + if text_mode == 'rt': + reader = zstandard.ZstdDecompressor().stream_reader(open(p, 'rb')) + return _io.TextIOWrapper(reader, encoding='utf-8') + writer = zstandard.ZstdCompressor().stream_writer(open(p, 'wb')) + return _ZstdTextWriter(writer, encoding='utf-8') + if cs == COMPRESSION_LZ4: + try: + import lz4.frame # type: ignore[import-not-found] + except ImportError as exc: # pragma: no cover + raise ImportError(_missing_codec_msg('lz4', 'lz4', 'lz4')) from exc + binmode = 'rb' if text_mode == 'rt' else 'wb' + return _io.TextIOWrapper(lz4.frame.open(p, mode=binmode), encoding='utf-8') + + # _COMPRESSION_BY_SUFFIX is exhaustive over COMPRESSION_CHOICES, so this + # branch is unreachable. + raise AssertionError(f'unhandled compression {cs!r}') # pragma: no cover + + +class _ZstdTextWriter(_io.TextIOWrapper): + """TextIOWrapper that flushes the underlying zstd stream_writer on close. + + ``ZstdCompressor.stream_writer`` requires an explicit ``close()`` / + ``flush(zstandard.FLUSH_FRAME)`` to finalize the frame; without that the + written file is silently truncated. Wrapping in a TextIOWrapper alone + isn't enough — close cascades to the wrapped writer, but ``flush()`` + on the TextIOWrapper alone does not. + """ + + def close(self) -> None: # pragma: no cover - exercised only with zstandard + try: + self.flush() + finally: + super().close() + + +# --------------------------------------------------------------------------- +# Discovery +# --------------------------------------------------------------------------- + + +def iter_eee_results(roots: Iterable[Path | str]) -> Iterator[Path]: + """Yield every EEE result file under each root (recursive). + + Files are yielded in deterministic (sorted) order. Roots that are + themselves EEE result files are yielded as-is. + """ + for root in roots: + rp = Path(root) + if rp.is_file(): + if is_eee_result(rp) is not None: + yield rp + continue + if not rp.is_dir(): + continue + for path in sorted(rp.rglob('*')): + if path.is_file() and is_eee_result(path) is not None: + yield path + + +def find_duplicate_variants( + paths: Iterable[Path | str], +) -> list[tuple[Path, str, str, list[Path]]]: + """Detect ``(folder, uuid, kind)`` groups with more than one variant. + + Each result is ``(folder, uuid_stem, kind, [physical_paths])``. + """ + by_group: dict[tuple[Path, str, str], list[Path]] = defaultdict(list) + seen_files: set[Path] = set() + for path in paths: + p = Path(path) + if p in seen_files: + continue + seen_files.add(p) + kind = is_eee_result(p) + if kind is None: + continue + stem = eee_uuid_stem(p) + if stem is None: + continue + by_group[(p.parent, stem, kind)].append(p) + return [ + (folder, stem, kind, sorted(variants)) + for (folder, stem, kind), variants in by_group.items() + if len(variants) > 1 + ] diff --git a/tests/test_io.py b/tests/test_io.py new file mode 100644 index 000000000..dc3f691f3 --- /dev/null +++ b/tests/test_io.py @@ -0,0 +1,220 @@ +"""Tests for ``every_eval_ever.io`` — transparent compression helpers.""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from every_eval_ever import io as eee_io + + +# --------------------------------------------------------------------------- +# Suffix recognition +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + 'name, expected', + [ + ('abc.json', 'aggregate'), + ('abc.json.gz', 'aggregate'), + ('abc.json.zst', 'aggregate'), + ('abc.json.bz2', 'aggregate'), + ('abc.json.xz', 'aggregate'), + ('abc.json.lz4', 'aggregate'), + ('abc_samples.jsonl', 'samples'), + ('abc_samples.jsonl.gz', 'samples'), + ('abc_samples.jsonl.zst', 'samples'), + ('abc_samples.jsonl.bz2', 'samples'), + ('abc_samples.jsonl.xz', 'samples'), + ('abc_samples.jsonl.lz4', 'samples'), + ('abc.txt', None), + ('abc.parquet', None), + ('manifest.json', 'aggregate'), # any *.json is aggregate by shape + ('abc.zip', None), # zip is not a recognized stream codec + ('abc.json.zip', None), + ], +) +def test_is_eee_result(name: str, expected: str | None) -> None: + assert eee_io.is_eee_result(name) == expected + + +@pytest.mark.parametrize( + 'name, expected_stem', + [ + ('abc.json', 'abc'), + ('abc.json.gz', 'abc'), + ('a-b-c.json.zst', 'a-b-c'), + ('abc_samples.jsonl', 'abc'), + ('abc_samples.jsonl.gz', 'abc'), + ('abc.txt', None), + ('abc.json.zip', None), + ], +) +def test_eee_uuid_stem(name: str, expected_stem: str | None) -> None: + assert eee_io.eee_uuid_stem(name) == expected_stem + + +@pytest.mark.parametrize( + 'name, expected', + [ + ('abc.json', 'none'), + ('abc.json.gz', 'gz'), + ('abc.json.zst', 'zst'), + ('abc.json.bz2', 'bz2'), + ('abc.json.xz', 'xz'), + ('abc.json.lz4', 'lz4'), + ('abc.txt', 'none'), + ], +) +def test_detect_compression(name: str, expected: str) -> None: + assert eee_io.detect_compression(name) == expected + + +def test_add_compression_suffix_none() -> None: + p = Path('/x/y/abc.json') + assert eee_io.add_compression_suffix(p, 'none') == p + + +@pytest.mark.parametrize('cs', ['gz', 'zst', 'bz2', 'xz', 'lz4']) +def test_add_compression_suffix(cs: str) -> None: + p = Path('/x/y/abc.json') + out = eee_io.add_compression_suffix(p, cs) + assert out.name == f'abc.json.{cs}' + + +def test_add_compression_suffix_rejects_unknown() -> None: + with pytest.raises(ValueError): + eee_io.add_compression_suffix(Path('a.json'), 'snappy') + + +# --------------------------------------------------------------------------- +# Round-trip per codec +# --------------------------------------------------------------------------- + + +def _can_import(mod: str) -> bool: + try: + __import__(mod) + return True + except ImportError: + return False + + +_CODEC_REQUIRES = { + 'none': None, + 'gz': None, # stdlib + 'bz2': None, # stdlib + 'xz': None, # stdlib (lzma) + 'zst': 'zstandard', + 'lz4': 'lz4', +} + + +def _codec_param(cs: str) -> pytest.param: + req = _CODEC_REQUIRES[cs] + marks: list = [] + if req is not None and not _can_import(req): + marks.append(pytest.mark.skip(reason=f'requires {req!r}')) + return pytest.param(cs, marks=marks) + + +@pytest.mark.parametrize( + 'cs', [_codec_param(c) for c in eee_io.COMPRESSION_CHOICES] +) +def test_open_eee_text_roundtrip_aggregate(cs: str, tmp_path: Path) -> None: + base = tmp_path / 'abc.json' + out_path = eee_io.add_compression_suffix(base, cs) + payload = {'schema_version': '0.2.2', 'kind': 'aggregate', 'cs': cs} + + with eee_io.open_eee_text(out_path, 'w') as f: + json.dump(payload, f) + + with eee_io.open_eee_text(out_path, 'r') as f: + assert json.load(f) == payload + + +@pytest.mark.parametrize( + 'cs', [_codec_param(c) for c in eee_io.COMPRESSION_CHOICES] +) +def test_open_eee_text_roundtrip_samples(cs: str, tmp_path: Path) -> None: + base = tmp_path / 'abc_samples.jsonl' + out_path = eee_io.add_compression_suffix(base, cs) + rows = [{'i': i, 'cs': cs} for i in range(5)] + + with eee_io.open_eee_text(out_path, 'w') as f: + for row in rows: + f.write(json.dumps(row) + '\n') + + with eee_io.open_eee_text(out_path, 'r') as f: + read_back = [json.loads(line) for line in f if line.strip()] + assert read_back == rows + + +def test_open_eee_text_rejects_bad_mode(tmp_path: Path) -> None: + p = tmp_path / 'abc.json' + p.write_text('{}') + with pytest.raises(ValueError): + eee_io.open_eee_text(p, 'rb') + + +# --------------------------------------------------------------------------- +# Discovery + duplicate-variant detection +# --------------------------------------------------------------------------- + + +def test_iter_eee_results_finds_compressed_and_plain(tmp_path: Path) -> None: + plain = tmp_path / 'sub' / 'a.json' + plain.parent.mkdir() + plain.write_text('{}') + gz = tmp_path / 'sub' / 'b_samples.jsonl.gz' + import gzip + with gzip.open(gz, 'wt', encoding='utf-8') as f: + f.write('{}\n') + junk = tmp_path / 'sub' / 'readme.txt' + junk.write_text('not eee') + + found = sorted(eee_io.iter_eee_results([tmp_path])) + assert found == [plain, gz] + + +def test_iter_eee_results_accepts_file_root(tmp_path: Path) -> None: + p = tmp_path / 'a.json' + p.write_text('{}') + assert list(eee_io.iter_eee_results([p])) == [p] + + +def test_find_duplicate_variants_detects_collision(tmp_path: Path) -> None: + folder = tmp_path / 'data' / 'bench' / 'dev' / 'model' + folder.mkdir(parents=True) + plain = folder / 'abc.json' + plain.write_text('{}') + gz = folder / 'abc.json.gz' + import gzip + with gzip.open(gz, 'wt', encoding='utf-8') as f: + f.write('{}') + + # Distinct kind in the same folder must NOT trigger the rule. + samples = folder / 'abc_samples.jsonl' + samples.write_text('{}\n') + + dups = eee_io.find_duplicate_variants(eee_io.iter_eee_results([tmp_path])) + assert len(dups) == 1 + folder_out, stem, kind, variants = dups[0] + assert folder_out == folder + assert stem == 'abc' + assert kind == 'aggregate' + assert sorted(variants) == sorted([plain, gz]) + + +def test_find_duplicate_variants_clean_tree(tmp_path: Path) -> None: + folder = tmp_path / 'data' + folder.mkdir() + (folder / 'abc.json').write_text('{}') + (folder / 'abc_samples.jsonl').write_text('{}\n') + (folder / 'def.json').write_text('{}') + assert eee_io.find_duplicate_variants( + eee_io.iter_eee_results([tmp_path]) + ) == [] From fce6b7574006438ced7a198f7291bbb40f8cd45d Mon Sep 17 00:00:00 2001 From: joncrall Date: Thu, 7 May 2026 21:58:37 -0400 Subject: [PATCH 2/4] validate: route through io helpers; add duplicate-variant rule MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wire ``validate.py`` and ``check_duplicate_entries.py`` through the ``every_eval_ever.io`` helpers so compressed result files (``.gz``, ``.zst``, ``.bz2``, ``.xz``, ``.lz4``) are validated identically to their plain counterparts. Validator changes ----------------- * ``validate_aggregate`` and ``validate_instance_file`` open via ``io.open_eee_text`` rather than ``Path.open`` / ``Path.read_text``. * ``validate_file`` dispatches on ``io.is_eee_result(path)`` rather than ``path.suffix == '.json' / '.jsonl'``. Compressed forms are recognized via the suffix-stripping detector; non-EEE filenames (``.zip``, ``.parquet``, plain ``.csv``) report ``unsupported_extension`` as before. * ``expand_paths`` enumerates EEE result files via ``io.iter_eee_results`` for directory inputs. Files passed explicitly are accepted regardless of suffix and let ``validate_file`` produce the unsupported-extension error. * New ``_duplicate_variant_reports`` produces synthetic ``ValidationReport``s with a ``duplicate_variant`` error type for every ``(folder, uuid_stem, kind)`` group with more than one physical variant. ``validate.main`` runs this check unconditionally before per-file validation, so a CI/PR-bot path cannot silently green-light a folder containing both ``abc.json`` and ``abc.json.gz``. * Missing optional codec dependencies surface as ``codec_unavailable`` validation errors rather than crashing the whole run. check_duplicate_entries changes ------------------------------- * ``expand_paths`` recognizes compressed aggregate files; explicit file inputs that aren't aggregate-shaped are skipped (preserving the historical "ignore non-JSON" behaviour for non-EEE content). * The aggregate-payload reader uses ``io.open_eee_text``. ``_samples.jsonl`` files are intentionally excluded from this command's discovery — duplicate detection runs over aggregate metadata only. io.py refinement ---------------- * ``is_eee_result`` and ``eee_uuid_stem`` are lenient about the ``_samples`` prefix on per-instance files: bare ``*.jsonl`` is still recognized as samples (matching what lm-eval emits when no file UUID is supplied). Strict ``_samples.jsonl`` matching is still preferred for stem extraction. Tests ----- * New ``tests/test_validate_compression.py`` covers: same fixture in plain + ``.gz`` form yields identical validation outcomes for both aggregate and samples; ``expand_paths`` finds compressed files; duplicate-variant reports fire and CI-gate appropriately; distinct kinds in the same folder don't false-positive; missing optional codec surfaces as a typed error rather than an exception. 213 passed, 5 skipped (4 require optional codec deps, 1 is unrelated upstream skip). --- every_eval_ever/check_duplicate_entries.py | 32 +++- every_eval_ever/io.py | 46 +++-- every_eval_ever/validate.py | 107 ++++++++--- tests/test_io.py | 8 + tests/test_validate_compression.py | 201 +++++++++++++++++++++ 5 files changed, 344 insertions(+), 50 deletions(-) create mode 100644 tests/test_validate_compression.py diff --git a/every_eval_ever/check_duplicate_entries.py b/every_eval_ever/check_duplicate_entries.py index 5f6c53342..4dada0034 100644 --- a/every_eval_ever/check_duplicate_entries.py +++ b/every_eval_ever/check_duplicate_entries.py @@ -2,22 +2,36 @@ import hashlib import json import os +from pathlib import Path from typing import Any, Dict, List +from every_eval_ever import io as eee_io + IGNORE_KEYS = {'retrieved_timestamp', 'evaluation_id'} def expand_paths(paths: List[str]) -> List[str]: - """Expand folders to file paths.""" + """Expand folders to aggregate-EEE file paths. + + Recognizes both plain ``.json`` and compressed forms + (``.gz``, ``.zst``, ``.bz2``, ``.xz``, ``.lz4``). + ``_samples.jsonl`` files are intentionally excluded — duplicate + detection runs over aggregate metadata, not per-instance samples. + """ file_paths: List[str] = [] for path in paths: - if os.path.isfile(path) and path.endswith('.json'): - file_paths.append(path) - elif os.path.isdir(path): - for root, _, file_names in os.walk(path): - for file_name in file_names: - if file_name.endswith('.json'): - file_paths.append(os.path.join(root, file_name)) + p = Path(path) + if p.is_file(): + if eee_io.is_eee_result(p) == 'aggregate': + file_paths.append(str(p)) + else: + # Preserve historical behaviour: be explicit when a + # passed file is not an aggregate JSON. + continue + elif p.is_dir(): + for found in eee_io.iter_eee_results([p]): + if eee_io.is_eee_result(found) == 'aggregate': + file_paths.append(str(found)) else: raise Exception(f'Could not find file or directory at path: {path}') return file_paths @@ -84,7 +98,7 @@ def main(argv: List[str] | None = None) -> int: groups: Dict[str, List[Dict[str, Any]]] = {} for file_path in file_paths: try: - with open(file_path, 'r') as f: + with eee_io.open_eee_text(file_path, 'r') as f: payload = json.load(f) except json.JSONDecodeError as e: message = f'JSONDecodeError: {str(e)}' diff --git a/every_eval_ever/io.py b/every_eval_ever/io.py index 351dc0aa0..bafabac41 100644 --- a/every_eval_ever/io.py +++ b/every_eval_ever/io.py @@ -79,16 +79,17 @@ name: suf for suf, name in _COMPRESSION_BY_SUFFIX.items() } -# Canonical kind -> base suffix (the part *before* any compression suffix). -_BASE_SUFFIX_BY_KIND: dict[str, str] = { - 'aggregate': '.json', - 'samples': '_samples.jsonl', +# Canonical filename suffixes for each kind. The kind detector matches on +# the *first* listed suffix that ends the filename (after stripping any +# compression suffix). The ordering matters: ``.jsonl`` must be checked +# before ``.json`` because ``.jsonl`` does not end in ``.json`` — but for +# kinds that share a final extension (none currently do) this would matter. +_KIND_SUFFIXES: dict[str, tuple[str, ...]] = { + 'samples': ('_samples.jsonl', '.jsonl'), + 'aggregate': ('.json',), } -# Order matters: 'samples' must be checked before 'aggregate' because -# ``_samples.jsonl`` ends in ``.jsonl`` not ``.json`` (different base suffix -# overall) but the kind detector should not be confused by future shape -# changes. +# Order in which kinds are tried by the detector. _KIND_ORDER: tuple[str, ...] = ('samples', 'aggregate') @@ -118,28 +119,37 @@ def _strip_compression_suffix(name: str) -> str: def is_eee_result(path: Path | str) -> str | None: """Classify a path as ``'aggregate'``, ``'samples'``, or ``None``. - Recognizes both plain and compressed forms. Returns ``None`` for any - path that does not match an EEE result filename convention. + Recognizes both plain and compressed forms. Lenient about the bare + ``.jsonl`` extension to keep parity with existing converter outputs + that don't always include the ``_samples`` prefix (lm-eval emits + ``samples__.jsonl`` when no UUID is supplied). + Returns ``None`` for any path that does not match an EEE result + filename convention. """ name = _strip_compression_suffix(Path(path).name) for kind in _KIND_ORDER: - if name.endswith(_BASE_SUFFIX_BY_KIND[kind]): - return kind + for suffix in _KIND_SUFFIXES[kind]: + if name.endswith(suffix): + return kind return None def eee_uuid_stem(path: Path | str) -> str | None: """Return the bare UUID portion of an EEE result filename, or ``None``. - Strips both the compression suffix (if any) and the kind-specific base - suffix. ``abc_samples.jsonl.gz`` and ``abc.json.zst`` both yield - ``'abc'``. + Strips the compression suffix (if any) and the kind-specific suffix. + For ``abc_samples.jsonl`` returns ``'abc'``; for ``samples_task.jsonl`` + (which lacks the canonical ``_samples`` prefix) returns + ``'samples_task'``. The duplicate-variant rule keys on this stem, so + files that share a stem and a kind across compressed/uncompressed + variants will be flagged as collisions regardless of which suffix + convention is in use. """ name = _strip_compression_suffix(Path(path).name) for kind in _KIND_ORDER: - base = _BASE_SUFFIX_BY_KIND[kind] - if name.endswith(base): - return name[: -len(base)] + for suffix in _KIND_SUFFIXES[kind]: + if name.endswith(suffix): + return name[: -len(suffix)] return None diff --git a/every_eval_ever/validate.py b/every_eval_ever/validate.py index 46f4d0a3b..1e98b4d10 100644 --- a/every_eval_ever/validate.py +++ b/every_eval_ever/validate.py @@ -4,6 +4,10 @@ Validates aggregate (.json) files against EvaluationLog and instance-level (_samples.jsonl) files against InstanceLevelEvaluationLog. +Compressed forms (``.gz``, ``.zst``, ``.bz2``, ``.xz``, ``.lz4``) are +accepted alongside the plain forms — see ``every_eval_ever.io`` for the +single source of truth on suffix handling. + Usage: uv run python -m every_eval_ever validate data/benchmark/dev/model/uuid.json uv run python -m every_eval_ever validate data/benchmark/dev/model/uuid_samples.jsonl @@ -25,6 +29,7 @@ from rich.panel import Panel from rich.text import Text +from every_eval_ever import io as eee_io from every_eval_ever.eval_types import EvaluationLog from every_eval_ever.instance_level_types import InstanceLevelEvaluationLog @@ -72,19 +77,29 @@ def _pydantic_errors_to_dicts(exc: ValidationError) -> list[dict]: def validate_aggregate(file_path: Path) -> ValidationReport: - """Validate a .json file as an EvaluationLog.""" + """Validate a .json (or .json.gz/.zst/.bz2/.xz/.lz4) file as an EvaluationLog.""" report = ValidationReport( file_path=file_path, valid=True, file_type='aggregate' ) try: - raw = file_path.read_text(encoding='utf-8') + with eee_io.open_eee_text(file_path, 'r') as f: + raw = f.read() except OSError as e: report.valid = False report.errors.append( {'loc': '(file)', 'msg': str(e), 'type': 'io_error'} ) return report + except ImportError as e: + # Missing optional codec dep — surface as a validation error rather + # than a hard crash, so that a single missing codec doesn't take down + # validation of an entire batch. + report.valid = False + report.errors.append( + {'loc': '(file)', 'msg': str(e), 'type': 'codec_unavailable'} + ) + return report try: data = json.loads(raw) @@ -135,19 +150,25 @@ def _validate_instance_line(line: str, line_num: int) -> list[dict]: def validate_instance_file( file_path: Path, max_errors: int = DEFAULT_MAX_ERRORS ) -> ValidationReport: - """Validate a .jsonl file as InstanceLevelEvaluationLog (line-by-line).""" + """Validate a _samples.jsonl (optionally compressed) file line-by-line.""" report = ValidationReport( file_path=file_path, valid=True, file_type='instance' ) try: - f = file_path.open(encoding='utf-8') + f = eee_io.open_eee_text(file_path, 'r') except OSError as e: report.valid = False report.errors.append( {'loc': '(file)', 'msg': str(e), 'type': 'io_error'} ) return report + except ImportError as e: + report.valid = False + report.errors.append( + {'loc': '(file)', 'msg': str(e), 'type': 'codec_unavailable'} + ) + return report with f: for line_num, line in enumerate(f, start=1): @@ -194,40 +215,75 @@ def validate_instance_file( def validate_file( file_path: Path, max_errors: int = DEFAULT_MAX_ERRORS ) -> ValidationReport: - """Dispatch validation by file extension.""" - if file_path.suffix == '.json': + """Dispatch validation by EEE result kind, transparently across codecs.""" + kind = eee_io.is_eee_result(file_path) + if kind == 'aggregate': return validate_aggregate(file_path) - elif file_path.suffix == '.jsonl': + if kind == 'samples': return validate_instance_file(file_path, max_errors) - else: - report = ValidationReport( - file_path=file_path, valid=False, file_type='unsupported' - ) - report.errors.append( - { - 'loc': '(file)', - 'msg': f"Unsupported file extension '{file_path.suffix}'. Expected .json or .jsonl", - 'type': 'unsupported_extension', - } - ) - return report + + report = ValidationReport( + file_path=file_path, valid=False, file_type='unsupported' + ) + report.errors.append( + { + 'loc': '(file)', + 'msg': ( + f"Unsupported filename {file_path.name!r}. Expected an EEE " + f'result file: ``.json`` or ``_samples.jsonl`` ' + f'(optionally compressed with .gz/.zst/.bz2/.xz/.lz4).' + ), + 'type': 'unsupported_extension', + } + ) + return report def expand_paths(paths: list[str]) -> list[Path]: - """Expand directories to .json and .jsonl files recursively.""" + """Expand inputs to a list of EEE result files (recursive on dirs). + + Recognizes both plain and compressed forms (``.gz``, ``.zst``, ``.bz2``, + ``.xz``, ``.lz4``). Files passed explicitly are accepted regardless of + suffix — ``validate_file`` will surface an error for non-EEE inputs. + """ result: list[Path] = [] for p in paths: path = Path(p) if path.is_file(): result.append(path) elif path.is_dir(): - for ext in ('*.json', '*.jsonl'): - result.extend(sorted(path.rglob(ext))) + result.extend(eee_io.iter_eee_results([path])) else: result.append(path) # let validate_file report the error return result +def _duplicate_variant_reports( + paths: list[Path], +) -> list[ValidationReport]: + """Produce one synthetic report per ``(folder, uuid, kind)`` collision.""" + reports: list[ValidationReport] = [] + for folder, stem, kind, variants in eee_io.find_duplicate_variants(paths): + rep = ValidationReport( + file_path=variants[0], valid=False, file_type=kind, + ) + rep.errors.append( + { + 'loc': f'(uuid={stem}, kind={kind})', + 'msg': ( + f'Multiple physical variants of the same EEE result ' + f'present in {folder}: ' + f'{[v.name for v in variants]}. EEE allows at most one ' + f'variant per (uuid, kind); pick a single compressed or ' + f'uncompressed form and remove the others.' + ), + 'type': 'duplicate_variant', + } + ) + reports.append(rep) + return reports + + def _truncate(value: object, max_len: int = 80) -> str: """Truncate a repr for display.""" s = repr(value) @@ -375,7 +431,12 @@ def main(argv: list[str] | None = None) -> int: print('No files found to validate.', file=sys.stderr) return 1 - reports = [ + # Detect duplicate variants up front. Producing a fail-closed + # report per collision means the CI/PR-bot path cannot silently + # green-light a folder containing both `abc.json` and `abc.json.gz`. + duplicate_reports = _duplicate_variant_reports(file_paths) + + reports = duplicate_reports + [ validate_file(fp, max_errors=args.max_errors) for fp in file_paths ] diff --git a/tests/test_io.py b/tests/test_io.py index dc3f691f3..60a8bc6be 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -30,6 +30,11 @@ ('abc_samples.jsonl.bz2', 'samples'), ('abc_samples.jsonl.xz', 'samples'), ('abc_samples.jsonl.lz4', 'samples'), + # Lenient: bare *.jsonl (no _samples prefix) still recognized + # as samples, since lm-eval and other converters produce these. + ('samples_mmlu_2025-01-01.jsonl', 'samples'), + ('whatever.jsonl', 'samples'), + ('whatever.jsonl.gz', 'samples'), ('abc.txt', None), ('abc.parquet', None), ('manifest.json', 'aggregate'), # any *.json is aggregate by shape @@ -49,6 +54,9 @@ def test_is_eee_result(name: str, expected: str | None) -> None: ('a-b-c.json.zst', 'a-b-c'), ('abc_samples.jsonl', 'abc'), ('abc_samples.jsonl.gz', 'abc'), + # Bare .jsonl: stem is the whole pre-extension filename. + ('samples_mmlu_2025.jsonl', 'samples_mmlu_2025'), + ('whatever.jsonl.bz2', 'whatever'), ('abc.txt', None), ('abc.json.zip', None), ], diff --git a/tests/test_validate_compression.py b/tests/test_validate_compression.py new file mode 100644 index 000000000..2b0045abc --- /dev/null +++ b/tests/test_validate_compression.py @@ -0,0 +1,201 @@ +"""Validator tests covering compressed result files + duplicate-variant rule.""" + +from __future__ import annotations + +import gzip +import json +from pathlib import Path + +import pytest + +from every_eval_ever import io as eee_io +from every_eval_ever import validate + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +def _read_fixture_aggregate() -> dict: + """Load the existing aggregate fixture committed under tests/data/.""" + fixture = Path(__file__).parent / 'data' / '98ea850e-7019-4728-a558-8b1819ec47c2.json' + with fixture.open(encoding='utf-8') as f: + return json.load(f) + + +# --------------------------------------------------------------------------- +# Plain + .gz validate identically +# --------------------------------------------------------------------------- + + +def test_validate_aggregate_gz_matches_plain(tmp_path: Path) -> None: + payload = _read_fixture_aggregate() + + plain = tmp_path / 'a.json' + with plain.open('w', encoding='utf-8') as f: + json.dump(payload, f) + rep_plain = validate.validate_file(plain) + + gz = tmp_path / 'b.json.gz' + with gzip.open(gz, 'wt', encoding='utf-8') as f: + json.dump(payload, f) + rep_gz = validate.validate_file(gz) + + # Gzipped vs. plain must produce identical validation outcomes — that's + # the whole point of transparent compression. We don't assert + # ``valid == True`` here because the fixture may legitimately fail + # against the current schema; we only assert that compression doesn't + # change the result. + assert rep_plain.valid == rep_gz.valid + assert rep_plain.file_type == rep_gz.file_type == 'aggregate' + assert [e['type'] for e in rep_plain.errors] == [ + e['type'] for e in rep_gz.errors + ] + assert [e['msg'] for e in rep_plain.errors] == [ + e['msg'] for e in rep_gz.errors + ] + + +def test_validate_samples_gz_matches_plain(tmp_path: Path) -> None: + # Synthesize a minimal valid InstanceLevelEvaluationLog stream. + rows = [ + { + 'schema_version': '0.2.2', + 'evaluation_id': 'fake/eval/1', + 'model_id': 'org/model', + 'evaluation_name': 'fake', + 'sample_id': str(i), + 'sample_hash': 'a' * 64, + 'interaction_type': 'single_turn', + 'input': {'raw': 'q', 'reference': [], 'choices': []}, + 'output': {'raw': ['a'], 'reasoning_trace': []}, + 'evaluation': {'score': 1.0, 'is_correct': True}, + } + for i in range(3) + ] + + plain = tmp_path / 'a_samples.jsonl' + with plain.open('w', encoding='utf-8') as f: + for r in rows: + f.write(json.dumps(r) + '\n') + rep_plain = validate.validate_file(plain) + + gz = tmp_path / 'b_samples.jsonl.gz' + with gzip.open(gz, 'wt', encoding='utf-8') as f: + for r in rows: + f.write(json.dumps(r) + '\n') + rep_gz = validate.validate_file(gz) + + assert rep_plain.valid == rep_gz.valid + assert rep_plain.line_count == rep_gz.line_count == 3 + assert rep_plain.file_type == rep_gz.file_type == 'instance' + + +# --------------------------------------------------------------------------- +# Discovery picks up compressed files +# --------------------------------------------------------------------------- + + +def test_expand_paths_finds_compressed_files(tmp_path: Path) -> None: + a_plain = tmp_path / 'sub' / 'aaa.json' + a_plain.parent.mkdir() + a_plain.write_text('{}') + b_gz = tmp_path / 'sub' / 'bbb_samples.jsonl.gz' + with gzip.open(b_gz, 'wt', encoding='utf-8') as f: + f.write('{}\n') + junk = tmp_path / 'sub' / 'README.md' + junk.write_text('not eee') + + found = sorted(validate.expand_paths([str(tmp_path)])) + assert found == [a_plain, b_gz] + + +# --------------------------------------------------------------------------- +# Duplicate-variant rule +# --------------------------------------------------------------------------- + + +def test_validate_main_fails_on_duplicate_variants(tmp_path: Path) -> None: + folder = tmp_path / 'data' / 'bench' / 'dev' / 'model' + folder.mkdir(parents=True) + plain = folder / 'abc.json' + plain.write_text('{}') # invalid as EvaluationLog, but that's a different error + gz = folder / 'abc.json.gz' + with gzip.open(gz, 'wt', encoding='utf-8') as f: + f.write('{}') + + exit_code = validate.main([str(tmp_path), '--format', 'json']) + assert exit_code == 1 + + +def test_duplicate_variant_reports_emit_typed_errors(tmp_path: Path) -> None: + folder = tmp_path / 'data' + folder.mkdir() + (folder / 'abc.json').write_text('{}') + with gzip.open(folder / 'abc.json.gz', 'wt', encoding='utf-8') as f: + f.write('{}') + + paths = list(eee_io.iter_eee_results([tmp_path])) + reports = validate._duplicate_variant_reports(paths) + assert len(reports) == 1 + err = reports[0].errors[0] + assert err['type'] == 'duplicate_variant' + assert 'abc.json' in err['msg'] + assert 'abc.json.gz' in err['msg'] + + +def test_duplicate_variant_distinct_kinds_dont_collide(tmp_path: Path) -> None: + """Same UUID, different kind: NOT a duplicate.""" + folder = tmp_path / 'data' + folder.mkdir() + (folder / 'abc.json').write_text('{}') + (folder / 'abc_samples.jsonl').write_text('{}\n') + + paths = list(eee_io.iter_eee_results([tmp_path])) + assert validate._duplicate_variant_reports(paths) == [] + + +# --------------------------------------------------------------------------- +# Unsupported / non-EEE filename +# --------------------------------------------------------------------------- + + +def test_validate_file_rejects_zip(tmp_path: Path) -> None: + p = tmp_path / 'abc.json.zip' + p.write_bytes(b'PK\x03\x04') + rep = validate.validate_file(p) + assert rep.valid is False + assert rep.errors[0]['type'] == 'unsupported_extension' + + +def test_validate_file_rejects_unknown_extension(tmp_path: Path) -> None: + p = tmp_path / 'abc.parquet' + p.write_bytes(b'PAR1') + rep = validate.validate_file(p) + assert rep.valid is False + assert rep.errors[0]['type'] == 'unsupported_extension' + + +# --------------------------------------------------------------------------- +# Missing optional codec surfaces as a validation error, not a crash +# --------------------------------------------------------------------------- + + +def test_missing_codec_surfaces_as_validation_error(tmp_path: Path, + monkeypatch) -> None: + pytest.importorskip( # no point if zstandard happens to be installed AND we mocked + 'pytest', reason='sanity' + ) + # Force lz4 import to fail regardless of environment. + import sys + monkeypatch.setitem(sys.modules, 'lz4', None) + monkeypatch.setitem(sys.modules, 'lz4.frame', None) + + p = tmp_path / 'abc.json.lz4' + # Bytes don't matter — we should never reach the actual decompressor. + p.write_bytes(b'\x00\x00\x00\x00') + + rep = validate.validate_file(p) + assert rep.valid is False + assert rep.errors[0]['type'] == 'codec_unavailable' From cba681440913b27a450c2aad08ca60175f964bd8 Mon Sep 17 00:00:00 2001 From: joncrall Date: Thu, 7 May 2026 21:58:37 -0400 Subject: [PATCH 3/4] cli: add --compress / --compress-aggregate / --compress-samples flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Threads compression through the converter writer paths. Defaults to ``--compress none`` everywhere for full backwards compatibility — this commit only enables compression on opt-in. CLI flags (added to all four ``convert`` subcommands) ----------------------------------------------------- * ``--compress {none|gz|zst|bz2|xz|lz4}`` Default codec for both aggregate and per-instance files. * ``--compress-aggregate {none|gz|zst|bz2|xz|lz4}`` Override for aggregate ``.json`` only. Defaults to ``--compress``'s value. Recommended setting: ``none``, since HF and GitHub render uncompressed JSON inline for spot-checking. * ``--compress-samples {none|gz|zst|bz2|xz|lz4}`` Override for per-instance ``_samples.jsonl`` only. Recommended setting for any submission going to a public store: ``gz`` (5–15x size reduction with no UX cost — the samples files are too large for web preview anyway). Argparse rejects unsupported codecs at parse time. Optional codec deps (``zstandard``, ``lz4``) trigger ``ImportError`` with an actionable message at write time when the user picks the codec but hasn't installed the extra. Writer-side plumbing -------------------- * ``cli._write_log`` accepts ``compression``; output filename gets the codec suffix appended (``.json[.gz]``). * ``LMEvalInstanceLevelAdapter.transform_and_save`` accepts ``compression``; the recorded ``DetailedEvaluationResults.checksum`` is computed over the on-disk (possibly compressed) bytes so that downstream verifiers can validate the file as-stored. * ``HELMInstanceLevelDataAdapter`` accepts ``compression`` in its constructor; the adapter's ``self.path`` reflects the on-disk filename including any codec suffix. * ``cli._cmd_convert_helm`` threads the resolved samples-compression via ``metadata_args['samples_compression']`` rather than adding a new positional to the long-existing ``transform_from_directory`` signature. Tests ----- * ``tests/test_cli_compression.py``: full coverage of ``_resolve_compression``, ``_write_log`` round-trips per codec, LM-Eval samples writer compression including checksum-on-disk and the recorded ``file_path``, and parser-level wiring (defaults, per-kind override, invalid codec rejection). * ``tests/test_cli_inspect_uuid.py``: existing fake-write-log stubs updated to accept the new ``compression=`` keyword. 226 passed, 5 skipped. Backwards compatibility preserved across all existing tests. --- every_eval_ever/cli.py | 85 +++++- every_eval_ever/converters/helm/adapter.py | 1 + .../converters/helm/instance_level_adapter.py | 10 +- .../lm_eval/instance_level_adapter.py | 14 +- tests/test_cli_compression.py | 281 ++++++++++++++++++ tests/test_cli_inspect_uuid.py | 8 +- 6 files changed, 383 insertions(+), 16 deletions(-) create mode 100644 tests/test_cli_compression.py diff --git a/every_eval_ever/cli.py b/every_eval_ever/cli.py index c215c89cd..edbf5bce5 100644 --- a/every_eval_ever/cli.py +++ b/every_eval_ever/cli.py @@ -10,6 +10,8 @@ from pathlib import Path from typing import Any +from every_eval_ever import io as eee_io + EVALUATOR_RELATIONSHIP_CHOICES = [ 'first_party', 'third_party', @@ -30,6 +32,21 @@ def _common_metadata(args: argparse.Namespace) -> dict[str, Any]: } +def _resolve_compression( + args: argparse.Namespace, kind: str +) -> str: + """Resolve the compression name for ``kind`` ('aggregate' or 'samples').""" + if kind == 'aggregate': + per_kind = getattr(args, 'compress_aggregate', None) + elif kind == 'samples': + per_kind = getattr(args, 'compress_samples', None) + else: + raise ValueError(f'unknown kind {kind!r}') + if per_kind is not None: + return per_kind + return getattr(args, 'compress', eee_io.COMPRESSION_NONE) + + def _output_dir_for_log(base_output: Path, log: Any) -> Path: dataset = 'unknown' if log.evaluation_results and log.evaluation_results[0].source_data: @@ -46,12 +63,23 @@ def _output_dir_for_log(base_output: Path, log: Any) -> Path: def _write_log( - log: Any, base_output: Path, eval_uuid: str | None = None + log: Any, + base_output: Path, + eval_uuid: str | None = None, + compression: str = eee_io.COMPRESSION_NONE, ) -> Path: + """Write an aggregate evaluation log to ``base_output``. + + ``compression`` selects the codec for the on-disk file; the resulting + filename has the codec suffix appended (e.g. ``.json.gz``). + Defaults to no compression for backwards compatibility. + """ out_dir = _output_dir_for_log(base_output, log) eval_uuid = eval_uuid or str(uuid.uuid4()) - out_file = out_dir / f'{eval_uuid}.json' - with out_file.open('w', encoding='utf-8') as file: + out_file = eee_io.add_compression_suffix( + out_dir / f'{eval_uuid}.json', compression + ) + with eee_io.open_eee_text(out_file, 'w') as file: json.dump( log.model_dump(mode='json', exclude_none=True), file, indent=2 ) @@ -100,9 +128,11 @@ def _cmd_convert_lm_eval(args: argparse.Namespace) -> int: task_name=task_name, output_dir=str(_output_dir_for_log(output_dir, log)), file_uuid=eval_uuid, + compression=_resolve_compression(args, 'samples'), ) log.detailed_evaluation_results = detailed - print(_write_log(log, output_dir, eval_uuid=eval_uuid)) + print(_write_log(log, output_dir, eval_uuid=eval_uuid, + compression=_resolve_compression(args, 'aggregate'))) print(f'Converted {len(logs)} evaluation log(s).') return 0 @@ -139,7 +169,8 @@ def _cmd_convert_inspect(args: argparse.Namespace) -> int: output_dir = Path(args.output_dir) for log, eval_uuid in zip(logs, eval_uuids): - print(_write_log(log, output_dir, eval_uuid=eval_uuid)) + print(_write_log(log, output_dir, eval_uuid=eval_uuid, + compression=_resolve_compression(args, 'aggregate'))) print(f'Converted {len(logs)} evaluation log(s).') return 0 @@ -168,6 +199,7 @@ def _cmd_convert_helm(args: argparse.Namespace) -> int: else: raise FileNotFoundError(f'Path is not a file or directory: {log_path}') + metadata['samples_compression'] = _resolve_compression(args, 'samples') logs = adapter.transform_from_directory( log_path, output_path=str(Path(args.output_dir) / 'helm_output'), @@ -182,7 +214,8 @@ def _cmd_convert_helm(args: argparse.Namespace) -> int: output_dir = Path(args.output_dir) for log, eval_uuid in zip(logs, eval_uuids): - print(_write_log(log, output_dir, eval_uuid=eval_uuid)) + print(_write_log(log, output_dir, eval_uuid=eval_uuid, + compression=_resolve_compression(args, 'aggregate'))) print(f'Converted {len(logs)} evaluation log(s).') return 0 @@ -223,7 +256,10 @@ def _cmd_convert_alpaca_eval(args: argparse.Namespace) -> int: if args.eval_library_version != 'unknown': log.eval_library.version = args.eval_library_version - out_file = _write_log(log, output_dir) + out_file = _write_log( + log, output_dir, + compression=_resolve_compression(args, 'aggregate'), + ) print(f' {out_file}') total += 1 @@ -354,6 +390,41 @@ def build_parser() -> argparse.ArgumentParser: help='Evaluation library version recorded in eval_library.version.', ) + # ----- compression options (apply to all converters) ------------ + source_parser.add_argument( + '--compress', + choices=eee_io.COMPRESSION_CHOICES, + default=eee_io.COMPRESSION_NONE, + help=( + 'Default codec for both aggregate and per-instance ' + 'output files (default: none). Files are written as ' + ".json[.] / _samples.jsonl[.]. " + 'See the --compress-aggregate / --compress-samples ' + 'flags to override per-kind. .zst and .lz4 require the ' + "optional 'zst' / 'lz4' extras." + ), + ) + source_parser.add_argument( + '--compress-aggregate', '--compress_aggregate', + choices=eee_io.COMPRESSION_CHOICES, + default=None, + help=( + "Override --compress for aggregate .json files only. " + 'HF + GitHub render uncompressed JSON inline so the default ' + "(--compress's value) is usually the right call here." + ), + ) + source_parser.add_argument( + '--compress-samples', '--compress_samples', + choices=eee_io.COMPRESSION_CHOICES, + default=None, + help=( + "Override --compress for per-instance _samples.jsonl " + "files only. JSONL compresses 5–15x; recommended setting " + 'when shipping to a public store.' + ), + ) + if source == 'alpaca_eval': source_parser.add_argument( '--version', diff --git a/every_eval_ever/converters/helm/adapter.py b/every_eval_ever/converters/helm/adapter.py index e43eb01b9..939ca3f9e 100644 --- a/every_eval_ever/converters/helm/adapter.py +++ b/every_eval_ever/converters/helm/adapter.py @@ -489,6 +489,7 @@ def _transform_single( Format.jsonl.value, HashAlgorithm.sha256.value, evaluation_dir, + compression=metadata_args.get('samples_compression', 'none'), ).convert_instance_level_logs( dataset_name, model_info.id, diff --git a/every_eval_ever/converters/helm/instance_level_adapter.py b/every_eval_ever/converters/helm/instance_level_adapter.py index 037237ecc..e1262cc29 100644 --- a/every_eval_ever/converters/helm/instance_level_adapter.py +++ b/every_eval_ever/converters/helm/instance_level_adapter.py @@ -2,6 +2,8 @@ from pathlib import Path from typing import Any, List, Tuple +from every_eval_ever import io as eee_io + _HELM_IMPORT_ERROR: Exception | None = None try: from helm.benchmark.adaptation.scenario_state import RequestState @@ -42,20 +44,24 @@ def __init__( format: str, hash_algorithm: str, evaluation_dir: str, + compression: str = eee_io.COMPRESSION_NONE, ): _require_helm_dependencies() self.evaluation_id = evaulation_id self.format = format self.hash_algorithm = hash_algorithm self.evaluation_dir = evaluation_dir - self.path = f'{evaluation_dir}/{evaulation_id}.{format}' + self.compression = compression + # On-disk path includes the codec suffix when compression is set. + base_path = Path(evaluation_dir) / f'{evaulation_id}.{format}' + self.path = str(eee_io.add_compression_suffix(base_path, compression)) def _save_json(self, items: List[InstanceLevelEvaluationLog]): eval_dir_path = Path(self.evaluation_dir) eval_dir_path.mkdir(parents=True, exist_ok=True) path = Path(self.path) - with path.open('w', encoding='utf-8') as f: + with eee_io.open_eee_text(path, 'w') as f: for item in items: json_line = json.dumps( item.model_dump(mode='json'), ensure_ascii=False diff --git a/every_eval_ever/converters/lm_eval/instance_level_adapter.py b/every_eval_ever/converters/lm_eval/instance_level_adapter.py index 1cc393862..c4a30198d 100644 --- a/every_eval_ever/converters/lm_eval/instance_level_adapter.py +++ b/every_eval_ever/converters/lm_eval/instance_level_adapter.py @@ -5,6 +5,7 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Union +from every_eval_ever import io as eee_io from every_eval_ever.converters import SCHEMA_VERSION from every_eval_ever.eval_types import ( DetailedEvaluationResults, @@ -55,12 +56,18 @@ def transform_and_save( task_name: str, output_dir: Optional[Union[str, Path]] = None, file_uuid: Optional[str] = None, + compression: str = eee_io.COMPRESSION_NONE, ) -> Optional[DetailedEvaluationResults]: """Transform samples and save to JSONL, returning a DetailedEvaluationResults pointer. If output_dir is None, returns None (skips instance-level output). If file_uuid is provided, the output file is named {file_uuid}_samples.jsonl so it shares the UUID of the corresponding evaluation result file. + + ``compression`` controls the on-disk codec for the samples file + (see ``every_eval_ever.io.COMPRESSION_CHOICES``). The output + filename has the codec suffix appended; the recorded checksum + is computed over the (possibly compressed) on-disk bytes. """ if output_dir is None: return None @@ -74,11 +81,12 @@ def transform_and_save( output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) if file_uuid: - out_file = output_dir / f'{file_uuid}_samples.jsonl' + base = output_dir / f'{file_uuid}_samples.jsonl' else: - out_file = output_dir / f'samples_{task_name}.jsonl' + base = output_dir / f'samples_{task_name}.jsonl' + out_file = eee_io.add_compression_suffix(base, compression) - with open(out_file, 'w') as f: + with eee_io.open_eee_text(out_file, 'w') as f: for log in logs: f.write( json.dumps(log.model_dump(mode='json'), ensure_ascii=False) diff --git a/tests/test_cli_compression.py b/tests/test_cli_compression.py new file mode 100644 index 000000000..85a53d45b --- /dev/null +++ b/tests/test_cli_compression.py @@ -0,0 +1,281 @@ +"""Writer-side compression tests for the converter CLIs.""" + +from __future__ import annotations + +import argparse +import gzip +import json +from pathlib import Path +from types import SimpleNamespace + +import pytest + +from every_eval_ever import cli +from every_eval_ever import io as eee_io + + +# --------------------------------------------------------------------------- +# _resolve_compression: per-kind override falls back to --compress +# --------------------------------------------------------------------------- + + +def test_resolve_compression_uses_per_kind_override() -> None: + args = SimpleNamespace(compress='gz', compress_aggregate='none', + compress_samples='zst') + assert cli._resolve_compression(args, 'aggregate') == 'none' + assert cli._resolve_compression(args, 'samples') == 'zst' + + +def test_resolve_compression_falls_back_to_compress() -> None: + args = SimpleNamespace(compress='gz', compress_aggregate=None, + compress_samples=None) + assert cli._resolve_compression(args, 'aggregate') == 'gz' + assert cli._resolve_compression(args, 'samples') == 'gz' + + +def test_resolve_compression_default_none() -> None: + args = SimpleNamespace(compress='none', compress_aggregate=None, + compress_samples=None) + assert cli._resolve_compression(args, 'samples') == 'none' + + +def test_resolve_compression_rejects_bad_kind() -> None: + args = SimpleNamespace(compress='gz', compress_aggregate=None, + compress_samples=None) + with pytest.raises(ValueError): + cli._resolve_compression(args, 'metadata') + + +# --------------------------------------------------------------------------- +# _write_log writes the right path, with the right codec, that round-trips +# --------------------------------------------------------------------------- + + +class _FakeSourceData: + def __init__(self, dataset_name: str = 'fake'): + self.dataset_name = dataset_name + + +class _FakeEvalResult: + def __init__(self, dataset_name: str = 'fake'): + self.source_data = _FakeSourceData(dataset_name) + + +class _FakeModelInfo: + def __init__(self, id_: str = 'org/model'): + self.id = id_ + + +class _FakeLog: + """Minimal log-shaped object that supports the bits ``_write_log`` uses.""" + + def __init__(self, dataset: str = 'fake', model_id: str = 'org/model'): + self.evaluation_results = [_FakeEvalResult(dataset)] + self.model_info = _FakeModelInfo(model_id) + self._dump = {'schema_version': '0.2.2', 'kind': 'aggregate', + 'dataset': dataset, 'model': model_id} + + def model_dump(self, *, mode: str, exclude_none: bool): + return self._dump + + +def test_write_log_uncompressed_default(tmp_path: Path) -> None: + log = _FakeLog() + out = cli._write_log(log, tmp_path, eval_uuid='aaa') + assert out.name == 'aaa.json' + assert out.parent == tmp_path / 'fake' / 'org' / 'model' + assert out.exists() + assert json.loads(out.read_text()) == log._dump + + +def test_write_log_gz_appends_suffix_and_compresses(tmp_path: Path) -> None: + log = _FakeLog() + out = cli._write_log(log, tmp_path, eval_uuid='bbb', compression='gz') + assert out.name == 'bbb.json.gz' + # Cannot read as plain text — must decompress. + with gzip.open(out, 'rt', encoding='utf-8') as f: + assert json.load(f) == log._dump + + +def test_write_log_preserves_indent_and_unicode(tmp_path: Path) -> None: + log = _FakeLog() + log._dump['note'] = 'café' # non-ascii + out = cli._write_log(log, tmp_path, eval_uuid='ccc', compression='bz2') + assert out.name == 'ccc.json.bz2' + with eee_io.open_eee_text(out, 'r') as f: + data = json.load(f) + assert data['note'] == 'café' + + +# --------------------------------------------------------------------------- +# LMEvalInstanceLevelAdapter.transform_and_save honors compression +# --------------------------------------------------------------------------- + + +def test_lm_eval_samples_writer_gzip(tmp_path: Path, + monkeypatch: pytest.MonkeyPatch) -> None: + from every_eval_ever.converters.lm_eval.instance_level_adapter import ( + LMEvalInstanceLevelAdapter, + ) + + # Write a fake lm-eval samples.jsonl input. We don't need it to be a + # real lm-eval shape; we just need transform_samples to produce >= 1 + # record. Stub _transform_sample to short-circuit the schema work. + fake_input = tmp_path / 'samples_fake_2025-01-01.jsonl' + fake_input.write_text(json.dumps({'_': 1}) + '\n') + + # Build a fake instance-level log: any Pydantic-validating shape works. + from every_eval_ever.instance_level_types import ( + Evaluation, Input, InstanceLevelEvaluationLog, InteractionType, + Output, + ) + + def _fake_transform(self, sample, evaluation_id, model_id, task_name): + from every_eval_ever.instance_level_types import ( + AnswerAttributionItem, + ) + return InstanceLevelEvaluationLog( + schema_version='0.2.2', + evaluation_id=evaluation_id, + model_id=model_id, + evaluation_name=task_name, + sample_id='1', + sample_hash='a' * 64, + interaction_type=InteractionType.single_turn, + input=Input(raw='q', reference=[], choices=[]), + output=Output(raw=['a'], reasoning_trace=[]), + answer_attribution=[ + AnswerAttributionItem( + turn_idx=0, source='output.raw', extracted_value='a', + extraction_method='exact_match', is_terminal=True, + ) + ], + evaluation=Evaluation(score=1.0, is_correct=True), + ) + + monkeypatch.setattr( + LMEvalInstanceLevelAdapter, '_transform_sample', _fake_transform + ) + + adapter = LMEvalInstanceLevelAdapter() + out_dir = tmp_path / 'out' + detailed = adapter.transform_and_save( + samples_path=fake_input, + evaluation_id='eid', + model_id='org/model', + task_name='fake', + output_dir=out_dir, + file_uuid='zzz', + compression='gz', + ) + + assert detailed is not None + out_file = Path(detailed.file_path) + assert out_file.name == 'zzz_samples.jsonl.gz' + # Round-trips + with gzip.open(out_file, 'rt', encoding='utf-8') as f: + rows = [json.loads(line) for line in f if line.strip()] + assert len(rows) == 1 + # checksum is over the on-disk (compressed) bytes + import hashlib + assert detailed.checksum == hashlib.sha256(out_file.read_bytes()).hexdigest() + + +def test_lm_eval_samples_writer_default_uncompressed( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + from every_eval_ever.converters.lm_eval.instance_level_adapter import ( + LMEvalInstanceLevelAdapter, + ) + from every_eval_ever.instance_level_types import ( + Evaluation, Input, InstanceLevelEvaluationLog, InteractionType, + Output, + ) + + fake_input = tmp_path / 'samples_fake.jsonl' + fake_input.write_text(json.dumps({'_': 1}) + '\n') + + def _fake_transform(self, sample, evaluation_id, model_id, task_name): + from every_eval_ever.instance_level_types import ( + AnswerAttributionItem, + ) + return InstanceLevelEvaluationLog( + schema_version='0.2.2', + evaluation_id=evaluation_id, + model_id=model_id, + evaluation_name=task_name, + sample_id='1', + sample_hash='a' * 64, + interaction_type=InteractionType.single_turn, + input=Input(raw='q', reference=[], choices=[]), + output=Output(raw=['a'], reasoning_trace=[]), + answer_attribution=[ + AnswerAttributionItem( + turn_idx=0, source='output.raw', extracted_value='a', + extraction_method='exact_match', is_terminal=True, + ) + ], + evaluation=Evaluation(score=1.0, is_correct=True), + ) + monkeypatch.setattr( + LMEvalInstanceLevelAdapter, '_transform_sample', _fake_transform + ) + + detailed = LMEvalInstanceLevelAdapter().transform_and_save( + samples_path=fake_input, + evaluation_id='eid', + model_id='org/model', + task_name='fake', + output_dir=tmp_path / 'out', + file_uuid='zzz', + ) + assert detailed is not None + assert Path(detailed.file_path).name == 'zzz_samples.jsonl' + + +# --------------------------------------------------------------------------- +# CLI parser wires the flags through with sensible defaults +# --------------------------------------------------------------------------- + + +def test_cli_parser_defaults_to_none() -> None: + parser = cli.build_parser() + ns = parser.parse_args( + ['convert', 'lm_eval', '--log_path', '/tmp/x', '--output_dir', '/tmp/y'] + ) + assert ns.compress == 'none' + assert ns.compress_aggregate is None + assert ns.compress_samples is None + assert cli._resolve_compression(ns, 'aggregate') == 'none' + assert cli._resolve_compression(ns, 'samples') == 'none' + + +def test_cli_parser_compress_samples_gz() -> None: + parser = cli.build_parser() + ns = parser.parse_args( + ['convert', 'helm', '--log_path', '/tmp/x', + '--compress-samples', 'gz'] + ) + assert ns.compress == 'none' + assert ns.compress_samples == 'gz' + assert cli._resolve_compression(ns, 'aggregate') == 'none' + assert cli._resolve_compression(ns, 'samples') == 'gz' + + +def test_cli_parser_global_compress() -> None: + parser = cli.build_parser() + ns = parser.parse_args( + ['convert', 'inspect', '--log_path', '/tmp/x', '--compress', 'bz2'] + ) + assert ns.compress == 'bz2' + assert cli._resolve_compression(ns, 'aggregate') == 'bz2' + assert cli._resolve_compression(ns, 'samples') == 'bz2' + + +def test_cli_parser_rejects_invalid_codec() -> None: + parser = cli.build_parser() + with pytest.raises(SystemExit): + parser.parse_args( + ['convert', 'lm_eval', '--log_path', '/tmp/x', + '--compress', 'snappy'] + ) diff --git a/tests/test_cli_inspect_uuid.py b/tests/test_cli_inspect_uuid.py index 732f860ae..2d4a60f18 100644 --- a/tests/test_cli_inspect_uuid.py +++ b/tests/test_cli_inspect_uuid.py @@ -63,7 +63,7 @@ def transform_from_directory(self, *_args, **_kwargs): captured_eval_uuids: list[str | None] = [] - def fake_write_log(_log, _base_output, eval_uuid=None): + def fake_write_log(_log, _base_output, eval_uuid=None, compression="none"): captured_eval_uuids.append(eval_uuid) return Path('/tmp/fake_aggregate.json') @@ -113,7 +113,7 @@ def transform_from_directory(self, _path, metadata_args): captured_eval_uuids: list[str | None] = [] - def fake_write_log(_log, _base_output, eval_uuid=None): + def fake_write_log(_log, _base_output, eval_uuid=None, compression="none"): captured_eval_uuids.append(eval_uuid) return Path('/tmp/fake_aggregate.json') @@ -160,7 +160,7 @@ def transform_from_directory( captured_eval_uuids: list[str | None] = [] - def fake_write_log(_log, _base_output, eval_uuid=None): + def fake_write_log(_log, _base_output, eval_uuid=None, compression="none"): captured_eval_uuids.append(eval_uuid) return Path('/tmp/fake_aggregate.json') @@ -208,7 +208,7 @@ def transform_from_directory( captured_eval_uuids: list[str | None] = [] - def fake_write_log(_log, _base_output, eval_uuid=None): + def fake_write_log(_log, _base_output, eval_uuid=None, compression="none"): captured_eval_uuids.append(eval_uuid) return Path('/tmp/fake_aggregate.json') From d35f86f571eabae067c357f58e83ec45b1dd0d9f Mon Sep 17 00:00:00 2001 From: joncrall Date: Thu, 7 May 2026 21:58:37 -0400 Subject: [PATCH 4/4] pyproject + README: optional codec extras and compression docs * Add ``[zst]`` and ``[lz4]`` optional-dependency groups for the third-party codecs not available in the standard library. ``[all]`` now pulls them in alongside ``[inspect]`` and ``[helm]``. * README: new subsection under "Data Validation" documenting the recognized compressed forms, the duplicate-variant rule, and the ``--compress`` / ``--compress-samples`` writer flags. Notes that defaults remain uncompressed for backwards compatibility, and recommends ``--compress-samples gz`` (not the global ``--compress``) for public submissions so aggregate JSON stays browsable on the HF web UI. 226 passed, 5 skipped. --- README.md | 38 ++++++++++++++++++++++++++++++++++++++ pyproject.toml | 7 +++++++ 2 files changed, 45 insertions(+) diff --git a/README.md b/README.md index 262a2ca11..343431925 100644 --- a/README.md +++ b/README.md @@ -177,6 +177,44 @@ uv run python -m every_eval_ever validate file1.json file2_samples.jsonl data/ File type is determined by extension: `.json` validates against `EvaluationLog`, `.jsonl` validates each line against `InstanceLevelEvaluationLog`. +#### Compressed files + +Validation, discovery, and the converter writers all transparently +support compressed result files alongside the plain forms: + +``` +.json{,.gz,.zst,.bz2,.xz,.lz4} +_samples.jsonl{,.gz,.zst,.bz2,.xz,.lz4} +``` + +Codecs match what the [HuggingFace Hub natively decompresses](https://huggingface.co/docs/hub/en/datasets-adding#file-formats). +The schema is unchanged; compression is purely a transport / storage +concern. A given `(folder, uuid, kind)` may have **at most one +physical variant** — the validator emits a `duplicate_variant` error +if both `abc.json` and `abc.json.gz` are committed in the same folder. + +`gzip`, `bzip2`, and `lzma/xz` use the standard library. `zstd` and +`lz4` are optional extras; install with `pip install 'every-eval-ever[zst]'` +or `[lz4]` (or `[all]` to pull in everything). + +To **write** compressed output during conversion, pass one of the +new flags to any `convert` subcommand: + +```sh +# Default codec for both aggregate and per-instance files +uv run python -m every_eval_ever convert helm \ + --log_path runs/ --output_dir data/ \ + --compress gz + +# Only compress per-instance samples (recommended for public submissions — +# aggregate JSON stays browsable on the HF web UI) +uv run python -m every_eval_ever convert helm \ + --log_path runs/ --output_dir data/ \ + --compress-samples gz +``` + +Defaults remain uncompressed for full backwards compatibility. + #### Output formats ```sh diff --git a/pyproject.toml b/pyproject.toml index da63d2c4f..87fb1b038 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,9 +31,16 @@ helm = [ "crfm-helm>=0.5.12", "typer>=0.12,<1.0", ] +# Optional codecs for transparent compression of EEE result files +# (see ``every_eval_ever.io``). gzip, bzip2, and lzma/xz are stdlib; +# zstd and lz4 each need a third-party package. +zst = ["zstandard>=0.22"] +lz4 = ["lz4>=4"] all = [ "every-eval-ever[inspect]", "every-eval-ever[helm]", + "every-eval-ever[zst]", + "every-eval-ever[lz4]", ] [project.scripts]