evaleval · Erotemic · May 8, 2026 · May 8, 2026 · May 8, 2026 · May 8, 2026
diff --git a/README.md b/README.md
@@ -177,6 +177,44 @@ uv run python -m every_eval_ever validate file1.json file2_samples.jsonl data/
 
 File type is determined by extension: `.json` validates against `EvaluationLog`, `.jsonl` validates each line against `InstanceLevelEvaluationLog`.
 
+#### Compressed files
+
+Validation, discovery, and the converter writers all transparently
+support compressed result files alongside the plain forms:
+
+```
+<uuid>.json{,.gz,.zst,.bz2,.xz,.lz4}
+<uuid>_samples.jsonl{,.gz,.zst,.bz2,.xz,.lz4}
+```
+
+Codecs match what the [HuggingFace Hub natively decompresses](https://huggingface.co/docs/hub/en/datasets-adding#file-formats).
+The schema is unchanged; compression is purely a transport / storage
+concern. A given `(folder, uuid, kind)` may have **at most one
+physical variant** — the validator emits a `duplicate_variant` error
+if both `abc.json` and `abc.json.gz` are committed in the same folder.
+
+`gzip`, `bzip2`, and `lzma/xz` use the standard library. `zstd` and
+`lz4` are optional extras; install with `pip install 'every-eval-ever[zst]'`
+or `[lz4]` (or `[all]` to pull in everything).
+
+To **write** compressed output during conversion, pass one of the
+new flags to any `convert` subcommand:
+
+```sh
+# Default codec for both aggregate and per-instance files
+uv run python -m every_eval_ever convert helm \
+    --log_path runs/ --output_dir data/ \
+    --compress gz
+
+# Only compress per-instance samples (recommended for public submissions —
+# aggregate JSON stays browsable on the HF web UI)
+uv run python -m every_eval_ever convert helm \
+    --log_path runs/ --output_dir data/ \
+    --compress-samples gz
+```
+
+Defaults remain uncompressed for full backwards compatibility.
+
 #### Output formats
 
 ```sh

diff --git a/every_eval_ever/check_duplicate_entries.py b/every_eval_ever/check_duplicate_entries.py
@@ -2,22 +2,36 @@
 import hashlib
 import json
 import os
+from pathlib import Path
 from typing import Any, Dict, List
 
+from every_eval_ever import io as eee_io
+
 IGNORE_KEYS = {'retrieved_timestamp', 'evaluation_id'}
 
 
 def expand_paths(paths: List[str]) -> List[str]:
-    """Expand folders to file paths."""
+    """Expand folders to aggregate-EEE file paths.
+
+    Recognizes both plain ``<uuid>.json`` and compressed forms
+    (``.gz``, ``.zst``, ``.bz2``, ``.xz``, ``.lz4``).
+    ``<uuid>_samples.jsonl`` files are intentionally excluded — duplicate
+    detection runs over aggregate metadata, not per-instance samples.
+    """
     file_paths: List[str] = []
     for path in paths:
-        if os.path.isfile(path) and path.endswith('.json'):
-            file_paths.append(path)
-        elif os.path.isdir(path):
-            for root, _, file_names in os.walk(path):
-                for file_name in file_names:
-                    if file_name.endswith('.json'):
-                        file_paths.append(os.path.join(root, file_name))
+        p = Path(path)
+        if p.is_file():
+            if eee_io.is_eee_result(p) == 'aggregate':
+                file_paths.append(str(p))
+            else:
+                # Preserve historical behaviour: be explicit when a
+                # passed file is not an aggregate JSON.
+                continue
+        elif p.is_dir():
+            for found in eee_io.iter_eee_results([p]):
+                if eee_io.is_eee_result(found) == 'aggregate':
+                    file_paths.append(str(found))
         else:
             raise Exception(f'Could not find file or directory at path: {path}')
     return file_paths
@@ -84,7 +98,7 @@ def main(argv: List[str] | None = None) -> int:
     groups: Dict[str, List[Dict[str, Any]]] = {}
     for file_path in file_paths:
         try:
-            with open(file_path, 'r') as f:
+            with eee_io.open_eee_text(file_path, 'r') as f:
                 payload = json.load(f)
         except json.JSONDecodeError as e:
             message = f'JSONDecodeError: {str(e)}'

diff --git a/every_eval_ever/cli.py b/every_eval_ever/cli.py
@@ -10,6 +10,8 @@
 from pathlib import Path
 from typing import Any
 
+from every_eval_ever import io as eee_io
+
 EVALUATOR_RELATIONSHIP_CHOICES = [
     'first_party',
     'third_party',
@@ -30,6 +32,21 @@ def _common_metadata(args: argparse.Namespace) -> dict[str, Any]:
     }
 
 
+def _resolve_compression(
+    args: argparse.Namespace, kind: str
+) -> str:
+    """Resolve the compression name for ``kind`` ('aggregate' or 'samples')."""
+    if kind == 'aggregate':
+        per_kind = getattr(args, 'compress_aggregate', None)
+    elif kind == 'samples':
+        per_kind = getattr(args, 'compress_samples', None)
+    else:
+        raise ValueError(f'unknown kind {kind!r}')
+    if per_kind is not None:
+        return per_kind
+    return getattr(args, 'compress', eee_io.COMPRESSION_NONE)
+
+
 def _output_dir_for_log(base_output: Path, log: Any) -> Path:
     dataset = 'unknown'
     if log.evaluation_results and log.evaluation_results[0].source_data:
@@ -46,12 +63,23 @@ def _output_dir_for_log(base_output: Path, log: Any) -> Path:
 
 
 def _write_log(
-    log: Any, base_output: Path, eval_uuid: str | None = None
+    log: Any,
+    base_output: Path,
+    eval_uuid: str | None = None,
+    compression: str = eee_io.COMPRESSION_NONE,
 ) -> Path:
+    """Write an aggregate evaluation log to ``base_output``.
+
+    ``compression`` selects the codec for the on-disk file; the resulting
+    filename has the codec suffix appended (e.g. ``<uuid>.json.gz``).
+    Defaults to no compression for backwards compatibility.
+    """
     out_dir = _output_dir_for_log(base_output, log)
     eval_uuid = eval_uuid or str(uuid.uuid4())
-    out_file = out_dir / f'{eval_uuid}.json'
-    with out_file.open('w', encoding='utf-8') as file:
+    out_file = eee_io.add_compression_suffix(
+        out_dir / f'{eval_uuid}.json', compression
+    )
+    with eee_io.open_eee_text(out_file, 'w') as file:
         json.dump(
             log.model_dump(mode='json', exclude_none=True), file, indent=2
         )
@@ -100,9 +128,11 @@ def _cmd_convert_lm_eval(args: argparse.Namespace) -> int:
                         task_name=task_name,
                         output_dir=str(_output_dir_for_log(output_dir, log)),
                         file_uuid=eval_uuid,
+                        compression=_resolve_compression(args, 'samples'),
                     )
                     log.detailed_evaluation_results = detailed
-        print(_write_log(log, output_dir, eval_uuid=eval_uuid))
+        print(_write_log(log, output_dir, eval_uuid=eval_uuid,
+                         compression=_resolve_compression(args, 'aggregate')))
 
     print(f'Converted {len(logs)} evaluation log(s).')
     return 0
@@ -139,7 +169,8 @@ def _cmd_convert_inspect(args: argparse.Namespace) -> int:
 
     output_dir = Path(args.output_dir)
     for log, eval_uuid in zip(logs, eval_uuids):
-        print(_write_log(log, output_dir, eval_uuid=eval_uuid))
+        print(_write_log(log, output_dir, eval_uuid=eval_uuid,
+                         compression=_resolve_compression(args, 'aggregate')))
 
     print(f'Converted {len(logs)} evaluation log(s).')
     return 0
@@ -168,6 +199,7 @@ def _cmd_convert_helm(args: argparse.Namespace) -> int:
     else:
         raise FileNotFoundError(f'Path is not a file or directory: {log_path}')
 
+    metadata['samples_compression'] = _resolve_compression(args, 'samples')
     logs = adapter.transform_from_directory(
         log_path,
         output_path=str(Path(args.output_dir) / 'helm_output'),
@@ -182,7 +214,8 @@ def _cmd_convert_helm(args: argparse.Namespace) -> int:
 
     output_dir = Path(args.output_dir)
     for log, eval_uuid in zip(logs, eval_uuids):
-        print(_write_log(log, output_dir, eval_uuid=eval_uuid))
+        print(_write_log(log, output_dir, eval_uuid=eval_uuid,
+                         compression=_resolve_compression(args, 'aggregate')))
 
     print(f'Converted {len(logs)} evaluation log(s).')
     return 0
@@ -223,7 +256,10 @@ def _cmd_convert_alpaca_eval(args: argparse.Namespace) -> int:
             if args.eval_library_version != 'unknown':
                 log.eval_library.version = args.eval_library_version
 
-            out_file = _write_log(log, output_dir)
+            out_file = _write_log(
+                log, output_dir,
+                compression=_resolve_compression(args, 'aggregate'),
+            )
             print(f'  {out_file}')
             total += 1
 
@@ -354,6 +390,41 @@ def build_parser() -> argparse.ArgumentParser:
             help='Evaluation library version recorded in eval_library.version.',
         )
 
+        # ----- compression options (apply to all converters) ------------
+        source_parser.add_argument(
+            '--compress',
+            choices=eee_io.COMPRESSION_CHOICES,
+            default=eee_io.COMPRESSION_NONE,
+            help=(
+                'Default codec for both aggregate and per-instance '
+                'output files (default: none). Files are written as '
+                "<uuid>.json[.<codec>] / <uuid>_samples.jsonl[.<codec>]. "
+                'See the --compress-aggregate / --compress-samples '
+                'flags to override per-kind. .zst and .lz4 require the '
+                "optional 'zst' / 'lz4' extras."
+            ),
+        )
+        source_parser.add_argument(
+            '--compress-aggregate', '--compress_aggregate',
+            choices=eee_io.COMPRESSION_CHOICES,
+            default=None,
+            help=(
+                "Override --compress for aggregate <uuid>.json files only. "
+                'HF + GitHub render uncompressed JSON inline so the default '
+                "(--compress's value) is usually the right call here."
+            ),
+        )
+        source_parser.add_argument(
+            '--compress-samples', '--compress_samples',
+            choices=eee_io.COMPRESSION_CHOICES,
+            default=None,
+            help=(
+                "Override --compress for per-instance <uuid>_samples.jsonl "
+                "files only. JSONL compresses 5–15x; recommended setting "
+                'when shipping to a public store.'
+            ),
+        )
+
         if source == 'alpaca_eval':
             source_parser.add_argument(
                 '--version',

diff --git a/every_eval_ever/converters/helm/adapter.py b/every_eval_ever/converters/helm/adapter.py
@@ -489,6 +489,7 @@ def _transform_single(
                     Format.jsonl.value,
                     HashAlgorithm.sha256.value,
                     evaluation_dir,
+                    compression=metadata_args.get('samples_compression', 'none'),
                 ).convert_instance_level_logs(
                     dataset_name,
                     model_info.id,

diff --git a/every_eval_ever/converters/helm/instance_level_adapter.py b/every_eval_ever/converters/helm/instance_level_adapter.py
@@ -2,6 +2,8 @@
 from pathlib import Path
 from typing import Any, List, Tuple
 
+from every_eval_ever import io as eee_io
+
 _HELM_IMPORT_ERROR: Exception | None = None
 try:
     from helm.benchmark.adaptation.scenario_state import RequestState
@@ -42,20 +44,24 @@ def __init__(
         format: str,
         hash_algorithm: str,
         evaluation_dir: str,
+        compression: str = eee_io.COMPRESSION_NONE,
     ):
         _require_helm_dependencies()
         self.evaluation_id = evaulation_id
         self.format = format
         self.hash_algorithm = hash_algorithm
         self.evaluation_dir = evaluation_dir
-        self.path = f'{evaluation_dir}/{evaulation_id}.{format}'
+        self.compression = compression
+        # On-disk path includes the codec suffix when compression is set.
+        base_path = Path(evaluation_dir) / f'{evaulation_id}.{format}'
+        self.path = str(eee_io.add_compression_suffix(base_path, compression))
 
     def _save_json(self, items: List[InstanceLevelEvaluationLog]):
         eval_dir_path = Path(self.evaluation_dir)
         eval_dir_path.mkdir(parents=True, exist_ok=True)
         path = Path(self.path)
 
-        with path.open('w', encoding='utf-8') as f:
+        with eee_io.open_eee_text(path, 'w') as f:
             for item in items:
                 json_line = json.dumps(
                     item.model_dump(mode='json'), ensure_ascii=False

diff --git a/every_eval_ever/converters/lm_eval/instance_level_adapter.py b/every_eval_ever/converters/lm_eval/instance_level_adapter.py
@@ -5,6 +5,7 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
+from every_eval_ever import io as eee_io
 from every_eval_ever.converters import SCHEMA_VERSION
 from every_eval_ever.eval_types import (
     DetailedEvaluationResults,
@@ -55,12 +56,18 @@ def transform_and_save(
         task_name: str,
         output_dir: Optional[Union[str, Path]] = None,
         file_uuid: Optional[str] = None,
+        compression: str = eee_io.COMPRESSION_NONE,
     ) -> Optional[DetailedEvaluationResults]:
         """Transform samples and save to JSONL, returning a DetailedEvaluationResults pointer.
 
         If output_dir is None, returns None (skips instance-level output).
         If file_uuid is provided, the output file is named {file_uuid}_samples.jsonl
         so it shares the UUID of the corresponding evaluation result file.
+
+        ``compression`` controls the on-disk codec for the samples file
+        (see ``every_eval_ever.io.COMPRESSION_CHOICES``). The output
+        filename has the codec suffix appended; the recorded checksum
+        is computed over the (possibly compressed) on-disk bytes.
         """
         if output_dir is None:
             return None
@@ -74,11 +81,12 @@ def transform_and_save(
         output_dir = Path(output_dir)
         output_dir.mkdir(parents=True, exist_ok=True)
         if file_uuid:
-            out_file = output_dir / f'{file_uuid}_samples.jsonl'
+            base = output_dir / f'{file_uuid}_samples.jsonl'
         else:
-            out_file = output_dir / f'samples_{task_name}.jsonl'
+            base = output_dir / f'samples_{task_name}.jsonl'
+        out_file = eee_io.add_compression_suffix(base, compression)
 
-        with open(out_file, 'w') as f:
+        with eee_io.open_eee_text(out_file, 'w') as f:
             for log in logs:
                 f.write(
                     json.dumps(log.model_dump(mode='json'), ensure_ascii=False)