Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,44 @@ uv run python -m every_eval_ever validate file1.json file2_samples.jsonl data/

File type is determined by extension: `.json` validates against `EvaluationLog`, `.jsonl` validates each line against `InstanceLevelEvaluationLog`.

#### Compressed files

Validation, discovery, and the converter writers all transparently
support compressed result files alongside the plain forms:

```
<uuid>.json{,.gz,.zst,.bz2,.xz,.lz4}
<uuid>_samples.jsonl{,.gz,.zst,.bz2,.xz,.lz4}
```

Codecs match what the [HuggingFace Hub natively decompresses](https://huggingface.co/docs/hub/en/datasets-adding#file-formats).
The schema is unchanged; compression is purely a transport / storage
concern. A given `(folder, uuid, kind)` may have **at most one
physical variant** — the validator emits a `duplicate_variant` error
if both `abc.json` and `abc.json.gz` are committed in the same folder.

`gzip`, `bzip2`, and `lzma/xz` use the standard library. `zstd` and
`lz4` are optional extras; install with `pip install 'every-eval-ever[zst]'`
or `[lz4]` (or `[all]` to pull in everything).

To **write** compressed output during conversion, pass one of the
new flags to any `convert` subcommand:

```sh
# Default codec for both aggregate and per-instance files
uv run python -m every_eval_ever convert helm \
--log_path runs/ --output_dir data/ \
--compress gz

# Only compress per-instance samples (recommended for public submissions —
# aggregate JSON stays browsable on the HF web UI)
uv run python -m every_eval_ever convert helm \
--log_path runs/ --output_dir data/ \
--compress-samples gz
```

Defaults remain uncompressed for full backwards compatibility.

#### Output formats

```sh
Expand Down
32 changes: 23 additions & 9 deletions every_eval_ever/check_duplicate_entries.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,36 @@
import hashlib
import json
import os
from pathlib import Path
from typing import Any, Dict, List

from every_eval_ever import io as eee_io

IGNORE_KEYS = {'retrieved_timestamp', 'evaluation_id'}


def expand_paths(paths: List[str]) -> List[str]:
"""Expand folders to file paths."""
"""Expand folders to aggregate-EEE file paths.

Recognizes both plain ``<uuid>.json`` and compressed forms
(``.gz``, ``.zst``, ``.bz2``, ``.xz``, ``.lz4``).
``<uuid>_samples.jsonl`` files are intentionally excluded — duplicate
detection runs over aggregate metadata, not per-instance samples.
"""
file_paths: List[str] = []
for path in paths:
if os.path.isfile(path) and path.endswith('.json'):
file_paths.append(path)
elif os.path.isdir(path):
for root, _, file_names in os.walk(path):
for file_name in file_names:
if file_name.endswith('.json'):
file_paths.append(os.path.join(root, file_name))
p = Path(path)
if p.is_file():
if eee_io.is_eee_result(p) == 'aggregate':
file_paths.append(str(p))
else:
# Preserve historical behaviour: be explicit when a
# passed file is not an aggregate JSON.
continue
elif p.is_dir():
for found in eee_io.iter_eee_results([p]):
if eee_io.is_eee_result(found) == 'aggregate':
file_paths.append(str(found))
else:
raise Exception(f'Could not find file or directory at path: {path}')
return file_paths
Expand Down Expand Up @@ -84,7 +98,7 @@ def main(argv: List[str] | None = None) -> int:
groups: Dict[str, List[Dict[str, Any]]] = {}
for file_path in file_paths:
try:
with open(file_path, 'r') as f:
with eee_io.open_eee_text(file_path, 'r') as f:
payload = json.load(f)
except json.JSONDecodeError as e:
message = f'JSONDecodeError: {str(e)}'
Expand Down
85 changes: 78 additions & 7 deletions every_eval_ever/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from pathlib import Path
from typing import Any

from every_eval_ever import io as eee_io

EVALUATOR_RELATIONSHIP_CHOICES = [
'first_party',
'third_party',
Expand All @@ -30,6 +32,21 @@ def _common_metadata(args: argparse.Namespace) -> dict[str, Any]:
}


def _resolve_compression(
args: argparse.Namespace, kind: str
) -> str:
"""Resolve the compression name for ``kind`` ('aggregate' or 'samples')."""
if kind == 'aggregate':
per_kind = getattr(args, 'compress_aggregate', None)
elif kind == 'samples':
per_kind = getattr(args, 'compress_samples', None)
else:
raise ValueError(f'unknown kind {kind!r}')
if per_kind is not None:
return per_kind
return getattr(args, 'compress', eee_io.COMPRESSION_NONE)


def _output_dir_for_log(base_output: Path, log: Any) -> Path:
dataset = 'unknown'
if log.evaluation_results and log.evaluation_results[0].source_data:
Expand All @@ -46,12 +63,23 @@ def _output_dir_for_log(base_output: Path, log: Any) -> Path:


def _write_log(
log: Any, base_output: Path, eval_uuid: str | None = None
log: Any,
base_output: Path,
eval_uuid: str | None = None,
compression: str = eee_io.COMPRESSION_NONE,
) -> Path:
"""Write an aggregate evaluation log to ``base_output``.

``compression`` selects the codec for the on-disk file; the resulting
filename has the codec suffix appended (e.g. ``<uuid>.json.gz``).
Defaults to no compression for backwards compatibility.
"""
out_dir = _output_dir_for_log(base_output, log)
eval_uuid = eval_uuid or str(uuid.uuid4())
out_file = out_dir / f'{eval_uuid}.json'
with out_file.open('w', encoding='utf-8') as file:
out_file = eee_io.add_compression_suffix(
out_dir / f'{eval_uuid}.json', compression
)
with eee_io.open_eee_text(out_file, 'w') as file:
json.dump(
log.model_dump(mode='json', exclude_none=True), file, indent=2
)
Expand Down Expand Up @@ -100,9 +128,11 @@ def _cmd_convert_lm_eval(args: argparse.Namespace) -> int:
task_name=task_name,
output_dir=str(_output_dir_for_log(output_dir, log)),
file_uuid=eval_uuid,
compression=_resolve_compression(args, 'samples'),
)
log.detailed_evaluation_results = detailed
print(_write_log(log, output_dir, eval_uuid=eval_uuid))
print(_write_log(log, output_dir, eval_uuid=eval_uuid,
compression=_resolve_compression(args, 'aggregate')))

print(f'Converted {len(logs)} evaluation log(s).')
return 0
Expand Down Expand Up @@ -139,7 +169,8 @@ def _cmd_convert_inspect(args: argparse.Namespace) -> int:

output_dir = Path(args.output_dir)
for log, eval_uuid in zip(logs, eval_uuids):
print(_write_log(log, output_dir, eval_uuid=eval_uuid))
print(_write_log(log, output_dir, eval_uuid=eval_uuid,
compression=_resolve_compression(args, 'aggregate')))

print(f'Converted {len(logs)} evaluation log(s).')
return 0
Expand Down Expand Up @@ -168,6 +199,7 @@ def _cmd_convert_helm(args: argparse.Namespace) -> int:
else:
raise FileNotFoundError(f'Path is not a file or directory: {log_path}')

metadata['samples_compression'] = _resolve_compression(args, 'samples')
logs = adapter.transform_from_directory(
log_path,
output_path=str(Path(args.output_dir) / 'helm_output'),
Expand All @@ -182,7 +214,8 @@ def _cmd_convert_helm(args: argparse.Namespace) -> int:

output_dir = Path(args.output_dir)
for log, eval_uuid in zip(logs, eval_uuids):
print(_write_log(log, output_dir, eval_uuid=eval_uuid))
print(_write_log(log, output_dir, eval_uuid=eval_uuid,
compression=_resolve_compression(args, 'aggregate')))

print(f'Converted {len(logs)} evaluation log(s).')
return 0
Expand Down Expand Up @@ -223,7 +256,10 @@ def _cmd_convert_alpaca_eval(args: argparse.Namespace) -> int:
if args.eval_library_version != 'unknown':
log.eval_library.version = args.eval_library_version

out_file = _write_log(log, output_dir)
out_file = _write_log(
log, output_dir,
compression=_resolve_compression(args, 'aggregate'),
)
print(f' {out_file}')
total += 1

Expand Down Expand Up @@ -354,6 +390,41 @@ def build_parser() -> argparse.ArgumentParser:
help='Evaluation library version recorded in eval_library.version.',
)

# ----- compression options (apply to all converters) ------------
source_parser.add_argument(
'--compress',
choices=eee_io.COMPRESSION_CHOICES,
default=eee_io.COMPRESSION_NONE,
help=(
'Default codec for both aggregate and per-instance '
'output files (default: none). Files are written as '
"<uuid>.json[.<codec>] / <uuid>_samples.jsonl[.<codec>]. "
'See the --compress-aggregate / --compress-samples '
'flags to override per-kind. .zst and .lz4 require the '
"optional 'zst' / 'lz4' extras."
),
)
source_parser.add_argument(
'--compress-aggregate', '--compress_aggregate',
choices=eee_io.COMPRESSION_CHOICES,
default=None,
help=(
"Override --compress for aggregate <uuid>.json files only. "
'HF + GitHub render uncompressed JSON inline so the default '
"(--compress's value) is usually the right call here."
),
)
source_parser.add_argument(
'--compress-samples', '--compress_samples',
choices=eee_io.COMPRESSION_CHOICES,
default=None,
help=(
"Override --compress for per-instance <uuid>_samples.jsonl "
"files only. JSONL compresses 5–15x; recommended setting "
'when shipping to a public store.'
),
)

if source == 'alpaca_eval':
source_parser.add_argument(
'--version',
Expand Down
1 change: 1 addition & 0 deletions every_eval_ever/converters/helm/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,7 @@ def _transform_single(
Format.jsonl.value,
HashAlgorithm.sha256.value,
evaluation_dir,
compression=metadata_args.get('samples_compression', 'none'),
).convert_instance_level_logs(
dataset_name,
model_info.id,
Expand Down
10 changes: 8 additions & 2 deletions every_eval_ever/converters/helm/instance_level_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from pathlib import Path
from typing import Any, List, Tuple

from every_eval_ever import io as eee_io

_HELM_IMPORT_ERROR: Exception | None = None
try:
from helm.benchmark.adaptation.scenario_state import RequestState
Expand Down Expand Up @@ -42,20 +44,24 @@ def __init__(
format: str,
hash_algorithm: str,
evaluation_dir: str,
compression: str = eee_io.COMPRESSION_NONE,
):
_require_helm_dependencies()
self.evaluation_id = evaulation_id
self.format = format
self.hash_algorithm = hash_algorithm
self.evaluation_dir = evaluation_dir
self.path = f'{evaluation_dir}/{evaulation_id}.{format}'
self.compression = compression
# On-disk path includes the codec suffix when compression is set.
base_path = Path(evaluation_dir) / f'{evaulation_id}.{format}'
self.path = str(eee_io.add_compression_suffix(base_path, compression))

def _save_json(self, items: List[InstanceLevelEvaluationLog]):
eval_dir_path = Path(self.evaluation_dir)
eval_dir_path.mkdir(parents=True, exist_ok=True)
path = Path(self.path)

with path.open('w', encoding='utf-8') as f:
with eee_io.open_eee_text(path, 'w') as f:
for item in items:
json_line = json.dumps(
item.model_dump(mode='json'), ensure_ascii=False
Expand Down
14 changes: 11 additions & 3 deletions every_eval_ever/converters/lm_eval/instance_level_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

from every_eval_ever import io as eee_io
from every_eval_ever.converters import SCHEMA_VERSION
from every_eval_ever.eval_types import (
DetailedEvaluationResults,
Expand Down Expand Up @@ -55,12 +56,18 @@ def transform_and_save(
task_name: str,
output_dir: Optional[Union[str, Path]] = None,
file_uuid: Optional[str] = None,
compression: str = eee_io.COMPRESSION_NONE,
) -> Optional[DetailedEvaluationResults]:
"""Transform samples and save to JSONL, returning a DetailedEvaluationResults pointer.

If output_dir is None, returns None (skips instance-level output).
If file_uuid is provided, the output file is named {file_uuid}_samples.jsonl
so it shares the UUID of the corresponding evaluation result file.

``compression`` controls the on-disk codec for the samples file
(see ``every_eval_ever.io.COMPRESSION_CHOICES``). The output
filename has the codec suffix appended; the recorded checksum
is computed over the (possibly compressed) on-disk bytes.
"""
if output_dir is None:
return None
Expand All @@ -74,11 +81,12 @@ def transform_and_save(
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
if file_uuid:
out_file = output_dir / f'{file_uuid}_samples.jsonl'
base = output_dir / f'{file_uuid}_samples.jsonl'
else:
out_file = output_dir / f'samples_{task_name}.jsonl'
base = output_dir / f'samples_{task_name}.jsonl'
out_file = eee_io.add_compression_suffix(base, compression)

with open(out_file, 'w') as f:
with eee_io.open_eee_text(out_file, 'w') as f:
for log in logs:
f.write(
json.dumps(log.model_dump(mode='json'), ensure_ascii=False)
Expand Down
Loading