diff --git a/every_eval_ever/cli.py b/every_eval_ever/cli.py index c215c89cd..81137eb24 100644 --- a/every_eval_ever/cli.py +++ b/every_eval_ever/cli.py @@ -108,6 +108,33 @@ def _cmd_convert_lm_eval(args: argparse.Namespace) -> int: return 0 +def _cmd_convert_lighteval(args: argparse.Namespace) -> int: + from every_eval_ever.converters.lighteval.adapter import LightEvalAdapter + + adapter = LightEvalAdapter() + metadata = _common_metadata(args) + if args.inference_engine: + metadata['inference_engine'] = args.inference_engine + if args.inference_engine_version: + metadata['inference_engine_version'] = args.inference_engine_version + + log_path = Path(args.log_path) + if log_path.is_file(): + logs = adapter.transform_from_file(log_path, metadata) + elif log_path.is_dir(): + logs = adapter.transform_from_directory(log_path, metadata) + else: + raise FileNotFoundError(f'Path is not a file or directory: {log_path}') + + output_dir = Path(args.output_dir) + for log in logs: + eval_uuid = str(uuid.uuid4()) + print(_write_log(log, output_dir, eval_uuid=eval_uuid)) + + print(f'Converted {len(logs)} evaluation log(s).') + return 0 + + def _cmd_convert_inspect(args: argparse.Namespace) -> int: from every_eval_ever.converters.inspect.adapter import ( InspectAIAdapter, @@ -241,6 +268,7 @@ def build_parser() -> argparse.ArgumentParser: epilog=( 'Examples:\n' ' every_eval_ever convert lm_eval --log_path results.json --output_dir data\n' + ' every_eval_ever convert lighteval --log_path results_run_dir --output_dir data\n' ' every_eval_ever convert inspect --log_path inspect_log.json --output_dir data\n' ' every_eval_ever convert helm --log_path helm_run_dir --output_dir data' ), @@ -298,7 +326,13 @@ def build_parser() -> argparse.ArgumentParser: dest='source', required=True ) - for source in ['lm_eval', 'inspect', 'helm', 'alpaca_eval']: + for source in [ + 'lm_eval', + 'lighteval', + 'inspect', + 'helm', + 'alpaca_eval', + ]: source_parser = convert_subparsers.add_parser( source, help=f'Convert {source} logs', @@ -385,6 +419,20 @@ def build_parser() -> argparse.ArgumentParser: help='Inference engine version to record in model_info.inference_engine.version.', ) + if source == 'lighteval': + source_parser.add_argument( + '--inference_engine', + '--inference-engine', + default=None, + help='Override inferred inference engine (e.g. vllm, transformers).', + ) + source_parser.add_argument( + '--inference_engine_version', + '--inference-engine-version', + default=None, + help='Inference engine version to record in model_info.inference_engine.version.', + ) + return parser @@ -415,6 +463,8 @@ def main(argv: list[str] | None = None) -> int: if args.command == 'convert': if args.source == 'lm_eval': return _cmd_convert_lm_eval(args) + if args.source == 'lighteval': + return _cmd_convert_lighteval(args) if args.source == 'inspect': return _cmd_convert_inspect(args) if args.source == 'helm': diff --git a/every_eval_ever/converters/common/adapter.py b/every_eval_ever/converters/common/adapter.py index 4598ee6de..e8a6c4a5c 100644 --- a/every_eval_ever/converters/common/adapter.py +++ b/every_eval_ever/converters/common/adapter.py @@ -31,6 +31,7 @@ class SupportedLibrary(Enum): LM_EVAL = 'lm-evaluation-harness' INSPECT_AI = 'inspect-ai' HELM = 'helm' + LIGHTEVAL = 'lighteval' CUSTOM = 'custom' diff --git a/every_eval_ever/converters/lighteval/__init__.py b/every_eval_ever/converters/lighteval/__init__.py new file mode 100644 index 000000000..57fc16811 --- /dev/null +++ b/every_eval_ever/converters/lighteval/__init__.py @@ -0,0 +1 @@ +"""lighteval adapter for every_eval_ever.""" diff --git a/every_eval_ever/converters/lighteval/__main__.py b/every_eval_ever/converters/lighteval/__main__.py new file mode 100644 index 000000000..ebc9464df --- /dev/null +++ b/every_eval_ever/converters/lighteval/__main__.py @@ -0,0 +1,142 @@ +"""CLI for converting lighteval output to every_eval_ever format.""" + +import argparse +import json +import sys +import uuid +from pathlib import Path + +from .adapter import LightEvalAdapter + + +def main(): + parser = argparse.ArgumentParser( + description='Convert lighteval output to every_eval_ever format' + ) + parser.add_argument( + '--log_path', + type=str, + required=True, + help='Path to results JSON file or directory containing results files', + ) + parser.add_argument( + '--output_dir', + type=str, + default='data', + help='Output directory for converted files', + ) + parser.add_argument( + '--source_organization_name', + type=str, + default='', + help='Name of the organization that ran the evaluation', + ) + parser.add_argument( + '--evaluator_relationship', + type=str, + default='first_party', + choices=['first_party', 'third_party', 'collaborative', 'other'], + help='Relationship of the evaluator to the model', + ) + parser.add_argument( + '--source_organization_url', + type=str, + default=None, + help='URL of the source organization', + ) + parser.add_argument( + '--source_organization_logo_url', + type=str, + default=None, + help='Logo of the source organization', + ) + parser.add_argument( + '--inference_engine', + type=str, + default=None, + help="Override inference engine name (e.g. 'vllm', 'transformers'). " + 'Auto-detected from provider when possible.', + ) + parser.add_argument( + '--inference_engine_version', + type=str, + default=None, + help="Inference engine version (e.g. '0.6.0'). " + 'Not available from lighteval logs, so must be provided manually.', + ) + parser.add_argument( + '--eval_library_name', + type=str, + default='lighteval', + help='Name of the evaluation library (e.g. inspect_ai, lm_eval, helm, lighteval)', + ) + parser.add_argument( + '--eval_library_version', + type=str, + default='unknown', + help='Version of the evaluation library. It should be extracted in the adapter if available in the evaluation log.', + ) + + args = parser.parse_args() + + adapter = LightEvalAdapter() + output_dir = Path(args.output_dir) + + metadata_args = { + 'source_organization_name': args.source_organization_name, + 'evaluator_relationship': args.evaluator_relationship, + 'source_organization_url': args.source_organization_url, + 'eval_library_name': args.eval_library_name, + 'eval_library_version': args.eval_library_version, + } + if args.inference_engine: + metadata_args['inference_engine'] = args.inference_engine + if args.inference_engine_version: + metadata_args['inference_engine_version'] = ( + args.inference_engine_version + ) + + log_path = Path(args.log_path) + + if log_path.is_file(): + logs = adapter.transform_from_file(log_path, metadata_args) + elif log_path.is_dir(): + logs = adapter.transform_from_directory(log_path, metadata_args) + else: + print(f'Error: {log_path} is not a file or directory', file=sys.stderr) + sys.exit(1) + + for log in logs: + # Organize as: output_dir/{evaluation_name}/{developer}/{model_name}/{uuid}.json + # Use the first evaluation result's name (before any /filter suffix) as the task name + if log.evaluation_results: + eval_name = log.evaluation_results[0].evaluation_name.split('/')[0] + else: + eval_name = 'unknown' + + model_parts = log.model_info.id.split('/') + if len(model_parts) >= 2: + developer = model_parts[0] + model_name = '/'.join(model_parts[1:]) + else: + developer = 'unknown' + model_name = log.model_info.id + + out_path = output_dir / eval_name / developer / model_name + out_path.mkdir(parents=True, exist_ok=True) + + eval_uuid = str(uuid.uuid4()) + out_file = out_path / f'{eval_uuid}.json' + + with open(out_file, 'w') as f: + json.dump( + log.model_dump(mode='json', exclude_none=True), f, indent=2 + ) + + print(f' {out_file}') + + print(f'\nConverted {len(logs)} evaluation log(s).') + + +if __name__ == '__main__': + main() diff --git a/every_eval_ever/converters/lighteval/adapter.py b/every_eval_ever/converters/lighteval/adapter.py new file mode 100644 index 000000000..b65d3ef78 --- /dev/null +++ b/every_eval_ever/converters/lighteval/adapter.py @@ -0,0 +1,414 @@ +"""Adapter for converting lighteval output to every_eval_ever format.""" + +import json +from pathlib import Path +from typing import Any, Dict, List, Optional, Union + +from every_eval_ever.converters import SCHEMA_VERSION +from every_eval_ever.converters.common.adapter import ( + AdapterMetadata, + BaseEvaluationAdapter, + SupportedLibrary, +) +from every_eval_ever.converters.common.utils import get_current_unix_timestamp +from every_eval_ever.eval_types import ( + EvalLibrary, + EvaluationLog, + EvaluationResult, + EvaluatorRelationship, + GenerationArgs, + GenerationConfig, + InferenceEngine, + MetricConfig, + ModelInfo, + ScoreDetails, + ScoreType, + SourceDataHf, + SourceDataPrivate, + SourceMetadata, + SourceType, + StandardError, + Uncertainty, +) + +from .utils import ( + KNOWN_METRIC_BOUNDS, + PROVIDER_TO_INFERENCE_ENGINE, + PROVIDER_TO_INFERENCE_PLATFORM, + parse_model_name, +) + + +class LightEvalAdapter(BaseEvaluationAdapter): + """Converts lighteval results to every_eval_ever format.""" + + def __init__(self, strict_validation: bool = True): + super().__init__(strict_validation) + + @property + def metadata(self) -> AdapterMetadata: + return AdapterMetadata( + name='lighteval-adapter', + version='0.1.0', + supported_library_versions=['*'], + description='Converts lighteval output to every_eval_ever format', + ) + + @property + def supported_library(self) -> SupportedLibrary: + return SupportedLibrary.lighteval + + def _extract_model_info( + self, + raw_data: Dict[str, Any], + metadata_args: Optional[Dict[str, Any]] = None, + ) -> ModelInfo: + """Extract model information from lighteval results.""" + metadata_args = metadata_args or {} + config = raw_data.get('config_general', {}) + model_config = config.get('model_config', {}) + + model_name = config.get('model_name', '') + provider, pretrained = parse_model_name(model_name) + + developer = None + if '/' in pretrained: + developer = pretrained.split('/')[0] + + inference_platform = PROVIDER_TO_INFERENCE_PLATFORM.get(provider) + + # Determine inference engine name: CLI override > auto-detection from provider + engine_name = metadata_args.get( + 'inference_engine' + ) or PROVIDER_TO_INFERENCE_ENGINE.get(provider) + engine_version = metadata_args.get('inference_engine_version') + + inference_engine = None + if engine_name: + inference_engine = InferenceEngine( + name=engine_name, version=engine_version + ) + + additional = {} + + exclude_fields = { + 'model_name', + 'generation_parameters', + 'api_key', + 'cache_dir', + } + + for key, value in model_config.items(): + if key not in exclude_fields and value is not None: + if isinstance(value, (str, int, float, bool)): + additional[key] = str(value) + elif isinstance(value, dict): + additional[key] = json.dumps(value) + elif isinstance(value, list): + additional[key] = json.dumps(value) + + if provider: + additional['provider'] = provider + #breakpoint() + return ModelInfo( + name=pretrained, + id=pretrained, + developer=developer, + inference_platform=inference_platform, + inference_engine=inference_engine, + additional_details=additional if additional else None, + ) + + def _get_tasks(self, raw_data: Dict[str, Any]) -> List[str]: + """Get task names that have actual metric results. + + lighteval uses format like "aime25|0" for tasks. + We extract the base task name without the suffix. + """ + results = raw_data.get('results', {}) + tasks = [] + for task_name, task_results in results.items(): + if task_name == 'all': + continue + has_metric = any( + isinstance(v, (int, float)) + for k, v in task_results.items() + if '_stderr' not in k + ) + if not has_metric: + continue + tasks.append(task_name) + return tasks + + def _get_task_config( + self, raw_data: Dict[str, Any], task_name: str + ) -> Dict[str, Any]: + """Resolve ``config_tasks[task_key]`` (same key as in ``results``). + + Falls back to ``config_tasks[task_basename]`` when the key uses a + ``|seed`` suffix but the config is stored under the bare task name. + """ + cfg_tasks = raw_data.get('config_tasks') or {} + if task_name in cfg_tasks: + return cfg_tasks[task_name] + base = task_name.split('|')[0] + if base != task_name and base in cfg_tasks: + return cfg_tasks[base] + return {} + + def _build_source_data(self, task_config: Dict[str, Any], task_name: str): + """Build ``EvaluationResult.source_data`` (HF variant sets ``hf_repo``). + + Serialized path: ``evaluation_results[].source_data.hf_repo`` when + ``source_type`` is ``hf_dataset``. + """ + + path = task_config.get('hf_repo') + dataset_name = task_config.get('name', task_name) + #breakpoint() + if path: + hf_subset = task_config.get('hf_subset') + splits = task_config.get('evaluation_splits', []) + hf_split = splits[0] if splits else None + + extra: Dict[str, str] = {} + if hf_subset and hf_subset != 'default': + extra['hf_subset'] = str(hf_subset) + + return SourceDataHf( + dataset_name=dataset_name, + source_type='hf_dataset', + hf_repo=str(path), + hf_split=hf_split, + additional_details=extra if extra else None, + ) + return SourceDataPrivate( + dataset_name=dataset_name, + source_type='other', + ) + + def _build_generation_config( + self, raw_data: Dict[str, Any], task_config: Dict[str, Any] + ) -> Optional[GenerationConfig]: + """Build generation config from lighteval config.""" + config_general = raw_data.get('config_general', {}) + model_config = config_general.get('model_config', {}) + gen_params = model_config.get('generation_parameters', {}) + + if not gen_params: + return None + + args = GenerationArgs( + temperature=gen_params.get('temperature'), + top_p=gen_params.get('top_p'), + top_k=gen_params.get('top_k'), + max_tokens=gen_params.get('max_new_tokens'), + ) + + additional = {} + for k, v in gen_params.items(): + if k not in ('temperature', 'top_p', 'top_k', 'max_new_tokens'): + if v is not None: + additional[k] = json.dumps(v) if not isinstance(v, str) else v + + if task_config.get('num_fewshots') is not None: + additional['num_fewshot'] = str(task_config['num_fewshots']) + + return GenerationConfig( + generation_args=args, + additional_details=additional if additional else None, + ) + + def _build_evaluation_results( + self, raw_data: Dict[str, Any], task_name: str + ) -> List[EvaluationResult]: + """Build EvaluationResult list for a single task.""" + task_results = raw_data['results'][task_name] + task_config = self._get_task_config(raw_data, task_name) + + source_data = self._build_source_data(task_config, task_name) + gen_config = self._build_generation_config(raw_data, task_config) + + config_general = raw_data.get('config_general', {}) + eval_timestamp = config_general.get('start_time') + if eval_timestamp is not None: + eval_timestamp = str(int(eval_timestamp)) + + display_task_name = task_name.split('|')[0] + + results = [] + for metric_key, value in task_results.items(): + if '_stderr' in metric_key: + continue + if not isinstance(value, (int, float)): + continue + + metric_root = ( + metric_key.split(':')[0] if ':' in metric_key else metric_key + ) + metric_name = ( + metric_root.split('@')[0] if '@' in metric_root else metric_root + ) + + stderr_key = f'{metric_key}_stderr' + stderr_val = task_results.get(stderr_key) + + is_higher_better = True + metrics_config = task_config.get('metrics', []) + for metric_cfg in metrics_config: + if metric_cfg.get('metric_name') == metric_key: + is_higher_better = metric_cfg.get('higher_is_better', True) + break + + bounds = KNOWN_METRIC_BOUNDS.get(metric_name) + if bounds: + min_score = bounds[0] + max_score = bounds[1] + else: + metric_lower = metric_name.lower() + if any( + pattern in metric_lower + for pattern in ['acc', 'accuracy', 'precision', 'recall', 'f1', 'pass', 'avg'] + ): + min_score = 0.0 + max_score = 1.0 + else: + min_score = 0.0 + max_score = 1.0 + + metric_config = MetricConfig( + evaluation_description=metric_key, + lower_is_better=not is_higher_better, + score_type=ScoreType.continuous, + min_score=min_score, + max_score=max_score, + ) + + uncertainty = None + num_samples = task_config.get('effective_num_docs') + if num_samples == -1: + num_samples = None + + valid_stderr = ( + isinstance(stderr_val, (int, float)) and stderr_val is not None + ) + if valid_stderr or num_samples: + uncertainty = Uncertainty( + standard_error=( + StandardError(value=stderr_val, method='bootstrap') + if valid_stderr + else None + ), + num_samples=num_samples, + ) + + results.append( + EvaluationResult( + evaluation_name=display_task_name, + source_data=source_data, + evaluation_timestamp=eval_timestamp, + metric_config=metric_config, + score_details=ScoreDetails( + score=value, + uncertainty=uncertainty, + ), + generation_config=gen_config, + ) + ) + + return results + + def _transform_single( + self, raw_data: Dict[str, Any], metadata_args: Dict[str, Any] + ) -> EvaluationLog: + """Transform a single task's results into an EvaluationLog. + + Expects metadata_args to contain 'task_name' specifying which task. + """ + task_name = metadata_args['task_name'] + model_info = self._extract_model_info(raw_data, metadata_args) + + retrieved_timestamp = get_current_unix_timestamp() + config_general = raw_data.get('config_general', {}) + eval_timestamp = config_general.get('start_time') + if eval_timestamp is not None: + eval_timestamp = str(int(eval_timestamp)) + + evaluation_id = f'{task_name}/{model_info.id}/{retrieved_timestamp}' + evaluation_results = self._build_evaluation_results(raw_data, task_name) + + evaluator_rel_str = metadata_args.get( + 'evaluator_relationship', 'first_party' + ) + evaluator_relationship = EvaluatorRelationship(evaluator_rel_str) + + library_version = str(config_general.get('lighteval_sha', '')) + if library_version in ('?', ''): + library_version = '' + eval_library = EvalLibrary( + name=metadata_args.get('eval_library_name', 'lighteval'), + version=library_version + or metadata_args.get('eval_library_version', 'unknown'), + ) + + source_metadata = SourceMetadata( + source_name='lighteval', + source_type=SourceType.evaluation_run, + source_organization_name=metadata_args.get( + 'source_organization_name', '' + ), + source_organization_url=metadata_args.get( + 'source_organization_url' + ), + source_organization_logo_url=metadata_args.get( + 'source_organization_logo_url' + ), + evaluator_relationship=evaluator_relationship, + ) + + return EvaluationLog( + schema_version=SCHEMA_VERSION, + evaluation_id=evaluation_id, + retrieved_timestamp=retrieved_timestamp, + evaluation_timestamp=eval_timestamp, + source_metadata=source_metadata, + eval_library=eval_library, + model_info=model_info, + evaluation_results=evaluation_results, + ) + + def transform_from_file( + self, file_path: Union[str, Path], metadata_args: Dict[str, Any] + ) -> List[EvaluationLog]: + """Transform a lighteval results JSON file into EvaluationLogs. + + Returns one EvaluationLog per leaf task in the results file. + """ + file_path = Path(file_path) + raw_data = self._load_file(file_path) + tasks = self._get_tasks(raw_data) + + results = [] + for task_name in tasks: + task_metadata = {**metadata_args, 'task_name': task_name} + log = self._transform_single(raw_data, task_metadata) + results.append(log) + + return results + + def transform_from_directory( + self, dir_path: Union[str, Path], metadata_args: Dict[str, Any] + ) -> List[EvaluationLog]: + """Transform all lighteval results files in a directory. + + Searches for results_*.json files recursively. + """ + dir_path = Path(dir_path) + results_files = sorted(dir_path.glob('**/results_*.json')) + + all_logs = [] + for results_file in results_files: + logs = self.transform_from_file(results_file, metadata_args) + all_logs.extend(logs) + + return all_logs diff --git a/every_eval_ever/converters/lighteval/utils.py b/every_eval_ever/converters/lighteval/utils.py new file mode 100644 index 000000000..b9cce676f --- /dev/null +++ b/every_eval_ever/converters/lighteval/utils.py @@ -0,0 +1,57 @@ +"""Utility functions for the lighteval adapter.""" + + +def parse_model_name(model_name: str) -> tuple[str, str]: + """Parse lighteval model_name into provider and model path. + + lighteval uses format like: "hosted_vllm/mistralai/Ministral-3-14B-Instruct-2512" + Returns: (provider, model_path) + """ + if not model_name: + return ('unknown', 'unknown') + + parts = model_name.split('/', 1) + if len(parts) == 2: + provider = parts[0] + model_path = parts[1] + return (provider, model_path) + + return ('unknown', model_name) + + +# Maps lighteval provider values to inference platform +PROVIDER_TO_INFERENCE_PLATFORM = { + 'openai': 'openai', + 'anthropic': 'anthropic', + 'together': 'together', +} + +# Maps lighteval provider values to inference engine names +PROVIDER_TO_INFERENCE_ENGINE = { + 'vllm': 'vllm', + 'hosted_vllm': 'vllm', + 'tgi': 'text-generation-inference', + 'transformers': 'transformers', +} + +# Known metric bounds: metric_name -> (min_score, max_score) +# max_score of None means unbounded +KNOWN_METRIC_BOUNDS = { + 'acc': (0.0, 1.0), + 'acc_norm': (0.0, 1.0), + 'exact_match': (0.0, 1.0), + 'f1': (0.0, 1.0), + 'em': (0.0, 1.0), + 'mc1': (0.0, 1.0), + 'mc2': (0.0, 1.0), + 'pass@k': (0.0, 1.0), + 'avg@n': (0.0, 1.0), + 'mcc': (-1.0, 1.0), + 'bleu': (0.0, 100.0), + 'rouge1': (0.0, 1.0), + 'rouge2': (0.0, 1.0), + 'rougeL': (0.0, 1.0), + 'rougeLsum': (0.0, 1.0), + 'ter': (0.0, None), + 'brier_score': (0.0, 1.0), +} diff --git a/tests/data/lighteval/results_2026-05-11T07-33-49.106520.json b/tests/data/lighteval/results_2026-05-11T07-33-49.106520.json new file mode 100644 index 000000000..23ecbd988 --- /dev/null +++ b/tests/data/lighteval/results_2026-05-11T07-33-49.106520.json @@ -0,0 +1,260 @@ +{ + "config_general": { + "lighteval_sha": "?", + "num_fewshot_seeds": 1, + "max_samples": null, + "job_id": "0", + "start_time": 7455098.686343914, + "end_time": 7465207.973786135, + "total_evaluation_time_secondes": "10109.287442221306", + "model_config": { + "model_name": "hosted_vllm/inference-optimization/Qwen3-235B-A22B-Thinking-2507.w4a16", + "generation_parameters": { + "num_blocks": null, + "block_size": null, + "early_stopping": null, + "repetition_penalty": null, + "frequency_penalty": null, + "length_penalty": null, + "presence_penalty": null, + "max_new_tokens": 32000, + "min_new_tokens": null, + "seed": 3344, + "stop_tokens": null, + "temperature": 0.6, + "top_k": 20, + "min_p": 0.0, + "top_p": 0.95, + "truncate_prompt": null, + "cache_implementation": null, + "response_format": null + }, + "system_prompt": null, + "cache_dir": "~/.cache/huggingface/lighteval", + "provider": "hosted_vllm", + "base_url": "http://0.0.0.0:8000/v1", + "api_key": "", + "concurrent_requests": 16, + "verbose": false, + "max_model_length": null, + "api_max_retry": 8, + "api_retry_sleep": 1.0, + "api_retry_multiplier": 2.0, + "timeout": 1200.0 + }, + "model_name": "hosted_vllm/inference-optimization/Qwen3-235B-A22B-Thinking-2507.w4a16" + }, + "results": { + "gpqa:diamond|0": { + "gpqa_pass@k:k=1": 0.7525252525252525, + "gpqa_pass@k:k=1_stderr": 0.03074630074212453 + }, + "aime25|0": { + "pass@k:k=1&n=1": 0.8333333333333334, + "pass@k:k=1&n=1_stderr": 0.06920456654478331, + "avg@n:n=1": 0.8333333333333334, + "avg@n:n=1_stderr": 0.06920456654478331 + }, + "math_500|0": { + "pass@k:k=1&n=1": 0.904, + "pass@k:k=1&n=1_stderr": 0.013187715179792425 + }, + "all": { + "gpqa_pass@k:k=1": 0.7525252525252525, + "gpqa_pass@k:k=1_stderr": 0.03074630074212453, + "pass@k:k=1&n=1": 0.8686666666666667, + "pass@k:k=1&n=1_stderr": 0.041196140862287865, + "avg@n:n=1": 0.8333333333333334, + "avg@n:n=1_stderr": 0.06920456654478331 + } + }, + "versions": {}, + "config_tasks": { + "gpqa:diamond|0": { + "name": "gpqa:diamond", + "prompt_function": "gpqa_instruct_prompt", + "hf_repo": "Idavidrein/gpqa", + "hf_subset": "gpqa_diamond", + "metrics": [ + { + "metric_name": "gpqa_pass@k:k=1", + "higher_is_better": true, + "category": "GENERATIVE", + "sample_level_fn": "PassAtK(normalize=None, strip_strings=False, compute_score=compute, type_exact_match=None, k=1, n=1, attribute_must_be_set=['k'])", + "corpus_level_fn": "mean", + "batched_compute": false + } + ], + "solver": [ + "solve" + ], + "scorer": "score", + "sample_fields": "record_to_sample", + "sample_to_fewshot": "sample_to_fewshot", + "filter": null, + "hf_revision": null, + "hf_filter": null, + "hf_avail_splits": [ + "train" + ], + "evaluation_splits": [ + "train" + ], + "few_shots_split": null, + "few_shots_select": null, + "generation_size": 32768, + "generation_grammar": null, + "stop_sequence": [], + "num_samples": null, + "original_num_docs": -1, + "effective_num_docs": -1, + "must_remove_duplicate_docs": false, + "num_fewshots": 0, + "version": 1 + }, + "aime25|0": { + "name": "aime25", + "prompt_function": "aime_prompt", + "hf_repo": "yentinglin/aime_2025", + "hf_subset": "default", + "metrics": [ + { + "metric_name": "pass@k:k=1&n=1", + "higher_is_better": true, + "category": "GENERATIVE", + "sample_level_fn": "PassAtK(normalize=None, strip_strings=True, compute_score=compute, type_exact_match=None, k=1, n=1, attribute_must_be_set=['k'])", + "corpus_level_fn": "mean", + "batched_compute": false + }, + { + "metric_name": "avg@n:n=1", + "higher_is_better": true, + "category": "GENERATIVE", + "sample_level_fn": "AvgAtN(normalize=None, strip_strings=False, compute_score=compute, type_exact_match=None, n=1, attribute_must_be_set=['n'])", + "corpus_level_fn": "mean", + "batched_compute": false + } + ], + "solver": [ + "solve", + "solve" + ], + "scorer": "score", + "sample_fields": "record_to_sample", + "sample_to_fewshot": null, + "filter": null, + "hf_revision": null, + "hf_filter": null, + "hf_avail_splits": [ + "train" + ], + "evaluation_splits": [ + "train" + ], + "few_shots_split": null, + "few_shots_select": null, + "generation_size": null, + "generation_grammar": null, + "stop_sequence": [], + "num_samples": null, + "original_num_docs": -1, + "effective_num_docs": -1, + "must_remove_duplicate_docs": false, + "num_fewshots": 0, + "version": 2 + }, + "math_500|0": { + "name": "math_500", + "prompt_function": "math_500_prompt", + "hf_repo": "HuggingFaceH4/MATH-500", + "hf_subset": "default", + "metrics": [ + { + "metric_name": "pass@k:k=1&n=1", + "higher_is_better": true, + "category": "GENERATIVE", + "sample_level_fn": "PassAtK(normalize=None, strip_strings=True, compute_score=compute, type_exact_match=None, k=1, n=1, attribute_must_be_set=['k'])", + "corpus_level_fn": "mean", + "batched_compute": false + } + ], + "solver": [ + "solve", + "solve" + ], + "scorer": "score", + "sample_fields": "record_to_sample", + "sample_to_fewshot": null, + "filter": null, + "hf_revision": null, + "hf_filter": null, + "hf_avail_splits": [ + "test" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": null, + "few_shots_select": null, + "generation_size": 32768, + "generation_grammar": null, + "stop_sequence": [], + "num_samples": null, + "original_num_docs": -1, + "effective_num_docs": -1, + "must_remove_duplicate_docs": false, + "num_fewshots": 0, + "version": 2 + } + }, + "summary_tasks": { + "gpqa:diamond|0": { + "hashes": { + "hash_examples": "7e6ab78504360639", + "hash_full_prompts": "ef46db3751d8e999", + "hash_input_tokens": "42cba4b3cd185311", + "hash_cont_tokens": "42cba4b3cd185311" + }, + "truncated": 0, + "non_truncated": 0, + "padded": 0, + "non_padded": 0 + }, + "aime25|0": { + "hashes": { + "hash_examples": "8df1bb48b826e206", + "hash_full_prompts": "ef46db3751d8e999", + "hash_input_tokens": "f04b89be03da9f73", + "hash_cont_tokens": "f04b89be03da9f73" + }, + "truncated": 0, + "non_truncated": 0, + "padded": 0, + "non_padded": 0 + }, + "math_500|0": { + "hashes": { + "hash_examples": "9f076bfe85016b9f", + "hash_full_prompts": "ef46db3751d8e999", + "hash_input_tokens": "6e2146e74832427e", + "hash_cont_tokens": "6e2146e74832427e" + }, + "truncated": 0, + "non_truncated": 0, + "padded": 0, + "non_padded": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9f27d13c4db0f7b7", + "hash_full_prompts": "15a9db8cd5f71ff9", + "hash_input_tokens": "a81faa1610a8e7f6", + "hash_cont_tokens": "a81faa1610a8e7f6" + }, + "truncated": 0, + "non_truncated": 0, + "padded": 0, + "non_padded": 0 + } +} \ No newline at end of file diff --git a/tests/test_lighteval_adapter.py b/tests/test_lighteval_adapter.py new file mode 100644 index 000000000..e4cfb6003 --- /dev/null +++ b/tests/test_lighteval_adapter.py @@ -0,0 +1,290 @@ +import shutil +import tempfile +from pathlib import Path + +from every_eval_ever.converters.common.adapter import AdapterMetadata +from every_eval_ever.converters.lighteval.adapter import LightEvalAdapter +from every_eval_ever.converters.lighteval.utils import parse_model_name +from every_eval_ever.eval_types import ( + EvaluationLog, + EvaluatorRelationship, + SourceDataHf, +) + +DATA_DIR = Path('tests/data/lighteval') +RESULTS_FILE = DATA_DIR / 'results_2026-05-11T07-33-49.106520.json' + + +def _make_metadata_args(**overrides): + args = { + 'source_organization_name': 'TestOrg', + 'evaluator_relationship': EvaluatorRelationship.first_party, + } + args.update(overrides) + return args + + +def _log_for_task(logs: list[EvaluationLog], evaluation_name: str) -> EvaluationLog: + for log in logs: + if not log.evaluation_results: + continue + if log.evaluation_results[0].evaluation_name == evaluation_name: + return log + raise AssertionError(f'no log for task {evaluation_name!r}') + + +# ── Utility tests ────────────────────────────────────────────────────── + + +def test_parse_model_name_hosted_vllm(): + provider, path = parse_model_name( + 'hosted_vllm/inference-optimization/Qwen3-235B-A22B-Thinking-2507.w4a16' + ) + assert provider == 'hosted_vllm' + assert path == 'inference-optimization/Qwen3-235B-A22B-Thinking-2507.w4a16' + + +def test_parse_model_name_empty(): + assert parse_model_name('') == ('unknown', 'unknown') + assert parse_model_name(None) == ('unknown', 'unknown') + + +def test_parse_model_name_no_slash(): + provider, path = parse_model_name('local-checkpoint') + assert provider == 'unknown' + assert path == 'local-checkpoint' + + +def test_parse_model_name_transformers(): + provider, path = parse_model_name( + 'transformers/EleutherAI/pythia-70m' + ) + assert provider == 'transformers' + assert path == 'EleutherAI/pythia-70m' + + +# ── Adapter: _get_tasks ───────────────────────────────────────────────── + + +def test_get_tasks_skips_all_aggregation(): + adapter = LightEvalAdapter() + raw = { + 'results': { + 'all': {'acc': 0.5}, + 'aime25|0': {'pass@k:k=1&n=1': 0.1}, + 'gsm8k|0': {'exact_match': 0.9}, + } + } + tasks = adapter._get_tasks(raw) + assert tasks == ['aime25|0', 'gsm8k|0'] + + +# ── Adapter: transform_from_file ─────────────────────────────────────── + + +def test_transform_from_file_returns_one_log_per_task(): + adapter = LightEvalAdapter() + logs = adapter.transform_from_file(RESULTS_FILE, _make_metadata_args()) + assert len(logs) == 3 + for log in logs: + assert isinstance(log, EvaluationLog) + + +def test_transform_from_file_model_info(): + adapter = LightEvalAdapter() + logs = adapter.transform_from_file(RESULTS_FILE, _make_metadata_args()) + model = logs[0].model_info + + assert ( + model.name + == 'inference-optimization/Qwen3-235B-A22B-Thinking-2507.w4a16' + ) + assert model.id == model.name + assert model.developer == 'inference-optimization' + assert model.inference_platform is None + assert model.inference_engine.name == 'vllm' + assert model.additional_details['provider'] == 'hosted_vllm' + assert model.additional_details['concurrent_requests'] == '16' + + +def test_transform_from_file_source_metadata(): + adapter = LightEvalAdapter() + logs = adapter.transform_from_file(RESULTS_FILE, _make_metadata_args()) + src = logs[0].source_metadata + + assert src.source_name == 'lighteval' + assert src.source_type.value == 'evaluation_run' + assert src.source_organization_name == 'TestOrg' + + +def test_transform_from_file_eval_library_version_unknown_sha(): + adapter = LightEvalAdapter() + logs = adapter.transform_from_file(RESULTS_FILE, _make_metadata_args()) + assert logs[0].eval_library.name == 'lighteval' + assert logs[0].eval_library.version == 'unknown' + + +def test_transform_from_file_source_data_hf(): + adapter = LightEvalAdapter() + logs = adapter.transform_from_file(RESULTS_FILE, _make_metadata_args()) + + gpqa = _log_for_task(logs, 'gpqa:diamond') + sd = gpqa.evaluation_results[0].source_data + assert isinstance(sd, SourceDataHf) + assert sd.hf_repo == 'Idavidrein/gpqa' + assert sd.hf_split == 'train' + + aime = _log_for_task(logs, 'aime25') + sd_a = aime.evaluation_results[0].source_data + assert isinstance(sd_a, SourceDataHf) + assert sd_a.hf_repo == 'yentinglin/aime_2025' + assert sd_a.hf_split == 'train' + + math = _log_for_task(logs, 'math_500') + sd_m = math.evaluation_results[0].source_data + assert isinstance(sd_m, SourceDataHf) + assert sd_m.hf_repo == 'HuggingFaceH4/MATH-500' + assert sd_m.hf_split == 'test' + + +def test_transform_from_file_evaluation_results_scores_and_bounds(): + adapter = LightEvalAdapter() + logs = adapter.transform_from_file(RESULTS_FILE, _make_metadata_args()) + + gpqa = _log_for_task(logs, 'gpqa:diamond') + assert len(gpqa.evaluation_results) == 1 + g0 = gpqa.evaluation_results[0] + assert g0.metric_config.evaluation_description == 'gpqa_pass@k:k=1' + assert g0.score_details.score == 0.7525252525252525 + assert g0.metric_config.lower_is_better is False + assert g0.metric_config.min_score == 0.0 + assert g0.metric_config.max_score == 1.0 + + aime = _log_for_task(logs, 'aime25') + assert len(aime.evaluation_results) == 2 + pass_at_k = next( + r + for r in aime.evaluation_results + if r.metric_config.evaluation_description.startswith('pass@k') + ) + assert pass_at_k.score_details.score == 0.8333333333333334 + assert pass_at_k.metric_config.lower_is_better is False + + avg_n = next( + r + for r in aime.evaluation_results + if r.metric_config.evaluation_description.startswith('avg@n') + ) + assert avg_n.score_details.score == 0.8333333333333334 + + math = _log_for_task(logs, 'math_500') + assert len(math.evaluation_results) == 1 + assert math.evaluation_results[0].score_details.score == 0.904 + + +def test_transform_from_file_uncertainty_stderr(): + adapter = LightEvalAdapter() + logs = adapter.transform_from_file(RESULTS_FILE, _make_metadata_args()) + + aime = _log_for_task(logs, 'aime25') + pass_at_k = next( + r + for r in aime.evaluation_results + if r.metric_config.evaluation_description.startswith('pass@k') + ) + u = pass_at_k.score_details.uncertainty + assert u is not None + assert u.standard_error.value == 0.06920456654478331 + assert u.standard_error.method == 'bootstrap' + assert u.num_samples is None + + +def test_transform_from_file_uncertainty_stderr_without_num_samples(): + """effective_num_docs -1 yields no num_samples on Uncertainty.""" + adapter = LightEvalAdapter() + logs = adapter.transform_from_file(RESULTS_FILE, _make_metadata_args()) + gpqa = _log_for_task(logs, 'gpqa:diamond') + u = gpqa.evaluation_results[0].score_details.uncertainty + assert u is not None + assert u.standard_error is not None + assert u.standard_error.value == 0.03074630074212453 + assert u.num_samples is None + + +def test_transform_from_file_generation_config(): + adapter = LightEvalAdapter() + logs = adapter.transform_from_file(RESULTS_FILE, _make_metadata_args()) + + gen = logs[0].evaluation_results[0].generation_config + assert gen is not None + assert gen.generation_args.temperature == 0.6 + assert gen.generation_args.max_tokens == 32000 + assert gen.generation_args.top_p == 0.95 + assert gen.generation_args.top_k == 20 + + +def test_transform_from_file_eval_timestamp(): + adapter = LightEvalAdapter() + logs = adapter.transform_from_file(RESULTS_FILE, _make_metadata_args()) + assert logs[0].evaluation_timestamp == '7455098' + + +# ── Adapter: transform_from_directory ──────────────────────────────────── + + +def test_transform_from_directory(): + adapter = LightEvalAdapter() + logs = adapter.transform_from_directory(DATA_DIR, _make_metadata_args()) + assert len(logs) == 3 + task_names = { + r.evaluation_name for log in logs for r in log.evaluation_results + } + assert task_names == {'gpqa:diamond', 'aime25', 'math_500'} + + +# ── Adapter: inference engine override ───────────────────────────────── + + +def test_inference_engine_override(): + adapter = LightEvalAdapter() + metadata = _make_metadata_args( + inference_engine='sglang', inference_engine_version='0.4.1' + ) + logs = adapter.transform_from_file(RESULTS_FILE, metadata) + assert logs[0].model_info.inference_engine.name == 'sglang' + assert logs[0].model_info.inference_engine.version == '0.4.1' + + +def test_eval_library_version_cli_override(): + adapter = LightEvalAdapter() + metadata = _make_metadata_args(eval_library_version='9.9.9') + logs = adapter.transform_from_file(RESULTS_FILE, metadata) + assert logs[0].eval_library.version == '9.9.9' + + +# ── Adapter: public adapter metadata ─────────────────────────────────── + + +def test_adapter_metadata_property(): + adapter = LightEvalAdapter() + meta = adapter.metadata + assert isinstance(meta, AdapterMetadata) + assert meta.name == 'lighteval-adapter' + assert meta.version == '0.1.0' + assert 'lighteval' in meta.description.lower() + + +def test_transform_from_directory_finds_nested_results_file(): + """Recursive glob should pick up results under a nested run directory.""" + adapter = LightEvalAdapter() + with tempfile.TemporaryDirectory() as tmpdir: + nested = Path(tmpdir) / 'run_a' + nested.mkdir() + dest = nested / RESULTS_FILE.name + shutil.copy(RESULTS_FILE, dest) + logs = adapter.transform_from_directory(tmpdir, _make_metadata_args()) + assert len(logs) == 3 + task_names = { + r.evaluation_name for log in logs for r in log.evaluation_results + } + assert task_names == {'gpqa:diamond', 'aime25', 'math_500'}