diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml new file mode 100644 index 000000000..1e1de4f8a --- /dev/null +++ b/.github/workflows/pages.yml @@ -0,0 +1,52 @@ +name: Deploy Docs to GitHub Pages + +on: + push: + branches: [main] + pull_request: + branches: [main] + + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + +jobs: + build: + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Ruby + uses: ruby/setup-ruby@v1 + with: + ruby-version: '3.2' + bundler-cache: true + + - name: Build site + run: bundle exec jekyll build + + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + with: + path: ./_site + + deploy: + permissions: + contents: read + pages: write + id-token: write + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + needs: build + if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} + steps: + - name: Deploy + id: deployment + uses: actions/deploy-pages@v5 \ No newline at end of file diff --git a/.gitignore b/.gitignore index 5a5d93546..b925edb9a 100644 --- a/.gitignore +++ b/.gitignore @@ -213,4 +213,14 @@ __marimo__/ *.DS_Store* None/ global-mmlu-lite/ -/data/ \ No newline at end of file +/data/ + +# Ignore folders generated by Bundler +.bundle/ +vendor/ + +# Ignore the default location of the built site, and caches and metadata generated by Jekyll +_site/ +.sass-cache/ +.jekyll-cache/ +.jekyll-metadata \ No newline at end of file diff --git a/Gemfile b/Gemfile new file mode 100644 index 000000000..f10963c73 --- /dev/null +++ b/Gemfile @@ -0,0 +1,2 @@ +source "https://rubygems.org" +gem "just-the-docs" \ No newline at end of file diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 000000000..3af5d9ff2 --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,91 @@ +GEM + remote: https://rubygems.org/ + specs: + addressable (2.9.0) + public_suffix (>= 2.0.2, < 8.0) + base64 (0.3.0) + bigdecimal (4.1.2) + colorator (1.1.0) + concurrent-ruby (1.3.6) + csv (3.3.5) + em-websocket (0.5.3) + eventmachine (>= 0.12.9) + http_parser.rb (~> 0) + eventmachine (1.2.7) + ffi (1.17.4-x86_64-linux-gnu) + forwardable-extended (2.6.0) + google-protobuf (4.35.0-x86_64-linux-gnu) + bigdecimal + rake (~> 13.3) + http_parser.rb (0.8.1) + i18n (1.14.8) + concurrent-ruby (~> 1.0) + jekyll (4.4.1) + addressable (~> 2.4) + base64 (~> 0.2) + colorator (~> 1.0) + csv (~> 3.0) + em-websocket (~> 0.5) + i18n (~> 1.0) + jekyll-sass-converter (>= 2.0, < 4.0) + jekyll-watch (~> 2.0) + json (~> 2.6) + kramdown (~> 2.3, >= 2.3.1) + kramdown-parser-gfm (~> 1.0) + liquid (~> 4.0) + mercenary (~> 0.3, >= 0.3.6) + pathutil (~> 0.9) + rouge (>= 3.0, < 5.0) + safe_yaml (~> 1.0) + terminal-table (>= 1.8, < 4.0) + webrick (~> 1.7) + jekyll-include-cache (0.2.1) + jekyll (>= 3.7, < 5.0) + jekyll-sass-converter (3.1.0) + sass-embedded (~> 1.75) + jekyll-seo-tag (2.9.0) + jekyll (>= 3.8, < 5.0) + jekyll-watch (2.2.1) + listen (~> 3.0) + json (2.19.5) + just-the-docs (0.12.0) + jekyll (>= 3.8.5) + jekyll-include-cache + jekyll-seo-tag (>= 2.0) + rake (>= 12.3.1) + kramdown (2.5.2) + rexml (>= 3.4.4) + kramdown-parser-gfm (1.1.0) + kramdown (~> 2.0) + liquid (4.0.4) + listen (3.10.0) + logger + rb-fsevent (~> 0.10, >= 0.10.3) + rb-inotify (~> 0.9, >= 0.9.10) + logger (1.7.0) + mercenary (0.4.0) + pathutil (0.16.2) + forwardable-extended (~> 2.6) + public_suffix (7.0.5) + rake (13.4.2) + rb-fsevent (0.11.2) + rb-inotify (0.11.1) + ffi (~> 1.0) + rexml (3.4.4) + rouge (4.7.0) + safe_yaml (1.0.5) + sass-embedded (1.99.0-x86_64-linux-gnu) + google-protobuf (~> 4.31) + terminal-table (3.0.2) + unicode-display_width (>= 1.1.1, < 3) + unicode-display_width (2.6.0) + webrick (1.9.2) + +PLATFORMS + x86_64-linux-gnu + +DEPENDENCIES + just-the-docs + +BUNDLED WITH + 2.4.20 diff --git a/README.md b/README.md index 262a2ca11..23b277b14 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # Every Eval Ever +📖 **[Documentation](https://docs.evalevalai.com)** + > [EvalEval Coalition](https://evalevalai.com) — "We are a researcher community developing scientifically grounded research outputs and robust deployment infrastructure for broader impact evaluations." **Every Eval Ever** is a shared schema and crowdsourced eval database. It defines a standardized metadata format for storing AI evaluation results — from leaderboard scrapes and research papers to local evaluation runs — so that results from different frameworks can be compared, reproduced, and reused. The three components that make it work: diff --git a/_config.yml b/_config.yml new file mode 100644 index 000000000..523389765 --- /dev/null +++ b/_config.yml @@ -0,0 +1,31 @@ +title: Every Eval Ever +description: Documentation for the Every Eval Ever schema, CLI, and converters +theme: just-the-docs +color_scheme: light + +source: docs + +baseurl: "" +url: "https://docs.evalevalai.com" +repository: evaleval/every_eval_ever + +permalink: pretty + +search_enabled: true +heading_anchors: true + +aux_links: + "Every Eval Ever on GitHub": + - https://github.com/evaleval/every_eval_ever + +defaults: + - scope: + path: "" + values: + layout: default + +nav_sort: case_sensitive + +# Back to top link +back_to_top: true +back_to_top_text: "Back to top" \ No newline at end of file diff --git a/docs/contributing/index.md b/docs/contributing/index.md new file mode 100644 index 000000000..46b5992e7 --- /dev/null +++ b/docs/contributing/index.md @@ -0,0 +1,57 @@ +--- +layout: default +title: Contributing +nav_order: 5 +--- + +# Contributing + +New data can be contributed to the [Hugging Face Dataset](https://huggingface.co/datasets/evaleval/EEE_datastore) using the following process: + +Leaderboard/evaluation data is split-up into files by individual model, and data for each model is stored using [eval.schema.json](https://github.com/evaleval/every_eval_ever/blob/main/eval.schema.json). The repository is structured into folders as `data/{benchmark_name}/{developer_name}/{model_name}/`. + +## TL;DR How to successfully submit + +1. Data must conform to [eval.schema.json](https://github.com/evaleval/every_eval_ever/blob/main/eval.schema.json) (current version is defined in the schema file) +2. The validation pipeline will automatically verify the data submitted in the pull request, but can also be manually triggered by typing `/eee validate changed` in a comment on the HF PR. +3. An EvalEval member will review and merge your submission + +## PR Naming Convention + +Use these prefixes in your pull request titles: + +- `[Submission]` - New evaluation data +- `[Issue #N]` - Fix for a specific GitHub issue +- `[Feature]` - New functionality not tied to an issue +- `[Docs]` - Documentation changes +- `[ACL Shared Task]` - Shared task submissions (priority review) + +## UUID Naming Convention + +Each JSON file is named with a **UUID (Universally Unique Identifier)** in the format `{uuid}.json`. The UUID is automatically generated (using standard UUID v4) when creating a new evaluation result file. This ensures that: +- **Multiple evaluations** of the same model can exist without conflicts (each gets a unique UUID) +- **Different timestamps** are stored as separate files with different UUIDs (not as separate folders) +- A model may have multiple result files, with each file representing different iterations or runs of the leaderboard/evaluation +- UUIDs can be generated using Python's `uuid.uuid4()` function. + +**Example**: The model `openai/gpt-4o-2024-11-20` might have multiple files like: +- `e70acf51-30ef-4c20-b7cc-51704d114d70.json` (evaluation run #1) +- `a1b2c3d4-5678-90ab-cdef-1234567890ab.json` (evaluation run #2) + +Note: Each file can contain multiple individual results related to one model. See [examples in the datastore](https://huggingface.co/datasets/evaleval/EEE_datastore/tree/main/data). + +## How to add new eval + +1. Add a new folder under [data/](https://huggingface.co/datasets/evaleval/EEE_datastore/tree/main/data) on the Hugging Face datastore with a codename for your eval. +2. For each model, use the Hugging Face (`developer_name/model_name`) naming convention to create a 2-tier folder structure. +3. Add a JSON file with results for each model and name it `{uuid}.json`. +4. [Optional] Include a [utils/](https://github.com/evaleval/every_eval_ever/tree/main/utils) folder in your benchmark name folder with any scripts used to generate the data (see e.g. [utils/global-mmlu-lite/adapter.py](https://github.com/evaleval/every_eval_ever/blob/main/utils/global-mmlu-lite/adapter.py)). +5. [Submit] Two ways to submit your evaluation data: + - **Option A: Drag & drop via Hugging Face** - Go to [evaleval/EEE_datastore](https://huggingface.co/datasets/evaleval/EEE_datastore) -> click "Files and versions" -> "Contribute" -> "Upload files" -> drag and drop your data -> select "Open as a pull request to the main branch". See [step-by-step screenshots](https://docs.google.com/document/d/1dxTQF8ncGCzaAOIj0RX7E9Hg4THmUBzezDOYUp_XdCY/edit?usp=sharing). + - **Option B: Clone & PR** - Clone the [Hugging Face repository](https://huggingface.co/datasets/evaleval/EEE_datastore), add your data under `data/`, and open a pull request + +Before submitting, run: + +```bash +uv run python -m every_eval_ever validate data/ +``` diff --git a/docs/data-structure/index.md b/docs/data-structure/index.md new file mode 100644 index 000000000..87ce1145e --- /dev/null +++ b/docs/data-structure/index.md @@ -0,0 +1,15 @@ +--- +layout: default +title: Data Structure +nav_order: 3 +has_children: true +--- + +# Data Structure + +Evaluation data is represented in two layers: + +- Aggregate JSON records (`{uuid}.json`) +- Instance-level JSONL records (`{uuid}_samples.jsonl`) + +Use the child pages in this section for schema and validation details. diff --git a/docs/data-structure/schema.md b/docs/data-structure/schema.md new file mode 100644 index 000000000..b73000b1b --- /dev/null +++ b/docs/data-structure/schema.md @@ -0,0 +1,50 @@ +--- +layout: default +title: Schema +parent: Data Structure +nav_order: 1 +--- + +# Schema + +The canonical schemas are: + +- [Aggregate schema](https://github.com/evaleval/every_eval_ever/blob/main/eval.schema.json) +- [Instance-level schema](https://github.com/evaleval/every_eval_ever/blob/main/instance_level_eval.schema.json) + +Schema versions are defined in the canonical JSON Schema files linked above. + +The repository enforces schema compatibility by generating Pydantic models from JSON Schema and applying post-generation patches (`post_codegen.py`). This generation flow is automated in CI and can also be run manually. + +## Schema Instructions + +1. **`model_info`**: Use Hugging Face formatting (`developer_name/model_name`). If a model does not come from Hugging Face, use the exact API reference. Check [examples in data/livecodebenchpro](https://huggingface.co/datasets/evaleval/EEE_datastore/tree/main/data/livecodebenchpro). Notably, some do have a **date included in the model name**, but others **do not**. For example: +- OpenAI: `gpt-4o-2024-11-20`, `gpt-5-2025-08-07`, `o3-2025-04-16` +- Anthropic: `claude-3-7-sonnet-20250219`, `claude-3-sonnet-20240229` +- Google: `gemini-2.5-pro`, `gemini-2.5-flash` +- xAI (Grok): `grok-2-2024-08-13`, `grok-3-2025-01-15` + +2. **`evaluation_id`**: Use `{benchmark_name/model_id/retrieved_timestamp}` format (e.g. `livecodebenchpro/qwen3-235b-a22b-thinking-2507/1760492095.8105888`). + +3. **`inference_platform`** vs **`inference_engine`**: Where possible specify where the evaluation was run using one of these two fields. +- `inference_platform`: Use this field when the evaluation was run through a remote API (e.g., `openai`, `huggingface`, `openrouter`, `anthropic`, `xai`). +- `inference_engine`: Use this field when the evaluation was run locally. This is now an object with `name` and `version` (e.g. `{"name": "vllm", "version": "0.6.0"}`). + +4. The `source_type` on `source_metadata` has two options: `documentation` and `evaluation_run`. Use `documentation` when results are scraped from a leaderboard or paper. Use `evaluation_run` when the evaluation was run locally (e.g. via an eval converter). + +5. **`source_data`** is specified per evaluation result (inside `evaluation_results`), with three variants: +- `source_type: "url"` - link to a web source (e.g. leaderboard API) +- `source_type: "hf_dataset"` - reference to a Hugging Face dataset (e.g. `{"hf_repo": "google/IFEval"}`) +- `source_type: "other"` - for private or proprietary datasets + +6. The schema is designed to accommodate both numeric and level-based (e.g. Low, Medium, High) metrics. For level-based metrics, the actual `value` should be converted to an integer (e.g. Low = 1, Medium = 2, High = 3), and the `level_names` property should be used to specify the mapping of levels to integers. + +7. **Timestamps**: The schema has three timestamp fields - use them as follows: +- `retrieved_timestamp` (required) - when this record was created, in Unix epoch format (e.g. `1760492095.8105888`) +- `evaluation_timestamp` (top-level, optional) - when the evaluation was run +- `evaluation_results[].evaluation_timestamp` (per-result, optional) - when a specific evaluation result was produced, if different results were run at different times + +8. Additional details can be provided in several places in the schema. They are not required, but can be useful for detailed analysis. +- `model_info.additional_details`: Use this field to provide any additional information about the model itself (e.g. number of parameters) +- `evaluation_results.generation_config.generation_args`: Specify additional arguments used to generate outputs from the model +- `evaluation_results.generation_config.additional_details`: Use this field to provide any additional information about the evaluation process that is not captured elsewhere \ No newline at end of file diff --git a/docs/data-structure/validation.md b/docs/data-structure/validation.md new file mode 100644 index 000000000..6234f1258 --- /dev/null +++ b/docs/data-structure/validation.md @@ -0,0 +1,50 @@ +--- +layout: default +title: Validation +parent: Data Structure +nav_order: 2 +--- + +# Validation + +Validation uses Pydantic models generated from the JSON schemas. This validates aggregate `.json` files against `EvaluationLog` and instance-level `_samples.jsonl` files line-by-line against `InstanceLevelEvaluationLog`. Requires [uv](https://docs.astral.sh/uv/). + +## Validate files with the package CLI + +```sh +# Single aggregate file +uv run python -m every_eval_ever validate data/benchmark/dev/model/uuid.json + +# Instance-level JSONL +uv run python -m every_eval_ever validate data/benchmark/dev/model/uuid_samples.jsonl + +# Entire directory (recurses into subdirectories) +uv run python -m every_eval_ever validate data/benchmark/dev/model/ + +# Multiple paths +uv run python -m every_eval_ever validate file1.json file2_samples.jsonl data/ +``` + +File type is determined by extension: `.json` validates against `EvaluationLog`, `.jsonl` validates each line against `InstanceLevelEvaluationLog`. + +### Output formats + +```sh +# Rich terminal output (default) +uv run python -m every_eval_ever validate data/ + +# Machine-readable JSON +uv run python -m every_eval_ever validate --format json data/ + +# GitHub Actions annotations +uv run python -m every_eval_ever validate --format github data/ +``` + +### Options + +| Flag | Default | Description | +|------|---------|-------------| +| `--format {rich,json,github}` | `rich` | Output format | +| `--max-errors N` | `50` | Maximum errors reported per JSONL file | + +Exit code is `0` if all files pass and `1` if any fail. diff --git a/docs/eval-converters/index.md b/docs/eval-converters/index.md new file mode 100644 index 000000000..444d75762 --- /dev/null +++ b/docs/eval-converters/index.md @@ -0,0 +1,27 @@ +--- +layout: default +title: Eval Converters +nav_order: 4 +--- + +# Eval Converters + +Supported conversion targets: + +- Inspect AI +- HELM +- lm-eval-harness + +These are the three main general-purpose converters expected to be supported in the core package. + +Example commands: + +```bash +uv run python -m every_eval_ever convert inspect --log_path +uv run python -m every_eval_ever convert helm --log_path +uv run python -m every_eval_ever convert lm_eval --log_path +``` + +Adapter source code lives under [every_eval_ever/converters](https://github.com/evaleval/every_eval_ever/tree/main/every_eval_ever/converters). + +One-off adapters also exist under [utils](https://github.com/evaleval/every_eval_ever/tree/main/utils) for source-specific parsing and business logic. diff --git a/docs/getting-started/index.md b/docs/getting-started/index.md new file mode 100644 index 000000000..136c14409 --- /dev/null +++ b/docs/getting-started/index.md @@ -0,0 +1,33 @@ +--- +layout: default +title: Getting Started +nav_order: 2 +--- + +# Getting Started + +Install the package: + +```bash +pip install every-eval-ever +``` + +Optional converter dependencies: + +```bash +pip install 'every-eval-ever[inspect]' +pip install 'every-eval-ever[helm]' +pip install 'every-eval-ever[all]' +``` + +## Run the CLI + +```bash +uv run python -m every_eval_ever --help +``` + +## Continue + +- See [Data Structure](../data-structure/) +- See [Eval Converters](../eval-converters/) +- See [Contributing](../contributing/) diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 000000000..76f334041 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,37 @@ +--- +layout: default +title: Home +nav_order: 1 +--- + +# Every Eval Ever + +> [EvalEval Coalition](https://evalevalai.com) — "We are a researcher community developing scientifically grounded research outputs and robust deployment infrastructure for broader impact evaluations." + +**Every Eval Ever** is a shared schema and crowdsourced eval database. It defines a standardized metadata format for storing AI evaluation results — from leaderboard scrapes and research papers to local evaluation runs — so that results from different frameworks can be compared, reproduced, and reused. The three components that make it work: + +- 📋 **A metadata schema** ([eval.schema.json](https://github.com/evaleval/every_eval_ever/blob/main/eval.schema.json)) that defines the information needed for meaningful comparison of evaluation results, including [instance-level data](https://github.com/evaleval/every_eval_ever/blob/main/instance_level_eval.schema.json) +- 🔧 **Validation** that checks data against the schema before it enters the repository +- 🔌 **Converters** for [Inspect AI](https://github.com/evaleval/every_eval_ever/tree/main/every_eval_ever/converters/inspect), [HELM](https://github.com/evaleval/every_eval_ever/tree/main/every_eval_ever/converters/helm), and [lm-eval-harness](https://github.com/evaleval/every_eval_ever/tree/main/every_eval_ever/converters/lm_eval), so you can transform your existing evaluation logs into the standard format + +## Project Components + +Every Eval Ever is maintained across three connected components: + +- [GitHub repository](https://github.com/evaleval/every_eval_ever): the `every_eval_ever` Python package with schema definitions, converters/adapters, tests, and core tooling. +- [EEE Datastore](https://huggingface.co/datasets/evaleval/EEE_datastore): the Hugging Face datastore that stores normalized Every Eval Ever evaluation data. +- [EEE Validator](https://huggingface.co/spaces/evaleval/eee_validator): validator and EvalEvalBot checks used on datastore pull requests, built from repository logic plus additional checks that are being upstreamed. + +Install the package: + +```bash +pip install every-eval-ever +``` + +Optional converter dependencies: + +```bash +pip install 'every-eval-ever[inspect]' +pip install 'every-eval-ever[helm]' +pip install 'every-eval-ever[all]' +``` \ No newline at end of file