evaleval · tommasocerruti · May 26, 2026 · May 20, 2026 · May 21, 2026 · May 21, 2026
diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml
@@ -0,0 +1,52 @@
+name: Deploy Docs to GitHub Pages
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Ruby
+        uses: ruby/setup-ruby@v1
+        with:
+          ruby-version: '3.2'
+          bundler-cache: true
+
+      - name: Build site
+        run: bundle exec jekyll build
+
+      - name: Upload artifact
+        uses: actions/upload-pages-artifact@v3
+        with:
+          path: ./_site
+
+  deploy:
+    permissions:
+      contents: read
+      pages: write
+      id-token: write
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    runs-on: ubuntu-latest
+    needs: build
+    if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
+    steps:
+      - name: Deploy
+        id: deployment
+        uses: actions/deploy-pages@v5
diff --git a/.gitignore b/.gitignore
@@ -213,4 +213,14 @@ __marimo__/
 *.DS_Store*
 None/
 global-mmlu-lite/
-/data/
+/data/
+
+# Ignore folders generated by Bundler
+.bundle/
+vendor/
+
+# Ignore the default location of the built site, and caches and metadata generated by Jekyll
+_site/
+.sass-cache/
+.jekyll-cache/
+.jekyll-metadata
diff --git a/Gemfile b/Gemfile
@@ -0,0 +1,2 @@
+source "https://rubygems.org"
+gem "just-the-docs"
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -0,0 +1,91 @@
+GEM
+  remote: https://rubygems.org/
+  specs:
+    addressable (2.9.0)
+      public_suffix (>= 2.0.2, < 8.0)
+    base64 (0.3.0)
+    bigdecimal (4.1.2)
+    colorator (1.1.0)
+    concurrent-ruby (1.3.6)
+    csv (3.3.5)
+    em-websocket (0.5.3)
+      eventmachine (>= 0.12.9)
+      http_parser.rb (~> 0)
+    eventmachine (1.2.7)
+    ffi (1.17.4-x86_64-linux-gnu)
+    forwardable-extended (2.6.0)
+    google-protobuf (4.35.0-x86_64-linux-gnu)
+      bigdecimal
+      rake (~> 13.3)
+    http_parser.rb (0.8.1)
+    i18n (1.14.8)
+      concurrent-ruby (~> 1.0)
+    jekyll (4.4.1)
+      addressable (~> 2.4)
+      base64 (~> 0.2)
+      colorator (~> 1.0)
+      csv (~> 3.0)
+      em-websocket (~> 0.5)
+      i18n (~> 1.0)
+      jekyll-sass-converter (>= 2.0, < 4.0)
+      jekyll-watch (~> 2.0)
+      json (~> 2.6)
+      kramdown (~> 2.3, >= 2.3.1)
+      kramdown-parser-gfm (~> 1.0)
+      liquid (~> 4.0)
+      mercenary (~> 0.3, >= 0.3.6)
+      pathutil (~> 0.9)
+      rouge (>= 3.0, < 5.0)
+      safe_yaml (~> 1.0)
+      terminal-table (>= 1.8, < 4.0)
+      webrick (~> 1.7)
+    jekyll-include-cache (0.2.1)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-sass-converter (3.1.0)
+      sass-embedded (~> 1.75)
+    jekyll-seo-tag (2.9.0)
+      jekyll (>= 3.8, < 5.0)
+    jekyll-watch (2.2.1)
+      listen (~> 3.0)
+    json (2.19.5)
+    just-the-docs (0.12.0)
+      jekyll (>= 3.8.5)
+      jekyll-include-cache
+      jekyll-seo-tag (>= 2.0)
+      rake (>= 12.3.1)
+    kramdown (2.5.2)
+      rexml (>= 3.4.4)
+    kramdown-parser-gfm (1.1.0)
+      kramdown (~> 2.0)
+    liquid (4.0.4)
+    listen (3.10.0)
+      logger
+      rb-fsevent (~> 0.10, >= 0.10.3)
+      rb-inotify (~> 0.9, >= 0.9.10)
+    logger (1.7.0)
+    mercenary (0.4.0)
+    pathutil (0.16.2)
+      forwardable-extended (~> 2.6)
+    public_suffix (7.0.5)
+    rake (13.4.2)
+    rb-fsevent (0.11.2)
+    rb-inotify (0.11.1)
+      ffi (~> 1.0)
+    rexml (3.4.4)
+    rouge (4.7.0)
+    safe_yaml (1.0.5)
+    sass-embedded (1.99.0-x86_64-linux-gnu)
+      google-protobuf (~> 4.31)
+    terminal-table (3.0.2)
+      unicode-display_width (>= 1.1.1, < 3)
+    unicode-display_width (2.6.0)
+    webrick (1.9.2)
+
+PLATFORMS
+  x86_64-linux-gnu
+
+DEPENDENCIES
+  just-the-docs
+
+BUNDLED WITH
+   2.4.20
diff --git a/README.md b/README.md
@@ -1,5 +1,7 @@
 # Every Eval Ever
 
+📖 **[Documentation](https://docs.evalevalai.com)**
+
 > [EvalEval Coalition](https://evalevalai.com) — "We are a researcher community developing scientifically grounded research outputs and robust deployment infrastructure for broader impact evaluations."
 
 **Every Eval Ever** is a shared schema and crowdsourced eval database. It defines a standardized metadata format for storing AI evaluation results — from leaderboard scrapes and research papers to local evaluation runs — so that results from different frameworks can be compared, reproduced, and reused. The three components that make it work:

diff --git a/_config.yml b/_config.yml
@@ -0,0 +1,31 @@
+title: Every Eval Ever
+description: Documentation for the Every Eval Ever schema, CLI, and converters
+theme: just-the-docs
+color_scheme: light
+
+source: docs
+
+baseurl: ""
+url: "https://docs.evalevalai.com"
+repository: evaleval/every_eval_ever
+
+permalink: pretty
+
+search_enabled: true
+heading_anchors: true
+
+aux_links:
+  "Every Eval Ever on GitHub":
+    - https://github.com/evaleval/every_eval_ever
+
+defaults:
+  - scope:
+      path: ""
+    values:
+      layout: default
+
+nav_sort: case_sensitive
+
+# Back to top link
+back_to_top: true
+back_to_top_text: "Back to top"
diff --git a/docs/contributing/index.md b/docs/contributing/index.md
@@ -0,0 +1,57 @@
+---
+layout: default
+title: Contributing
+nav_order: 5
+---
+
+# Contributing
+
+New data can be contributed to the [Hugging Face Dataset](https://huggingface.co/datasets/evaleval/EEE_datastore) using the following process:
+
+Leaderboard/evaluation data is split-up into files by individual model, and data for each model is stored using [eval.schema.json](https://github.com/evaleval/every_eval_ever/blob/main/eval.schema.json). The repository is structured into folders as `data/{benchmark_name}/{developer_name}/{model_name}/`.
+
+## TL;DR How to successfully submit
+
+1. Data must conform to [eval.schema.json](https://github.com/evaleval/every_eval_ever/blob/main/eval.schema.json) (current version is defined in the schema file)
+2. The validation pipeline will automatically verify the data submitted in the pull request, but can also be manually triggered by typing `/eee validate changed` in a comment on the HF PR.
+3. An EvalEval member will review and merge your submission
+
+## PR Naming Convention
+
+Use these prefixes in your pull request titles:
+
+- `[Submission]` - New evaluation data
+- `[Issue #N]` - Fix for a specific GitHub issue
+- `[Feature]` - New functionality not tied to an issue
+- `[Docs]` - Documentation changes
+- `[ACL Shared Task]` - Shared task submissions (priority review)
+
+## UUID Naming Convention
+
+Each JSON file is named with a **UUID (Universally Unique Identifier)** in the format `{uuid}.json`. The UUID is automatically generated (using standard UUID v4) when creating a new evaluation result file. This ensures that:
+- **Multiple evaluations** of the same model can exist without conflicts (each gets a unique UUID)
+- **Different timestamps** are stored as separate files with different UUIDs (not as separate folders)
+- A model may have multiple result files, with each file representing different iterations or runs of the leaderboard/evaluation
+- UUIDs can be generated using Python's `uuid.uuid4()` function.
+
+**Example**: The model `openai/gpt-4o-2024-11-20` might have multiple files like:
+- `e70acf51-30ef-4c20-b7cc-51704d114d70.json` (evaluation run #1)
+- `a1b2c3d4-5678-90ab-cdef-1234567890ab.json` (evaluation run #2)
+
+Note: Each file can contain multiple individual results related to one model. See [examples in the datastore](https://huggingface.co/datasets/evaleval/EEE_datastore/tree/main/data).
+
+## How to add new eval
+
+1. Add a new folder under [data/](https://huggingface.co/datasets/evaleval/EEE_datastore/tree/main/data) on the Hugging Face datastore with a codename for your eval.
+2. For each model, use the Hugging Face (`developer_name/model_name`) naming convention to create a 2-tier folder structure.
+3. Add a JSON file with results for each model and name it `{uuid}.json`.
+4. [Optional] Include a [utils/](https://github.com/evaleval/every_eval_ever/tree/main/utils) folder in your benchmark name folder with any scripts used to generate the data (see e.g. [utils/global-mmlu-lite/adapter.py](https://github.com/evaleval/every_eval_ever/blob/main/utils/global-mmlu-lite/adapter.py)).
+5. [Submit] Two ways to submit your evaluation data:
+	- **Option A: Drag & drop via Hugging Face** - Go to [evaleval/EEE_datastore](https://huggingface.co/datasets/evaleval/EEE_datastore) -> click "Files and versions" -> "Contribute" -> "Upload files" -> drag and drop your data -> select "Open as a pull request to the main branch". See [step-by-step screenshots](https://docs.google.com/document/d/1dxTQF8ncGCzaAOIj0RX7E9Hg4THmUBzezDOYUp_XdCY/edit?usp=sharing).
+	- **Option B: Clone & PR** - Clone the [Hugging Face repository](https://huggingface.co/datasets/evaleval/EEE_datastore), add your data under `data/`, and open a pull request
+
+Before submitting, run:
+
+```bash
+uv run python -m every_eval_ever validate data/
+```
diff --git a/docs/data-structure/index.md b/docs/data-structure/index.md
@@ -0,0 +1,15 @@
+---
+layout: default
+title: Data Structure
+nav_order: 3
+has_children: true
+---
+
+# Data Structure
+
+Evaluation data is represented in two layers:
+
+- Aggregate JSON records (`{uuid}.json`)
+- Instance-level JSONL records (`{uuid}_samples.jsonl`)
+
+Use the child pages in this section for schema and validation details.
diff --git a/docs/data-structure/schema.md b/docs/data-structure/schema.md
@@ -0,0 +1,50 @@
+---
+layout: default
+title: Schema
+parent: Data Structure
+nav_order: 1
+---
+
+# Schema
+
+The canonical schemas are:
+
+- [Aggregate schema](https://github.com/evaleval/every_eval_ever/blob/main/eval.schema.json)
+- [Instance-level schema](https://github.com/evaleval/every_eval_ever/blob/main/instance_level_eval.schema.json)
+
+Schema versions are defined in the canonical JSON Schema files linked above.
+
+The repository enforces schema compatibility by generating Pydantic models from JSON Schema and applying post-generation patches (`post_codegen.py`). This generation flow is automated in CI and can also be run manually.
+
+## Schema Instructions
+
+1. **`model_info`**: Use Hugging Face formatting (`developer_name/model_name`). If a model does not come from Hugging Face, use the exact API reference. Check [examples in data/livecodebenchpro](https://huggingface.co/datasets/evaleval/EEE_datastore/tree/main/data/livecodebenchpro). Notably, some do have a **date included in the model name**, but others **do not**. For example:
+- OpenAI: `gpt-4o-2024-11-20`, `gpt-5-2025-08-07`, `o3-2025-04-16`
+- Anthropic: `claude-3-7-sonnet-20250219`, `claude-3-sonnet-20240229`
+- Google: `gemini-2.5-pro`, `gemini-2.5-flash`
+- xAI (Grok): `grok-2-2024-08-13`, `grok-3-2025-01-15`
+
+2. **`evaluation_id`**: Use `{benchmark_name/model_id/retrieved_timestamp}` format (e.g. `livecodebenchpro/qwen3-235b-a22b-thinking-2507/1760492095.8105888`).
+
+3. **`inference_platform`** vs **`inference_engine`**: Where possible specify where the evaluation was run using one of these two fields.
+- `inference_platform`: Use this field when the evaluation was run through a remote API (e.g., `openai`, `huggingface`, `openrouter`, `anthropic`, `xai`).
+- `inference_engine`: Use this field when the evaluation was run locally. This is now an object with `name` and `version` (e.g. `{"name": "vllm", "version": "0.6.0"}`).
+
+4. The `source_type` on `source_metadata` has two options: `documentation` and `evaluation_run`. Use `documentation` when results are scraped from a leaderboard or paper. Use `evaluation_run` when the evaluation was run locally (e.g. via an eval converter).
+
+5. **`source_data`** is specified per evaluation result (inside `evaluation_results`), with three variants:
+- `source_type: "url"` - link to a web source (e.g. leaderboard API)
+- `source_type: "hf_dataset"` - reference to a Hugging Face dataset (e.g. `{"hf_repo": "google/IFEval"}`)
+- `source_type: "other"` - for private or proprietary datasets
+
+6. The schema is designed to accommodate both numeric and level-based (e.g. Low, Medium, High) metrics. For level-based metrics, the actual `value` should be converted to an integer (e.g. Low = 1, Medium = 2, High = 3), and the `level_names` property should be used to specify the mapping of levels to integers.
+
+7. **Timestamps**: The schema has three timestamp fields - use them as follows:
+- `retrieved_timestamp` (required) - when this record was created, in Unix epoch format (e.g. `1760492095.8105888`)
+- `evaluation_timestamp` (top-level, optional) - when the evaluation was run
+- `evaluation_results[].evaluation_timestamp` (per-result, optional) - when a specific evaluation result was produced, if different results were run at different times
+
+8. Additional details can be provided in several places in the schema. They are not required, but can be useful for detailed analysis.
+- `model_info.additional_details`: Use this field to provide any additional information about the model itself (e.g. number of parameters)
+- `evaluation_results.generation_config.generation_args`: Specify additional arguments used to generate outputs from the model
+- `evaluation_results.generation_config.additional_details`: Use this field to provide any additional information about the evaluation process that is not captured elsewhere
diff --git a/docs/data-structure/validation.md b/docs/data-structure/validation.md
@@ -0,0 +1,50 @@
+---
+layout: default
+title: Validation
+parent: Data Structure
+nav_order: 2
+---
+
+# Validation
+
+Validation uses Pydantic models generated from the JSON schemas. This validates aggregate `.json` files against `EvaluationLog` and instance-level `_samples.jsonl` files line-by-line against `InstanceLevelEvaluationLog`. Requires [uv](https://docs.astral.sh/uv/).
+
+## Validate files with the package CLI
+
+```sh
+# Single aggregate file
+uv run python -m every_eval_ever validate data/benchmark/dev/model/uuid.json
+
+# Instance-level JSONL
+uv run python -m every_eval_ever validate data/benchmark/dev/model/uuid_samples.jsonl
+
+# Entire directory (recurses into subdirectories)
+uv run python -m every_eval_ever validate data/benchmark/dev/model/
+
+# Multiple paths
+uv run python -m every_eval_ever validate file1.json file2_samples.jsonl data/
+```
+
+File type is determined by extension: `.json` validates against `EvaluationLog`, `.jsonl` validates each line against `InstanceLevelEvaluationLog`.
+
+### Output formats
+
+```sh
+# Rich terminal output (default)
+uv run python -m every_eval_ever validate data/
+
+# Machine-readable JSON
+uv run python -m every_eval_ever validate --format json data/
+
+# GitHub Actions annotations
+uv run python -m every_eval_ever validate --format github data/
+```
+
+### Options
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--format {rich,json,github}` | `rich` | Output format |
+| `--max-errors N` | `50` | Maximum errors reported per JSONL file |
+
+Exit code is `0` if all files pass and `1` if any fail.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		source "https://rubygems.org"
		gem "just-the-docs"