From e2263b9e7449ee2eea5ec31950c3d4555119b0ca Mon Sep 17 00:00:00 2001 From: gbemike Date: Wed, 20 May 2026 23:51:59 +0100 Subject: [PATCH 01/16] add unique inference --- every_eval_ever/helpers/eee_stats.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/every_eval_ever/helpers/eee_stats.py b/every_eval_ever/helpers/eee_stats.py index f4556c1b8..1879cdd9a 100644 --- a/every_eval_ever/helpers/eee_stats.py +++ b/every_eval_ever/helpers/eee_stats.py @@ -192,9 +192,10 @@ def analyze_data(con, schema_table, instance_table, csv_path) -> None: WHEN eval_library.name IN ('unknown', 'custom') THEN 'unknown/custom' ELSE 'named harness' END AS harness_status, - COUNT(DISTINCT evaluation_id) AS n_evaluation_runs, - ROUND(100.0 * COUNT(DISTINCT evaluation_id) / SUM(COUNT(DISTINCT evaluation_id)) OVER (), 1) AS pct - FROM {schema_table} + COUNT(*) AS n_evaluation_runs, + ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER (), 1) AS pct + FROM {schema_table}, + LATERAL UNNEST(evaluation_results) AS t(er) GROUP BY 1 ORDER BY n_evaluation_runs DESC; """, @@ -208,6 +209,17 @@ def analyze_data(con, schema_table, instance_table, csv_path) -> None: ) section(f'eval harness percentage saved to {csv_path}') + unique_inference = execute_query( + con, + f""" + SELECT DISTINCT + -- COALESCE(model_info.inference_platform, 'unreported') AS platform_inference -- + COUNT(DISTINCT model_info.inference_platform) AS platform_inference + FROM {schema_table} + """ + ) + section(f'unique inference platforms {unique_inference}') + count_inference_platform = execute_query( con, f""" @@ -820,7 +832,7 @@ def main(): sys.exit(1) analyze_data(con, schema_table, instance_table, csv_path) - create_visualisations(con, schema_table, instance_table, csv_path) + # create_visualisations(con, schema_table, instance_table, csv_path) if __name__ == '__main__': From 093770f32679fa9030cb61f491801e448db1cb1c Mon Sep 17 00:00:00 2001 From: gbemike Date: Thu, 21 May 2026 18:30:46 +0100 Subject: [PATCH 02/16] feat: add docs first pass --- .github/workflows/pages.yml | 0 .gitignore | 12 +++- Gemfile | 2 + Gemfile.lock | 91 +++++++++++++++++++++++++++++++ _config.yml | 22 ++++++++ docs/contributing/index.md | 26 +++++++++ docs/data-structure/index.md | 15 +++++ docs/data-structure/schema.md | 24 ++++++++ docs/data-structure/validation.md | 24 ++++++++ docs/eval-converters/index.md | 27 +++++++++ docs/getting-started/index.md | 33 +++++++++++ docs/index.md | 37 +++++++++++++ 12 files changed, 312 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/pages.yml create mode 100644 Gemfile create mode 100644 Gemfile.lock create mode 100644 _config.yml create mode 100644 docs/contributing/index.md create mode 100644 docs/data-structure/index.md create mode 100644 docs/data-structure/schema.md create mode 100644 docs/data-structure/validation.md create mode 100644 docs/eval-converters/index.md create mode 100644 docs/getting-started/index.md create mode 100644 docs/index.md diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml new file mode 100644 index 000000000..e69de29bb diff --git a/.gitignore b/.gitignore index 5a5d93546..b925edb9a 100644 --- a/.gitignore +++ b/.gitignore @@ -213,4 +213,14 @@ __marimo__/ *.DS_Store* None/ global-mmlu-lite/ -/data/ \ No newline at end of file +/data/ + +# Ignore folders generated by Bundler +.bundle/ +vendor/ + +# Ignore the default location of the built site, and caches and metadata generated by Jekyll +_site/ +.sass-cache/ +.jekyll-cache/ +.jekyll-metadata \ No newline at end of file diff --git a/Gemfile b/Gemfile new file mode 100644 index 000000000..f10963c73 --- /dev/null +++ b/Gemfile @@ -0,0 +1,2 @@ +source "https://rubygems.org" +gem "just-the-docs" \ No newline at end of file diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 000000000..3af5d9ff2 --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,91 @@ +GEM + remote: https://rubygems.org/ + specs: + addressable (2.9.0) + public_suffix (>= 2.0.2, < 8.0) + base64 (0.3.0) + bigdecimal (4.1.2) + colorator (1.1.0) + concurrent-ruby (1.3.6) + csv (3.3.5) + em-websocket (0.5.3) + eventmachine (>= 0.12.9) + http_parser.rb (~> 0) + eventmachine (1.2.7) + ffi (1.17.4-x86_64-linux-gnu) + forwardable-extended (2.6.0) + google-protobuf (4.35.0-x86_64-linux-gnu) + bigdecimal + rake (~> 13.3) + http_parser.rb (0.8.1) + i18n (1.14.8) + concurrent-ruby (~> 1.0) + jekyll (4.4.1) + addressable (~> 2.4) + base64 (~> 0.2) + colorator (~> 1.0) + csv (~> 3.0) + em-websocket (~> 0.5) + i18n (~> 1.0) + jekyll-sass-converter (>= 2.0, < 4.0) + jekyll-watch (~> 2.0) + json (~> 2.6) + kramdown (~> 2.3, >= 2.3.1) + kramdown-parser-gfm (~> 1.0) + liquid (~> 4.0) + mercenary (~> 0.3, >= 0.3.6) + pathutil (~> 0.9) + rouge (>= 3.0, < 5.0) + safe_yaml (~> 1.0) + terminal-table (>= 1.8, < 4.0) + webrick (~> 1.7) + jekyll-include-cache (0.2.1) + jekyll (>= 3.7, < 5.0) + jekyll-sass-converter (3.1.0) + sass-embedded (~> 1.75) + jekyll-seo-tag (2.9.0) + jekyll (>= 3.8, < 5.0) + jekyll-watch (2.2.1) + listen (~> 3.0) + json (2.19.5) + just-the-docs (0.12.0) + jekyll (>= 3.8.5) + jekyll-include-cache + jekyll-seo-tag (>= 2.0) + rake (>= 12.3.1) + kramdown (2.5.2) + rexml (>= 3.4.4) + kramdown-parser-gfm (1.1.0) + kramdown (~> 2.0) + liquid (4.0.4) + listen (3.10.0) + logger + rb-fsevent (~> 0.10, >= 0.10.3) + rb-inotify (~> 0.9, >= 0.9.10) + logger (1.7.0) + mercenary (0.4.0) + pathutil (0.16.2) + forwardable-extended (~> 2.6) + public_suffix (7.0.5) + rake (13.4.2) + rb-fsevent (0.11.2) + rb-inotify (0.11.1) + ffi (~> 1.0) + rexml (3.4.4) + rouge (4.7.0) + safe_yaml (1.0.5) + sass-embedded (1.99.0-x86_64-linux-gnu) + google-protobuf (~> 4.31) + terminal-table (3.0.2) + unicode-display_width (>= 1.1.1, < 3) + unicode-display_width (2.6.0) + webrick (1.9.2) + +PLATFORMS + x86_64-linux-gnu + +DEPENDENCIES + just-the-docs + +BUNDLED WITH + 2.4.20 diff --git a/_config.yml b/_config.yml new file mode 100644 index 000000000..bc2e9898d --- /dev/null +++ b/_config.yml @@ -0,0 +1,22 @@ +title: Every Eval Ever +description: Documentation for the Every Eval Ever schema, CLI, and converters +theme: just-the-docs +color_scheme: light + +source: docs + +url: https://evaleval.github.io +baseurl: /every_eval_ever + +search_enabled: true +heading_anchors: true + +aux_links: + GitHub: + - https://github.com/evaleval/every_eval_ever + +defaults: + - scope: + path: "" + values: + layout: default \ No newline at end of file diff --git a/docs/contributing/index.md b/docs/contributing/index.md new file mode 100644 index 000000000..9fb2e43f8 --- /dev/null +++ b/docs/contributing/index.md @@ -0,0 +1,26 @@ +--- +layout: default +title: Contributing +nav_order: 5 +--- + +# Contributing + +Data contributions land in the datastore, while validation gates run through the validator/EvalEvalBot workflow. + +To contribute evaluation data: + +1. Add files under `data/{benchmark}/{developer}/{model}/` +2. Name aggregate files as `{uuid}.json` +3. Optionally add instance-level `{uuid}_samples.jsonl` +4. Validate before submission + +Datastore: https://huggingface.co/datasets/evaleval/EEE_datastore + +The validator checks datastore pull requests using core checks from this repository and additional checks that are being upstreamed. + +Before submitting, run: + +```bash +uv run python -m every_eval_ever validate data/ +``` diff --git a/docs/data-structure/index.md b/docs/data-structure/index.md new file mode 100644 index 000000000..87ce1145e --- /dev/null +++ b/docs/data-structure/index.md @@ -0,0 +1,15 @@ +--- +layout: default +title: Data Structure +nav_order: 3 +has_children: true +--- + +# Data Structure + +Evaluation data is represented in two layers: + +- Aggregate JSON records (`{uuid}.json`) +- Instance-level JSONL records (`{uuid}_samples.jsonl`) + +Use the child pages in this section for schema and validation details. diff --git a/docs/data-structure/schema.md b/docs/data-structure/schema.md new file mode 100644 index 000000000..c1720fa89 --- /dev/null +++ b/docs/data-structure/schema.md @@ -0,0 +1,24 @@ +--- +layout: default +title: Schema +parent: Data Structure +nav_order: 1 +--- + +# Schema + +The canonical schemas are: + +- [Aggregate schema](../../eval.schema.json) +- [Instance-level schema](../../instance_level_eval.schema.json) + +Both schema definitions are currently version `0.2.2`. + +The repository enforces schema compatibility by generating Pydantic models from JSON Schema and applying post-generation patches (`post_codegen.py`). This generation flow is automated in CI and can also be run manually. + +For aggregate records, keep these conventions: + +1. `evaluation_id` uses `{benchmark_name}/{model_id}/{retrieved_timestamp}` +2. `source_metadata.source_type` is `documentation` or `evaluation_run` +3. `source_data` is set per result (`url`, `hf_dataset`, or `other`) +4. Level-based metrics use integer values plus `level_names` diff --git a/docs/data-structure/validation.md b/docs/data-structure/validation.md new file mode 100644 index 000000000..56cdd7048 --- /dev/null +++ b/docs/data-structure/validation.md @@ -0,0 +1,24 @@ +--- +layout: default +title: Validation +parent: Data Structure +nav_order: 2 +--- + +# Validation + +Validate aggregate `.json` files and instance-level `.jsonl` files: + +```bash +uv run python -m every_eval_ever validate data/ +``` + +Output formats: + +```bash +uv run python -m every_eval_ever validate --format rich data/ +uv run python -m every_eval_ever validate --format json data/ +uv run python -m every_eval_ever validate --format github data/ +``` + +Exit code is `0` when all files pass and `1` when any file fails. diff --git a/docs/eval-converters/index.md b/docs/eval-converters/index.md new file mode 100644 index 000000000..8388a8af0 --- /dev/null +++ b/docs/eval-converters/index.md @@ -0,0 +1,27 @@ +--- +layout: default +title: Eval Converters +nav_order: 4 +--- + +# Eval Converters + +Supported conversion targets: + +- Inspect AI +- HELM +- lm-evaluation-harness + +These are the three main general-purpose converters expected to be supported in the core package. + +Example commands: + +```bash +uv run python -m every_eval_ever convert inspect --log_path +uv run python -m every_eval_ever convert helm --log_path +uv run python -m every_eval_ever convert lm_eval --log_path +``` + +Adapter source code lives under [every_eval_ever/converters](../../every_eval_ever/converters/). + +One-off adapters also exist under [utils](../../utils/) for source-specific parsing and business logic. diff --git a/docs/getting-started/index.md b/docs/getting-started/index.md new file mode 100644 index 000000000..136c14409 --- /dev/null +++ b/docs/getting-started/index.md @@ -0,0 +1,33 @@ +--- +layout: default +title: Getting Started +nav_order: 2 +--- + +# Getting Started + +Install the package: + +```bash +pip install every-eval-ever +``` + +Optional converter dependencies: + +```bash +pip install 'every-eval-ever[inspect]' +pip install 'every-eval-ever[helm]' +pip install 'every-eval-ever[all]' +``` + +## Run the CLI + +```bash +uv run python -m every_eval_ever --help +``` + +## Continue + +- See [Data Structure](../data-structure/) +- See [Eval Converters](../eval-converters/) +- See [Contributing](../contributing/) diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 000000000..468f0f87f --- /dev/null +++ b/docs/index.md @@ -0,0 +1,37 @@ +--- +layout: default +title: Home +nav_order: 1 +--- + +# Every Eval Ever + +> [EvalEval Coalition](https://evalevalai.com) — "We are a researcher community developing scientifically grounded research outputs and robust deployment infrastructure for broader impact evaluations." + +**Every Eval Ever** is a shared schema and crowdsourced eval database. It defines a standardized metadata format for storing AI evaluation results — from leaderboard scrapes and research papers to local evaluation runs — so that results from different frameworks can be compared, reproduced, and reused. The three components that make it work: + +- 📋 **A metadata schema** ([eval.schema.json](../eval.schema.json)) that defines the information needed for meaningful comparison of evaluation results, including [instance-level data](../instance_level_eval.schema.json) +- 🔧 **Validation** that checks data against the schema before it enters the repository +- 🔌 **Converters** for [Inspect AI](../every_eval_ever/converters/inspect/), [HELM](../every_eval_ever/converters/helm/), and [lm-eval-harness](../every_eval_ever/converters/lm_eval/), so you can transform your existing evaluation logs into the standard format + +## Project Components + +Every Eval Ever is maintained across three connected components: + +- [GitHub repository](https://github.com/evaleval/every_eval_ever): the `every_eval_ever` Python package with schema definitions, converters/adapters, tests, and core tooling. +- [EEE Datastore](https://huggingface.co/datasets/evaleval/EEE_datastore): the Hugging Face datastore that stores normalized Every Eval Ever evaluation data. +- [EEE Validator](https://huggingface.co/spaces/evaleval/eee_validator): validator and EvalEvalBot checks used on datastore pull requests, built from repository logic plus additional checks that are being upstreamed. + +Install the package: + +```bash +pip install every-eval-ever +``` + +Optional converter dependencies: + +```bash +pip install 'every-eval-ever[inspect]' +pip install 'every-eval-ever[helm]' +pip install 'every-eval-ever[all]' +``` \ No newline at end of file From ea9794d807a3b04d35661cf543b2415b6072dea2 Mon Sep 17 00:00:00 2001 From: gbemike Date: Fri, 22 May 2026 13:09:14 +0100 Subject: [PATCH 03/16] feat: add pages.yml github actions jekyll deployment --- .github/workflows/pages.yml | 51 +++++++++++++++++++++++++++++++++++++ docs/index.md | 4 +-- 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index e69de29bb..9db4c15a1 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -0,0 +1,51 @@ +# Sample workflow for building and deploying a Jekyll site to GitHub Pages +name: Deploy Jekyll with GitHub Pages dependencies preinstalled + +on: + # Runs on pushes targeting the default branch + push: + branches: ["add-read-the-docs"] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages +permissions: + contents: read + pages: write + id-token: write + +# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. +# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. +concurrency: + group: "pages" + cancel-in-progress: false + +jobs: + # Build job + build: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Setup Pages + uses: actions/configure-pages@v5 + - name: Build with Jekyll + uses: actions/jekyll-build-pages@v1 + with: + source: ./ + destination: ./_site + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + + # Deployment job + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + needs: build + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v5 \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index 468f0f87f..e16385303 100644 --- a/docs/index.md +++ b/docs/index.md @@ -10,9 +10,9 @@ nav_order: 1 **Every Eval Ever** is a shared schema and crowdsourced eval database. It defines a standardized metadata format for storing AI evaluation results — from leaderboard scrapes and research papers to local evaluation runs — so that results from different frameworks can be compared, reproduced, and reused. The three components that make it work: -- 📋 **A metadata schema** ([eval.schema.json](../eval.schema.json)) that defines the information needed for meaningful comparison of evaluation results, including [instance-level data](../instance_level_eval.schema.json) +- 📋 **A metadata schema** ([eval.schema.json](https://github.com/gbemike/every_eval_ever/blob/add-read-the-docs/eval.schema.json)) that defines the information needed for meaningful comparison of evaluation results, including [instance-level data](https://github.com/gbemike/every_eval_ever/blob/add-read-the-docs/instance_level_eval.schema.json) - 🔧 **Validation** that checks data against the schema before it enters the repository -- 🔌 **Converters** for [Inspect AI](../every_eval_ever/converters/inspect/), [HELM](../every_eval_ever/converters/helm/), and [lm-eval-harness](../every_eval_ever/converters/lm_eval/), so you can transform your existing evaluation logs into the standard format +- 🔌 **Converters** for [Inspect AI](https://github.com/gbemike/every_eval_ever/tree/add-read-the-docs/every_eval_ever/converters/inspect), [HELM](https://github.com/gbemike/every_eval_ever/blob/add-read-the-docs/every_eval_ever/converters/helm), and [lm-eval-harness](https://github.com/gbemike/every_eval_ever/blob/add-read-the-docs/every_eval_ever/converters/lm_eval), so you can transform your existing evaluation logs into the standard format ## Project Components From 6cc1a9fbf7bf42ade8cb7d8099b14f7d2c70b993 Mon Sep 17 00:00:00 2001 From: gbemike Date: Fri, 22 May 2026 13:22:43 +0100 Subject: [PATCH 04/16] fix: refactor config.yml fields --- _config.yml | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/_config.yml b/_config.yml index bc2e9898d..dd06788be 100644 --- a/_config.yml +++ b/_config.yml @@ -1,22 +1,28 @@ title: Every Eval Ever description: Documentation for the Every Eval Ever schema, CLI, and converters -theme: just-the-docs -color_scheme: light +color_scheme: nil -source: docs +baseurl: "" +url: https://evalevalai.com/ +repository: every_eval_ever/every_eval_ever -url: https://evaleval.github.io -baseurl: /every_eval_ever +permalink: pretty search_enabled: true heading_anchors: true aux_links: - GitHub: + "Every Eval Ever on GitHub": - https://github.com/evaleval/every_eval_ever defaults: - scope: path: "" values: - layout: default \ No newline at end of file + layout: default + +nav_sort: case_sensitive + +# Back to top link +back_to_top: true +back_to_top_text: "Back to top" \ No newline at end of file From bd1aa6f9941b540bf5d56f673572fdfa985f2101 Mon Sep 17 00:00:00 2001 From: gbemike Date: Fri, 22 May 2026 13:28:10 +0100 Subject: [PATCH 05/16] add source docs as source --- _config.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/_config.yml b/_config.yml index dd06788be..605567d89 100644 --- a/_config.yml +++ b/_config.yml @@ -2,6 +2,8 @@ title: Every Eval Ever description: Documentation for the Every Eval Ever schema, CLI, and converters color_scheme: nil +source: docs + baseurl: "" url: https://evalevalai.com/ repository: every_eval_ever/every_eval_ever From 417cc6fb58c83d3ec707607037bef66b3bbdd9ff Mon Sep 17 00:00:00 2001 From: gbemike Date: Fri, 22 May 2026 13:42:02 +0100 Subject: [PATCH 06/16] add just-the-docs as value for theme field --- _config.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/_config.yml b/_config.yml index 605567d89..2841b0377 100644 --- a/_config.yml +++ b/_config.yml @@ -1,6 +1,7 @@ title: Every Eval Ever description: Documentation for the Every Eval Ever schema, CLI, and converters -color_scheme: nil +theme: just-the-docs +color_scheme: light source: docs From 726c6595678bd7bb36f1335e4932123ea2ac7300 Mon Sep 17 00:00:00 2001 From: gbemike Date: Fri, 22 May 2026 13:46:35 +0100 Subject: [PATCH 07/16] ci: build pages with builder --- .github/workflows/pages.yml | 24 ++++++++++++++---------- _config.yml | 2 +- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index 9db4c15a1..5d5613b1f 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -1,5 +1,4 @@ -# Sample workflow for building and deploying a Jekyll site to GitHub Pages -name: Deploy Jekyll with GitHub Pages dependencies preinstalled +name: Deploy Docs to GitHub Pages on: # Runs on pushes targeting the default branch @@ -22,23 +21,28 @@ concurrency: cancel-in-progress: false jobs: - # Build job build: runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 + + - name: Setup Ruby + uses: ruby/setup-ruby@v1 + with: + bundler-cache: true + - name: Setup Pages uses: actions/configure-pages@v5 - - name: Build with Jekyll - uses: actions/jekyll-build-pages@v1 - with: - source: ./ - destination: ./_site + + - name: Build site + run: bundle exec jekyll build + - name: Upload artifact uses: actions/upload-pages-artifact@v3 + with: + path: ./_site - # Deployment job deploy: environment: name: github-pages @@ -46,6 +50,6 @@ jobs: runs-on: ubuntu-latest needs: build steps: - - name: Deploy to GitHub Pages + - name: Deploy id: deployment uses: actions/deploy-pages@v5 \ No newline at end of file diff --git a/_config.yml b/_config.yml index 2841b0377..a909c26e1 100644 --- a/_config.yml +++ b/_config.yml @@ -7,7 +7,7 @@ source: docs baseurl: "" url: https://evalevalai.com/ -repository: every_eval_ever/every_eval_ever +repository: /every_eval_ever permalink: pretty From 8366fc8592bfb5a6c61ee0483980b58a9764f505 Mon Sep 17 00:00:00 2001 From: gbemike Date: Fri, 22 May 2026 13:55:46 +0100 Subject: [PATCH 08/16] fix pages baseurl/url --- _config.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/_config.yml b/_config.yml index a909c26e1..e880bea1d 100644 --- a/_config.yml +++ b/_config.yml @@ -5,9 +5,9 @@ color_scheme: light source: docs -baseurl: "" -url: https://evalevalai.com/ -repository: /every_eval_ever +baseurl: "/every_eval_ever" +url: "https://gbemike.github.io" +repository: gbemike/every_eval_ever permalink: pretty From 686644ea57780e595355a13ce4734170244c5d18 Mon Sep 17 00:00:00 2001 From: gbemike Date: Fri, 22 May 2026 14:02:15 +0100 Subject: [PATCH 09/16] fix: specify ruby-version in github actions workflow --- .github/workflows/pages.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index 5d5613b1f..2f178dc4d 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -30,6 +30,7 @@ jobs: - name: Setup Ruby uses: ruby/setup-ruby@v1 with: + ruby-version: '3.2' bundler-cache: true - name: Setup Pages From 44ea36864d640de86d7c9d8f4176d98295af31ea Mon Sep 17 00:00:00 2001 From: gbemike Date: Sat, 23 May 2026 15:00:28 +0100 Subject: [PATCH 10/16] docs: configure pages for docs.evalevalai.com --- _config.yml | 6 +++--- docs/CNAME | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) create mode 100644 docs/CNAME diff --git a/_config.yml b/_config.yml index e880bea1d..523389765 100644 --- a/_config.yml +++ b/_config.yml @@ -5,9 +5,9 @@ color_scheme: light source: docs -baseurl: "/every_eval_ever" -url: "https://gbemike.github.io" -repository: gbemike/every_eval_ever +baseurl: "" +url: "https://docs.evalevalai.com" +repository: evaleval/every_eval_ever permalink: pretty diff --git a/docs/CNAME b/docs/CNAME new file mode 100644 index 000000000..d1695a523 --- /dev/null +++ b/docs/CNAME @@ -0,0 +1 @@ +docs.evalevalai.com From d0eb386ce27be137fb8da2d6694098cacc6b47a8 Mon Sep 17 00:00:00 2001 From: gbemike Date: Sat, 23 May 2026 15:13:26 +0100 Subject: [PATCH 11/16] feat: ready pages.yml for main branch merge --- .github/workflows/pages.yml | 7 +------ docs/CNAME | 1 - 2 files changed, 1 insertion(+), 7 deletions(-) delete mode 100644 docs/CNAME diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index 2f178dc4d..df2f0ab3c 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -1,21 +1,16 @@ name: Deploy Docs to GitHub Pages on: - # Runs on pushes targeting the default branch push: - branches: ["add-read-the-docs"] + branches: [main] - # Allows you to run this workflow manually from the Actions tab workflow_dispatch: -# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages permissions: contents: read pages: write id-token: write -# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. -# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. concurrency: group: "pages" cancel-in-progress: false diff --git a/docs/CNAME b/docs/CNAME deleted file mode 100644 index d1695a523..000000000 --- a/docs/CNAME +++ /dev/null @@ -1 +0,0 @@ -docs.evalevalai.com From 27febff087e406b469f54d8c3f46a715d5b49fd7 Mon Sep 17 00:00:00 2001 From: gbemike Date: Sun, 24 May 2026 22:42:25 +0100 Subject: [PATCH 12/16] refactor: fix links and add documentation link to README --- .github/workflows/pages.yml | 5 +++- README.md | 2 ++ docs/contributing/index.md | 47 +++++++++++++++++++++++++------ docs/data-structure/schema.md | 42 +++++++++++++++++++++------ docs/data-structure/validation.md | 40 +++++++++++++++++++++----- docs/eval-converters/index.md | 6 ++-- docs/index.md | 4 +-- 7 files changed, 117 insertions(+), 29 deletions(-) diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index df2f0ab3c..5b3f109ae 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -3,6 +3,8 @@ name: Deploy Docs to GitHub Pages on: push: branches: [main] + pull_request: + branches: [main] workflow_dispatch: @@ -45,7 +47,8 @@ jobs: url: ${{ steps.deployment.outputs.page_url }} runs-on: ubuntu-latest needs: build + if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} steps: - name: Deploy id: deployment - uses: actions/deploy-pages@v5 \ No newline at end of file + uses: actions/deploy-pages@v5 diff --git a/README.md b/README.md index 262a2ca11..23b277b14 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # Every Eval Ever +📖 **[Documentation](https://docs.evalevalai.com)** + > [EvalEval Coalition](https://evalevalai.com) — "We are a researcher community developing scientifically grounded research outputs and robust deployment infrastructure for broader impact evaluations." **Every Eval Ever** is a shared schema and crowdsourced eval database. It defines a standardized metadata format for storing AI evaluation results — from leaderboard scrapes and research papers to local evaluation runs — so that results from different frameworks can be compared, reproduced, and reused. The three components that make it work: diff --git a/docs/contributing/index.md b/docs/contributing/index.md index 9fb2e43f8..46b5992e7 100644 --- a/docs/contributing/index.md +++ b/docs/contributing/index.md @@ -6,18 +6,49 @@ nav_order: 5 # Contributing -Data contributions land in the datastore, while validation gates run through the validator/EvalEvalBot workflow. +New data can be contributed to the [Hugging Face Dataset](https://huggingface.co/datasets/evaleval/EEE_datastore) using the following process: -To contribute evaluation data: +Leaderboard/evaluation data is split-up into files by individual model, and data for each model is stored using [eval.schema.json](https://github.com/evaleval/every_eval_ever/blob/main/eval.schema.json). The repository is structured into folders as `data/{benchmark_name}/{developer_name}/{model_name}/`. -1. Add files under `data/{benchmark}/{developer}/{model}/` -2. Name aggregate files as `{uuid}.json` -3. Optionally add instance-level `{uuid}_samples.jsonl` -4. Validate before submission +## TL;DR How to successfully submit -Datastore: https://huggingface.co/datasets/evaleval/EEE_datastore +1. Data must conform to [eval.schema.json](https://github.com/evaleval/every_eval_ever/blob/main/eval.schema.json) (current version is defined in the schema file) +2. The validation pipeline will automatically verify the data submitted in the pull request, but can also be manually triggered by typing `/eee validate changed` in a comment on the HF PR. +3. An EvalEval member will review and merge your submission -The validator checks datastore pull requests using core checks from this repository and additional checks that are being upstreamed. +## PR Naming Convention + +Use these prefixes in your pull request titles: + +- `[Submission]` - New evaluation data +- `[Issue #N]` - Fix for a specific GitHub issue +- `[Feature]` - New functionality not tied to an issue +- `[Docs]` - Documentation changes +- `[ACL Shared Task]` - Shared task submissions (priority review) + +## UUID Naming Convention + +Each JSON file is named with a **UUID (Universally Unique Identifier)** in the format `{uuid}.json`. The UUID is automatically generated (using standard UUID v4) when creating a new evaluation result file. This ensures that: +- **Multiple evaluations** of the same model can exist without conflicts (each gets a unique UUID) +- **Different timestamps** are stored as separate files with different UUIDs (not as separate folders) +- A model may have multiple result files, with each file representing different iterations or runs of the leaderboard/evaluation +- UUIDs can be generated using Python's `uuid.uuid4()` function. + +**Example**: The model `openai/gpt-4o-2024-11-20` might have multiple files like: +- `e70acf51-30ef-4c20-b7cc-51704d114d70.json` (evaluation run #1) +- `a1b2c3d4-5678-90ab-cdef-1234567890ab.json` (evaluation run #2) + +Note: Each file can contain multiple individual results related to one model. See [examples in the datastore](https://huggingface.co/datasets/evaleval/EEE_datastore/tree/main/data). + +## How to add new eval + +1. Add a new folder under [data/](https://huggingface.co/datasets/evaleval/EEE_datastore/tree/main/data) on the Hugging Face datastore with a codename for your eval. +2. For each model, use the Hugging Face (`developer_name/model_name`) naming convention to create a 2-tier folder structure. +3. Add a JSON file with results for each model and name it `{uuid}.json`. +4. [Optional] Include a [utils/](https://github.com/evaleval/every_eval_ever/tree/main/utils) folder in your benchmark name folder with any scripts used to generate the data (see e.g. [utils/global-mmlu-lite/adapter.py](https://github.com/evaleval/every_eval_ever/blob/main/utils/global-mmlu-lite/adapter.py)). +5. [Submit] Two ways to submit your evaluation data: + - **Option A: Drag & drop via Hugging Face** - Go to [evaleval/EEE_datastore](https://huggingface.co/datasets/evaleval/EEE_datastore) -> click "Files and versions" -> "Contribute" -> "Upload files" -> drag and drop your data -> select "Open as a pull request to the main branch". See [step-by-step screenshots](https://docs.google.com/document/d/1dxTQF8ncGCzaAOIj0RX7E9Hg4THmUBzezDOYUp_XdCY/edit?usp=sharing). + - **Option B: Clone & PR** - Clone the [Hugging Face repository](https://huggingface.co/datasets/evaleval/EEE_datastore), add your data under `data/`, and open a pull request Before submitting, run: diff --git a/docs/data-structure/schema.md b/docs/data-structure/schema.md index c1720fa89..b73000b1b 100644 --- a/docs/data-structure/schema.md +++ b/docs/data-structure/schema.md @@ -9,16 +9,42 @@ nav_order: 1 The canonical schemas are: -- [Aggregate schema](../../eval.schema.json) -- [Instance-level schema](../../instance_level_eval.schema.json) +- [Aggregate schema](https://github.com/evaleval/every_eval_ever/blob/main/eval.schema.json) +- [Instance-level schema](https://github.com/evaleval/every_eval_ever/blob/main/instance_level_eval.schema.json) -Both schema definitions are currently version `0.2.2`. +Schema versions are defined in the canonical JSON Schema files linked above. The repository enforces schema compatibility by generating Pydantic models from JSON Schema and applying post-generation patches (`post_codegen.py`). This generation flow is automated in CI and can also be run manually. -For aggregate records, keep these conventions: +## Schema Instructions -1. `evaluation_id` uses `{benchmark_name}/{model_id}/{retrieved_timestamp}` -2. `source_metadata.source_type` is `documentation` or `evaluation_run` -3. `source_data` is set per result (`url`, `hf_dataset`, or `other`) -4. Level-based metrics use integer values plus `level_names` +1. **`model_info`**: Use Hugging Face formatting (`developer_name/model_name`). If a model does not come from Hugging Face, use the exact API reference. Check [examples in data/livecodebenchpro](https://huggingface.co/datasets/evaleval/EEE_datastore/tree/main/data/livecodebenchpro). Notably, some do have a **date included in the model name**, but others **do not**. For example: +- OpenAI: `gpt-4o-2024-11-20`, `gpt-5-2025-08-07`, `o3-2025-04-16` +- Anthropic: `claude-3-7-sonnet-20250219`, `claude-3-sonnet-20240229` +- Google: `gemini-2.5-pro`, `gemini-2.5-flash` +- xAI (Grok): `grok-2-2024-08-13`, `grok-3-2025-01-15` + +2. **`evaluation_id`**: Use `{benchmark_name/model_id/retrieved_timestamp}` format (e.g. `livecodebenchpro/qwen3-235b-a22b-thinking-2507/1760492095.8105888`). + +3. **`inference_platform`** vs **`inference_engine`**: Where possible specify where the evaluation was run using one of these two fields. +- `inference_platform`: Use this field when the evaluation was run through a remote API (e.g., `openai`, `huggingface`, `openrouter`, `anthropic`, `xai`). +- `inference_engine`: Use this field when the evaluation was run locally. This is now an object with `name` and `version` (e.g. `{"name": "vllm", "version": "0.6.0"}`). + +4. The `source_type` on `source_metadata` has two options: `documentation` and `evaluation_run`. Use `documentation` when results are scraped from a leaderboard or paper. Use `evaluation_run` when the evaluation was run locally (e.g. via an eval converter). + +5. **`source_data`** is specified per evaluation result (inside `evaluation_results`), with three variants: +- `source_type: "url"` - link to a web source (e.g. leaderboard API) +- `source_type: "hf_dataset"` - reference to a Hugging Face dataset (e.g. `{"hf_repo": "google/IFEval"}`) +- `source_type: "other"` - for private or proprietary datasets + +6. The schema is designed to accommodate both numeric and level-based (e.g. Low, Medium, High) metrics. For level-based metrics, the actual `value` should be converted to an integer (e.g. Low = 1, Medium = 2, High = 3), and the `level_names` property should be used to specify the mapping of levels to integers. + +7. **Timestamps**: The schema has three timestamp fields - use them as follows: +- `retrieved_timestamp` (required) - when this record was created, in Unix epoch format (e.g. `1760492095.8105888`) +- `evaluation_timestamp` (top-level, optional) - when the evaluation was run +- `evaluation_results[].evaluation_timestamp` (per-result, optional) - when a specific evaluation result was produced, if different results were run at different times + +8. Additional details can be provided in several places in the schema. They are not required, but can be useful for detailed analysis. +- `model_info.additional_details`: Use this field to provide any additional information about the model itself (e.g. number of parameters) +- `evaluation_results.generation_config.generation_args`: Specify additional arguments used to generate outputs from the model +- `evaluation_results.generation_config.additional_details`: Use this field to provide any additional information about the evaluation process that is not captured elsewhere \ No newline at end of file diff --git a/docs/data-structure/validation.md b/docs/data-structure/validation.md index 56cdd7048..6234f1258 100644 --- a/docs/data-structure/validation.md +++ b/docs/data-structure/validation.md @@ -7,18 +7,44 @@ nav_order: 2 # Validation -Validate aggregate `.json` files and instance-level `.jsonl` files: +Validation uses Pydantic models generated from the JSON schemas. This validates aggregate `.json` files against `EvaluationLog` and instance-level `_samples.jsonl` files line-by-line against `InstanceLevelEvaluationLog`. Requires [uv](https://docs.astral.sh/uv/). -```bash -uv run python -m every_eval_ever validate data/ +## Validate files with the package CLI + +```sh +# Single aggregate file +uv run python -m every_eval_ever validate data/benchmark/dev/model/uuid.json + +# Instance-level JSONL +uv run python -m every_eval_ever validate data/benchmark/dev/model/uuid_samples.jsonl + +# Entire directory (recurses into subdirectories) +uv run python -m every_eval_ever validate data/benchmark/dev/model/ + +# Multiple paths +uv run python -m every_eval_ever validate file1.json file2_samples.jsonl data/ ``` -Output formats: +File type is determined by extension: `.json` validates against `EvaluationLog`, `.jsonl` validates each line against `InstanceLevelEvaluationLog`. + +### Output formats -```bash -uv run python -m every_eval_ever validate --format rich data/ +```sh +# Rich terminal output (default) +uv run python -m every_eval_ever validate data/ + +# Machine-readable JSON uv run python -m every_eval_ever validate --format json data/ + +# GitHub Actions annotations uv run python -m every_eval_ever validate --format github data/ ``` -Exit code is `0` when all files pass and `1` when any file fails. +### Options + +| Flag | Default | Description | +|------|---------|-------------| +| `--format {rich,json,github}` | `rich` | Output format | +| `--max-errors N` | `50` | Maximum errors reported per JSONL file | + +Exit code is `0` if all files pass and `1` if any fail. diff --git a/docs/eval-converters/index.md b/docs/eval-converters/index.md index 8388a8af0..444d75762 100644 --- a/docs/eval-converters/index.md +++ b/docs/eval-converters/index.md @@ -10,7 +10,7 @@ Supported conversion targets: - Inspect AI - HELM -- lm-evaluation-harness +- lm-eval-harness These are the three main general-purpose converters expected to be supported in the core package. @@ -22,6 +22,6 @@ uv run python -m every_eval_ever convert helm --log_path uv run python -m every_eval_ever convert lm_eval --log_path ``` -Adapter source code lives under [every_eval_ever/converters](../../every_eval_ever/converters/). +Adapter source code lives under [every_eval_ever/converters](https://github.com/evaleval/every_eval_ever/tree/main/every_eval_ever/converters). -One-off adapters also exist under [utils](../../utils/) for source-specific parsing and business logic. +One-off adapters also exist under [utils](https://github.com/evaleval/every_eval_ever/tree/main/utils) for source-specific parsing and business logic. diff --git a/docs/index.md b/docs/index.md index e16385303..76f334041 100644 --- a/docs/index.md +++ b/docs/index.md @@ -10,9 +10,9 @@ nav_order: 1 **Every Eval Ever** is a shared schema and crowdsourced eval database. It defines a standardized metadata format for storing AI evaluation results — from leaderboard scrapes and research papers to local evaluation runs — so that results from different frameworks can be compared, reproduced, and reused. The three components that make it work: -- 📋 **A metadata schema** ([eval.schema.json](https://github.com/gbemike/every_eval_ever/blob/add-read-the-docs/eval.schema.json)) that defines the information needed for meaningful comparison of evaluation results, including [instance-level data](https://github.com/gbemike/every_eval_ever/blob/add-read-the-docs/instance_level_eval.schema.json) +- 📋 **A metadata schema** ([eval.schema.json](https://github.com/evaleval/every_eval_ever/blob/main/eval.schema.json)) that defines the information needed for meaningful comparison of evaluation results, including [instance-level data](https://github.com/evaleval/every_eval_ever/blob/main/instance_level_eval.schema.json) - 🔧 **Validation** that checks data against the schema before it enters the repository -- 🔌 **Converters** for [Inspect AI](https://github.com/gbemike/every_eval_ever/tree/add-read-the-docs/every_eval_ever/converters/inspect), [HELM](https://github.com/gbemike/every_eval_ever/blob/add-read-the-docs/every_eval_ever/converters/helm), and [lm-eval-harness](https://github.com/gbemike/every_eval_ever/blob/add-read-the-docs/every_eval_ever/converters/lm_eval), so you can transform your existing evaluation logs into the standard format +- 🔌 **Converters** for [Inspect AI](https://github.com/evaleval/every_eval_ever/tree/main/every_eval_ever/converters/inspect), [HELM](https://github.com/evaleval/every_eval_ever/tree/main/every_eval_ever/converters/helm), and [lm-eval-harness](https://github.com/evaleval/every_eval_ever/tree/main/every_eval_ever/converters/lm_eval), so you can transform your existing evaluation logs into the standard format ## Project Components From 8cf811a7fbfb41c96e084e3b9c1ebccc37f00e8d Mon Sep 17 00:00:00 2001 From: gbemike Date: Tue, 26 May 2026 18:55:58 +0100 Subject: [PATCH 13/16] fix github actions build block --- .github/workflows/pages.yml | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index 5b3f109ae..1e1de4f8a 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -8,18 +8,15 @@ on: workflow_dispatch: -permissions: - contents: read - pages: write - id-token: write - concurrency: - group: "pages" - cancel-in-progress: false + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} jobs: build: runs-on: ubuntu-latest + permissions: + contents: read steps: - name: Checkout uses: actions/checkout@v4 @@ -30,9 +27,6 @@ jobs: ruby-version: '3.2' bundler-cache: true - - name: Setup Pages - uses: actions/configure-pages@v5 - - name: Build site run: bundle exec jekyll build @@ -42,6 +36,10 @@ jobs: path: ./_site deploy: + permissions: + contents: read + pages: write + id-token: write environment: name: github-pages url: ${{ steps.deployment.outputs.page_url }} @@ -51,4 +49,4 @@ jobs: steps: - name: Deploy id: deployment - uses: actions/deploy-pages@v5 + uses: actions/deploy-pages@v5 \ No newline at end of file From 01fdf757689771e7a02258803059c8c0eeeaab92 Mon Sep 17 00:00:00 2001 From: gbemike Date: Tue, 26 May 2026 19:59:04 +0100 Subject: [PATCH 14/16] refactor: revert eee_stats script to normal --- every_eval_ever/helpers/eee_stats.py | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/every_eval_ever/helpers/eee_stats.py b/every_eval_ever/helpers/eee_stats.py index 1879cdd9a..c3b296ac7 100644 --- a/every_eval_ever/helpers/eee_stats.py +++ b/every_eval_ever/helpers/eee_stats.py @@ -192,10 +192,9 @@ def analyze_data(con, schema_table, instance_table, csv_path) -> None: WHEN eval_library.name IN ('unknown', 'custom') THEN 'unknown/custom' ELSE 'named harness' END AS harness_status, - COUNT(*) AS n_evaluation_runs, - ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER (), 1) AS pct - FROM {schema_table}, - LATERAL UNNEST(evaluation_results) AS t(er) + COUNT(DISTINCT evaluation_id) AS n_evaluation_runs, + ROUND(100.0 * COUNT(DISTINCT evaluation_id) / SUM(COUNT(DISTINCT evaluation_id)) OVER (), 1) AS pct + FROM {schema_table} GROUP BY 1 ORDER BY n_evaluation_runs DESC; """, @@ -209,17 +208,6 @@ def analyze_data(con, schema_table, instance_table, csv_path) -> None: ) section(f'eval harness percentage saved to {csv_path}') - unique_inference = execute_query( - con, - f""" - SELECT DISTINCT - -- COALESCE(model_info.inference_platform, 'unreported') AS platform_inference -- - COUNT(DISTINCT model_info.inference_platform) AS platform_inference - FROM {schema_table} - """ - ) - section(f'unique inference platforms {unique_inference}') - count_inference_platform = execute_query( con, f""" @@ -832,8 +820,8 @@ def main(): sys.exit(1) analyze_data(con, schema_table, instance_table, csv_path) - # create_visualisations(con, schema_table, instance_table, csv_path) + create_visualisations(con, schema_table, instance_table, csv_path) if __name__ == '__main__': - main() + main() \ No newline at end of file From e76dc6ad76ddfbcada8375576682674e6de44c60 Mon Sep 17 00:00:00 2001 From: Tommaso Cerruti <79256764+tommasocerruti@users.noreply.github.com> Date: Tue, 26 May 2026 21:06:10 +0200 Subject: [PATCH 15/16] Add newline at end of eee_stats.py Add a newline at the end of the file for better formatting. --- every_eval_ever/helpers/eee_stats.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/every_eval_ever/helpers/eee_stats.py b/every_eval_ever/helpers/eee_stats.py index c3b296ac7..4871a2e11 100644 --- a/every_eval_ever/helpers/eee_stats.py +++ b/every_eval_ever/helpers/eee_stats.py @@ -824,4 +824,5 @@ def main(): if __name__ == '__main__': - main() \ No newline at end of file + main() + From 6b8371216096eb90158fb3864cb900bb1ac6b560 Mon Sep 17 00:00:00 2001 From: Tommaso Cerruti <79256764+tommasocerruti@users.noreply.github.com> Date: Tue, 26 May 2026 21:09:40 +0200 Subject: [PATCH 16/16] nit: restoring previous state prior to PR --- every_eval_ever/helpers/eee_stats.py | 1 - 1 file changed, 1 deletion(-) diff --git a/every_eval_ever/helpers/eee_stats.py b/every_eval_ever/helpers/eee_stats.py index 4871a2e11..f4556c1b8 100644 --- a/every_eval_ever/helpers/eee_stats.py +++ b/every_eval_ever/helpers/eee_stats.py @@ -825,4 +825,3 @@ def main(): if __name__ == '__main__': main() -