From fde324e4f04cbfd90378a77e25f7c88f1323e9e7 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 12:23:39 -0600
Subject: [PATCH 01/61] build: bootstrap uv project for evals lib

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/__init__.py     |   0
 evals/lib/__init__.py |   0
 pyproject.toml        |  29 ++++
 tests/__init__.py     |   0
 tests/lib/__init__.py |   0
 uv.lock               | 325 ++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 354 insertions(+)
 create mode 100644 evals/__init__.py
 create mode 100644 evals/lib/__init__.py
 create mode 100644 pyproject.toml
 create mode 100644 tests/__init__.py
 create mode 100644 tests/lib/__init__.py
 create mode 100644 uv.lock

diff --git a/evals/__init__.py b/evals/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/evals/lib/__init__.py b/evals/lib/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..b64b1ff
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,29 @@
+[project]
+name = "agent-skills-evals"
+version = "0.1.0"
+description = "Eval harness + shared grading lib for StackHawk agent skills"
+requires-python = ">=3.11"
+dependencies = [
+    "pydantic>=2.6",
+    "pyyaml>=6.0",
+    "rich>=13.0",
+]
+
+[dependency-groups]
+dev = ["pytest>=8.0"]
+
+[project.scripts]
+evals = "evals.cli:main"
+compare = "evals.cli:compare"
+regrade = "evals.cli:regrade"
+validate = "evals.cli:validate"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["evals"]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/lib/__init__.py b/tests/lib/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/uv.lock b/uv.lock
new file mode 100644
index 0000000..851950e
--- /dev/null
+++ b/uv.lock
@@ -0,0 +1,325 @@
+version = 1
+revision = 2
+requires-python = ">=3.11"
+
+[[package]]
+name = "agent-skills-evals"
+version = "0.1.0"
+source = { editable = "." }
+dependencies = [
+    { name = "pydantic" },
+    { name = "pyyaml" },
+    { name = "rich" },
+]
+
+[package.dev-dependencies]
+dev = [
+    { name = "pytest" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "pydantic", specifier = ">=2.6" },
+    { name = "pyyaml", specifier = ">=6.0" },
+    { name = "rich", specifier = ">=13.0" },
+]
+
+[package.metadata.requires-dev]
+dev = [{ name = "pytest", specifier = ">=8.0" }]
+
+[[package]]
+name = "annotated-types"
+version = "0.7.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },
+]
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
+]
+
+[[package]]
+name = "iniconfig"
+version = "2.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
+]
+
+[[package]]
+name = "markdown-it-py"
+version = "4.2.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "mdurl" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/06/ff/7841249c247aa650a76b9ee4bbaeae59370dc8bfd2f6c01f3630c35eb134/markdown_it_py-4.2.0.tar.gz", hash = "sha256:04a21681d6fbb623de53f6f364d352309d4094dd4194040a10fd51833e418d49", size = 82454, upload-time = "2026-05-07T12:08:28.36Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b3/81/4da04ced5a082363ecfa159c010d200ecbd959ae410c10c0264a38cac0f5/markdown_it_py-4.2.0-py3-none-any.whl", hash = "sha256:9f7ebbcd14fe59494226453aed97c1070d83f8d24b6fc3a3bcf9a38092641c4a", size = 91687, upload-time = "2026-05-07T12:08:27.182Z" },
+]
+
+[[package]]
+name = "mdurl"
+version = "0.1.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" },
+]
+
+[[package]]
+name = "packaging"
+version = "26.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d7/f1/e7a6dd94a8d4a5626c03e4e99c87f241ba9e350cd9e6d75123f992427270/packaging-26.2.tar.gz", hash = "sha256:ff452ff5a3e828ce110190feff1178bb1f2ea2281fa2075aadb987c2fb221661", size = 228134, upload-time = "2026-04-24T20:15:23.917Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/df/b2/87e62e8c3e2f4b32e5fe99e0b86d576da1312593b39f47d8ceef365e95ed/packaging-26.2-py3-none-any.whl", hash = "sha256:5fc45236b9446107ff2415ce77c807cee2862cb6fac22b8a73826d0693b0980e", size = 100195, upload-time = "2026-04-24T20:15:22.081Z" },
+]
+
+[[package]]
+name = "pluggy"
+version = "1.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
+]
+
+[[package]]
+name = "pydantic"
+version = "2.13.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "annotated-types" },
+    { name = "pydantic-core" },
+    { name = "typing-extensions" },
+    { name = "typing-inspection" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/18/a5/b60d21ac674192f8ab0ba4e9fd860690f9b4a6e51ca5df118733b487d8d6/pydantic-2.13.4.tar.gz", hash = "sha256:c40756b57adaa8b1efeeced5c196f3f3b7c435f90e84ea7f443901bec8099ef6", size = 844775, upload-time = "2026-05-06T13:43:05.343Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fd/7b/122376b1fd3c62c1ed9dc80c931ace4844b3c55407b6fb2d199377c9736f/pydantic-2.13.4-py3-none-any.whl", hash = "sha256:45a282cde31d808236fd7ea9d919b128653c8b38b393d1c4ab335c62924d9aba", size = 472262, upload-time = "2026-05-06T13:43:02.641Z" },
+]
+
+[[package]]
+name = "pydantic-core"
+version = "2.46.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9d/56/921726b776ace8d8f5db44c4ef961006580d91dc52b803c489fafd1aa249/pydantic_core-2.46.4.tar.gz", hash = "sha256:62f875393d7f270851f20523dd2e29f082bcc82292d66db2b64ea71f64b6e1c1", size = 471464, upload-time = "2026-05-06T13:37:06.98Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5c/fa/6d7708d2cfc1a832acb6aeb0cd16e801902df8a0f583bb3b4b527fde022e/pydantic_core-2.46.4-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:0e96592440881c74a213e5ad528e2b24d3d4f940de2766bed9010ab1d9e51594", size = 2111872, upload-time = "2026-05-06T13:40:27.596Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/6f/aa064a3e74b5745afbdf250594f38e7ead05e2d651bcb35994b9417a0d4d/pydantic_core-2.46.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e0d65b8c354be7fb5f720c3caa8bc940bc2d20ce749c8e06135f07f8ed95dd7c", size = 1948255, upload-time = "2026-05-06T13:39:12.574Z" },
+    { url = "https://files.pythonhosted.org/packages/43/3a/41114a9f7569b84b4d84e7a018c57c56347dac30c0d4a872946ec4e36c46/pydantic_core-2.46.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7bfb192b3f4b9e8a89b6277b6ce787564f62cfd272055f6e685726b111dc7826", size = 1972827, upload-time = "2026-05-06T13:38:19.841Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/25/1ab42e8048fe551934d9884e8d64daa7e990ad386f310a15981aeb6a5b08/pydantic_core-2.46.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9037063db01f09b09e237c282b6792bd4da634b5402c4e7f0c61effed7701a04", size = 2041051, upload-time = "2026-05-06T13:38:10.447Z" },
+    { url = "https://files.pythonhosted.org/packages/94/c2/1a934597ddf08da410385b3b7aae91956a5a76c635effef456074fad7e88/pydantic_core-2.46.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fc010ab034c8c7452522748bf937df58020d256ccae0874463d1f4d01758af8e", size = 2221314, upload-time = "2026-05-06T13:40:13.089Z" },
+    { url = "https://files.pythonhosted.org/packages/02/6d/9e8ad178c9c4df27ad3c8f25d1fe2a7ab0d2ba0559fad4aee5d3d1f16771/pydantic_core-2.46.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8c5dac79fa1614d1e06ca695109c6105923bd9c7d1d6c918d4e637b7e6b32fd3", size = 2285146, upload-time = "2026-05-06T13:38:59.224Z" },
+    { url = "https://files.pythonhosted.org/packages/80/50/540cd3aeefc041beb111125c4bff779831a2111fc6b15a9138cda277d32c/pydantic_core-2.46.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9fa868638bf362d3d138ea55829cefb3d5f4b0d7f142234382a15e2485dbec4", size = 2089685, upload-time = "2026-05-06T13:38:17.762Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/a4/b440ad35f05f6a38f89fa0f149accb3f0e02be94ca5e15f3c449a61b4bc9/pydantic_core-2.46.4-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:17299feefe090f2caa5b8e37222bb5f663e4935a8bfa6931d4102e5df1a9f398", size = 2115420, upload-time = "2026-05-06T13:37:58.195Z" },
+    { url = "https://files.pythonhosted.org/packages/99/61/de4f55db8dfd57bfdfa9a12ec90fe1b57c4f41062f7ca86f08586b3e0ac0/pydantic_core-2.46.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4c63ebc82684aa89d9a3bcbd13d515b3be44250dc68dd3bd81526c1cb31286c3", size = 2165122, upload-time = "2026-05-06T13:37:01.167Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/52/7c529d7bdb2d1068bd52f51fe32572c8301f9a4febf1948f10639f1436f5/pydantic_core-2.46.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:aaa2a54443eff1950ba5ddc6b6ccda0d9c84a364276a62f969bdf2a390650848", size = 2182573, upload-time = "2026-05-06T13:38:45.04Z" },
+    { url = "https://files.pythonhosted.org/packages/37/b3/7c40325848ba78247f2812dcf9c7274e38cd801820ca6dd9fe63bcfb0eb4/pydantic_core-2.46.4-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:18e5ceec2ab67e6d5f1a9085e5a24c9c4e2ac4545730bfe668680bca05e555f3", size = 2317139, upload-time = "2026-05-06T13:37:15.539Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/37/f913f81a657c865b75da6c0dbed79876073c2a43b5bd9edbe8da785e4d49/pydantic_core-2.46.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a0f62d0a58f4e7da165457e995725421e0064f2255d8eccebc49f41bbc23b109", size = 2360433, upload-time = "2026-05-06T13:37:30.099Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/67/6acaa1be2567f9256b056d8477158cac7240813956ce86e49deae8e173b4/pydantic_core-2.46.4-cp311-cp311-win32.whl", hash = "sha256:041bde0a48fd37cf71cab1c9d56d3e8625a3793fef1f7dd232b3ff37e978ecda", size = 1985513, upload-time = "2026-05-06T13:38:15.669Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/e6/c505f83dfeda9a2e5c995cfd872949e4d05e12f7feb3dca72f633daefa94/pydantic_core-2.46.4-cp311-cp311-win_amd64.whl", hash = "sha256:6f2eeda33a839975441c86a4119e1383c50b47faf0cbb5176985565c6bb02c33", size = 2071114, upload-time = "2026-05-06T13:40:35.416Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/da/7a263a96d965d9d0df5e8de8a475f33495451117035b09acb110288c381f/pydantic_core-2.46.4-cp311-cp311-win_arm64.whl", hash = "sha256:14f4c5d6db102bd796a627bbb3a17b4cf4574b9ae861d8b7c9a9661c6dd3362d", size = 2044298, upload-time = "2026-05-06T13:38:29.754Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/8c/af022f0af448d7747c5154288d46b5f2bc5f17366eaa0e23e9aa04d59f3b/pydantic_core-2.46.4-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:3245406455a5d98187ec35530fd772b1d799b26667980872c8d4614991e2c4a2", size = 2106158, upload-time = "2026-05-06T13:38:57.215Z" },
+    { url = "https://files.pythonhosted.org/packages/19/95/6195171e385007300f0f5574592e467c568becce2d937a0b6804f218bc49/pydantic_core-2.46.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:962ccbab7b642487b1d8b7df90ef677e03134cf1fd8880bf698649b22a69371f", size = 1951724, upload-time = "2026-05-06T13:37:02.697Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/bc/f47d1ff9cbb1620e1b5b697eef06010035735f07820180e74178226b27b3/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8233f2947cf85404441fd7e0085f53b10c93e0ee78611099b5c7237e36aacbf7", size = 1975742, upload-time = "2026-05-06T13:37:09.448Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/11/9b9a5b0306345664a2da6410877af6e8082481b5884b3ddd78d47c6013ce/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3a233125ac121aa3ffba9a2b59edfc4a985a76092dc8279586ab4b71390875e7", size = 2052418, upload-time = "2026-05-06T13:37:38.234Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/b7/a65fec226f5d78fc39f4a13c4cc0c768c22b113438f60c14adc9d2865038/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5b712b53160b79a5850310b912a5ef8e57e56947c8ad690c227f5c9d7e561712", size = 2232274, upload-time = "2026-05-06T13:38:27.753Z" },
+    { url = "https://files.pythonhosted.org/packages/68/f0/92039db98b907ef49269a8271f67db9cb78ae2fc68062ef7e4e77adb5f61/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9401557acd873c3a7f3eb9383edef8ac4968f9510e340f4808d427e75667e7b4", size = 2309940, upload-time = "2026-05-06T13:38:05.353Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/97/2aab507d3d00ca626e8e57c1eac6a79e4e5fbcc63eb99733ff55d1717f65/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:926c9541b14b12b1681dca8a0b75feb510b06c6341b70a8e500c2fdcff837cce", size = 2094516, upload-time = "2026-05-06T13:39:10.577Z" },
+    { url = "https://files.pythonhosted.org/packages/22/37/a8aca44d40d737dde2bc05b3c6c07dff0de07ce6f82e9f3167aeaf4d5dea/pydantic_core-2.46.4-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:56cb4851bcaf3d117eddcef4fe66afd750a50274b0da8e22be256d10e5611987", size = 2136854, upload-time = "2026-05-06T13:40:22.59Z" },
+    { url = "https://files.pythonhosted.org/packages/24/99/fcef1b79238c06a8cbec70819ac722ba76e02bc8ada9b0fd66eba40da01b/pydantic_core-2.46.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c68fcd102d71ea85c5b2dfac3f4f8476eff42a9e078fd5faefff6d145063536b", size = 2180306, upload-time = "2026-05-06T13:40:10.666Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/6c/fc44000918855b42779d007ae63b0532794739027b2f417321cddbc44f6a/pydantic_core-2.46.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b2f69dec1725e79a012d920df1707de5caf7ed5e08f3be4435e25803efc47458", size = 2190044, upload-time = "2026-05-06T13:40:43.231Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/65/d9cadc9f1920d7a127ad2edba16c1db7916e59719285cd6c94600b0080ba/pydantic_core-2.46.4-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:8d0820e8192167f80d88d64038e609c31452eeca865b4e1d9950a27a4609b00b", size = 2329133, upload-time = "2026-05-06T13:39:57.365Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/cf/c873d91679f3a30bcf5e7ac280ce5573483e72295307685120d0d5ad3416/pydantic_core-2.46.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:fbdb89b3e1c94a30cc5edfce477c6e6a5dc4d8f84665b455c27582f211a1c72c", size = 2374464, upload-time = "2026-05-06T13:38:06.976Z" },
+    { url = "https://files.pythonhosted.org/packages/47/bd/6f2fc8188f31bf10590f1e98e7b306336161fac930a8c514cd7bd828c7dc/pydantic_core-2.46.4-cp312-cp312-win32.whl", hash = "sha256:9aa768456404a8bf48a4406685ac2bec8e72b62c69313734fa3b73cf33b3a894", size = 1974823, upload-time = "2026-05-06T13:40:47.985Z" },
+    { url = "https://files.pythonhosted.org/packages/40/8c/985c1d41ea1107c2534abd9870e4ed5c8e7669b5c308297835c001e7a1c4/pydantic_core-2.46.4-cp312-cp312-win_amd64.whl", hash = "sha256:e9c26f834c65f5752f3f06cb08cb86a913ceb7274d0db6e267808a708b46bc89", size = 2072919, upload-time = "2026-05-06T13:39:21.153Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/ba/f463d006e0c47373ca7ec5e1a261c59dc01ef4d62b2657af925fb0deee3a/pydantic_core-2.46.4-cp312-cp312-win_arm64.whl", hash = "sha256:4fc73cb559bdb54b1134a706a2802a4cddd27a0633f5abb7e53056268751ac6a", size = 2027604, upload-time = "2026-05-06T13:39:03.753Z" },
+    { url = "https://files.pythonhosted.org/packages/51/a2/5d30b469c5267a17b39dec53208222f76a8d351dfac4af661888c5aee77d/pydantic_core-2.46.4-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:5d5902252db0d3cedf8d4a1bc68f70eeb430f7e4c7104c8c476753519b423008", size = 2106306, upload-time = "2026-05-06T13:37:48.029Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/81/4fa520eaffa8bd7d1525e644cd6d39e7d60b1592bc5b516693c7340b50f1/pydantic_core-2.46.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:c94f0688e7b8d0a67abf40e57a7eaaecd17cc9586706a31b76c031f63df052b4", size = 1951906, upload-time = "2026-05-06T13:37:17.012Z" },
+    { url = "https://files.pythonhosted.org/packages/03/d5/fd02da45b659668b05923b17ba3a0100a0a3d5541e3bd8fcc4ecb711309e/pydantic_core-2.46.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f027324c56cd5406ca49c124b0db10e56c69064fec039acc571c29020cc87c76", size = 1976802, upload-time = "2026-05-06T13:37:35.113Z" },
+    { url = "https://files.pythonhosted.org/packages/21/f2/95727e1368be3d3ed485eaab7adbd7dda408f33f7a36e8b48e0144002b91/pydantic_core-2.46.4-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e739fee756ba1010f8bcccb534252e85a35fe45ae92c295a06059ce58b74ccd3", size = 2052446, upload-time = "2026-05-06T13:37:12.313Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/86/5d99feea3f77c7234b8718075b23db11532773c1a0dbd9b9490215dc2eeb/pydantic_core-2.46.4-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9d56801be94b86a9da183e5f3766e6310752b99ff647e38b09a9500d88e46e76", size = 2232757, upload-time = "2026-05-06T13:39:01.149Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/3a/508ac615935ef7588cf6d9e9b91309fdc2da751af865e02a9098de88258c/pydantic_core-2.46.4-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2412e734dcb48da14d4e4006b82b46b74f2518b8a26ee7e58c6844a6cd6d03c4", size = 2309275, upload-time = "2026-05-06T13:37:41.406Z" },
+    { url = "https://files.pythonhosted.org/packages/07/f8/41db9de19d7987d6b04715a02b3b40aea467000275d9d758ffaa31af7d50/pydantic_core-2.46.4-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9551187363ffc0de2a00b2e47c25aeaeb1020b69b668762966df15fc5659dd5a", size = 2094467, upload-time = "2026-05-06T13:39:18.847Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/e2/f35033184cb11d0052daf4416e8e10a502ea2ac006fc4f459aee872727d1/pydantic_core-2.46.4-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:0186750b482eefa11d7f435892b09c5c606193ef3375bcf94aa00ae6bfb66262", size = 2134417, upload-time = "2026-05-06T13:40:17.944Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/7b/6ceeb1cc90e193862f444ebe373d8fdf613f0a82572dde03fb10734c6c71/pydantic_core-2.46.4-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5855698a4856556d86e8e6cd8434bc3ac0314ee8e12089ae0e143f64c6256e4e", size = 2179782, upload-time = "2026-05-06T13:40:32.618Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/f2/c8d7773ede6af08036423a00ae0ceffce266c3c52a096c435d68c896083f/pydantic_core-2.46.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:cbaf13819775b7f769bf4a1f066cb6df7a28d4480081a589828ef190226881cd", size = 2188782, upload-time = "2026-05-06T13:36:51.018Z" },
+    { url = "https://files.pythonhosted.org/packages/59/31/0c864784e31f09f05cdd87606f08923b9c9e7f6e51dd27f20f62f975ce9f/pydantic_core-2.46.4-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:633147d34cf4550417f12e2b1a0383973bdf5cdfde212cb09e9a581cf10820be", size = 2328334, upload-time = "2026-05-06T13:40:37.764Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/eb/4f6c8a41efa30baa755590f4141abf3a8c370fab610915733e74134a7270/pydantic_core-2.46.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:82cf5301172168103724d49a1444d3378cb20cdee30b116a1bd6031236298a5d", size = 2372986, upload-time = "2026-05-06T13:39:34.152Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/24/b375a480d53113860c299764bfe9f349a3dc9108b3adc0d7f0d786492ebf/pydantic_core-2.46.4-cp313-cp313-win32.whl", hash = "sha256:9fa8ae11da9e2b3126c6426f147e0fba88d96d65921799bb30c6abd1cb2c97fb", size = 1973693, upload-time = "2026-05-06T13:37:55.072Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/e8/cff247591966f2d22ec8c003cd7587e27b7ba7b81ab2fb888e3ab75dc285/pydantic_core-2.46.4-cp313-cp313-win_amd64.whl", hash = "sha256:6b3ace8194b0e5204818c92802dcdca7fc6d88aabbb799d7c795540d9cd6d292", size = 2071819, upload-time = "2026-05-06T13:38:49.139Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/1a/f4aee670d5670e9e148e0c82c7db98d780be566c6e6a97ee8035528ca0b3/pydantic_core-2.46.4-cp313-cp313-win_arm64.whl", hash = "sha256:184c081504d17f1c1066e430e117142b2c77d9448a97f7b65c6ac9fd9aee238d", size = 2027411, upload-time = "2026-05-06T13:40:45.796Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/74/228a26ddad29c6672b805d9fd78e8d251cd04004fa7eed0e622096cd0250/pydantic_core-2.46.4-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:428e04521a40150c85216fc8b85e8d39fece235a9cf5e383761238c7fa9b96fb", size = 2102079, upload-time = "2026-05-06T13:38:41.019Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/1f/8970b150a4b4365623ae00fc88603491f763c627311ae8031e3111356d6e/pydantic_core-2.46.4-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:23ace664830ee0bfe014a0c7bc248b1f7f25ed7ad103852c317624a1083af462", size = 1952179, upload-time = "2026-05-06T13:36:59.812Z" },
+    { url = "https://files.pythonhosted.org/packages/95/30/5211a831ae054928054b2f79731661087a2bc5c01e825c672b3a4a8f1b3e/pydantic_core-2.46.4-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce5c1d2a8b27468f433ca974829c44060b8097eedc39933e3c206a90ee49c4a9", size = 1978926, upload-time = "2026-05-06T13:37:39.933Z" },
+    { url = "https://files.pythonhosted.org/packages/57/e9/689668733b1eb67adeef047db3c2e8788fcf65a7fd9c9e2b46b7744fe245/pydantic_core-2.46.4-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7283d57845ecf5a163403eb0702dfc220cc4fbdd18919cb5ccea4f95ee1cdab4", size = 2046785, upload-time = "2026-05-06T13:38:01.995Z" },
+    { url = "https://files.pythonhosted.org/packages/60/d9/6715260422ff50a2109878fd24d948a6c3446bb2664f34ee78cd972b3acd/pydantic_core-2.46.4-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8daafc69c93ee8a0204506a3b6b30f586ef54028f52aeeeb5c4cfc5184fd5914", size = 2228733, upload-time = "2026-05-06T13:40:50.371Z" },
+    { url = "https://files.pythonhosted.org/packages/18/ae/fdb2f64316afca925640f8e70bb1a564b0ec2721c1389e25b8eb4bf9a299/pydantic_core-2.46.4-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd2213145bcc2ba85884d0ac63d222fece9209678f77b9b4d76f054c561adb28", size = 2307534, upload-time = "2026-05-06T13:37:21.531Z" },
+    { url = "https://files.pythonhosted.org/packages/89/1d/8eff589b45bb8190a9d12c49cfad0f176a5cbd1534908a6b5125e2886239/pydantic_core-2.46.4-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a5f930472650a82629163023e630d160863fce524c616f4e5186e5de9d9a49b", size = 2099732, upload-time = "2026-05-06T13:39:31.942Z" },
+    { url = "https://files.pythonhosted.org/packages/06/d5/ee5a3366637fee41dee51a1fc91562dcf12ddbc68fda34e6b253da2324bb/pydantic_core-2.46.4-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:c1b3f518abeca3aa13c712fd202306e145abf59a18b094a6bafb2d2bbf59192c", size = 2129627, upload-time = "2026-05-06T13:37:25.033Z" },
+    { url = "https://files.pythonhosted.org/packages/94/33/2414be571d2c6a6c4d08be21f9292b6d3fdb08949a97b6dfe985017821db/pydantic_core-2.46.4-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1a7dd0b3ee80d90150e3495a3a13ac34dbcbfd4f012996a6a1d8900e91b5c0fb", size = 2179141, upload-time = "2026-05-06T13:37:14.046Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/79/7daa95be995be0eecc4cf75064cb33f9bbbfe3fe0158caf2f0d4a996a5c7/pydantic_core-2.46.4-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:3fb702cd90b0446a3a1c5e470bfa0dd23c0233b676a9099ddcc964fa6ca13898", size = 2184325, upload-time = "2026-05-06T13:36:53.615Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/cb/d0a382f5c0de8a222dc61c65348e0ce831b1f68e0a018450d31c2cace3a5/pydantic_core-2.46.4-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:b8458003118a712e66286df6a707db01c52c0f52f7db8e4a38f0da1d3b94fc4e", size = 2323990, upload-time = "2026-05-06T13:40:29.971Z" },
+    { url = "https://files.pythonhosted.org/packages/05/db/d9ba624cc4a5aced1598e88c04fdbd8310c8a69b9d38b9a3d39ce3a61ed7/pydantic_core-2.46.4-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:372429a130e469c9cd698925ce5fc50940b7a1336b0d82038e63d5bbc4edc519", size = 2369978, upload-time = "2026-05-06T13:37:23.027Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/20/d15df15ba918c423461905802bfd2981c3af0bfa0e40d05e13edbfa48bc3/pydantic_core-2.46.4-cp314-cp314-win32.whl", hash = "sha256:85bb3611ff1802f3ee7fdd7dbff26b56f343fb432d57a4728fdd49b6ef35e2f4", size = 1966354, upload-time = "2026-05-06T13:38:03.499Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/b6/6b8de4c0a7d7ab3004c439c80c5c1e0a3e8d78bbae19379b01960383d9e5/pydantic_core-2.46.4-cp314-cp314-win_amd64.whl", hash = "sha256:811ff8e9c313ab425368bcbb36e5c4ebd7108c2bbf4e4089cfbb0b01eff63fac", size = 2072238, upload-time = "2026-05-06T13:39:40.807Z" },
+    { url = "https://files.pythonhosted.org/packages/32/36/51eb763beec1f4cf59b1db243a7dcc39cbb41230f050a09b9d69faaf0a48/pydantic_core-2.46.4-cp314-cp314-win_arm64.whl", hash = "sha256:bfec22eab3c8cc2ceec0248aec886624116dc079afa027ecc8ad4a7e62010f8a", size = 2018251, upload-time = "2026-05-06T13:37:26.72Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/91/855af51d625b23aa987116a19e231d2aaef9c4a415273ddc189b79a45fee/pydantic_core-2.46.4-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:af8244b2bef6aaad6d92cda81372de7f8c8d36c9f0c3ea36e827c60e7d9467a0", size = 2099593, upload-time = "2026-05-06T13:39:47.682Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/1b/8784a54c65edb5f49f0a14d6977cf1b209bba85a4c77445b255c2de58ab3/pydantic_core-2.46.4-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:5a4330cdbc57162e4b3aa303f588ba752257694c9c9be3e7ebb11b4aca659b5d", size = 1935226, upload-time = "2026-05-06T13:40:40.428Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/e7/1955d28d1afc56dd4b3ad7cc0cf39df1b9852964cf16e5d13912756d6d6b/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:29c61fc04a3d840155ff08e475a04809278972fe6aef51e2720554e96367e34b", size = 1974605, upload-time = "2026-05-06T13:37:32.029Z" },
+    { url = "https://files.pythonhosted.org/packages/93/e2/3fedbf0ba7a22850e6e9fd78117f1c0f10f950182344d8a6c535d468fdd8/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c50f2528cf200c5eed56faf3f4e22fcd5f38c157a8b78576e6ba3168ec35f000", size = 2030777, upload-time = "2026-05-06T13:38:55.239Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/61/46be275fcaaba0b4f5b9669dd852267ce1ff616592dccf7a7845588df091/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0cbe8b01f948de4286c74cdd6c667aceb38f5c1e26f0693b3983d9d74887c65e", size = 2236641, upload-time = "2026-05-06T13:37:08.096Z" },
+    { url = "https://files.pythonhosted.org/packages/60/db/12e93e46a8bac9988be3c016860f83293daea8c716c029c9ace279036f2f/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:617d7e2ca7dcb8c5cf6bcb8c59b8832c94b36196bbf1cbd1bfb56ed341905edd", size = 2286404, upload-time = "2026-05-06T13:40:20.221Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/4a/4d8b19008f38d31c53b8219cfedc2e3d5de5fe99d90076b7e767de29274f/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7027560ee92211647d0d34e3f7cd6f50da56399d26a9c8ad0da286d3869a53f3", size = 2109219, upload-time = "2026-05-06T13:38:12.153Z" },
+    { url = "https://files.pythonhosted.org/packages/88/70/3cbc40978fefb7bb09c6708d40d4ad1a5d70fd7213c3d17f971de868ec1f/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:f99626688942fb746e545232e7726926f3be91b5975f8b55327665fafda991c7", size = 2110594, upload-time = "2026-05-06T13:40:02.971Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/20/b8d36736216e29491125531685b2f9e61aa5b4b2599893f8268551da3338/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fc3e9034a63de20e15e8ade85358bc6efc614008cab72898b4b4952bea0509ff", size = 2159542, upload-time = "2026-05-06T13:39:27.506Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/a2/367df868eb584dacf6bf82a389272406d7178e301c4ac82545ab98bc2dd9/pydantic_core-2.46.4-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:97e7cf2be5c77b7d1a9713a05605d49460d02c6078d38d8bef3cbe323c548424", size = 2168146, upload-time = "2026-05-06T13:38:31.93Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/b8/4460f77f7e201893f649a29ab355dddd3beee8a97bcb1a320db414f9a06e/pydantic_core-2.46.4-cp314-cp314t-musllinux_1_1_armv7l.whl", hash = "sha256:3bf92c5d0e00fefaab325a4d27828fe6b6e2a21848686b5b60d2d9eeb09d76c6", size = 2306309, upload-time = "2026-05-06T13:37:44.717Z" },
+    { url = "https://files.pythonhosted.org/packages/64/c4/be2639293acd87dc8ddbcec41a73cee9b2ebf996fe6d892a1a74e88ad3f7/pydantic_core-2.46.4-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:3ecbc122d18468d06ca279dc26a8c2e2d5acb10943bb35e36ae92096dc3b5565", size = 2369736, upload-time = "2026-05-06T13:37:05.645Z" },
+    { url = "https://files.pythonhosted.org/packages/30/a6/9f9f380dbb301f67023bf8f707aaa75daadf84f7152d95c410fd7e81d994/pydantic_core-2.46.4-cp314-cp314t-win32.whl", hash = "sha256:e846ae7835bf0703ae43f534ab79a867146dadd59dc9ca5c8b53d5c8f7c9ef02", size = 1955575, upload-time = "2026-05-06T13:38:51.116Z" },
+    { url = "https://files.pythonhosted.org/packages/40/1f/f1eb9eb350e795d1af8586289746f5c5677d16043040d63710e22abc43c9/pydantic_core-2.46.4-cp314-cp314t-win_amd64.whl", hash = "sha256:2108ba5c1c1eca18030634489dc544844144ee36357f2f9f780b93e7ddbb44b5", size = 2051624, upload-time = "2026-05-06T13:38:21.672Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/d2/42dd53d0a85c27606f316d3aa5d2869c4e8470a5ed6dec30e4a1abe19192/pydantic_core-2.46.4-cp314-cp314t-win_arm64.whl", hash = "sha256:4fcbe087dbc2068af7eda3aa87634eba216dbda64d1ae73c8684b621d33f6596", size = 2017325, upload-time = "2026-05-06T13:40:52.723Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/a4/73995fd4ebbb46ba0ee51e6fa049b8f02c40daebb762208feda8a6b7894d/pydantic_core-2.46.4-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:14d4edf427bdcf950a8a02d7cb44a08614388dd6e1bdcbf4f67504fa7887da9c", size = 2111589, upload-time = "2026-05-06T13:37:10.817Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/7f/f37d3a5e8bfcc2e403f5c57a730f2d815693fb42119e8ea48b3789335af1/pydantic_core-2.46.4-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:0ce40cd7b21210e99342afafbd4d0f76d784eb5b1d60f3bdc566be4983c6c73b", size = 1944552, upload-time = "2026-05-06T13:36:56.717Z" },
+    { url = "https://files.pythonhosted.org/packages/15/3c/d7eb777b3ff43e8433a4efb39a17aa8fd98a4ee8561a24a67ef5db07b2d6/pydantic_core-2.46.4-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:90884113d8b48f760e9587002789ddd741e76ab9f89518cd1e43b1f1a52ec44b", size = 1982984, upload-time = "2026-05-06T13:39:06.207Z" },
+    { url = "https://files.pythonhosted.org/packages/63/87/70b9f40170a81afd55ca26c9b2acb25c20d64bcfbf888fafecb3ba077d4c/pydantic_core-2.46.4-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66ce7632c22d837c95301830e111ad0128a32b8207533b60896a96c4915192ea", size = 2138417, upload-time = "2026-05-06T13:39:45.476Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/1d/8987ad40f65ae1432753072f214fb5c74fe47ffbd0698bb9cbbb585664f8/pydantic_core-2.46.4-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:1d8ba486450b14f3b1d63bc521d410ec7565e52f887b9fb671791886436a42f7", size = 2095527, upload-time = "2026-05-06T13:39:52.283Z" },
+    { url = "https://files.pythonhosted.org/packages/64/d3/84c282a7eee1d3ac4c0377546ef5a1ea436ce26840d9ac3b7ed54a377507/pydantic_core-2.46.4-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:3009f12e4e90b7f88b4f9adb1b0c4a3d58fe7820f3238c190047209d148026df", size = 1936024, upload-time = "2026-05-06T13:40:15.671Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/ca/eac61596cdeb4d7e174d3dc0bd8a6238f14f75f97a24e7b7db4c7e7340a0/pydantic_core-2.46.4-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ad785e92e6dc634c21555edc8bd6b64957ab844541bcb96a1366c202951ae526", size = 1990696, upload-time = "2026-05-06T13:38:34.717Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/c3/7c8b240552251faf6b3a957db200fcfbbcec36763c050428b601e0c9b83b/pydantic_core-2.46.4-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00c603d540afdd6b80eb39f078f33ebd46211f02f33e34a32d9f053bba711de0", size = 2147590, upload-time = "2026-05-06T13:39:29.883Z" },
+    { url = "https://files.pythonhosted.org/packages/11/cb/428de0385b6c8d44b716feba566abfacfbd23ee3c4439faa789a1456242f/pydantic_core-2.46.4-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:0c563b08bca408dc7f65f700633d8442fffb2421fc47b8101377e9fd65051ff0", size = 2112782, upload-time = "2026-05-06T13:37:04.016Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/b5/6a17bdadd0fc1f170adfd05a20d37c832f52b117b4d9131da1f41bb097ce/pydantic_core-2.46.4-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:db06ffe51636ffe9ca531fe9023dd64bdd794be8754cb5df57c5498ae5b518a7", size = 1952146, upload-time = "2026-05-06T13:39:43.092Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/dc/03734d80e362cd43ef65428e9de77c730ce7f2f11c60d2b1e1b39f0fbf99/pydantic_core-2.46.4-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:133878133d271ade3d41d1bfb2a45ec38dbdbda40bc065921c6b04e4630127e2", size = 2134492, upload-time = "2026-05-06T13:36:58.124Z" },
+    { url = "https://files.pythonhosted.org/packages/de/df/5e5ffc085ed07cc22d298134d3d911c63e91f6a0eb91fe646750a3209910/pydantic_core-2.46.4-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9bc519fbf2b7578398853d815009ae5e4d4603d12f4e3f91da8c06852d3da3e9", size = 2156604, upload-time = "2026-05-06T13:37:49.88Z" },
+    { url = "https://files.pythonhosted.org/packages/81/44/6e112a4253e56f5705467cbab7ab5e91ee7398ba3d56d358635958893d3e/pydantic_core-2.46.4-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:c7a7bd4e39e8e4c12c39cd480356842b6a8a06e41b23a55a5e3e191718838ddf", size = 2183828, upload-time = "2026-05-06T13:37:43.053Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/ad/5565071e937d8e752842ac241463944c9eb14c87e2d269f2658a5bd05e98/pydantic_core-2.46.4-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:d396ec2b979760aaf3218e76c24e65bd0aca24983298653b3a9d7a45f9e47b30", size = 2310000, upload-time = "2026-05-06T13:37:56.694Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/c3/66883a5cec183e7fba4d024b4cbbe61851a63750ef606b0afecc46d1f2bf/pydantic_core-2.46.4-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:86e1a4418c6cd97d60c95c71164158eaf7324fae7b0923264016baa993eba6fc", size = 2361286, upload-time = "2026-05-06T13:40:05.667Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/2d/69abac8f838090bbecd5df894befb2c2619e7996a98ddb949db9f3b93225/pydantic_core-2.46.4-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:d51026d73fcfd93610abc7b27789c26b313920fcfb20e27462d74a7f8b06e983", size = 2193071, upload-time = "2026-05-06T13:38:08.682Z" },
+]
+
+[[package]]
+name = "pygments"
+version = "2.20.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c3/b2/bc9c9196916376152d655522fdcebac55e66de6603a76a02bca1b6414f6c/pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f", size = 4955991, upload-time = "2026-03-29T13:29:33.898Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" },
+]
+
+[[package]]
+name = "pytest"
+version = "9.0.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "iniconfig" },
+    { name = "packaging" },
+    { name = "pluggy" },
+    { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7d/0d/549bd94f1a0a402dc8cf64563a117c0f3765662e2e668477624baeec44d5/pytest-9.0.3.tar.gz", hash = "sha256:b86ada508af81d19edeb213c681b1d48246c1a91d304c6c81a427674c17eb91c", size = 1572165, upload-time = "2026-04-07T17:16:18.027Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl", hash = "sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9", size = 375249, upload-time = "2026-04-07T17:16:16.13Z" },
+]
+
+[[package]]
+name = "pyyaml"
+version = "6.0.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6d/16/a95b6757765b7b031c9374925bb718d55e0a9ba8a1b6a12d25962ea44347/pyyaml-6.0.3-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:44edc647873928551a01e7a563d7452ccdebee747728c1080d881d68af7b997e", size = 185826, upload-time = "2025-09-25T21:31:58.655Z" },
+    { url = "https://files.pythonhosted.org/packages/16/19/13de8e4377ed53079ee996e1ab0a9c33ec2faf808a4647b7b4c0d46dd239/pyyaml-6.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:652cb6edd41e718550aad172851962662ff2681490a8a711af6a4d288dd96824", size = 175577, upload-time = "2025-09-25T21:32:00.088Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/62/d2eb46264d4b157dae1275b573017abec435397aa59cbcdab6fc978a8af4/pyyaml-6.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:10892704fc220243f5305762e276552a0395f7beb4dbf9b14ec8fd43b57f126c", size = 775556, upload-time = "2025-09-25T21:32:01.31Z" },
+    { url = "https://files.pythonhosted.org/packages/10/cb/16c3f2cf3266edd25aaa00d6c4350381c8b012ed6f5276675b9eba8d9ff4/pyyaml-6.0.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:850774a7879607d3a6f50d36d04f00ee69e7fc816450e5f7e58d7f17f1ae5c00", size = 882114, upload-time = "2025-09-25T21:32:03.376Z" },
+    { url = "https://files.pythonhosted.org/packages/71/60/917329f640924b18ff085ab889a11c763e0b573da888e8404ff486657602/pyyaml-6.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b8bb0864c5a28024fac8a632c443c87c5aa6f215c0b126c449ae1a150412f31d", size = 806638, upload-time = "2025-09-25T21:32:04.553Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/6f/529b0f316a9fd167281a6c3826b5583e6192dba792dd55e3203d3f8e655a/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37d57ad971609cf3c53ba6a7e365e40660e3be0e5175fa9f2365a379d6095a", size = 767463, upload-time = "2025-09-25T21:32:06.152Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/6a/b627b4e0c1dd03718543519ffb2f1deea4a1e6d42fbab8021936a4d22589/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:37503bfbfc9d2c40b344d06b2199cf0e96e97957ab1c1b546fd4f87e53e5d3e4", size = 794986, upload-time = "2025-09-25T21:32:07.367Z" },
+    { url = "https://files.pythonhosted.org/packages/45/91/47a6e1c42d9ee337c4839208f30d9f09caa9f720ec7582917b264defc875/pyyaml-6.0.3-cp311-cp311-win32.whl", hash = "sha256:8098f252adfa6c80ab48096053f512f2321f0b998f98150cea9bd23d83e1467b", size = 142543, upload-time = "2025-09-25T21:32:08.95Z" },
+    { url = "https://files.pythonhosted.org/packages/da/e3/ea007450a105ae919a72393cb06f122f288ef60bba2dc64b26e2646fa315/pyyaml-6.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:9f3bfb4965eb874431221a3ff3fdcddc7e74e3b07799e0e84ca4a0f867d449bf", size = 158763, upload-time = "2025-09-25T21:32:09.96Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/33/422b98d2195232ca1826284a76852ad5a86fe23e31b009c9886b2d0fb8b2/pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196", size = 182063, upload-time = "2025-09-25T21:32:11.445Z" },
+    { url = "https://files.pythonhosted.org/packages/89/a0/6cf41a19a1f2f3feab0e9c0b74134aa2ce6849093d5517a0c550fe37a648/pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0", size = 173973, upload-time = "2025-09-25T21:32:12.492Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/23/7a778b6bd0b9a8039df8b1b1d80e2e2ad78aa04171592c8a5c43a56a6af4/pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28", size = 775116, upload-time = "2025-09-25T21:32:13.652Z" },
+    { url = "https://files.pythonhosted.org/packages/65/30/d7353c338e12baef4ecc1b09e877c1970bd3382789c159b4f89d6a70dc09/pyyaml-6.0.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c", size = 844011, upload-time = "2025-09-25T21:32:15.21Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/9d/b3589d3877982d4f2329302ef98a8026e7f4443c765c46cfecc8858c6b4b/pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc", size = 807870, upload-time = "2025-09-25T21:32:16.431Z" },
+    { url = "https://files.pythonhosted.org/packages/05/c0/b3be26a015601b822b97d9149ff8cb5ead58c66f981e04fedf4e762f4bd4/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e", size = 761089, upload-time = "2025-09-25T21:32:17.56Z" },
+    { url = "https://files.pythonhosted.org/packages/be/8e/98435a21d1d4b46590d5459a22d88128103f8da4c2d4cb8f14f2a96504e1/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea", size = 790181, upload-time = "2025-09-25T21:32:18.834Z" },
+    { url = "https://files.pythonhosted.org/packages/74/93/7baea19427dcfbe1e5a372d81473250b379f04b1bd3c4c5ff825e2327202/pyyaml-6.0.3-cp312-cp312-win32.whl", hash = "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5", size = 137658, upload-time = "2025-09-25T21:32:20.209Z" },
+    { url = "https://files.pythonhosted.org/packages/86/bf/899e81e4cce32febab4fb42bb97dcdf66bc135272882d1987881a4b519e9/pyyaml-6.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b", size = 154003, upload-time = "2025-09-25T21:32:21.167Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344, upload-time = "2025-09-25T21:32:22.617Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/11/0fd08f8192109f7169db964b5707a2f1e8b745d4e239b784a5a1dd80d1db/pyyaml-6.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8", size = 181669, upload-time = "2025-09-25T21:32:23.673Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/16/95309993f1d3748cd644e02e38b75d50cbc0d9561d21f390a76242ce073f/pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1", size = 173252, upload-time = "2025-09-25T21:32:25.149Z" },
+    { url = "https://files.pythonhosted.org/packages/50/31/b20f376d3f810b9b2371e72ef5adb33879b25edb7a6d072cb7ca0c486398/pyyaml-6.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c", size = 767081, upload-time = "2025-09-25T21:32:26.575Z" },
+    { url = "https://files.pythonhosted.org/packages/49/1e/a55ca81e949270d5d4432fbbd19dfea5321eda7c41a849d443dc92fd1ff7/pyyaml-6.0.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a33284e20b78bd4a18c8c2282d549d10bc8408a2a7ff57653c0cf0b9be0afce5", size = 841159, upload-time = "2025-09-25T21:32:27.727Z" },
+    { url = "https://files.pythonhosted.org/packages/74/27/e5b8f34d02d9995b80abcef563ea1f8b56d20134d8f4e5e81733b1feceb2/pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6", size = 801626, upload-time = "2025-09-25T21:32:28.878Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/11/ba845c23988798f40e52ba45f34849aa8a1f2d4af4b798588010792ebad6/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f7057c9a337546edc7973c0d3ba84ddcdf0daa14533c2065749c9075001090e6", size = 753613, upload-time = "2025-09-25T21:32:30.178Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/e0/7966e1a7bfc0a45bf0a7fb6b98ea03fc9b8d84fa7f2229e9659680b69ee3/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eda16858a3cab07b80edaf74336ece1f986ba330fdb8ee0d6c0d68fe82bc96be", size = 794115, upload-time = "2025-09-25T21:32:31.353Z" },
+    { url = "https://files.pythonhosted.org/packages/de/94/980b50a6531b3019e45ddeada0626d45fa85cbe22300844a7983285bed3b/pyyaml-6.0.3-cp313-cp313-win32.whl", hash = "sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26", size = 137427, upload-time = "2025-09-25T21:32:32.58Z" },
+    { url = "https://files.pythonhosted.org/packages/97/c9/39d5b874e8b28845e4ec2202b5da735d0199dbe5b8fb85f91398814a9a46/pyyaml-6.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c", size = 154090, upload-time = "2025-09-25T21:32:33.659Z" },
+    { url = "https://files.pythonhosted.org/packages/73/e8/2bdf3ca2090f68bb3d75b44da7bbc71843b19c9f2b9cb9b0f4ab7a5a4329/pyyaml-6.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb", size = 140246, upload-time = "2025-09-25T21:32:34.663Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size = 181814, upload-time = "2025-09-25T21:32:35.712Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size = 173809, upload-time = "2025-09-25T21:32:36.789Z" },
+    { url = "https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size = 766454, upload-time = "2025-09-25T21:32:37.966Z" },
+    { url = "https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size = 836355, upload-time = "2025-09-25T21:32:39.178Z" },
+    { url = "https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size = 794175, upload-time = "2025-09-25T21:32:40.865Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size = 755228, upload-time = "2025-09-25T21:32:42.084Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size = 789194, upload-time = "2025-09-25T21:32:43.362Z" },
+    { url = "https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size = 156429, upload-time = "2025-09-25T21:32:57.844Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size = 143912, upload-time = "2025-09-25T21:32:59.247Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size = 189108, upload-time = "2025-09-25T21:32:44.377Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size = 183641, upload-time = "2025-09-25T21:32:45.407Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size = 831901, upload-time = "2025-09-25T21:32:48.83Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size = 861132, upload-time = "2025-09-25T21:32:50.149Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size = 839261, upload-time = "2025-09-25T21:32:51.808Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size = 805272, upload-time = "2025-09-25T21:32:52.941Z" },
+    { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" },
+]
+
+[[package]]
+name = "rich"
+version = "15.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "markdown-it-py" },
+    { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c0/8f/0722ca900cc807c13a6a0c696dacf35430f72e0ec571c4275d2371fca3e9/rich-15.0.0.tar.gz", hash = "sha256:edd07a4824c6b40189fb7ac9bc4c52536e9780fbbfbddf6f1e2502c31b068c36", size = 230680, upload-time = "2026-04-12T08:24:00.75Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/82/3b/64d4899d73f91ba49a8c18a8ff3f0ea8f1c1d75481760df8c68ef5235bf5/rich-15.0.0-py3-none-any.whl", hash = "sha256:33bd4ef74232fb73fe9279a257718407f169c09b78a87ad3d296f548e27de0bb", size = 310654, upload-time = "2026-04-12T08:24:02.83Z" },
+]
+
+[[package]]
+name = "typing-extensions"
+version = "4.15.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" },
+]
+
+[[package]]
+name = "typing-inspection"
+version = "0.4.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/55/e3/70399cb7dd41c10ac53367ae42139cf4b1ca5f36bb3dc6c9d33acdb43655/typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464", size = 75949, upload-time = "2025-10-01T02:14:41.687Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" },
+]

From 2c4fd368ba7252afee4954e502d3773e23ac6c09 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 12:25:36 -0600
Subject: [PATCH 02/61] feat(evals): Pydantic data models with strict
 validation

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/lib/models.py      | 80 ++++++++++++++++++++++++++++++++++++++++
 tests/lib/test_models.py | 54 +++++++++++++++++++++++++++
 2 files changed, 134 insertions(+)
 create mode 100644 evals/lib/models.py
 create mode 100644 tests/lib/test_models.py

diff --git a/evals/lib/models.py b/evals/lib/models.py
new file mode 100644
index 0000000..4c34ea3
--- /dev/null
+++ b/evals/lib/models.py
@@ -0,0 +1,80 @@
+"""Pydantic data contracts for the eval system. extra='forbid' makes config
+typos hard load-time errors instead of silently-ignored fields."""
+from __future__ import annotations
+from enum import Enum
+from typing import Literal
+
+from pydantic import BaseModel, ConfigDict, model_validator
+
+
+class BudgetSpec(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    cost_usd: float | None = None
+    bash_commands: int | None = None
+    output_tokens: int | None = None
+    wall_seconds: float | None = None
+
+
+class ExpectedCheck(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    check_id: str | None = None      # reference an existing process-check by id
+    signal: str | None = None        # ad-hoc substring that MUST appear
+    anti_pattern: str | None = None  # substring that must NOT appear
+
+    @model_validator(mode="after")
+    def _exactly_one(self) -> "ExpectedCheck":
+        set_count = sum(x is not None for x in (self.check_id, self.signal, self.anti_pattern))
+        if set_count != 1:
+            raise ValueError("ExpectedCheck must set exactly one of "
+                             "check_id / signal / anti_pattern")
+        return self
+
+
+class PromptConfig(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    id: str
+    should_trigger: bool
+    invocation_type: Literal["explicit", "implicit", "contextual", "negative"]
+    prompt: str
+    notes: str = ""
+    budget: BudgetSpec | None = None
+    expected: list[ExpectedCheck] = []
+
+
+class Verdict(str, Enum):
+    PASS = "pass"
+    PASS_SLOW = "pass-slow"
+    FAIL = "fail"
+
+
+class ParsedRun(BaseModel):
+    bash_commands: list[str] = []
+    files_written: list[str] = []
+    files_edited: list[str] = []
+    output_text: str = ""
+    cost_usd: float = 0.0
+    output_tokens: int | None = None
+    wall_seconds: float | None = None
+    error: str | None = None
+
+
+class ProcessCheckResult(BaseModel):
+    id: str
+    passed: bool
+    severity: Literal["blocking", "warning"]
+    signal_found: str | None = None
+    anti_found: str | None = None
+
+
+class EvalResult(BaseModel):
+    platform: str
+    skill: str
+    run_id: str
+    should_trigger: bool
+    did_trigger: bool
+    trigger_correct: bool
+    verdict: Verdict
+    budget_breaches: list[str] = []
+    process_checks: list[ProcessCheckResult] = []
+    score: int
+    cost_usd: float = 0.0
diff --git a/tests/lib/test_models.py b/tests/lib/test_models.py
new file mode 100644
index 0000000..2f95d78
--- /dev/null
+++ b/tests/lib/test_models.py
@@ -0,0 +1,54 @@
+# tests/lib/test_models.py
+import pytest
+from pydantic import ValidationError
+from evals.lib.models import (
+    BudgetSpec, ExpectedCheck, PromptConfig, ParsedRun, Verdict,
+)
+
+
+def test_prompt_config_minimal():
+    p = PromptConfig(id="hw-01", should_trigger=True,
+                     invocation_type="explicit", prompt="scan it")
+    assert p.budget is None
+    assert p.expected == []
+    assert p.notes == ""
+
+
+def test_prompt_config_rejects_unknown_field():
+    with pytest.raises(ValidationError):
+        PromptConfig(id="hw-01", should_trigger=True,
+                     invocation_type="explicit", prompt="x", budget_usd=0.1)
+
+
+def test_budget_spec_rejects_unknown_axis():
+    with pytest.raises(ValidationError):
+        BudgetSpec(cost_dollars=0.1)
+
+
+def test_expected_check_requires_exactly_one():
+    ExpectedCheck(signal="hawk scan")            # ok
+    ExpectedCheck(check_id="step1")              # ok
+    ExpectedCheck(anti_pattern="curl")           # ok
+    with pytest.raises(ValidationError):
+        ExpectedCheck()                          # none set
+    with pytest.raises(ValidationError):
+        ExpectedCheck(signal="a", anti_pattern="b")  # two set
+
+
+def test_invocation_type_is_constrained():
+    with pytest.raises(ValidationError):
+        PromptConfig(id="x", should_trigger=True,
+                     invocation_type="bogus", prompt="x")
+
+
+def test_verdict_values():
+    assert Verdict.PASS == "pass"
+    assert Verdict.PASS_SLOW == "pass-slow"
+    assert Verdict.FAIL == "fail"
+
+
+def test_parsed_run_defaults():
+    r = ParsedRun()
+    assert r.bash_commands == []
+    assert r.cost_usd == 0.0
+    assert r.output_tokens is None

From 10415429813d9549d1caf3040defbb08a25c4b7c Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 12:27:22 -0600
Subject: [PATCH 03/61] feat(evals): skill config loader with validation

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/lib/config.py      | 39 +++++++++++++++++++
 tests/lib/test_config.py | 82 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 121 insertions(+)
 create mode 100644 evals/lib/config.py
 create mode 100644 tests/lib/test_config.py

diff --git a/evals/lib/config.py b/evals/lib/config.py
new file mode 100644
index 0000000..4736749
--- /dev/null
+++ b/evals/lib/config.py
@@ -0,0 +1,39 @@
+"""Load and validate a skill's eval config (prompts.yaml + process-checks.json)."""
+from __future__ import annotations
+import json
+from pathlib import Path
+
+import yaml
+from pydantic import BaseModel
+
+from evals.lib.models import PromptConfig
+
+EVALS_DIR = Path(__file__).resolve().parent.parent  # repo/evals
+
+
+class SkillConfig(BaseModel):
+    skill: str
+    prompts: list[PromptConfig]
+    checks: list[dict]
+
+
+def load_skill(skill: str, base_dir: Path | None = None) -> SkillConfig:
+    base = base_dir or EVALS_DIR
+    skill_dir = base / skill
+    prompts_raw = yaml.safe_load((skill_dir / "prompts.yaml").read_text()) or []
+    prompts = [PromptConfig(**row) for row in prompts_raw]  # raises on bad fields
+
+    ids = [p.id for p in prompts]
+    dupes = {i for i in ids if ids.count(i) > 1}
+    if dupes:
+        raise ValueError(f"duplicate prompt id(s) in {skill}: {sorted(dupes)}")
+
+    checks = json.loads((skill_dir / "process-checks.json").read_text())["checks"]
+    id_set = set(ids)
+    for c in checks:
+        for target in c.get("applies_to", []):
+            if target not in id_set:
+                raise ValueError(
+                    f"check '{c['id']}' applies_to references unknown prompt '{target}'")
+
+    return SkillConfig(skill=skill, prompts=prompts, checks=checks)
diff --git a/tests/lib/test_config.py b/tests/lib/test_config.py
new file mode 100644
index 0000000..8f64c2e
--- /dev/null
+++ b/tests/lib/test_config.py
@@ -0,0 +1,82 @@
+# tests/lib/test_config.py
+import json
+import textwrap
+import pytest
+from pydantic import ValidationError
+from evals.lib.config import load_skill, SkillConfig
+
+
+def _write_skill(tmp_path, prompts_yaml: str, checks: dict):
+    skill_dir = tmp_path / "demo"
+    skill_dir.mkdir()
+    (skill_dir / "prompts.yaml").write_text(prompts_yaml)
+    (skill_dir / "process-checks.json").write_text(json.dumps(checks))
+    return skill_dir
+
+
+def test_load_skill_parses_prompts_and_checks(tmp_path):
+    yaml_text = textwrap.dedent("""
+      - id: d-01
+        should_trigger: true
+        invocation_type: explicit
+        prompt: do the thing
+        budget:
+          bash_commands: 5
+        expected:
+          - signal: "hawk scan"
+    """)
+    checks = {"skill": "demo", "checks": [
+        {"id": "c1", "type": "command_executed", "signals": ["hawk scan"],
+         "severity": "blocking"}]}
+    skill_dir = _write_skill(tmp_path, yaml_text, checks)
+
+    cfg = load_skill("demo", base_dir=skill_dir.parent)
+    assert isinstance(cfg, SkillConfig)
+    assert cfg.skill == "demo"
+    assert len(cfg.prompts) == 1
+    assert cfg.prompts[0].budget.bash_commands == 5
+    assert cfg.checks[0]["id"] == "c1"
+
+
+def test_load_skill_rejects_bad_prompt_field(tmp_path):
+    yaml_text = textwrap.dedent("""
+      - id: d-01
+        should_trigger: true
+        invocation_type: explicit
+        prompt: x
+        budget_usd: 0.1
+    """)
+    skill_dir = _write_skill(tmp_path, yaml_text, {"skill": "demo", "checks": []})
+    with pytest.raises(ValidationError):
+        load_skill("demo", base_dir=skill_dir.parent)
+
+
+def test_load_skill_rejects_duplicate_ids(tmp_path):
+    yaml_text = textwrap.dedent("""
+      - id: dup
+        should_trigger: true
+        invocation_type: explicit
+        prompt: a
+      - id: dup
+        should_trigger: false
+        invocation_type: negative
+        prompt: b
+    """)
+    skill_dir = _write_skill(tmp_path, yaml_text, {"skill": "demo", "checks": []})
+    with pytest.raises(ValueError, match="duplicate prompt id"):
+        load_skill("demo", base_dir=skill_dir.parent)
+
+
+def test_load_skill_rejects_applies_to_unknown_prompt(tmp_path):
+    yaml_text = textwrap.dedent("""
+      - id: d-01
+        should_trigger: true
+        invocation_type: explicit
+        prompt: x
+    """)
+    checks = {"skill": "demo", "checks": [
+        {"id": "c1", "type": "command_executed", "signals": ["x"],
+         "severity": "warning", "applies_to": ["nope"]}]}
+    skill_dir = _write_skill(tmp_path, yaml_text, checks)
+    with pytest.raises(ValueError, match="applies_to references unknown prompt"):
+        load_skill("demo", base_dir=skill_dir.parent)

From 23f82e276027738d9837711345be8dc9bb37b6db Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 12:29:41 -0600
Subject: [PATCH 04/61] feat(evals): grading with per-prompt expected + budget
 verdict

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/lib/grading.py      | 122 ++++++++++++++++++++++++++++++++++++++
 tests/lib/test_grading.py |  98 ++++++++++++++++++++++++++++++
 2 files changed, 220 insertions(+)
 create mode 100644 evals/lib/grading.py
 create mode 100644 tests/lib/test_grading.py

diff --git a/evals/lib/grading.py b/evals/lib/grading.py
new file mode 100644
index 0000000..821a99c
--- /dev/null
+++ b/evals/lib/grading.py
@@ -0,0 +1,122 @@
+"""Grading: process checks (ported from the claude-code harness), per-prompt
+ad-hoc expectations, budget scoring, and the three-state verdict."""
+from __future__ import annotations
+import re
+
+from evals.lib.models import (
+    ParsedRun, PromptConfig, BudgetSpec, ExpectedCheck, Verdict,
+    ProcessCheckResult, EvalResult,
+)
+
+
+def applicable_checks(checks: list[dict], prompt_id: str) -> list[dict]:
+    """A check applies if it has no applies_to (global) or names this prompt id."""
+    out = []
+    for c in checks:
+        targets = c.get("applies_to")
+        if not targets or prompt_id in targets:
+            out.append(c)
+    return out
+
+
+def _haystack(run: ParsedRun) -> str:
+    return " ".join([*run.bash_commands, run.output_text]).lower()
+
+
+def run_process_checks(run: ParsedRun, checks: list[dict]) -> list[ProcessCheckResult]:
+    haystack = _haystack(run)
+    all_files = " ".join(run.files_written + run.files_edited).lower()
+    results: list[ProcessCheckResult] = []
+
+    for check in checks:
+        ctype = check.get("type", "command_executed")
+        signals = [s.lower() for s in check.get("signals", [])]
+        antis = [a.lower() for a in check.get("anti_patterns", [])]
+        signal_hit = next((s for s in signals if s in haystack), None)
+        anti_hit = next((a for a in antis if a in haystack), None)
+
+        if ctype in ("command_negative", "file_content_negative", "output_negative"):
+            passed = anti_hit is None
+        elif ctype == "file_absent":
+            target = check.get("target_file", "").lower()
+            passed = target not in all_files
+        elif ctype == "conditional_command":
+            condition_str = check.get("condition", "")
+            m = re.search(r"'([^']+)'", condition_str)
+            keyword = m.group(1).lower() if m else None
+            passed = True if (keyword and keyword not in haystack) else signal_hit is not None
+        elif ctype == "command_preference":
+            preferred = [p.lower() for p in check.get("preferred", [])]
+            passed = any(p in haystack for p in preferred) and anti_hit is None
+        else:
+            passed = signal_hit is not None and (anti_hit is None if antis else True)
+
+        results.append(ProcessCheckResult(
+            id=check["id"], passed=passed,
+            severity=check.get("severity", "warning"),
+            signal_found=signal_hit, anti_found=anti_hit,
+        ))
+    return results
+
+
+def run_adhoc_expected(run: ParsedRun, expected: list[ExpectedCheck]) -> list[ProcessCheckResult]:
+    """Per-prompt expectations. signal/anti_pattern are blocking; check_id refs are
+    resolved by the caller against process-checks and skipped here."""
+    haystack = _haystack(run)
+    results: list[ProcessCheckResult] = []
+    for i, exp in enumerate(expected):
+        if exp.check_id is not None:
+            continue  # handled via applies_to / process checks
+        if exp.signal is not None:
+            hit = exp.signal.lower() in haystack
+            results.append(ProcessCheckResult(
+                id=f"expected[{i}]:signal", passed=hit, severity="blocking",
+                signal_found=exp.signal if hit else None))
+        elif exp.anti_pattern is not None:
+            hit = exp.anti_pattern.lower() in haystack
+            results.append(ProcessCheckResult(
+                id=f"expected[{i}]:anti", passed=not hit, severity="blocking",
+                anti_found=exp.anti_pattern if hit else None))
+    return results
+
+
+def check_budget(run: ParsedRun, budget: BudgetSpec) -> list[str]:
+    breaches: list[str] = []
+    if budget.cost_usd is not None and run.cost_usd > budget.cost_usd:
+        breaches.append(f"cost_usd {run.cost_usd:.3f} > {budget.cost_usd}")
+    if budget.bash_commands is not None and len(run.bash_commands) > budget.bash_commands:
+        breaches.append(f"bash_commands {len(run.bash_commands)} > {budget.bash_commands}")
+    if budget.output_tokens is not None and (run.output_tokens or 0) > budget.output_tokens:
+        breaches.append(f"output_tokens {run.output_tokens} > {budget.output_tokens}")
+    if budget.wall_seconds is not None and (run.wall_seconds or 0) > budget.wall_seconds:
+        breaches.append(f"wall_seconds {run.wall_seconds:.0f} > {budget.wall_seconds}")
+    return breaches
+
+
+def _score(checks: list[ProcessCheckResult]) -> int:
+    blocking = sum(1 for c in checks if not c.passed and c.severity == "blocking")
+    warning = sum(1 for c in checks if not c.passed and c.severity == "warning")
+    return max(0, 100 - blocking * 15 - warning * 5)
+
+
+def grade(prompt: PromptConfig, run: ParsedRun, checks: list[dict], *,
+          platform: str, skill: str, did_trigger: bool) -> EvalResult:
+    proc = run_process_checks(run, applicable_checks(checks, prompt.id))
+    proc += run_adhoc_expected(run, prompt.expected)
+
+    blocking_failed = any(not c.passed and c.severity == "blocking" for c in proc)
+    verdict = Verdict.FAIL if blocking_failed else Verdict.PASS
+
+    breaches: list[str] = []
+    if verdict == Verdict.PASS and prompt.budget is not None:
+        breaches = check_budget(run, prompt.budget)
+        if breaches:
+            verdict = Verdict.PASS_SLOW
+
+    return EvalResult(
+        platform=platform, skill=skill, run_id=prompt.id,
+        should_trigger=prompt.should_trigger, did_trigger=did_trigger,
+        trigger_correct=(did_trigger == prompt.should_trigger),
+        verdict=verdict, budget_breaches=breaches, process_checks=proc,
+        score=_score(proc), cost_usd=run.cost_usd,
+    )
diff --git a/tests/lib/test_grading.py b/tests/lib/test_grading.py
new file mode 100644
index 0000000..e1ba94e
--- /dev/null
+++ b/tests/lib/test_grading.py
@@ -0,0 +1,98 @@
+# tests/lib/test_grading.py
+from evals.lib.models import ParsedRun, PromptConfig, BudgetSpec, ExpectedCheck, Verdict
+from evals.lib.grading import (
+    applicable_checks, run_process_checks, run_adhoc_expected, check_budget, grade,
+)
+
+
+def _prompt(**kw):
+    base = dict(id="d-01", should_trigger=True, invocation_type="explicit", prompt="x")
+    base.update(kw)
+    return PromptConfig(**base)
+
+
+def test_applicable_checks_global_and_scoped():
+    checks = [
+        {"id": "global", "type": "command_executed", "signals": ["a"], "severity": "warning"},
+        {"id": "scoped", "type": "command_executed", "signals": ["b"], "severity": "warning",
+         "applies_to": ["d-02"]},
+    ]
+    assert {c["id"] for c in applicable_checks(checks, "d-01")} == {"global"}
+    assert {c["id"] for c in applicable_checks(checks, "d-02")} == {"global", "scoped"}
+
+
+def test_process_check_signal_hit():
+    run = ParsedRun(bash_commands=["hawk scan --env test"])
+    checks = [{"id": "c1", "type": "command_executed", "signals": ["hawk scan"],
+               "severity": "blocking"}]
+    res = run_process_checks(run, checks)
+    assert res[0].passed is True
+    assert res[0].signal_found == "hawk scan"
+
+
+def test_process_check_anti_pattern_negative_type():
+    run = ParsedRun(bash_commands=["curl https://api/v1/scan"])
+    checks = [{"id": "c1", "type": "command_negative", "anti_patterns": ["curl"],
+               "severity": "warning"}]
+    res = run_process_checks(run, checks)
+    assert res[0].passed is False
+    assert res[0].anti_found == "curl"
+
+
+def test_adhoc_expected_signal_and_anti():
+    run = ParsedRun(bash_commands=["hawk validate"], output_text="done")
+    expected = [ExpectedCheck(signal="hawk validate"),
+                ExpectedCheck(anti_pattern="rm -rf")]
+    res = run_adhoc_expected(run, expected)
+    assert all(r.passed for r in res)
+
+
+def test_adhoc_expected_missing_signal_is_blocking_fail():
+    run = ParsedRun(bash_commands=["hawk scan"])
+    res = run_adhoc_expected(run, [ExpectedCheck(signal="hawk validate")])
+    assert res[0].passed is False
+    assert res[0].severity == "blocking"
+
+
+def test_check_budget_detects_breaches():
+    run = ParsedRun(bash_commands=["a", "b", "c"], cost_usd=0.30, output_tokens=9000)
+    budget = BudgetSpec(cost_usd=0.15, bash_commands=2, output_tokens=5000)
+    breaches = check_budget(run, budget)
+    assert any("cost_usd" in b for b in breaches)
+    assert any("bash_commands" in b for b in breaches)
+    assert any("output_tokens" in b for b in breaches)
+
+
+def test_check_budget_ignores_unset_axes():
+    run = ParsedRun(bash_commands=["a", "b", "c"])
+    assert check_budget(run, BudgetSpec(cost_usd=1.0)) == []
+
+
+def test_grade_pass():
+    run = ParsedRun(bash_commands=["hawk scan"])
+    checks = [{"id": "c1", "type": "command_executed", "signals": ["hawk scan"],
+               "severity": "blocking"}]
+    result = grade(_prompt(), run, checks, platform="claude-code", skill="demo",
+                   did_trigger=True)
+    assert result.verdict == Verdict.PASS
+    assert result.score == 100
+
+
+def test_grade_fail_on_blocking():
+    run = ParsedRun(bash_commands=["echo nope"])
+    checks = [{"id": "c1", "type": "command_executed", "signals": ["hawk scan"],
+               "severity": "blocking"}]
+    result = grade(_prompt(), run, checks, platform="claude-code", skill="demo",
+                   did_trigger=True)
+    assert result.verdict == Verdict.FAIL
+
+
+def test_grade_pass_slow_on_budget_breach():
+    run = ParsedRun(bash_commands=["hawk scan", "a", "b", "c"])
+    checks = [{"id": "c1", "type": "command_executed", "signals": ["hawk scan"],
+               "severity": "blocking"}]
+    p = _prompt(budget=BudgetSpec(bash_commands=2))
+    result = grade(p, run, checks, platform="claude-code", skill="demo",
+                   did_trigger=True)
+    assert result.verdict == Verdict.PASS_SLOW
+    assert any("bash_commands" in b for b in result.budget_breaches)

From d7adeff0336cedb0b2f04e93b68c34754098121d Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 12:34:51 -0600
Subject: [PATCH 05/61] fix(evals): grading robustness from code review (loud
 on malformed checks, +tests)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/lib/grading.py      | 13 ++++++--
 tests/lib/test_grading.py | 68 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 78 insertions(+), 3 deletions(-)

diff --git a/evals/lib/grading.py b/evals/lib/grading.py
index 821a99c..a3876cc 100644
--- a/evals/lib/grading.py
+++ b/evals/lib/grading.py
@@ -43,11 +43,18 @@ def run_process_checks(run: ParsedRun, checks: list[dict]) -> list[ProcessCheckR
         elif ctype == "conditional_command":
             condition_str = check.get("condition", "")
             m = re.search(r"'([^']+)'", condition_str)
+            if condition_str and m is None:
+                raise ValueError(
+                    f"conditional_command check '{check['id']}': condition "
+                    f"'{condition_str}' has no single-quoted keyword")
             keyword = m.group(1).lower() if m else None
             passed = True if (keyword and keyword not in haystack) else signal_hit is not None
         elif ctype == "command_preference":
             preferred = [p.lower() for p in check.get("preferred", [])]
-            passed = any(p in haystack for p in preferred) and anti_hit is None
+            if preferred:
+                passed = any(p in haystack for p in preferred) and anti_hit is None
+            else:
+                passed = anti_hit is None  # no preference expressed; only anti-patterns matter
         else:
             passed = signal_hit is not None and (anti_hit is None if antis else True)
 
@@ -83,13 +90,13 @@ def run_adhoc_expected(run: ParsedRun, expected: list[ExpectedCheck]) -> list[Pr
 def check_budget(run: ParsedRun, budget: BudgetSpec) -> list[str]:
     breaches: list[str] = []
     if budget.cost_usd is not None and run.cost_usd > budget.cost_usd:
-        breaches.append(f"cost_usd {run.cost_usd:.3f} > {budget.cost_usd}")
+        breaches.append(f"cost_usd {run.cost_usd:.3f} > {budget.cost_usd:.3f}")
     if budget.bash_commands is not None and len(run.bash_commands) > budget.bash_commands:
         breaches.append(f"bash_commands {len(run.bash_commands)} > {budget.bash_commands}")
     if budget.output_tokens is not None and (run.output_tokens or 0) > budget.output_tokens:
         breaches.append(f"output_tokens {run.output_tokens} > {budget.output_tokens}")
     if budget.wall_seconds is not None and (run.wall_seconds or 0) > budget.wall_seconds:
-        breaches.append(f"wall_seconds {run.wall_seconds:.0f} > {budget.wall_seconds}")
+        breaches.append(f"wall_seconds {run.wall_seconds:.0f} > {budget.wall_seconds:.0f}")
     return breaches
 
 
diff --git a/tests/lib/test_grading.py b/tests/lib/test_grading.py
index e1ba94e..eab61de 100644
--- a/tests/lib/test_grading.py
+++ b/tests/lib/test_grading.py
@@ -96,3 +96,71 @@ def test_grade_pass_slow_on_budget_breach():
                    did_trigger=True)
     assert result.verdict == Verdict.PASS_SLOW
     assert any("bash_commands" in b for b in result.budget_breaches)
+
+
+def test_process_check_conditional_command_enforced_when_keyword_present():
+    run = ParsedRun(bash_commands=["cat stackhawk.yml: authentication: enabled"],
+                    output_text="hawk validate ran")
+    checks = [{"id": "c1", "type": "conditional_command",
+               "condition": "stackhawk.yml contains 'authentication:'",
+               "signals": ["hawk validate"], "severity": "warning"}]
+    assert run_process_checks(run, checks)[0].passed is True
+
+
+def test_process_check_conditional_command_skipped_when_keyword_absent():
+    run = ParsedRun(bash_commands=["echo nothing relevant"])
+    checks = [{"id": "c1", "type": "conditional_command",
+               "condition": "stackhawk.yml contains 'authentication:'",
+               "signals": ["hawk validate"], "severity": "warning"}]
+    # keyword not in haystack -> check is not applicable -> passes
+    assert run_process_checks(run, checks)[0].passed is True
+
+
+def test_process_check_conditional_command_raises_without_quoted_keyword():
+    import pytest
+    run = ParsedRun(bash_commands=["x"])
+    checks = [{"id": "c1", "type": "conditional_command",
+               "condition": "no quotes here", "signals": ["x"], "severity": "warning"}]
+    with pytest.raises(ValueError, match="single-quoted keyword"):
+        run_process_checks(run, checks)
+
+
+def test_process_check_command_preference_normal():
+    run = ParsedRun(bash_commands=["hawkop scan get 123"])
+    checks = [{"id": "c1", "type": "command_preference",
+               "preferred": ["hawkop scan get"], "anti_patterns": ["curl"],
+               "severity": "warning"}]
+    assert run_process_checks(run, checks)[0].passed is True
+
+
+def test_process_check_command_preference_empty_is_unconstrained():
+    run = ParsedRun(bash_commands=["anything"])
+    checks = [{"id": "c1", "type": "command_preference", "preferred": [],
+               "anti_patterns": ["curl"], "severity": "warning"}]
+    assert run_process_checks(run, checks)[0].passed is True
+
+
+def test_process_check_file_absent():
+    run = ParsedRun(files_written=["stackhawk.yml"])
+    present = [{"id": "c1", "type": "file_absent", "target_file": "stackhawk.yml",
+                "severity": "warning"}]
+    absent = [{"id": "c2", "type": "file_absent", "target_file": "secrets.env",
+               "severity": "warning"}]
+    assert run_process_checks(run, present)[0].passed is False
+    assert run_process_checks(run, absent)[0].passed is True
+
+
+def test_adhoc_expected_check_id_is_skipped():
+    run = ParsedRun(bash_commands=["x"])
+    assert run_adhoc_expected(run, [ExpectedCheck(check_id="step1")]) == []
+
+
+def test_score_deductions():
+    from evals.lib.grading import _score
+    from evals.lib.models import ProcessCheckResult
+    def pc(passed, sev): return ProcessCheckResult(id="x", passed=passed, severity=sev)
+    assert _score([pc(True, "blocking")]) == 100
+    assert _score([pc(False, "blocking")]) == 85
+    assert _score([pc(False, "warning")]) == 95
+    assert _score([pc(False, "blocking"), pc(False, "warning")]) == 80
+    assert _score([pc(False, "blocking")] * 8) == 0  # floored

From cf46d3e1fc1c2d41ced10709ca0a495eefd8a051 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 12:37:07 -0600
Subject: [PATCH 06/61] feat(evals): Harness protocol + claude-code adapter

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/harnesses/claude-code/adapter.py | 110 +++++++++++++++++++++++++
 evals/lib/harness.py                   |  32 +++++++
 tests/lib/test_harness.py              |  31 +++++++
 3 files changed, 173 insertions(+)
 create mode 100644 evals/harnesses/claude-code/adapter.py
 create mode 100644 evals/lib/harness.py
 create mode 100644 tests/lib/test_harness.py

diff --git a/evals/harnesses/claude-code/adapter.py b/evals/harnesses/claude-code/adapter.py
new file mode 100644
index 0000000..c6d2a92
--- /dev/null
+++ b/evals/harnesses/claude-code/adapter.py
@@ -0,0 +1,110 @@
+"""claude-code Harness adapter. Parsing + signal lists ported from run-evals.py."""
+from __future__ import annotations
+import json
+import shutil
+import subprocess
+import tempfile
+
+from evals.lib.models import ParsedRun
+
+CLI_SIGNALS = {
+    "hawkscan": ["hawk scan", "hawk validate", "hawk rescan", "hawk config",
+                 "hawk create app", "hawk init", "hawk perch"],
+    "api": ["hawkop scan", "hawkop app", "hawkop org", "hawkop env", "hawkop status",
+            "hawkop init", "/api/v1/scan", "/api/v2/org", "hawk_api GET"],
+}
+
+INVOCATION_SIGNALS = {
+    "hawkscan": [
+        "hawkscan:hawkscan`: yes", "hawkscan:hawkscan` — yes", "hawkscan:hawkscan**: yes",
+        "hawkscan:hawkscan** — yes", "hawkscan:hawkscan: yes", "hawkscan:hawkscan — yes",
+        "hawkscan:hawkscan - yes", "hawkscan:hawkscan - **yes", "hawkscan**: yes",
+        "hawkscan** — yes", "hawkscan** - yes", "hawkscan: yes", "hawkscan — yes",
+        "hawkscan - yes", "autonomous security scan", "dast scan after code",
+        "dast scan triggered", "dast scan required", "security scan required",
+        "security scan after", "run the security scan", "running the hawkscan",
+    ],
+    "api": [
+        "stackhawk-api:api`: yes", "stackhawk-api:api` — yes", "stackhawk-api:api**: yes",
+        "stackhawk-api:api** — yes", "stackhawk-api:api: yes", "stackhawk-api:api — yes",
+        "stackhawk-api:api - yes", "stackhawk-api**: yes", "stackhawk-api** — yes",
+        "stackhawk-api** - yes", "stackhawk-api: yes", "stackhawk-api — yes",
+        "stackhawk-api - yes",
+    ],
+}
+
+
+def parse_stream(raw: str) -> ParsedRun:
+    bash, written, edited, text, cost, err = [], [], [], "", 0.0, None
+    for line in raw.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            event = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        etype = event.get("type", "")
+        if etype == "assistant":
+            for block in event.get("message", {}).get("content", []):
+                bt = block.get("type", "")
+                if bt == "text":
+                    text += block.get("text", "") + "\n"
+                elif bt == "tool_use":
+                    name, inp = block.get("name", ""), block.get("input", {})
+                    if name == "Bash" and inp.get("command"):
+                        bash.append(inp["command"])
+                    elif name == "Write" and inp.get("file_path"):
+                        written.append(inp["file_path"])
+                    elif name == "Edit" and inp.get("file_path"):
+                        edited.append(inp["file_path"])
+        elif etype == "result":
+            cost = event.get("cost_usd") or 0.0
+            text += event.get("result", "")
+            if event.get("subtype") == "error_during_execution":
+                err = event.get("result", "unknown error")
+    return ParsedRun(bash_commands=bash, files_written=written, files_edited=edited,
+                     output_text=text.strip(), cost_usd=cost, error=err)
+
+
+class ClaudeCodeAdapter:
+    platform = "claude-code"
+
+    def cli_signals(self, skill): return CLI_SIGNALS.get(skill, [])
+    def invocation_signals(self, skill): return INVOCATION_SIGNALS.get(skill, [])
+    def parse_stream(self, raw): return parse_stream(raw)
+
+    def detect_trigger(self, run: ParsedRun, skill: str) -> bool:
+        cli = " ".join(run.bash_commands).lower()
+        if any(s.lower() in cli for s in self.cli_signals(skill)):
+            return True
+        text = run.output_text.lower()
+        return any(s.lower() in text for s in self.invocation_signals(skill))
+
+    def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill,
+               max_budget, bare, full_auto) -> ParsedRun:
+        tmpdir = tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_")
+        try:
+            cmd = ["claude", "-p", prompt, "--output-format", "stream-json",
+                   "--verbose", "--no-session-persistence",
+                   "--max-budget-usd", str(max_budget)]
+            if model:
+                cmd += ["--model", model]
+            if load_skill:
+                for pd in plugin_dirs:
+                    cmd += ["--plugin-dir", pd]
+            if full_auto:
+                cmd.append("--dangerously-skip-permissions")
+            if bare:
+                cmd.append("--bare")
+            try:
+                proc = subprocess.run(cmd, capture_output=True, text=True,
+                                      timeout=300, cwd=tmpdir)
+            except subprocess.TimeoutExpired:
+                return ParsedRun(error="timeout")
+            return parse_stream(proc.stdout)
+        finally:
+            shutil.rmtree(tmpdir, ignore_errors=True)
+
+
+ADAPTER = ClaudeCodeAdapter()
diff --git a/evals/lib/harness.py b/evals/lib/harness.py
new file mode 100644
index 0000000..52fb0be
--- /dev/null
+++ b/evals/lib/harness.py
@@ -0,0 +1,32 @@
+"""Harness protocol + adapter registry. An adapter owns everything runtime-specific:
+how to launch the agent, how to parse its stream, and which signals indicate the
+skill fired. Everything downstream consumes the ParsedRun it returns."""
+from __future__ import annotations
+import importlib.util
+from pathlib import Path
+from typing import Protocol
+
+from evals.lib.models import ParsedRun
+
+EVALS_DIR = Path(__file__).resolve().parent.parent
+
+
+class Harness(Protocol):
+    platform: str
+    def cli_signals(self, skill: str) -> list[str]: ...
+    def invocation_signals(self, skill: str) -> list[str]: ...
+    def parse_stream(self, raw: str) -> ParsedRun: ...
+    def detect_trigger(self, run: ParsedRun, skill: str) -> bool: ...
+    def launch(self, prompt: str, skill: str, run_id: str, plugin_dirs: list[str],
+               *, model: str | None, load_skill: bool, max_budget: float,
+               bare: bool, full_auto: bool) -> ParsedRun: ...
+
+
+def get_adapter(platform: str) -> Harness:
+    path = EVALS_DIR / "harnesses" / platform / "adapter.py"
+    if not path.exists():
+        raise ValueError(f"no adapter for platform '{platform}' at {path}")
+    spec = importlib.util.spec_from_file_location(f"adapter_{platform.replace('-', '_')}", path)
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    return mod.ADAPTER
diff --git a/tests/lib/test_harness.py b/tests/lib/test_harness.py
new file mode 100644
index 0000000..4689abb
--- /dev/null
+++ b/tests/lib/test_harness.py
@@ -0,0 +1,31 @@
+# tests/lib/test_harness.py
+import json
+from evals.lib.harness import get_adapter
+from evals.lib.models import ParsedRun
+
+CC = get_adapter("claude-code")
+
+
+def test_parse_stream_extracts_bash_and_text():
+    lines = [
+        json.dumps({"type": "assistant", "message": {"content": [
+            {"type": "tool_use", "name": "Bash", "input": {"command": "hawk scan"}},
+            {"type": "text", "text": "scanning now"},
+        ]}}),
+        json.dumps({"type": "result", "result": "done", "cost_usd": 0.04}),
+    ]
+    run = CC.parse_stream("\n".join(lines))
+    assert isinstance(run, ParsedRun)
+    assert run.bash_commands == ["hawk scan"]
+    assert "scanning now" in run.output_text
+    assert run.cost_usd == 0.04
+
+
+def test_detect_trigger_via_cli_signal():
+    run = ParsedRun(bash_commands=["hawk scan --env test"])
+    assert CC.detect_trigger(run, "hawkscan") is True
+
+
+def test_detect_trigger_negative():
+    run = ParsedRun(bash_commands=["echo hello"], output_text="nothing relevant")
+    assert CC.detect_trigger(run, "hawkscan") is False

From b3827fcec122f1b80db130d481a1ff15eded97bd Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 12:38:34 -0600
Subject: [PATCH 07/61] chore(evals): add one-time prompts CSV->YAML migration
 script

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 scripts/migrate_prompts.py | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 scripts/migrate_prompts.py

diff --git a/scripts/migrate_prompts.py b/scripts/migrate_prompts.py
new file mode 100644
index 0000000..3498fe3
--- /dev/null
+++ b/scripts/migrate_prompts.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+"""One-time, idempotent migration of evals/<skill>/prompts.csv -> prompts.yaml.
+Preserves id, should_trigger (bool), invocation_type, prompt, notes. Adds no
+budgets or expected[] — those are authored by hand afterward."""
+from __future__ import annotations
+import csv
+import sys
+from pathlib import Path
+
+import yaml
+
+EVALS_DIR = Path(__file__).resolve().parent.parent / "evals"
+
+
+def migrate(skill: str) -> None:
+    csv_path = EVALS_DIR / skill / "prompts.csv"
+    yaml_path = EVALS_DIR / skill / "prompts.yaml"
+    rows = []
+    with open(csv_path) as f:
+        for r in csv.DictReader(f):
+            rows.append({
+                "id": r["id"],
+                "should_trigger": r["should_trigger"].strip().lower() == "true",
+                "invocation_type": r["invocation_type"],
+                "prompt": r["prompt"],
+                "notes": r.get("notes", ""),
+            })
+    yaml_path.write_text(yaml.safe_dump(rows, sort_keys=False, width=100,
+                                        allow_unicode=True))
+    print(f"wrote {yaml_path} ({len(rows)} prompts)")
+
+
+if __name__ == "__main__":
+    for skill in (sys.argv[1:] or ["hawkscan", "api"]):
+        migrate(skill)

From f424b26719cde617aac8605c4e970cfe1cb9beaa Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 12:39:12 -0600
Subject: [PATCH 08/61] feat(evals): migrate hawkscan prompts to YAML + worked
 budget example

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/hawkscan/prompts.yaml      | 107 +++++++++++++++++++++++++++++++
 tests/fixtures/hw-07.trace.jsonl |   2 +
 2 files changed, 109 insertions(+)
 create mode 100644 evals/hawkscan/prompts.yaml
 create mode 100644 tests/fixtures/hw-07.trace.jsonl

diff --git a/evals/hawkscan/prompts.yaml b/evals/hawkscan/prompts.yaml
new file mode 100644
index 0000000..3c6e5dd
--- /dev/null
+++ b/evals/hawkscan/prompts.yaml
@@ -0,0 +1,107 @@
+- id: hw-01
+  should_trigger: true
+  invocation_type: explicit
+  prompt: Use $hawkscan to scan my API running on localhost:8080
+  notes: Direct skill reference by name; baseline — if this fails everything else is moot
+- id: hw-02
+  should_trigger: true
+  invocation_type: explicit
+  prompt: Run the hawkscan skill against my running application
+  notes: hawkscan keyword in natural language
+- id: hw-03
+  should_trigger: true
+  invocation_type: implicit
+  prompt: Run a DAST scan on my running application
+  notes: DAST keyword from skill description
+- id: hw-04
+  should_trigger: true
+  invocation_type: implicit
+  prompt: Scan my app for security vulnerabilities
+  notes: '''security scan'' + ''app'' language'
+- id: hw-05
+  should_trigger: true
+  invocation_type: implicit
+  prompt: Use StackHawk to check my service on port 8080
+  notes: StackHawk platform keyword
+- id: hw-06
+  should_trigger: true
+  invocation_type: implicit
+  prompt: I need to run a security scan on my API
+  notes: '''security scan'' + ''API'' language'
+- id: hw-07
+  should_trigger: true
+  invocation_type: contextual
+  prompt: I just finished implementing the payment processing endpoint. Run a security scan before we
+    deploy.
+  notes: Explicit scan request after code change
+  budget:
+    bash_commands: 15
+    cost_usd: 0.15
+  expected:
+  - signal: hawk validate
+- id: hw-08
+  should_trigger: true
+  invocation_type: contextual
+  prompt: I added OAuth2 support to the auth service. Check it for security issues.
+  notes: Implicit security check after feature; tests autonomous trigger
+- id: hw-09
+  should_trigger: true
+  invocation_type: contextual
+  prompt: The SQL injection fix is in. Verify the vulnerability is actually resolved.
+  notes: Fix verification — should route to rescan path not a fresh scan
+- id: hw-10
+  should_trigger: true
+  invocation_type: contextual
+  prompt: I refactored the API gateway — check it hasn't introduced any security regressions.
+  notes: Post-refactor security check; tests autonomous trigger on refactor
+- id: hw-11
+  should_trigger: true
+  invocation_type: contextual
+  prompt: My API spec is at ./openapi.yaml and the service is running on port 3000. Check for security
+    holes before I merge.
+  notes: Pre-merge gate with OpenAPI spec context
+- id: hw-12
+  should_trigger: true
+  invocation_type: contextual
+  prompt: Feature is done. Let's make sure it's secure.
+  notes: Vague post-feature prompt; tests whether autonomous trigger fires on minimal phrasing
+- id: hw-13
+  should_trigger: false
+  invocation_type: negative
+  prompt: Show me my security findings from the last StackHawk scan
+  notes: Reporting query → api skill; hawkscan must NOT trigger
+- id: hw-14
+  should_trigger: false
+  invocation_type: negative
+  prompt: What's my security posture across all my applications?
+  notes: Posture dashboard → api skill; 'security' keyword alone should not trigger hawkscan
+- id: hw-15
+  should_trigger: false
+  invocation_type: negative
+  prompt: How many HIGH severity findings do I have untriaged?
+  notes: Finding count query → api skill
+- id: hw-16
+  should_trigger: false
+  invocation_type: negative
+  prompt: Update the README with instructions for running security scans
+  notes: Doc-only change; skill explicitly excludes doc-only changes
+- id: hw-17
+  should_trigger: false
+  invocation_type: negative
+  prompt: Don't run a security scan after this change — just commit it
+  notes: Explicit user skip; skill must respect user's explicit opt-out
+- id: hw-18
+  should_trigger: false
+  invocation_type: negative
+  prompt: Change the host in stackhawk.yml from localhost:8080 to localhost:3000
+  notes: Config-only edit to stackhawk.yml; not a code change, not a scan trigger
+- id: hw-19
+  should_trigger: false
+  invocation_type: negative
+  prompt: Install the hawk CLI on my machine
+  notes: Installation task with no scan intent
+- id: hw-20
+  should_trigger: false
+  invocation_type: negative
+  prompt: What types of vulnerabilities does HawkScan detect?
+  notes: Research question; no scan intent, no code change
diff --git a/tests/fixtures/hw-07.trace.jsonl b/tests/fixtures/hw-07.trace.jsonl
new file mode 100644
index 0000000..1eda47e
--- /dev/null
+++ b/tests/fixtures/hw-07.trace.jsonl
@@ -0,0 +1,2 @@
+{"type":"assistant","message":{"content":[{"type":"text","text":"`hawkscan:hawkscan`: YES — running the security scan"},{"type":"tool_use","name":"Bash","input":{"command":"hawk validate"}},{"type":"tool_use","name":"Bash","input":{"command":"hawk scan --env Development"}}]}}
+{"type":"result","result":"Scan complete.","cost_usd":0.07,"subtype":"success"}

From ae57aed96e04efcb3db427572df7f16467fb99ff Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 12:45:01 -0600
Subject: [PATCH 09/61] feat(evals): replay-from-trace regrade + realistic
 hw-07 fixture

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/lib/replay.py              | 29 +++++++++++++++++++++++++++++
 tests/fixtures/hw-07.trace.jsonl |  6 ++++--
 tests/lib/test_replay.py         | 20 ++++++++++++++++++++
 3 files changed, 53 insertions(+), 2 deletions(-)
 create mode 100644 evals/lib/replay.py
 create mode 100644 tests/lib/test_replay.py

diff --git a/evals/lib/replay.py b/evals/lib/replay.py
new file mode 100644
index 0000000..95e826c
--- /dev/null
+++ b/evals/lib/replay.py
@@ -0,0 +1,29 @@
+"""Regrade a saved trace with no agent call — the zero-cost iteration loop.
+The trace filename stem is the prompt id (e.g. hw-07.trace.jsonl -> hw-07)."""
+from __future__ import annotations
+from pathlib import Path
+
+from evals.lib.config import load_skill
+from evals.lib.grading import grade
+from evals.lib.harness import get_adapter
+from evals.lib.models import EvalResult
+
+
+def _prompt_id_from_path(trace_path: Path) -> str:
+    return trace_path.name.split(".")[0]
+
+
+def regrade(trace_path: Path, *, skill: str, platform: str) -> EvalResult:
+    trace_path = Path(trace_path)
+    adapter = get_adapter(platform)
+    run = adapter.parse_stream(trace_path.read_text())
+
+    cfg = load_skill(skill)
+    prompt_id = _prompt_id_from_path(trace_path)
+    prompt = next((p for p in cfg.prompts if p.id == prompt_id), None)
+    if prompt is None:
+        raise ValueError(f"no prompt '{prompt_id}' in skill '{skill}'")
+
+    did_trigger = adapter.detect_trigger(run, skill)
+    return grade(prompt, run, cfg.checks, platform=platform, skill=skill,
+                 did_trigger=did_trigger)
diff --git a/tests/fixtures/hw-07.trace.jsonl b/tests/fixtures/hw-07.trace.jsonl
index 1eda47e..6869787 100644
--- a/tests/fixtures/hw-07.trace.jsonl
+++ b/tests/fixtures/hw-07.trace.jsonl
@@ -1,2 +1,4 @@
-{"type":"assistant","message":{"content":[{"type":"text","text":"`hawkscan:hawkscan`: YES — running the security scan"},{"type":"tool_use","name":"Bash","input":{"command":"hawk validate"}},{"type":"tool_use","name":"Bash","input":{"command":"hawk scan --env Development"}}]}}
-{"type":"result","result":"Scan complete.","cost_usd":0.07,"subtype":"success"}
+{"type":"assistant","message":{"content":[{"type":"text","text":"`hawkscan:hawkscan`: YES — running the security scan after the code change. First I'll verify the CLI and that the app is running and reachable on localhost port 8080."},{"type":"tool_use","name":"Bash","input":{"command":"hawk version"}},{"type":"tool_use","name":"Bash","input":{"command":"hawk config --help"}}]}}
+{"type":"assistant","message":{"content":[{"type":"text","text":"Confirming the app and environment exist and credentials are set."},{"type":"tool_use","name":"Bash","input":{"command":"hawkop app list"}},{"type":"tool_use","name":"Bash","input":{"command":"hawkop env list"}},{"type":"tool_use","name":"Bash","input":{"command":"hawk init"}}]}}
+{"type":"assistant","message":{"content":[{"type":"text","text":"Validating config, then running the scan synchronously."},{"type":"tool_use","name":"Bash","input":{"command":"hawk validate config stackhawk.yml"}},{"type":"tool_use","name":"Bash","input":{"command":"hawk scan --env Development"}}]}}
+{"type":"result","result":"Scan complete. The application was reachable on localhost:8080 throughout.","cost_usd":0.07,"subtype":"success"}
diff --git a/tests/lib/test_replay.py b/tests/lib/test_replay.py
new file mode 100644
index 0000000..a69f3a7
--- /dev/null
+++ b/tests/lib/test_replay.py
@@ -0,0 +1,20 @@
+# tests/lib/test_replay.py
+from pathlib import Path
+from evals.lib.replay import regrade
+from evals.lib.models import Verdict
+
+FIXTURE = Path(__file__).parent.parent / "fixtures" / "hw-07.trace.jsonl"
+
+
+def test_regrade_from_trace_passes():
+    result = regrade(FIXTURE, skill="hawkscan", platform="claude-code")
+    assert result.did_trigger is True
+    assert result.verdict in (Verdict.PASS, Verdict.PASS_SLOW)
+    assert result.run_id == "hw-07"
+
+
+def test_regrade_is_deterministic():
+    a = regrade(FIXTURE, skill="hawkscan", platform="claude-code")
+    b = regrade(FIXTURE, skill="hawkscan", platform="claude-code")
+    assert a.verdict == b.verdict
+    assert a.score == b.score

From de9cb81acf8f4c1c5ac466426003160faa845663 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 12:48:51 -0600
Subject: [PATCH 10/61] feat(evals): with/without-skill compare mode

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/lib/compare.py      | 36 ++++++++++++++++++++++++++++++++
 tests/lib/test_compare.py | 43 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+)
 create mode 100644 evals/lib/compare.py
 create mode 100644 tests/lib/test_compare.py

diff --git a/evals/lib/compare.py b/evals/lib/compare.py
new file mode 100644
index 0000000..b48316c
--- /dev/null
+++ b/evals/lib/compare.py
@@ -0,0 +1,36 @@
+"""Run each should_trigger prompt with and without the skill loaded; report lift."""
+from __future__ import annotations
+from pathlib import Path
+
+from evals.lib.config import load_skill
+from evals.lib.grading import grade
+from evals.lib.harness import get_adapter
+
+
+def compare_skill(skill: str, platform: str, *, model: str | None = None,
+                  max_budget: float = 0.20, bare: bool = False,
+                  full_auto: bool = False, only_id: str | None = None) -> list[dict]:
+    cfg = load_skill(skill)
+    adapter = get_adapter(platform)
+    plugin_dirs = [str(Path.cwd() / "plugins" / skill)]
+    prompts = [p for p in cfg.prompts
+               if p.should_trigger and (not only_id or p.id == only_id)]
+
+    rows = []
+    for p in prompts:
+        graded = {}
+        for load in (True, False):
+            run = adapter.launch(p.prompt, skill, f"{p.id}-{'with' if load else 'without'}",
+                                 plugin_dirs, model=model, load_skill=load,
+                                 max_budget=max_budget, bare=bare, full_auto=full_auto)
+            did = adapter.detect_trigger(run, skill)
+            graded[load] = grade(p, run, cfg.checks, platform=platform, skill=skill,
+                                 did_trigger=did)
+        rows.append({
+            "id": p.id,
+            "with_verdict": graded[True].verdict,
+            "without_verdict": graded[False].verdict,
+            "with_cost": graded[True].cost_usd,
+            "without_cost": graded[False].cost_usd,
+        })
+    return rows
diff --git a/tests/lib/test_compare.py b/tests/lib/test_compare.py
new file mode 100644
index 0000000..fbe6fd7
--- /dev/null
+++ b/tests/lib/test_compare.py
@@ -0,0 +1,43 @@
+# tests/lib/test_compare.py
+from evals.lib.models import ParsedRun, Verdict
+from evals.lib import compare as compare_mod
+
+
+# A realistic skill-loaded hawkscan run: preflight + step1 discovery + config
+# validation + synchronous scan, with output mentioning the app is reachable.
+# This satisfies hawkscan's blocking process-checks, the way a real run would.
+_WITH_SKILL = ParsedRun(
+    bash_commands=[
+        "hawk version",
+        "hawk config --help",
+        "hawkop app list",
+        "hawkop env list",
+        "hawk init",
+        "hawk validate config stackhawk.yml",
+        "hawk scan --env Development",
+    ],
+    output_text="The application was running and reachable on localhost:8080.",
+    cost_usd=0.05,
+)
+_WITHOUT_SKILL = ParsedRun(bash_commands=["echo idk"], cost_usd=0.02)
+
+
+class StubAdapter:
+    platform = "stub"
+    def cli_signals(self, skill): return ["hawk scan"]
+    def invocation_signals(self, skill): return []
+    def parse_stream(self, raw): return ParsedRun()
+    def detect_trigger(self, run, skill):
+        return any("hawk scan" in c for c in run.bash_commands)
+    def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill,
+               max_budget, bare, full_auto):
+        return _WITH_SKILL if load_skill else _WITHOUT_SKILL
+
+
+def test_compare_shows_lift(monkeypatch):
+    monkeypatch.setattr(compare_mod, "get_adapter", lambda p: StubAdapter())
+    rows = compare_mod.compare_skill("hawkscan", "stub", only_id="hw-01")
+    row = rows[0]
+    assert row["without_verdict"] == Verdict.FAIL          # no skill -> blocking checks fail
+    assert row["with_verdict"] in (Verdict.PASS, Verdict.PASS_SLOW)  # skill -> workflow satisfied
+    assert row["with_cost"] == 0.05 and row["without_cost"] == 0.02

From 9399123e5e094d8ac9f3bad961ce4e5f1c7f4486 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 12:51:34 -0600
Subject: [PATCH 11/61] feat(evals): unified CLI
 (evals/compare/regrade/validate) + reporting

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/cli.py                | 95 +++++++++++++++++++++++++++++++++++++
 evals/lib/reporting.py      | 55 +++++++++++++++++++++
 tests/lib/test_reporting.py | 20 ++++++++
 3 files changed, 170 insertions(+)
 create mode 100644 evals/cli.py
 create mode 100644 evals/lib/reporting.py
 create mode 100644 tests/lib/test_reporting.py

diff --git a/evals/cli.py b/evals/cli.py
new file mode 100644
index 0000000..bb32b34
--- /dev/null
+++ b/evals/cli.py
@@ -0,0 +1,95 @@
+"""Unified eval CLI. Entry points: evals, compare, regrade, validate."""
+from __future__ import annotations
+import argparse
+import json
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+from evals.lib.config import load_skill
+from evals.lib.grading import grade
+from evals.lib.harness import get_adapter
+from evals.lib.replay import regrade as _regrade
+from evals.lib.reporting import build_summary, render_table, render_compare, console
+from evals.lib.compare import compare_skill
+
+PLATFORMS = ["claude-code", "codex", "cursor", "copilot", "agy"]
+RESULTS_ROOT = Path(__file__).resolve().parent / "harnesses"
+
+
+def _common_args(p: argparse.ArgumentParser) -> None:
+    p.add_argument("--skill", required=True, choices=["hawkscan", "api"])
+    p.add_argument("--harness", default="claude-code", choices=PLATFORMS)
+    p.add_argument("--id", dest="prompt_id")
+    p.add_argument("--model")
+    p.add_argument("--max-budget", type=float, default=0.20)
+    p.add_argument("--bare", action="store_true")
+    p.add_argument("--full-auto", action="store_true")
+    p.add_argument("--rubric", action="store_true")
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser(prog="evals")
+    _common_args(ap)
+    args = ap.parse_args()
+
+    cfg = load_skill(args.skill)
+    adapter = get_adapter(args.harness)
+    plugin_dirs = [str(Path.cwd() / "plugins" / args.skill)]
+    prompts = [p for p in cfg.prompts if not args.prompt_id or p.id == args.prompt_id]
+    if not prompts:
+        print(f"no prompt '{args.prompt_id}'", file=sys.stderr); sys.exit(1)
+
+    results = []
+    out_dir = RESULTS_ROOT / args.harness / "results" / args.skill
+    out_dir.mkdir(parents=True, exist_ok=True)
+    for p in prompts:
+        run = adapter.launch(p.prompt, args.skill, p.id, plugin_dirs,
+                             model=args.model, load_skill=True,
+                             max_budget=args.max_budget, bare=args.bare,
+                             full_auto=args.full_auto)
+        did = adapter.detect_trigger(run, args.skill)
+        res = grade(p, run, cfg.checks, platform=args.harness, skill=args.skill,
+                    did_trigger=did)
+        results.append(res)
+        (out_dir / f"{p.id}.result.json").write_text(res.model_dump_json(indent=2))
+
+    render_table(results)
+    summary = build_summary(args.skill, args.harness, results)
+    summary["timestamp"] = datetime.now(timezone.utc).isoformat()
+    (out_dir / "summary.json").write_text(json.dumps(summary, indent=2))
+
+    if summary["false_positives"] or summary["false_negatives"] or \
+            summary["total_blocking_failures"] > 0:
+        sys.exit(1)
+
+
+def compare() -> None:
+    ap = argparse.ArgumentParser(prog="compare")
+    _common_args(ap)
+    args = ap.parse_args()
+    rows = compare_skill(args.skill, args.harness, model=args.model,
+                         max_budget=args.max_budget, bare=args.bare,
+                         full_auto=args.full_auto, only_id=args.prompt_id)
+    render_compare(rows)
+
+
+def regrade() -> None:
+    ap = argparse.ArgumentParser(prog="regrade")
+    ap.add_argument("trace", type=Path)
+    ap.add_argument("--skill", required=True, choices=["hawkscan", "api"])
+    ap.add_argument("--harness", default="claude-code", choices=PLATFORMS)
+    args = ap.parse_args()
+    res = _regrade(args.trace, skill=args.skill, platform=args.harness)
+    render_table([res])
+
+
+def validate() -> None:
+    ap = argparse.ArgumentParser(prog="validate")
+    ap.add_argument("--skill", choices=["hawkscan", "api"])
+    args = ap.parse_args()
+    skills = [args.skill] if args.skill else ["hawkscan", "api"]
+    for skill in skills:
+        cfg = load_skill(skill)   # raises on any validation error
+        console.print(f"[green]✓[/] {skill}: {len(cfg.prompts)} prompts, "
+                      f"{len(cfg.checks)} checks valid")
diff --git a/evals/lib/reporting.py b/evals/lib/reporting.py
new file mode 100644
index 0000000..6a37bc5
--- /dev/null
+++ b/evals/lib/reporting.py
@@ -0,0 +1,55 @@
+"""Summaries + rich rendering for eval runs."""
+from __future__ import annotations
+from collections import Counter
+
+from rich.console import Console
+from rich.table import Table
+
+from evals.lib.models import EvalResult, Verdict
+
+console = Console()
+DOT = {Verdict.PASS: "[green]● PASS[/]", Verdict.PASS_SLOW: "[yellow]◐ PASS-SLOW[/]",
+       Verdict.FAIL: "[red]○ FAIL[/]"}
+
+
+def build_summary(skill: str, platform: str, results: list[EvalResult]) -> dict:
+    correct = sum(1 for r in results if r.trigger_correct)
+    fp = [r.run_id for r in results if not r.should_trigger and r.did_trigger]
+    fn = [r.run_id for r in results if r.should_trigger and not r.did_trigger]
+    counts = Counter(r.verdict.value for r in results)
+    graded = [r for r in results if r.did_trigger and r.should_trigger]
+    avg = sum(r.score for r in graded) // len(graded) if graded else None
+    return {
+        "skill": skill, "platform": platform,
+        "trigger_accuracy": {"correct": correct, "total": len(results)},
+        "false_positives": fp, "false_negatives": fn,
+        "verdict_counts": dict(counts), "process_avg_score": avg,
+        "total_blocking_failures": sum(
+            1 for r in results for c in r.process_checks
+            if not c.passed and c.severity == "blocking"),
+    }
+
+
+def render_table(results: list[EvalResult]) -> None:
+    t = Table(show_edge=False, box=None, padding=(0, 2))
+    for col in ("ID", "Trigger", "Verdict", "Score", "Budget", "Cost"):
+        t.add_column(col)
+    for r in results:
+        trig = "[green]✓[/]" if r.trigger_correct else "[red]✗[/]"
+        budget = ", ".join(r.budget_breaches) or "—"
+        t.add_row(r.run_id, trig, DOT[r.verdict], str(r.score), budget,
+                  f"${r.cost_usd:.3f}")
+    console.print(t)
+
+
+def render_compare(rows: list[dict]) -> None:
+    """rows: {id, with_verdict, without_verdict, with_cost, without_cost}."""
+    t = Table(show_edge=False, box=None, padding=(0, 2))
+    for col in ("ID", "Without skill", "With skill", "Δ"):
+        t.add_column(col)
+    for row in rows:
+        w, wo = row["with_verdict"], row["without_verdict"]
+        delta = "[green]↑ lift[/]" if (wo == Verdict.FAIL and w != Verdict.FAIL) else (
+                "[red]↓ regress[/]" if (wo != Verdict.FAIL and w == Verdict.FAIL) else "=")
+        t.add_row(row["id"], DOT[wo], DOT[w], delta)
+    console.print(t)
diff --git a/tests/lib/test_reporting.py b/tests/lib/test_reporting.py
new file mode 100644
index 0000000..54707d2
--- /dev/null
+++ b/tests/lib/test_reporting.py
@@ -0,0 +1,20 @@
+# tests/lib/test_reporting.py
+from evals.lib.models import EvalResult, Verdict
+from evals.lib.reporting import build_summary
+
+
+def _r(run_id, verdict, trigger_ok=True, should=True, did=True):
+    return EvalResult(platform="claude-code", skill="hawkscan", run_id=run_id,
+                      should_trigger=should, did_trigger=did, trigger_correct=trigger_ok,
+                      verdict=verdict, score=100 if verdict != Verdict.FAIL else 40)
+
+
+def test_build_summary_counts():
+    results = [_r("hw-01", Verdict.PASS), _r("hw-02", Verdict.PASS_SLOW),
+               _r("hw-03", Verdict.FAIL),
+               _r("hw-13", Verdict.PASS, trigger_ok=False, should=False, did=True)]
+    s = build_summary("hawkscan", "claude-code", results)
+    assert s["trigger_accuracy"]["correct"] == 3
+    assert s["trigger_accuracy"]["total"] == 4
+    assert s["false_positives"] == ["hw-13"]
+    assert s["verdict_counts"] == {"pass": 2, "pass-slow": 1, "fail": 1}

From f054e1f82a54762fe2bc860552687f7cd7222f54 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 12:53:00 -0600
Subject: [PATCH 12/61] feat(evals): migrate api prompts to YAML; remove CSV
 sources

Generate evals/api/prompts.yaml (16 prompts) via migrate_prompts.py and
delete both evals/api/prompts.csv and evals/hawkscan/prompts.csv.
prompts.yaml is now the single source of truth for both skills; harness
shims still reference prompts.csv and will be updated in Task 12.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/api/prompts.csv      | 17 --------
 evals/api/prompts.yaml     | 80 ++++++++++++++++++++++++++++++++++++++
 evals/hawkscan/prompts.csv | 21 ----------
 3 files changed, 80 insertions(+), 38 deletions(-)
 delete mode 100644 evals/api/prompts.csv
 create mode 100644 evals/api/prompts.yaml
 delete mode 100644 evals/hawkscan/prompts.csv

diff --git a/evals/api/prompts.csv b/evals/api/prompts.csv
deleted file mode 100644
index 93d89e1..0000000
--- a/evals/api/prompts.csv
+++ /dev/null
@@ -1,17 +0,0 @@
-id,should_trigger,invocation_type,prompt,notes
-api-01,true,explicit,"Use the $api skill to show me my org's security posture","Direct skill reference by name"
-api-02,true,implicit,"Show me the security posture of my StackHawk organization","'security posture' keyword from skill description"
-api-03,true,implicit,"What are my untriaged findings across all apps?","'untriaged findings' keyword"
-api-04,true,implicit,"Give me a security dashboard overview for my org","'security dashboard' keyword"
-api-05,true,implicit,"Show me the scan history for my payment-api application","'scan history' keyword"
-api-06,true,implicit,"What needs immediate security attention in my org?","'what needs attention' keyword"
-api-07,true,contextual,"Which of my apps have HIGH severity findings that haven't been addressed?","Finding severity query; no keyword match — tests whether description alone triggers"
-api-08,true,contextual,"What changed in my security findings since last week?","Delta / diff query; tests 'what's new' path in Step 4"
-api-09,true,contextual,"Which apps haven't been scanned in over 30 days?","Stale app detection; tests Step 3 posture with stale-app focus"
-api-10,true,contextual,"I need a security report for the team's weekly standup","Reporting use case with no API-specific keywords"
-api-11,true,contextual,"Pull the full finding details for the checkout-service — the PM wants a severity breakdown","App deep dive; tests Step 4 path"
-api-12,false,negative,"Run a DAST scan on my API","Scan request → hawkscan skill; 'scan' should not route to api skill"
-api-13,false,negative,"Scan my app on localhost:8080 for vulnerabilities","Explicit scan request → hawkscan"
-api-14,false,negative,"Create a stackhawk.yml for my service","Config generation → hawkscan"
-api-15,false,negative,"Fix the CORS misconfiguration that HawkScan found","Code fix → hawkscan + code change; api skill reads only"
-api-16,false,negative,"Run HawkScan against my staging environment","Scan request → hawkscan; 'StackHawk' keyword should not override scan intent"
diff --git a/evals/api/prompts.yaml b/evals/api/prompts.yaml
new file mode 100644
index 0000000..72b0534
--- /dev/null
+++ b/evals/api/prompts.yaml
@@ -0,0 +1,80 @@
+- id: api-01
+  should_trigger: true
+  invocation_type: explicit
+  prompt: Use the $api skill to show me my org's security posture
+  notes: Direct skill reference by name
+- id: api-02
+  should_trigger: true
+  invocation_type: implicit
+  prompt: Show me the security posture of my StackHawk organization
+  notes: '''security posture'' keyword from skill description'
+- id: api-03
+  should_trigger: true
+  invocation_type: implicit
+  prompt: What are my untriaged findings across all apps?
+  notes: '''untriaged findings'' keyword'
+- id: api-04
+  should_trigger: true
+  invocation_type: implicit
+  prompt: Give me a security dashboard overview for my org
+  notes: '''security dashboard'' keyword'
+- id: api-05
+  should_trigger: true
+  invocation_type: implicit
+  prompt: Show me the scan history for my payment-api application
+  notes: '''scan history'' keyword'
+- id: api-06
+  should_trigger: true
+  invocation_type: implicit
+  prompt: What needs immediate security attention in my org?
+  notes: '''what needs attention'' keyword'
+- id: api-07
+  should_trigger: true
+  invocation_type: contextual
+  prompt: Which of my apps have HIGH severity findings that haven't been addressed?
+  notes: Finding severity query; no keyword match — tests whether description alone triggers
+- id: api-08
+  should_trigger: true
+  invocation_type: contextual
+  prompt: What changed in my security findings since last week?
+  notes: Delta / diff query; tests 'what's new' path in Step 4
+- id: api-09
+  should_trigger: true
+  invocation_type: contextual
+  prompt: Which apps haven't been scanned in over 30 days?
+  notes: Stale app detection; tests Step 3 posture with stale-app focus
+- id: api-10
+  should_trigger: true
+  invocation_type: contextual
+  prompt: I need a security report for the team's weekly standup
+  notes: Reporting use case with no API-specific keywords
+- id: api-11
+  should_trigger: true
+  invocation_type: contextual
+  prompt: Pull the full finding details for the checkout-service — the PM wants a severity breakdown
+  notes: App deep dive; tests Step 4 path
+- id: api-12
+  should_trigger: false
+  invocation_type: negative
+  prompt: Run a DAST scan on my API
+  notes: Scan request → hawkscan skill; 'scan' should not route to api skill
+- id: api-13
+  should_trigger: false
+  invocation_type: negative
+  prompt: Scan my app on localhost:8080 for vulnerabilities
+  notes: Explicit scan request → hawkscan
+- id: api-14
+  should_trigger: false
+  invocation_type: negative
+  prompt: Create a stackhawk.yml for my service
+  notes: Config generation → hawkscan
+- id: api-15
+  should_trigger: false
+  invocation_type: negative
+  prompt: Fix the CORS misconfiguration that HawkScan found
+  notes: Code fix → hawkscan + code change; api skill reads only
+- id: api-16
+  should_trigger: false
+  invocation_type: negative
+  prompt: Run HawkScan against my staging environment
+  notes: Scan request → hawkscan; 'StackHawk' keyword should not override scan intent
diff --git a/evals/hawkscan/prompts.csv b/evals/hawkscan/prompts.csv
deleted file mode 100644
index 20e1ebd..0000000
--- a/evals/hawkscan/prompts.csv
+++ /dev/null
@@ -1,21 +0,0 @@
-id,should_trigger,invocation_type,prompt,notes
-hw-01,true,explicit,"Use $hawkscan to scan my API running on localhost:8080","Direct skill reference by name; baseline — if this fails everything else is moot"
-hw-02,true,explicit,"Run the hawkscan skill against my running application","hawkscan keyword in natural language"
-hw-03,true,implicit,"Run a DAST scan on my running application","DAST keyword from skill description"
-hw-04,true,implicit,"Scan my app for security vulnerabilities","'security scan' + 'app' language"
-hw-05,true,implicit,"Use StackHawk to check my service on port 8080","StackHawk platform keyword"
-hw-06,true,implicit,"I need to run a security scan on my API","'security scan' + 'API' language"
-hw-07,true,contextual,"I just finished implementing the payment processing endpoint. Run a security scan before we deploy.","Explicit scan request after code change"
-hw-08,true,contextual,"I added OAuth2 support to the auth service. Check it for security issues.","Implicit security check after feature; tests autonomous trigger"
-hw-09,true,contextual,"The SQL injection fix is in. Verify the vulnerability is actually resolved.","Fix verification — should route to rescan path not a fresh scan"
-hw-10,true,contextual,"I refactored the API gateway — check it hasn't introduced any security regressions.","Post-refactor security check; tests autonomous trigger on refactor"
-hw-11,true,contextual,"My API spec is at ./openapi.yaml and the service is running on port 3000. Check for security holes before I merge.","Pre-merge gate with OpenAPI spec context"
-hw-12,true,contextual,"Feature is done. Let's make sure it's secure.","Vague post-feature prompt; tests whether autonomous trigger fires on minimal phrasing"
-hw-13,false,negative,"Show me my security findings from the last StackHawk scan","Reporting query → api skill; hawkscan must NOT trigger"
-hw-14,false,negative,"What's my security posture across all my applications?","Posture dashboard → api skill; 'security' keyword alone should not trigger hawkscan"
-hw-15,false,negative,"How many HIGH severity findings do I have untriaged?","Finding count query → api skill"
-hw-16,false,negative,"Update the README with instructions for running security scans","Doc-only change; skill explicitly excludes doc-only changes"
-hw-17,false,negative,"Don't run a security scan after this change — just commit it","Explicit user skip; skill must respect user's explicit opt-out"
-hw-18,false,negative,"Change the host in stackhawk.yml from localhost:8080 to localhost:3000","Config-only edit to stackhawk.yml; not a code change, not a scan trigger"
-hw-19,false,negative,"Install the hawk CLI on my machine","Installation task with no scan intent"
-hw-20,false,negative,"What types of vulnerabilities does HawkScan detect?","Research question; no scan intent, no code change"

From 5472ed220e86fb31ef7629eb88c647d1b3eb8410 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 12:55:41 -0600
Subject: [PATCH 13/61] refactor(evals): harness scripts become shims into
 unified CLI

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/harnesses/_manual_harness.py       |  25 +-
 evals/harnesses/agy/run-evals.py         | 376 +------------
 evals/harnesses/claude-code/run-evals.py | 651 +----------------------
 evals/harnesses/codex/run-evals.py       | 593 +--------------------
 evals/harnesses/copilot/run-evals.py     | 392 +-------------
 evals/harnesses/cursor/run-evals.py      | 452 +---------------
 6 files changed, 42 insertions(+), 2447 deletions(-)

diff --git a/evals/harnesses/_manual_harness.py b/evals/harnesses/_manual_harness.py
index 7b400a9..f996e44 100644
--- a/evals/harnesses/_manual_harness.py
+++ b/evals/harnesses/_manual_harness.py
@@ -3,13 +3,13 @@
 Import this from platform-specific run-evals.py files.
 """
 
-import csv
 import json
-import os
 import sys
 from datetime import datetime, timezone
 from pathlib import Path
 
+from evals.lib.config import load_skill
+
 
 HARNESS_ROOT = Path(__file__).parent.resolve()
 EVALS_DIR    = HARNESS_ROOT.parent
@@ -36,23 +36,22 @@ def run_manual_evals(
     prompt_id: str | None,
     rubric: bool,
 ) -> None:
-    prompts_path = EVALS_DIR / skill / "prompts.csv"
-    checks_path  = EVALS_DIR / skill / "process-checks.json"
     results_dir  = HARNESS_ROOT / platform / "results" / skill
 
-    with open(prompts_path) as f:
-        all_prompts = list(csv.DictReader(f))
-    checks = json.loads(checks_path.read_text())["checks"]
+    cfg = load_skill(skill)
+    all_prompts = cfg.prompts
+    checks = cfg.checks
     blocking_checks = [c for c in checks if c.get("severity") == "blocking"]
 
     rubric_items = None
     if rubric:
+        # rubric-items.json is not yet part of evals.lib — loaded directly for now
         rubric_path = EVALS_DIR / skill / "rubric-items.json"
         if rubric_path.exists():
             rubric_items = json.loads(rubric_path.read_text())["checks"]
 
     if prompt_id:
-        prompts = [p for p in all_prompts if p["id"] == prompt_id]
+        prompts = [p for p in all_prompts if p.id == prompt_id]
         if not prompts:
             print(f"ERROR: No prompt with id '{prompt_id}'", file=sys.stderr)
             sys.exit(1)
@@ -70,11 +69,11 @@ def run_manual_evals(
     all_results = []
 
     for row in prompts:
-        run_id         = row["id"]
-        prompt         = row["prompt"]
-        should_trigger = row["should_trigger"].lower() == "true"
-        itype          = row.get("invocation_type", "")
-        notes          = row.get("notes", "")
+        run_id         = row.id
+        prompt         = row.prompt
+        should_trigger = row.should_trigger
+        itype          = row.invocation_type
+        notes          = row.notes
 
         print(f"\n{'─' * 68}")
         print(f"[{run_id}]  {itype:<12}  should_trigger={'Y' if should_trigger else 'N'}")
diff --git a/evals/harnesses/agy/run-evals.py b/evals/harnesses/agy/run-evals.py
index c485b1d..52d7fd7 100644
--- a/evals/harnesses/agy/run-evals.py
+++ b/evals/harnesses/agy/run-evals.py
@@ -1,375 +1,11 @@
 #!/usr/bin/env python3
-"""
-Antigravity (agy) eval harness for StackHawk agent skills.
-
-Uses `agy -p --print-timeout` (headless mode). Skills are installed via:
-    agy plugin install /path/to/agent-skills/plugins/hawkscan
-    agy plugin install /path/to/agent-skills/plugins/api
-
-agy outputs plain text (no --output-format stream-json), so trigger detection
-scans the full text output for CLI signals and skill-invocation phrases.
-
-Usage:
-    python3 evals/harnesses/agy/run-evals.py --skill hawkscan
-    python3 evals/harnesses/agy/run-evals.py --skill api
-    python3 evals/harnesses/agy/run-evals.py --skill hawkscan --id hw-07
-    python3 evals/harnesses/agy/run-evals.py --skill hawkscan --dry-run
-
-Requirements:
-    - agy CLI installed and authenticated
-    - StackHawk plugins installed:
-        agy plugin install /path/to/agent-skills/plugins/hawkscan
-        agy plugin install /path/to/agent-skills/plugins/api
-    - Run from the agent-skills repo root
-
-Known limitations:
-    - agy connects to a shared server process. Background tasks from your
-      main agy session can bleed into eval runs — run evals when your main
-      agy session is idle.
-    - Some contextual prompts take >180s; use --print-timeout to increase.
-    - Process check scores will be low (agy in print mode doesn't execute
-      full workflows).
-"""
-
-import argparse
-import csv
-import json
-import os
-import re
-import subprocess
+"""Back-compat shim. The eval logic now lives in evals/cli.py and evals/lib/.
+Run `uv run evals --harness agy --skill <skill>` instead.
+This shim forwards old invocations to the new CLI."""
 import sys
-import tempfile
-import shutil
-from datetime import datetime, timezone
-from pathlib import Path
-
-# ---------------------------------------------------------------------------
-# Paths
-# ---------------------------------------------------------------------------
-HARNESS_DIR = Path(__file__).parent.resolve()
-EVALS_DIR   = HARNESS_DIR.parent.parent
-REPO_ROOT   = EVALS_DIR.parent
-RESULTS_DIR = HARNESS_DIR / "results"
-
-# ---------------------------------------------------------------------------
-# Trigger signals
-# agy outputs plain text, so ALL signals are searched against output_text.
-# CLI_SIGNALS: hawk/hawkop commands that appear in agent's description of work.
-# INVOCATION_SIGNALS: phrases the agent uses when explicitly invoking a skill.
-# ---------------------------------------------------------------------------
-ALL_SIGNALS = {
-    # Explicit skill declarations injected by the OBSERVE_SUFFIX.
-    # The suffix asks the agent to state 'SKILL: hawkscan', 'SKILL: api', or 'SKILL: none'.
-    # This is far more reliable than inferring intent from CLI command mentions.
-    "hawkscan": [
-        "skill: hawkscan",
-        "skill:hawkscan",
-    ],
-    "api": [
-        "skill: api",
-        "skill:api",
-        "skill: stackhawk-api",
-    ],
-}
-
-# Negative signals — if these appear, the agent is explicitly NOT using the skill
-NEGATIVE_SIGNALS = {
-    "hawkscan": [
-        # Agent explicitly declines the scan
-        "i cannot run",
-        "i can't run",
-        "cannot perform a scan",
-        "not able to scan",
-        "no application to scan",
-    ],
-    "api": [],
-}
-
-
-# ---------------------------------------------------------------------------
-# Text parsing — agy outputs plain text, not JSONL
-# ---------------------------------------------------------------------------
-
-def parse_output(text: str) -> dict:
-    return {
-        "bash_commands":  [],   # no JSON tool calls in agy text mode
-        "files_written":  [],
-        "output_text":    text.strip(),
-        "usage":          {},
-        "error":          None,
-    }
-
-
-# ---------------------------------------------------------------------------
-# Trigger detection — text-only approach
-# ---------------------------------------------------------------------------
-
-def detect_trigger(parsed: dict, skill: str) -> bool:
-    haystack = parsed["output_text"].lower()
-    if not haystack:
-        return False
-    return any(s.lower() in haystack for s in ALL_SIGNALS.get(skill, []))
-
-
-# ---------------------------------------------------------------------------
-# Process checks
-# ---------------------------------------------------------------------------
-
-def run_process_checks(parsed: dict, checks: list) -> list[dict]:
-    haystack = " ".join([
-        *parsed["bash_commands"],
-        parsed["output_text"],
-    ]).lower()
-    all_files = " ".join(parsed["files_written"]).lower()
-
-    results = []
-    for check in checks:
-        ctype   = check.get("type", "command_executed")
-        signals = [s.lower() for s in check.get("signals", [])]
-        antis   = [a.lower() for a in check.get("anti_patterns", [])]
-
-        signal_hit = next((s for s in signals if s in haystack), None)
-        anti_hit   = next((a for a in antis   if a in haystack), None)
-
-        if ctype in ("command_negative", "file_content_negative", "output_negative"):
-            passed = anti_hit is None
-        elif ctype == "file_absent":
-            target = check.get("target_file", "").lower()
-            passed = target not in all_files
-        elif ctype == "conditional_command":
-            m = re.search(r"'([^']+)'", check.get("condition", ""))
-            condition_keyword = m.group(1).lower() if m else None
-            if condition_keyword and condition_keyword not in haystack:
-                passed = True
-            else:
-                passed = signal_hit is not None
-        elif ctype == "command_preference":
-            preferred = [p.lower() for p in check.get("preferred", [])]
-            passed = any(p in haystack for p in preferred) and anti_hit is None
-        else:
-            passed = signal_hit is not None
-            if antis:
-                passed = passed and anti_hit is None
-
-        results.append({
-            "id":           check["id"],
-            "pass":         passed,
-            "severity":     check.get("severity", "warning"),
-            "signal_found": signal_hit,
-            "anti_found":   anti_hit,
-        })
-    return results
-
-
-def score_checks(results: list[dict]) -> dict:
-    blocking_failed = sum(1 for r in results if not r["pass"] and r["severity"] == "blocking")
-    warning_failed  = sum(1 for r in results if not r["pass"] and r["severity"] == "warning")
-    return {
-        "total":           len(results),
-        "passed":          sum(1 for r in results if r["pass"]),
-        "blocking_failed": blocking_failed,
-        "warning_failed":  warning_failed,
-        "score":           max(0, 100 - blocking_failed * 15 - warning_failed * 5),
-    }
-
-
-# ---------------------------------------------------------------------------
-# Run agy
-# ---------------------------------------------------------------------------
-
-OBSERVE_SUFFIX = (
-    "\n\n(Eval mode: before responding, state which skill you would invoke: "
-    "'SKILL: hawkscan', 'SKILL: api', or 'SKILL: none'. Then proceed with your response.)"
-)
-
-
-def run_agy(
-    prompt: str,
-    skill: str,
-    run_id: str,
-    model: str | None = None,
-    print_timeout: str = "120s",
-    observe: bool = True,
-) -> tuple[dict, int]:
-    # In observe mode, append a suffix so agy describes its plan without
-    # blocking on tool call approvals (which hang forever in --print mode).
-    effective_prompt = prompt + OBSERVE_SUFFIX if observe else prompt
-    tmpdir = Path(tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_"))
-    try:
-        cmd = ["agy", "-p", effective_prompt, "--print-timeout", print_timeout]
-        if model:
-            cmd += ["--model", model]
-
-        proc = subprocess.run(
-            cmd,
-            capture_output=True,
-            text=True,
-            timeout=int(print_timeout.rstrip("s")) + 30,
-            cwd=str(tmpdir),
-            env={**os.environ},
-        )
-
-        trace_dir = RESULTS_DIR / skill
-        trace_dir.mkdir(parents=True, exist_ok=True)
-        (trace_dir / f"{run_id}.txt").write_text(proc.stdout)
-
-        parsed = parse_output(proc.stdout)
-        if proc.returncode != 0 and not parsed["output_text"]:
-            stderr = proc.stderr.strip()
-            if stderr:
-                parsed["error"] = stderr[:300]
-
-        return parsed, proc.returncode
-
-    except subprocess.TimeoutExpired:
-        return {"bash_commands": [], "files_written": [], "output_text": "",
-                "usage": {}, "error": "timeout"}, 1
-    except FileNotFoundError:
-        print("ERROR: 'agy' CLI not found.", file=sys.stderr)
-        sys.exit(1)
-    finally:
-        shutil.rmtree(tmpdir, ignore_errors=True)
-
-
-# ---------------------------------------------------------------------------
-# Main
-# ---------------------------------------------------------------------------
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Antigravity (agy) eval harness for StackHawk agent skills",
-    )
-    parser.add_argument("--skill", required=True, choices=["hawkscan", "api"])
-    parser.add_argument("--id", dest="prompt_id", metavar="RUN_ID")
-    parser.add_argument("--dry-run", action="store_true")
-    parser.add_argument("--model", metavar="MODEL_ID",
-                        help="Model override (passed to agy --model)")
-    parser.add_argument("--print-timeout", default="180s",
-                        help="Per-prompt timeout for agy (default: 180s)")
-    args = parser.parse_args()
-
-    skill = args.skill
-    prompts_path = EVALS_DIR / skill / "prompts.csv"
-    checks_path  = EVALS_DIR / skill / "process-checks.json"
-
-    with open(prompts_path) as f:
-        all_prompts = list(csv.DictReader(f))
-    checks = json.loads(checks_path.read_text())["checks"]
-
-    if args.prompt_id:
-        prompts = [p for p in all_prompts if p["id"] == args.prompt_id]
-        if not prompts:
-            print(f"ERROR: No prompt with id '{args.prompt_id}'", file=sys.stderr)
-            sys.exit(1)
-    else:
-        prompts = all_prompts
-
-    model_label = f"  |  Model: {args.model}" if args.model else ""
-    print(f"\nSkill: {skill}  |  Platform: agy  |  Mode: observe{model_label}  |  Prompts: {len(prompts)}")
-    if args.dry_run:
-        print("[dry-run — no agy calls]")
-    print("─" * 68)
-
-    all_results = []
-
-    for row in prompts:
-        run_id         = row["id"]
-        prompt         = row["prompt"]
-        should_trigger = row["should_trigger"].lower() == "true"
-        itype          = row.get("invocation_type", "")
-
-        print(f"\n[{run_id}] {itype:<12}  trigger={'Y' if should_trigger else 'N'}")
-        print(f"  {prompt[:92]}{'…' if len(prompt) > 92 else ''}")
-
-        if args.dry_run:
-            print("  → skipped")
-            continue
-
-        parsed, _exit = run_agy(
-            prompt, skill, run_id,
-            model=args.model,
-            print_timeout=args.print_timeout,
-            observe=True,
-        )
-
-        if parsed.get("error"):
-            print(f"  ERROR: {parsed['error']}")
-
-        did_trigger = detect_trigger(parsed, skill)
-        trigger_ok  = did_trigger == should_trigger
-
-        process_results: list[dict] = []
-        scoring = {"total": 0, "passed": 0, "blocking_failed": 0, "warning_failed": 0, "score": 0}
-        if should_trigger and did_trigger:
-            process_results = run_process_checks(parsed, checks)
-            scoring = score_checks(process_results)
-
-        result = {
-            "platform":        "agy",
-            "skill":           skill,
-            "run_id":          run_id,
-            "prompt":          prompt,
-            "should_trigger":  should_trigger,
-            "did_trigger":     did_trigger,
-            "trigger_correct": trigger_ok,
-            "bash_commands":   parsed["bash_commands"],
-            "files_written":   parsed["files_written"],
-            "process_checks":  process_results,
-            "scoring":         scoring,
-            "timestamp":       datetime.now(timezone.utc).isoformat(),
-        }
-        all_results.append(result)
-
-        out_dir = RESULTS_DIR / skill
-        out_dir.mkdir(parents=True, exist_ok=True)
-        (out_dir / f"{run_id}.result.json").write_text(json.dumps(result, indent=2))
-
-        t_icon    = "✓" if trigger_ok else "✗"
-        score_str = f"score={scoring['score']}/100" if process_results else "—"
-        print(f"  {t_icon} did_trigger={did_trigger}  {score_str}")
-        for pr in process_results:
-            if not pr["pass"] and pr["severity"] == "blocking":
-                print(f"    BLOCKING FAIL: {pr['id']}")
-
-    if args.dry_run or not all_results:
-        return
-
-    trigger_correct = sum(1 for r in all_results if r["trigger_correct"])
-    total = len(all_results)
-    false_pos = [r for r in all_results if not r["should_trigger"] and r["did_trigger"]]
-    false_neg = [r for r in all_results if r["should_trigger"] and not r["did_trigger"]]
-    proc_runs = [r for r in all_results if r["process_checks"]]
-    avg_score = (sum(r["scoring"]["score"] for r in proc_runs) // len(proc_runs)
-                 if proc_runs else None)
-    total_blocking = sum(r["scoring"]["blocking_failed"] for r in proc_runs) if proc_runs else 0
-
-    print("\n" + "═" * 68)
-    print(f"SUMMARY  skill={skill}  platform=agy")
-    print(f"  Trigger accuracy : {trigger_correct}/{total} ({100 * trigger_correct // total}%)")
-    if false_pos:
-        print(f"  False positives  : {', '.join(r['run_id'] for r in false_pos)}")
-    if false_neg:
-        print(f"  False negatives  : {', '.join(r['run_id'] for r in false_neg)}")
-    if avg_score is not None:
-        print(f"  Process avg score: {avg_score}/100  (blocking failures: {total_blocking})")
-    print(f"  Results in       : {RESULTS_DIR / skill}/")
-
-    summary = {
-        "skill": skill, "platform": "agy",
-        "timestamp": datetime.now(timezone.utc).isoformat(),
-        "trigger_accuracy": {"correct": trigger_correct, "total": total},
-        "false_positives":  [r["run_id"] for r in false_pos],
-        "false_negatives":  [r["run_id"] for r in false_neg],
-        "process_avg_score": avg_score,
-        "total_blocking_failures": total_blocking,
-        "runs": [{"run_id": r["run_id"], "trigger_correct": r["trigger_correct"],
-                  "score": r["scoring"]["score"]} for r in all_results],
-    }
-    (RESULTS_DIR / skill / "summary.json").write_text(json.dumps(summary, indent=2))
-
-    if false_pos or false_neg or total_blocking > 0:
-        sys.exit(1)
-
+from evals.cli import main
 
 if __name__ == "__main__":
+    if "--harness" not in sys.argv:
+        sys.argv += ["--harness", "agy"]
     main()
diff --git a/evals/harnesses/claude-code/run-evals.py b/evals/harnesses/claude-code/run-evals.py
index 6d8679f..9489d2b 100644
--- a/evals/harnesses/claude-code/run-evals.py
+++ b/evals/harnesses/claude-code/run-evals.py
@@ -1,650 +1,11 @@
 #!/usr/bin/env python3
-"""
-Claude Code eval harness for StackHawk agent skills.
-
-Usage:
-    python3 run-evals.py --skill hawkscan          # all prompts
-    python3 run-evals.py --skill api               # all prompts
-    python3 run-evals.py --skill hawkscan --id hw-07    # single prompt
-    python3 run-evals.py --skill hawkscan --dry-run     # print prompts, no claude calls
-    python3 run-evals.py --skill hawkscan --full-auto   # allow agent to execute commands
-    python3 run-evals.py --skill hawkscan --rubric      # also run qualitative rubric grader
-    python3 run-evals.py --skill hawkscan --bare        # CI mode: ANTHROPIC_API_KEY only, no keychain
-
-Requirements:
-    - claude CLI installed and authenticated (https://claude.ai/code)
-    - Run from the agent-skills repo root (plugin dirs are auto-detected)
-
-Output:
-    evals/harnesses/claude-code/results/<skill>/<run-id>.jsonl       raw trace
-    evals/harnesses/claude-code/results/<skill>/<run-id>.result.json scored result
-    evals/harnesses/claude-code/results/<skill>/summary.json         run summary
-"""
-
-import argparse
-import csv
-import json
-import os
-import shutil
-import subprocess
+"""Back-compat shim. The eval logic now lives in evals/cli.py and evals/lib/.
+Run `uv run evals --harness claude-code --skill <skill>` instead.
+This shim forwards old invocations to the new CLI."""
 import sys
-import tempfile
-from datetime import datetime, timezone
-from pathlib import Path
-
-# ---------------------------------------------------------------------------
-# Paths
-# ---------------------------------------------------------------------------
-HARNESS_DIR = Path(__file__).parent.resolve()
-EVALS_DIR = HARNESS_DIR.parent.parent
-REPO_ROOT = EVALS_DIR.parent
-RESULTS_DIR = HARNESS_DIR / "results"
-
-# ---------------------------------------------------------------------------
-# Trigger signals
-# Any of these appearing in bash commands or output text means the skill fired.
-# ---------------------------------------------------------------------------
-# CLI signals — checked against bash_commands only (prevents documentation content
-# from creating false positives when the agent writes README/guides about HawkScan).
-CLI_SIGNALS = {
-    "hawkscan": [
-        "hawk scan",
-        "hawk validate",
-        "hawk rescan",
-        # "hawk version" intentionally excluded: running 'hawk version' alone is common
-        # for installation-check tasks and would cause false positives. The preflight
-        # workflow always runs 'hawk config --help' in the same command, so 'hawk config'
-        # below is sufficient to distinguish scan-intent from install-check tasks.
-        "hawk config",
-        "hawk create app",
-        "hawk init",
-        "hawk perch",
-    ],
-    "api": [
-        "hawkop scan",
-        "hawkop app",
-        "hawkop org",
-        "hawkop env",
-        "hawkop status",
-        "hawkop init",
-        "/api/v1/scan",
-        "/api/v2/org",
-        "hawk_api GET",
-    ],
-}
-
-# Invocation signals — checked against output_text only. Catches contextual prompts
-# where the agent correctly identifies the skill should trigger and says so explicitly,
-# but can't reach the CLI workflow (empty working dir, no running app, etc.).
-#
-# These are intentionally specific to action-intent phrases, NOT the generic
-# "hawkscan:hawkscan: yes" pattern (which also fires on educational/informational
-# responses where the agent answers "what does HawkScan detect?" type questions).
-INVOCATION_SIGNALS = {
-    "hawkscan": [
-        # Generic YES-evaluation signals — catch any run where the agent explicitly
-        # evaluates hawkscan as YES regardless of phrasing. Models vary in their markdown
-        # formatting: backtick (`` `hawkscan:hawkscan` ``), bold (**hawkscan:hawkscan**),
-        # or plain text. Each produces a different character sequence around `: YES`.
-        # Safe because SKILL.md now instructs NO for educational questions (hw-20),
-        # doc-only changes (hw-16/17/18), installation tasks (hw-19), and explicit skips.
-        "hawkscan:hawkscan`: yes",   # "`hawkscan:hawkscan`: YES" — backtick + colon (Sonnet/Haiku)
-        "hawkscan:hawkscan` — yes",  # "`hawkscan:hawkscan` — YES" — backtick + em-dash
-        "hawkscan:hawkscan**: yes",  # "**hawkscan:hawkscan**: YES" — bold + colon
-        "hawkscan:hawkscan** — yes", # "**hawkscan:hawkscan** — YES" — bold + em-dash
-        "hawkscan:hawkscan: yes",    # "hawkscan:hawkscan: YES" — plain colon
-        "hawkscan:hawkscan — yes",   # "hawkscan:hawkscan — YES" — em-dash
-        "hawkscan:hawkscan - yes",   # "hawkscan:hawkscan - YES" — plain hyphen (Opus 4.7)
-        "hawkscan:hawkscan - **yes", # "hawkscan:hawkscan - **YES**" — bold YES (Opus 4.7)
-        # Plugin name only — Opus 4.7 sometimes omits :hawkscan suffix
-        "hawkscan**: yes",           # "**hawkscan**: YES" — bold, no skill suffix
-        "hawkscan** — yes",          # bold + em-dash, no skill suffix
-        "hawkscan** - yes",          # "**hawkscan:hawkscan** - YES" — bold name + hyphen (Opus)
-        "hawkscan: yes",             # plain colon, no skill suffix
-        "hawkscan — yes",            # em-dash, no skill suffix
-        "hawkscan - yes",            # plain hyphen, no skill suffix
-        # Specific action-intent phrases as belt-and-suspenders for unusual formats
-        "autonomous security scan",
-        "dast scan after code",
-        "dast scan triggered",
-        "dast scan required",
-        "security scan required",
-        "security scan after",
-        "run the security scan",
-        "running the hawkscan",
-    ],
-    "api": [
-        # Full skill name (plugin:skill) — Sonnet/Haiku format
-        "stackhawk-api:api`: yes",   # backtick + colon
-        "stackhawk-api:api` — yes",  # backtick + em-dash
-        "stackhawk-api:api**: yes",  # bold + colon
-        "stackhawk-api:api** — yes", # bold + em-dash
-        "stackhawk-api:api: yes",    # plain colon
-        "stackhawk-api:api — yes",   # em-dash
-        "stackhawk-api:api - yes",   # plain hyphen (Opus 4.7)
-        # Plugin name only (Opus 4.7 sometimes omits :api suffix)
-        "stackhawk-api**: yes",      # bold + colon, no skill suffix
-        "stackhawk-api** — yes",     # bold + em-dash, no skill suffix
-        "stackhawk-api** - yes",     # bold + plain hyphen, no skill suffix (Opus)
-        "stackhawk-api: yes",        # plain colon, no skill suffix
-        "stackhawk-api — yes",       # em-dash, no skill suffix
-        "stackhawk-api - yes",       # plain hyphen, no skill suffix
-    ],
-}
-
-# ---------------------------------------------------------------------------
-# Stream-json parsing
-# ---------------------------------------------------------------------------
-
-def parse_stream(jsonl: str) -> dict:
-    """Extract structured data from a claude --output-format stream-json run."""
-    bash_commands: list[str] = []
-    files_written: list[str] = []
-    files_edited: list[str] = []
-    output_text = ""
-    cost_usd = 0.0
-    error = None
-
-    for line in jsonl.splitlines():
-        line = line.strip()
-        if not line:
-            continue
-        try:
-            event = json.loads(line)
-        except json.JSONDecodeError:
-            continue
-
-        etype = event.get("type", "")
-
-        if etype == "assistant":
-            for block in event.get("message", {}).get("content", []):
-                btype = block.get("type", "")
-                if btype == "text":
-                    output_text += block.get("text", "") + "\n"
-                elif btype == "tool_use":
-                    name = block.get("name", "")
-                    inp = block.get("input", {})
-                    if name == "Bash":
-                        cmd = inp.get("command", "")
-                        if cmd:
-                            bash_commands.append(cmd)
-                    elif name == "Write":
-                        path = inp.get("file_path", "")
-                        if path:
-                            files_written.append(path)
-                    elif name == "Edit":
-                        path = inp.get("file_path", "")
-                        if path:
-                            files_edited.append(path)
-
-        elif etype == "result":
-            cost_usd = event.get("cost_usd") or 0.0
-            output_text += event.get("result", "")
-            if event.get("subtype") == "error_during_execution":
-                error = event.get("result", "unknown error")
-
-    return {
-        "bash_commands": bash_commands,
-        "files_written": files_written,
-        "files_edited": files_edited,
-        "output_text": output_text.strip(),
-        "cost_usd": cost_usd,
-        "error": error,
-    }
-
-
-# ---------------------------------------------------------------------------
-# Trigger detection
-# ---------------------------------------------------------------------------
-
-def detect_trigger(parsed: dict, skill: str) -> bool:
-    # CLI signals are checked only against actual bash commands executed — prevents
-    # documentation content (README guides, educational answers) from triggering.
-    cli_haystack = " ".join(parsed["bash_commands"]).lower()
-    if any(s.lower() in cli_haystack for s in CLI_SIGNALS.get(skill, [])):
-        return True
-
-    # Invocation signals are checked only against output text — catches cases where
-    # the agent evaluated the skill as YES but couldn't run CLI commands (e.g. empty
-    # working dir, permission blocks on hawkop, no running app).
-    text_haystack = parsed["output_text"].lower()
-    return any(s.lower() in text_haystack for s in INVOCATION_SIGNALS.get(skill, []))
-
-
-# ---------------------------------------------------------------------------
-# Process checks
-# ---------------------------------------------------------------------------
-
-def run_process_checks(parsed: dict, checks: list) -> list[dict]:
-    haystack = " ".join([
-        *parsed["bash_commands"],
-        parsed["output_text"],
-    ]).lower()
-    all_files = " ".join(parsed["files_written"] + parsed["files_edited"]).lower()
-
-    results = []
-    for check in checks:
-        ctype = check.get("type", "command_executed")
-        signals = [s.lower() for s in check.get("signals", [])]
-        antis = [a.lower() for a in check.get("anti_patterns", [])]
-
-        signal_hit = next((s for s in signals if s in haystack), None)
-        anti_hit = next((a for a in antis if a in haystack), None)
-
-        if ctype in ("command_negative", "file_content_negative", "output_negative"):
-            passed = anti_hit is None
-        elif ctype == "file_absent":
-            target = check.get("target_file", "").lower()
-            passed = target not in all_files
-        elif ctype == "conditional_command":
-            # Only enforce when the condition's keyword appears in the trace.
-            # Extract the keyword inside single quotes from the condition string,
-            # e.g. "stackhawk.yml contains 'authentication:'" → "authentication:"
-            import re as _re
-            condition_str = check.get("condition", "")
-            m = _re.search(r"'([^']+)'", condition_str)
-            condition_keyword = m.group(1).lower() if m else None
-            if condition_keyword and condition_keyword not in haystack:
-                passed = True  # condition not met — check is not applicable
-            else:
-                passed = signal_hit is not None
-        elif ctype == "command_preference":
-            preferred = [p.lower() for p in check.get("preferred", [])]
-            passed = any(p in haystack for p in preferred) and anti_hit is None
-        else:
-            passed = signal_hit is not None
-            if antis:
-                passed = passed and anti_hit is None
-
-        results.append({
-            "id": check["id"],
-            "pass": passed,
-            "severity": check.get("severity", "warning"),
-            "signal_found": signal_hit,
-            "anti_found": anti_hit,
-        })
-
-    return results
-
-
-def score_checks(results: list[dict]) -> dict:
-    blocking_failed = sum(1 for r in results if not r["pass"] and r["severity"] == "blocking")
-    warning_failed  = sum(1 for r in results if not r["pass"] and r["severity"] == "warning")
-    score = max(0, 100 - blocking_failed * 15 - warning_failed * 5)
-    return {
-        "total": len(results),
-        "passed": sum(1 for r in results if r["pass"]),
-        "blocking_failed": blocking_failed,
-        "warning_failed": warning_failed,
-        "score": score,
-    }
-
-
-# ---------------------------------------------------------------------------
-# Run claude -p
-# ---------------------------------------------------------------------------
-
-def run_claude(
-    prompt: str,
-    skill: str,
-    run_id: str,
-    plugin_dirs: list[str],
-    full_auto: bool = False,
-    bare: bool = False,
-    max_budget: float = 0.20,
-    model: str | None = None,
-) -> tuple[dict, int]:
-    # Each eval runs in a fresh temp dir so there is no state leakage.
-    tmpdir = tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_")
-    try:
-        cmd = [
-            "claude", "-p", prompt,
-            "--output-format", "stream-json",
-            "--verbose",
-            "--no-session-persistence",
-            "--max-budget-usd", str(max_budget),
-        ]
-        if model:
-            cmd += ["--model", model]
-        for pd in plugin_dirs:
-            cmd += ["--plugin-dir", pd]
-        if full_auto:
-            cmd.append("--dangerously-skip-permissions")
-        if bare:
-            cmd.append("--bare")
-
-        proc = subprocess.run(
-            cmd,
-            capture_output=True,
-            text=True,
-            timeout=300,
-            cwd=tmpdir,
-        )
-
-        trace_dir = RESULTS_DIR / skill
-        trace_dir.mkdir(parents=True, exist_ok=True)
-        (trace_dir / f"{run_id}.jsonl").write_text(proc.stdout)
-
-        return parse_stream(proc.stdout), proc.returncode
-
-    except subprocess.TimeoutExpired:
-        return {
-            "bash_commands": [], "files_written": [], "files_edited": [],
-            "output_text": "", "cost_usd": 0.0, "error": "timeout",
-        }, 1
-    except FileNotFoundError:
-        print(
-            "ERROR: 'claude' CLI not found. "
-            "Install Claude Code (https://claude.ai/code) and ensure it is in PATH.",
-            file=sys.stderr,
-        )
-        sys.exit(1)
-    finally:
-        shutil.rmtree(tmpdir, ignore_errors=True)
-
-
-# ---------------------------------------------------------------------------
-# Rubric grader (qualitative, model-assisted, optional)
-# ---------------------------------------------------------------------------
-
-def run_rubric_grader(
-    parsed: dict,
-    skill: str,
-    run_id: str,
-    plugin_dirs: list[str],
-    bare: bool = False,
-) -> dict | None:
-    rubric_path = EVALS_DIR / skill / "rubric-items.json"
-    schema_path = EVALS_DIR / "rubric-schema.json"
-    if not rubric_path.exists() or not schema_path.exists():
-        print("  [rubric] rubric-items.json or rubric-schema.json not found — skipping",
-              file=sys.stderr)
-        return None
-
-    rubric_data = json.loads(rubric_path.read_text())
-    schema = json.loads(schema_path.read_text())
-
-    grader_prompt = f"""{rubric_data['grader_prompt']}
-
-## Bash Commands Executed:
-{json.dumps(parsed['bash_commands'], indent=2)}
-
-## Files Written/Edited:
-{json.dumps(parsed['files_written'] + parsed['files_edited'], indent=2)}
-
-## Agent Output (first 4000 chars):
-{parsed['output_text'][:4000]}
-
-## Rubric Checks to Grade:
-{json.dumps(rubric_data['checks'], indent=2)}
-
-Populate the JSON result with:
-  skill = "{skill}"
-  run_id = "{run_id}"
-  overall_pass = true if all checks pass and score >= 70
-  score = 0-100
-  checks = one entry per check id listed above"""
-
-    cmd = [
-        "claude", "-p", grader_prompt,
-        "--output-format", "json",
-        "--no-session-persistence",
-        "--json-schema", json.dumps(schema),
-        "--max-budget-usd", "0.10",
-    ]
-    for pd in plugin_dirs:
-        cmd += ["--plugin-dir", pd]
-    if bare:
-        cmd.append("--bare")
-
-    try:
-        proc = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
-        envelope = json.loads(proc.stdout)
-        # --output-format json wraps the response: {"result": "<json_string>", ...}
-        raw_result = envelope.get("result", "{}")
-        if isinstance(raw_result, dict):
-            return raw_result
-        return json.loads(raw_result)
-    except Exception as exc:
-        print(f"  [rubric] grader failed: {exc}", file=sys.stderr)
-        return None
-
-
-# ---------------------------------------------------------------------------
-# Main
-# ---------------------------------------------------------------------------
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Claude Code eval harness for StackHawk agent skills",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-    )
-    parser.add_argument("--skill", required=True, choices=["hawkscan", "api"])
-    parser.add_argument("--id", dest="prompt_id", metavar="RUN_ID",
-                        help="Run a single prompt by id (e.g. hw-07)")
-    parser.add_argument("--dry-run", action="store_true",
-                        help="Print prompts without calling claude")
-    parser.add_argument("--rubric", action="store_true",
-                        help="Run qualitative rubric grader after process checks (extra cost + time)")
-    parser.add_argument("--full-auto", action="store_true",
-                        help="Pass --dangerously-skip-permissions so the agent can execute commands")
-    parser.add_argument("--bare", action="store_true",
-                        help="Pass --bare to claude: ANTHROPIC_API_KEY only, no keychain/hooks/CLAUDE.md (recommended for CI)")
-    parser.add_argument("--max-budget", type=float, default=0.20, metavar="USD",
-                        help="Max spend per eval run in USD (default: 0.20)")
-    parser.add_argument("--plugin-dir", action="append", dest="plugin_dirs",
-                        help="Plugin dir to load; auto-detected from repo root if omitted")
-    parser.add_argument("--model", metavar="MODEL_ID",
-                        help="Override the Claude model (e.g. claude-haiku-4-5-20251001, claude-sonnet-4-6)")
-    args = parser.parse_args()
-
-    skill = args.skill
-    plugin_dirs = args.plugin_dirs or [str(REPO_ROOT / "plugins" / skill)]
-
-    prompts_path = EVALS_DIR / skill / "prompts.csv"
-    checks_path  = EVALS_DIR / skill / "process-checks.json"
-
-    with open(prompts_path) as f:
-        all_prompts = list(csv.DictReader(f))
-    checks = json.loads(checks_path.read_text())["checks"]
-
-    if args.prompt_id:
-        prompts = [p for p in all_prompts if p["id"] == args.prompt_id]
-        if not prompts:
-            print(f"ERROR: No prompt with id '{args.prompt_id}'", file=sys.stderr)
-            sys.exit(1)
-    else:
-        prompts = all_prompts
-
-    mode = "full-auto" if args.full_auto else "observe"
-    if args.bare:
-        mode += "+bare"
-    model_label = f"  |  Model: {args.model}" if args.model else ""
-    print(f"\nSkill: {skill}  |  Platform: claude-code  |  Mode: {mode}{model_label}  |  Prompts: {len(prompts)}")
-    if args.dry_run:
-        print("[dry-run — no claude calls]")
-    print("─" * 68)
-
-    all_results = []
-    total_cost = 0.0
-
-    for row in prompts:
-        run_id        = row["id"]
-        prompt        = row["prompt"]
-        should_trigger = row["should_trigger"].lower() == "true"
-        itype         = row.get("invocation_type", "")
-
-        print(f"\n[{run_id}] {itype:<12}  trigger={'Y' if should_trigger else 'N'}")
-        print(f"  {prompt[:92]}{'…' if len(prompt) > 92 else ''}")
-
-        if args.dry_run:
-            print("  → skipped")
-            continue
-
-        parsed, _exit = run_claude(
-            prompt, skill, run_id, plugin_dirs,
-            full_auto=args.full_auto,
-            bare=args.bare,
-            max_budget=args.max_budget,
-            model=args.model,
-        )
-        total_cost += parsed.get("cost_usd", 0.0)
-
-        if parsed.get("error"):
-            print(f"  ERROR: {parsed['error']}")
-
-        did_trigger  = detect_trigger(parsed, skill)
-        trigger_ok   = did_trigger == should_trigger
-
-        process_results: list[dict] = []
-        scoring = {"total": 0, "passed": 0, "blocking_failed": 0, "warning_failed": 0, "score": 0}
-        if should_trigger and did_trigger:
-            process_results = run_process_checks(parsed, checks)
-            scoring = score_checks(process_results)
-
-        rubric_result = None
-        if args.rubric and should_trigger and did_trigger:
-            print("  [rubric] grading…", end=" ", flush=True)
-            rubric_result = run_rubric_grader(parsed, skill, run_id, plugin_dirs, bare=args.bare)
-            print(f"score={rubric_result.get('score', '?')}" if rubric_result else "failed")
-
-        result = {
-            "platform": "claude-code",
-            "skill": skill,
-            "run_id": run_id,
-            "prompt": prompt,
-            "should_trigger": should_trigger,
-            "did_trigger": did_trigger,
-            "trigger_correct": trigger_ok,
-            "bash_commands": parsed["bash_commands"],
-            "files_written": parsed["files_written"],
-            "process_checks": process_results,
-            "scoring": scoring,
-            "rubric_result": rubric_result,
-            "cost_usd": parsed.get("cost_usd", 0.0),
-            "timestamp": datetime.now(timezone.utc).isoformat(),
-        }
-        all_results.append(result)
-
-        out_dir = RESULTS_DIR / skill
-        out_dir.mkdir(parents=True, exist_ok=True)
-        (out_dir / f"{run_id}.result.json").write_text(json.dumps(result, indent=2))
-
-        t_icon   = "✓" if trigger_ok else "✗"
-        score_str = f"score={scoring['score']}/100" if process_results else "—"
-        print(f"  {t_icon} did_trigger={did_trigger}  {score_str}  ${parsed.get('cost_usd', 0):.3f}")
-
-        for pr in process_results:
-            if not pr["pass"] and pr["severity"] == "blocking":
-                print(f"    BLOCKING FAIL: {pr['id']}")
-
-    if args.dry_run or not all_results:
-        return
-
-    # ── Final summary ──────────────────────────────────────────────────────
-    trigger_correct = sum(1 for r in all_results if r["trigger_correct"])
-    total = len(all_results)
-    false_pos = [r for r in all_results if not r["should_trigger"] and r["did_trigger"]]
-    false_neg = [r for r in all_results if r["should_trigger"] and not r["did_trigger"]]
-    process_runs = [r for r in all_results if r["process_checks"]]
-    avg_score = (sum(r["scoring"]["score"] for r in process_runs) // len(process_runs)
-                 if process_runs else None)
-    total_blocking = (sum(r["scoring"]["blocking_failed"] for r in process_runs)
-                      if process_runs else 0)
-
-    print("\n" + "═" * 68)
-    print(f"SUMMARY  skill={skill}  platform=claude-code")
-    print(f"  Trigger accuracy : {trigger_correct}/{total} ({100 * trigger_correct // total}%)")
-    if false_pos:
-        print(f"  False positives  : {', '.join(r['run_id'] for r in false_pos)}")
-    if false_neg:
-        print(f"  False negatives  : {', '.join(r['run_id'] for r in false_neg)}")
-    if avg_score is not None:
-        print(f"  Process avg score: {avg_score}/100  (blocking failures: {total_blocking})")
-    print(f"  Total cost       : ${total_cost:.3f}")
-    print(f"  Results in       : {RESULTS_DIR / skill}/")
-
-    summary = {
-        "skill": skill,
-        "platform": "claude-code",
-        "timestamp": datetime.now(timezone.utc).isoformat(),
-        "trigger_accuracy": {"correct": trigger_correct, "total": total},
-        "false_positives": [r["run_id"] for r in false_pos],
-        "false_negatives": [r["run_id"] for r in false_neg],
-        "process_avg_score": avg_score,
-        "total_blocking_failures": total_blocking,
-        "total_cost_usd": round(total_cost, 4),
-        "runs": [
-            {
-                "run_id": r["run_id"],
-                "trigger_correct": r["trigger_correct"],
-                "score": r["scoring"]["score"],
-                "cost_usd": r["cost_usd"],
-            }
-            for r in all_results
-        ],
-    }
-    (RESULTS_DIR / skill / "summary.json").write_text(json.dumps(summary, indent=2))
-
-    # ── GitHub Actions step summary ────────────────────────────────────────
-    step_summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
-    if step_summary_path:
-        _write_step_summary(
-            step_summary_path, skill, all_results,
-            false_pos, false_neg, avg_score, total_blocking, total_cost,
-        )
-
-    # ── Exit non-zero for CI on any regression ─────────────────────────────
-    if false_pos or false_neg or total_blocking > 0:
-        sys.exit(1)
-
-
-def _write_step_summary(
-    path: str,
-    skill: str,
-    results: list[dict],
-    false_pos: list[dict],
-    false_neg: list[dict],
-    avg_score: int | None,
-    total_blocking: int,
-    total_cost: float,
-) -> None:
-    correct = sum(1 for r in results if r["trigger_correct"])
-    total = len(results)
-    trigger_icon = "✅" if correct == total else "❌"
-    score_icon = "✅" if (avg_score or 0) >= 70 and total_blocking == 0 else "❌"
-
-    lines = [
-        f"## Skill Eval: `{skill}` (claude-code)\n",
-        "| Metric | Value |",
-        "|---|---|",
-        f"| Trigger accuracy | {trigger_icon} {correct}/{total} |",
-    ]
-    if false_pos:
-        lines.append(f"| False positives | ⚠️ {', '.join(r['run_id'] for r in false_pos)} |")
-    if false_neg:
-        lines.append(f"| False negatives | ⚠️ {', '.join(r['run_id'] for r in false_neg)} |")
-    if avg_score is not None:
-        lines.append(f"| Process avg score | {score_icon} {avg_score}/100 |")
-        lines.append(f"| Blocking failures | {'❌' if total_blocking else '✅'} {total_blocking} |")
-    lines.append(f"| Total cost | ${total_cost:.3f} |")
-    lines.append("")
-
-    # Per-run table
-    lines += [
-        "<details><summary>Per-run results</summary>\n",
-        "| ID | Trigger | Score | Cost |",
-        "|---|---|---|---|",
-    ]
-    for r in results:
-        t = "✅" if r["trigger_correct"] else "❌"
-        score = r["scoring"]["score"] if r["process_checks"] else "—"
-        lines.append(f"| {r['run_id']} | {t} | {score} | ${r['cost_usd']:.3f} |")
-    lines.append("\n</details>\n")
-
-    with open(path, "a") as f:
-        f.write("\n".join(lines) + "\n")
-
+from evals.cli import main
 
 if __name__ == "__main__":
+    if "--harness" not in sys.argv:
+        sys.argv += ["--harness", "claude-code"]
     main()
diff --git a/evals/harnesses/codex/run-evals.py b/evals/harnesses/codex/run-evals.py
index 3c0828f..24df734 100644
--- a/evals/harnesses/codex/run-evals.py
+++ b/evals/harnesses/codex/run-evals.py
@@ -1,592 +1,11 @@
 #!/usr/bin/env python3
-"""
-Codex eval harness for StackHawk agent skills.
-
-Usage:
-    python3 run-evals.py --skill hawkscan          # all prompts
-    python3 run-evals.py --skill api               # all prompts
-    python3 run-evals.py --skill hawkscan --id hw-07    # single prompt
-    python3 run-evals.py --skill hawkscan --dry-run     # print prompts, no codex calls
-    python3 run-evals.py --skill hawkscan --rubric      # also run qualitative rubric grader
-
-Requirements:
-    - codex CLI installed and authenticated (https://openai.com/codex)
-    - Run from the agent-skills repo root
-
-Output:
-    evals/harnesses/codex/results/<skill>/<run-id>.jsonl       raw JSONL trace
-    evals/harnesses/codex/results/<skill>/<run-id>.result.json scored result
-    evals/harnesses/codex/results/<skill>/summary.json         run summary
-"""
-
-import argparse
-import csv
-import json
-import os
-import shutil
-import subprocess
+"""Back-compat shim. The eval logic now lives in evals/cli.py and evals/lib/.
+Run `uv run evals --harness codex --skill <skill>` instead.
+This shim forwards old invocations to the new CLI."""
 import sys
-import tempfile
-from datetime import datetime, timezone
-from pathlib import Path
-
-# ---------------------------------------------------------------------------
-# Paths
-# ---------------------------------------------------------------------------
-HARNESS_DIR = Path(__file__).parent.resolve()
-EVALS_DIR   = HARNESS_DIR.parent.parent
-REPO_ROOT   = EVALS_DIR.parent
-RESULTS_DIR = HARNESS_DIR / "results"
-
-# ---------------------------------------------------------------------------
-# Trigger signals
-# ---------------------------------------------------------------------------
-# CLI signals — checked against bash_commands only (prevents documentation content
-# from creating false positives when the agent writes README/guides about HawkScan).
-CLI_SIGNALS = {
-    "hawkscan": [
-        "hawk scan",
-        "hawk validate",
-        "hawk rescan",
-        # "hawk version" excluded: running 'hawk version' alone is common for
-        # installation-check tasks and would cause false positives. The preflight
-        # workflow always also runs 'hawk config --help', so 'hawk config' below suffices.
-        "hawk config",
-        "hawk create app",
-        "hawk init",
-        "hawk perch",
-    ],
-    # Signals specific to the api reporting workflow — avoids false positives
-    # from hawkop status/app/env commands that the hawkscan skill also runs.
-    "api": [
-        "hawkop scan get",     # api Step 4: app deep dive
-        "hawkop org get",      # api Step 1: establish orgId
-        "hawkop org set",      # api Step 1: switch org
-        "/api/v2/org",         # api Step 3: org posture endpoint (hawkop doesn't wrap it)
-        "/api/v1/scan",        # api Step 4: raw scan drill-down
-        "hawk_api GET",        # api raw API helper function
-    ],
-}
-
-# Invocation signals — checked against output_text only. In full-auto mode these are
-# belt-and-suspenders: the agent usually runs CLI commands directly. They catch
-# contextual prompts where the skill fires but the agent finds an empty working dir
-# and stops before reaching the CLI (same as observe mode in Claude Code harness).
-INVOCATION_SIGNALS = {
-    "hawkscan": [
-        # All markdown formatting variants the model uses around `: YES` or ` — YES`
-        "hawkscan:hawkscan`: yes",   # backtick + colon
-        "hawkscan:hawkscan` — yes",  # backtick + dash
-        "hawkscan:hawkscan**: yes",  # bold + colon
-        "hawkscan:hawkscan** — yes", # bold + dash
-        "hawkscan:hawkscan: yes",    # plain colon
-        "hawkscan:hawkscan — yes",   # plain dash
-        # Specific action-intent phrases
-        "autonomous security scan",
-        "dast scan after code",
-        "dast scan triggered",
-        "dast scan required",
-        "security scan required",
-        "security scan after",
-        "run the security scan",
-        "running the hawkscan",
-    ],
-    "api": [
-        "stackhawk-api:api`: yes",
-        "stackhawk-api:api` — yes",
-        "stackhawk-api:api: yes",
-        "stackhawk-api:api — yes",
-    ],
-}
-
-# ---------------------------------------------------------------------------
-# JSONL parsing
-# Codex --json event stream: item.started / item.completed / turn.completed
-# ---------------------------------------------------------------------------
-
-def parse_stream(jsonl: str) -> dict:
-    commands: list[str] = []
-    output_text = ""
-    input_tokens = 0
-    output_tokens = 0
-    error = None
-
-    seen_commands: set[str] = set()
-
-    for line in jsonl.splitlines():
-        line = line.strip()
-        if not line:
-            continue
-        try:
-            event = json.loads(line)
-        except json.JSONDecodeError:
-            continue
-
-        etype = event.get("type", "")
-
-        if etype == "item.started":
-            item = event.get("item", {})
-            if item.get("type") == "command_execution":
-                cmd = item.get("command", "")
-                # Deduplicate: item.started fires before item.completed for the same cmd
-                if cmd and cmd not in seen_commands:
-                    commands.append(cmd)
-                    seen_commands.add(cmd)
-
-        elif etype == "item.completed":
-            item = event.get("item", {})
-            # Capture any assistant message text — Codex uses "agent_message" type
-            if item.get("type") in ("message", "agent_message"):
-                text = item.get("text", "")
-                if text:
-                    output_text += text + "\n"
-                content = item.get("content", "")
-                if isinstance(content, str):
-                    output_text += content + "\n"
-                elif isinstance(content, list):
-                    for block in content:
-                        if isinstance(block, dict) and block.get("type") == "text":
-                            output_text += block.get("text", "") + "\n"
-
-        elif etype == "turn.completed":
-            usage = event.get("usage", {})
-            input_tokens  += usage.get("input_tokens", 0)
-            output_tokens += usage.get("output_tokens", 0)
-
-        elif etype == "error":
-            error = event.get("message", "unknown error")
-
-    return {
-        "bash_commands": commands,
-        "files_written": [],  # populated by scanning tmpdir after run
-        "files_edited":  [],
-        "output_text":   output_text.strip(),
-        "input_tokens":  input_tokens,
-        "output_tokens": output_tokens,
-        "error":         error,
-    }
-
-
-def _setup_skill_in_dir(skill: str, target_dir: Path) -> None:
-    """No-op: skills are installed globally via 'codex plugin add <skill>@stackhawk'.
-    Run: codex plugin marketplace add /path/to/agent-skills
-         codex plugin add hawkscan@stackhawk
-         codex plugin add stackhawk-api@stackhawk
-    """
-    pass
-
-
-# ---------------------------------------------------------------------------
-# Trigger detection
-# ---------------------------------------------------------------------------
-
-def detect_trigger(parsed: dict, skill: str) -> bool:
-    # CLI signals checked against actual bash commands only — prevents README/educational
-    # output text from creating false positives.
-    cli_haystack = " ".join(parsed["bash_commands"]).lower()
-    if any(s.lower() in cli_haystack for s in CLI_SIGNALS.get(skill, [])):
-        return True
-
-    # Invocation signals checked against output text only — belt-and-suspenders for
-    # contextual prompts where the skill fires but no CLI commands run (empty dir, etc.)
-    text_haystack = parsed["output_text"].lower()
-    return any(s.lower() in text_haystack for s in INVOCATION_SIGNALS.get(skill, []))
-
-
-# ---------------------------------------------------------------------------
-# Process checks
-# ---------------------------------------------------------------------------
-
-def run_process_checks(parsed: dict, checks: list) -> list[dict]:
-    haystack = " ".join([
-        *parsed["bash_commands"],
-        parsed["output_text"],
-    ]).lower()
-    all_files = " ".join(parsed["files_written"] + parsed["files_edited"]).lower()
-
-    results = []
-    for check in checks:
-        ctype   = check.get("type", "command_executed")
-        signals = [s.lower() for s in check.get("signals", [])]
-        antis   = [a.lower() for a in check.get("anti_patterns", [])]
-
-        signal_hit = next((s for s in signals if s in haystack), None)
-        anti_hit   = next((a for a in antis   if a in haystack), None)
-
-        if ctype in ("command_negative", "file_content_negative", "output_negative"):
-            passed = anti_hit is None
-        elif ctype == "file_absent":
-            target = check.get("target_file", "").lower()
-            passed = target not in all_files
-        elif ctype == "conditional_command":
-            # Only enforce when the condition's keyword appears in the trace.
-            import re as _re
-            condition_str = check.get("condition", "")
-            m = _re.search(r"'([^']+)'", condition_str)
-            condition_keyword = m.group(1).lower() if m else None
-            if condition_keyword and condition_keyword not in haystack:
-                passed = True  # condition not met — check not applicable
-            else:
-                passed = signal_hit is not None
-        elif ctype == "command_preference":
-            preferred = [p.lower() for p in check.get("preferred", [])]
-            passed = any(p in haystack for p in preferred) and anti_hit is None
-        else:
-            passed = signal_hit is not None
-            if antis:
-                passed = passed and anti_hit is None
-
-        results.append({
-            "id":           check["id"],
-            "pass":         passed,
-            "severity":     check.get("severity", "warning"),
-            "signal_found": signal_hit,
-            "anti_found":   anti_hit,
-        })
-
-    return results
-
-
-def score_checks(results: list[dict]) -> dict:
-    blocking_failed = sum(1 for r in results if not r["pass"] and r["severity"] == "blocking")
-    warning_failed  = sum(1 for r in results if not r["pass"] and r["severity"] == "warning")
-    score = max(0, 100 - blocking_failed * 15 - warning_failed * 5)
-    return {
-        "total":            len(results),
-        "passed":           sum(1 for r in results if r["pass"]),
-        "blocking_failed":  blocking_failed,
-        "warning_failed":   warning_failed,
-        "score":            score,
-    }
-
-
-# ---------------------------------------------------------------------------
-# Run codex exec
-# ---------------------------------------------------------------------------
-
-def run_codex(
-    prompt: str,
-    skill: str,
-    run_id: str,
-    full_auto: bool = True,
-    max_budget: float = 0.20,
-    model: str | None = None,
-) -> tuple[dict, int]:
-    tmpdir = Path(tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_"))
-    try:
-        _setup_skill_in_dir(skill, tmpdir)
-
-        cmd = [
-            "codex", "exec", "--json",
-            "--sandbox", "workspace-write",
-            "--skip-git-repo-check",
-        ]
-        if model:
-            cmd += ["-m", model]
-        if not full_auto:
-            cmd += ["--sandbox", "read-only"]
-        cmd.append(prompt)
-
-        proc = subprocess.run(
-            cmd,
-            capture_output=True,
-            text=True,
-            timeout=300,
-            cwd=str(tmpdir),
-        )
-
-        trace_dir = RESULTS_DIR / skill
-        trace_dir.mkdir(parents=True, exist_ok=True)
-        (trace_dir / f"{run_id}.jsonl").write_text(proc.stdout)
-
-        parsed = parse_stream(proc.stdout)
-
-        # Scan tmpdir for files created during the run (more reliable than JSONL parsing)
-        created = [
-            str(p.relative_to(tmpdir))
-            for p in tmpdir.rglob("*")
-            if p.is_file() and not str(p).startswith(str(tmpdir / ".codex"))
-        ]
-        parsed["files_written"] = created
-
-        return parsed, proc.returncode
-
-    except subprocess.TimeoutExpired:
-        return {
-            "bash_commands": [], "files_written": [], "files_edited": [],
-            "output_text": "", "input_tokens": 0, "output_tokens": 0, "error": "timeout",
-        }, 1
-    except FileNotFoundError:
-        print(
-            "ERROR: 'codex' CLI not found. "
-            "Install the Codex CLI and ensure it is in PATH.",
-            file=sys.stderr,
-        )
-        sys.exit(1)
-    finally:
-        shutil.rmtree(tmpdir, ignore_errors=True)
-
-
-# ---------------------------------------------------------------------------
-# Rubric grader
-# Uses: codex exec "<prompt>" --output-schema <schema> -o <output_file>
-# ---------------------------------------------------------------------------
-
-def run_rubric_grader(parsed: dict, skill: str, run_id: str) -> dict | None:
-    rubric_path = EVALS_DIR / skill / "rubric-items.json"
-    schema_path = EVALS_DIR / "rubric-schema.json"
-    if not rubric_path.exists() or not schema_path.exists():
-        return None
-
-    rubric_data = json.loads(rubric_path.read_text())
-
-    grader_prompt = f"""{rubric_data['grader_prompt']}
-
-## Commands Executed:
-{json.dumps(parsed['bash_commands'], indent=2)}
-
-## Files Created:
-{json.dumps(parsed['files_written'], indent=2)}
-
-## Agent Output (first 4000 chars):
-{parsed['output_text'][:4000]}
-
-## Rubric Checks to Grade:
-{json.dumps(rubric_data['checks'], indent=2)}
-
-Populate: skill="{skill}", run_id="{run_id}", overall_pass, score 0-100, checks array."""
-
-    tmpdir = Path(tempfile.mkdtemp(prefix=f"hawkrubric_{run_id}_"))
-    try:
-        output_file = tmpdir / "rubric_result.json"
-        cmd = [
-            "codex", "exec",
-            grader_prompt,
-            "--output-schema", str(schema_path),
-            "-o", str(output_file),
-        ]
-        subprocess.run(cmd, capture_output=True, text=True, timeout=120, cwd=str(tmpdir))
-
-        if output_file.exists():
-            return json.loads(output_file.read_text())
-        return None
-    except Exception as exc:
-        print(f"  [rubric] grader failed: {exc}", file=sys.stderr)
-        return None
-    finally:
-        shutil.rmtree(tmpdir, ignore_errors=True)
-
-
-# ---------------------------------------------------------------------------
-# Main
-# ---------------------------------------------------------------------------
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Codex eval harness for StackHawk agent skills",
-    )
-    parser.add_argument("--skill", required=True, choices=["hawkscan", "api"])
-    parser.add_argument("--id", dest="prompt_id", metavar="RUN_ID",
-                        help="Run a single prompt by id (e.g. hw-07)")
-    parser.add_argument("--dry-run", action="store_true",
-                        help="Print prompts without calling codex")
-    parser.add_argument("--rubric", action="store_true",
-                        help="Run qualitative rubric grader after process checks (extra cost)")
-    parser.add_argument("--no-full-auto", action="store_true",
-                        help="Run without --full-auto (restricts filesystem access)")
-    parser.add_argument("--max-budget", type=float, default=0.20, metavar="USD",
-                        help="Max spend per eval run in USD (default: 0.20)")
-    parser.add_argument("--model", metavar="MODEL_ID",
-                        help="Override the Codex model (e.g. o3, o4-mini, gpt-4o)")
-    args = parser.parse_args()
-
-    skill = args.skill
-    full_auto = not args.no_full_auto
-
-    prompts_path = EVALS_DIR / skill / "prompts.csv"
-    checks_path  = EVALS_DIR / skill / "process-checks.json"
-
-    with open(prompts_path) as f:
-        all_prompts = list(csv.DictReader(f))
-    checks = json.loads(checks_path.read_text())["checks"]
-
-    if args.prompt_id:
-        prompts = [p for p in all_prompts if p["id"] == args.prompt_id]
-        if not prompts:
-            print(f"ERROR: No prompt with id '{args.prompt_id}'", file=sys.stderr)
-            sys.exit(1)
-    else:
-        prompts = all_prompts
-
-    mode = "full-auto" if full_auto else "sandbox"
-    model_label = f"  |  Model: {args.model}" if args.model else ""
-    print(f"\nSkill: {skill}  |  Platform: codex  |  Mode: {mode}{model_label}  |  Prompts: {len(prompts)}")
-    if args.dry_run:
-        print("[dry-run — no codex calls]")
-    print("─" * 68)
-
-    all_results = []
-    total_cost = 0.0
-
-    for row in prompts:
-        run_id         = row["id"]
-        prompt         = row["prompt"]
-        should_trigger = row["should_trigger"].lower() == "true"
-        itype          = row.get("invocation_type", "")
-
-        print(f"\n[{run_id}] {itype:<12}  trigger={'Y' if should_trigger else 'N'}")
-        print(f"  {prompt[:92]}{'…' if len(prompt) > 92 else ''}")
-
-        if args.dry_run:
-            print("  → skipped")
-            continue
-
-        parsed, _exit = run_codex(
-            prompt, skill, run_id,
-            full_auto=full_auto,
-            max_budget=args.max_budget,
-            model=args.model,
-        )
-
-        # Codex doesn't report USD cost directly; estimate from token usage
-        tokens = parsed.get("input_tokens", 0) + parsed.get("output_tokens", 0)
-        est_cost = tokens * 0.000015  # rough estimate
-        total_cost += est_cost
-
-        if parsed.get("error"):
-            print(f"  ERROR: {parsed['error']}")
-
-        did_trigger = detect_trigger(parsed, skill)
-        trigger_ok  = did_trigger == should_trigger
-
-        process_results: list[dict] = []
-        scoring = {"total": 0, "passed": 0, "blocking_failed": 0, "warning_failed": 0, "score": 0}
-        if should_trigger and did_trigger:
-            process_results = run_process_checks(parsed, checks)
-            scoring = score_checks(process_results)
-
-        rubric_result = None
-        if args.rubric and should_trigger and did_trigger:
-            print("  [rubric] grading…", end=" ", flush=True)
-            rubric_result = run_rubric_grader(parsed, skill, run_id)
-            print(f"score={rubric_result.get('score', '?')}" if rubric_result else "failed")
-
-        result = {
-            "platform":       "codex",
-            "skill":          skill,
-            "run_id":         run_id,
-            "prompt":         prompt,
-            "should_trigger": should_trigger,
-            "did_trigger":    did_trigger,
-            "trigger_correct": trigger_ok,
-            "bash_commands":  parsed["bash_commands"],
-            "files_written":  parsed["files_written"],
-            "process_checks": process_results,
-            "scoring":        scoring,
-            "rubric_result":  rubric_result,
-            "tokens":         {"input": parsed.get("input_tokens", 0), "output": parsed.get("output_tokens", 0)},
-            "timestamp":      datetime.now(timezone.utc).isoformat(),
-        }
-        all_results.append(result)
-
-        out_dir = RESULTS_DIR / skill
-        out_dir.mkdir(parents=True, exist_ok=True)
-        (out_dir / f"{run_id}.result.json").write_text(json.dumps(result, indent=2))
-
-        t_icon    = "✓" if trigger_ok else "✗"
-        score_str = f"score={scoring['score']}/100" if process_results else "—"
-        print(f"  {t_icon} did_trigger={did_trigger}  {score_str}")
-
-        for pr in process_results:
-            if not pr["pass"] and pr["severity"] == "blocking":
-                print(f"    BLOCKING FAIL: {pr['id']}")
-
-    if args.dry_run or not all_results:
-        return
-
-    # ── Summary ────────────────────────────────────────────────────────────
-    trigger_correct = sum(1 for r in all_results if r["trigger_correct"])
-    total      = len(all_results)
-    false_pos  = [r for r in all_results if not r["should_trigger"] and r["did_trigger"]]
-    false_neg  = [r for r in all_results if r["should_trigger"] and not r["did_trigger"]]
-    proc_runs  = [r for r in all_results if r["process_checks"]]
-    avg_score  = (sum(r["scoring"]["score"] for r in proc_runs) // len(proc_runs)
-                  if proc_runs else None)
-    total_blocking = sum(r["scoring"]["blocking_failed"] for r in proc_runs) if proc_runs else 0
-
-    print("\n" + "═" * 68)
-    print(f"SUMMARY  skill={skill}  platform=codex")
-    print(f"  Trigger accuracy : {trigger_correct}/{total} ({100 * trigger_correct // total}%)")
-    if false_pos:
-        print(f"  False positives  : {', '.join(r['run_id'] for r in false_pos)}")
-    if false_neg:
-        print(f"  False negatives  : {', '.join(r['run_id'] for r in false_neg)}")
-    if avg_score is not None:
-        print(f"  Process avg score: {avg_score}/100  (blocking failures: {total_blocking})")
-    print(f"  Results in       : {RESULTS_DIR / skill}/")
-
-    summary = {
-        "skill":    skill,
-        "platform": "codex",
-        "timestamp": datetime.now(timezone.utc).isoformat(),
-        "trigger_accuracy": {"correct": trigger_correct, "total": total},
-        "false_positives":  [r["run_id"] for r in false_pos],
-        "false_negatives":  [r["run_id"] for r in false_neg],
-        "process_avg_score": avg_score,
-        "total_blocking_failures": total_blocking,
-        "runs": [
-            {"run_id": r["run_id"], "trigger_correct": r["trigger_correct"], "score": r["scoring"]["score"]}
-            for r in all_results
-        ],
-    }
-    (RESULTS_DIR / skill / "summary.json").write_text(json.dumps(summary, indent=2))
-
-    # ── GitHub Actions step summary ─────────────────────────────────────────
-    step_summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
-    if step_summary_path:
-        _write_step_summary(step_summary_path, skill, all_results, false_pos, false_neg, avg_score, total_blocking)
-
-    if false_pos or false_neg or total_blocking > 0:
-        sys.exit(1)
-
-
-def _write_step_summary(
-    path: str, skill: str, results: list[dict],
-    false_pos: list[dict], false_neg: list[dict],
-    avg_score: int | None, total_blocking: int,
-) -> None:
-    correct = sum(1 for r in results if r["trigger_correct"])
-    total = len(results)
-    trigger_icon = "✅" if correct == total else "❌"
-    score_icon   = "✅" if (avg_score or 0) >= 70 and total_blocking == 0 else "❌"
-
-    lines = [
-        f"## Skill Eval: `{skill}` (codex)\n",
-        "| Metric | Value |", "|---|---|",
-        f"| Trigger accuracy | {trigger_icon} {correct}/{total} |",
-    ]
-    if false_pos:
-        lines.append(f"| False positives | ⚠️ {', '.join(r['run_id'] for r in false_pos)} |")
-    if false_neg:
-        lines.append(f"| False negatives | ⚠️ {', '.join(r['run_id'] for r in false_neg)} |")
-    if avg_score is not None:
-        lines.append(f"| Process avg score | {score_icon} {avg_score}/100 |")
-        lines.append(f"| Blocking failures | {'❌' if total_blocking else '✅'} {total_blocking} |")
-    lines.append("")
-
-    lines += [
-        "<details><summary>Per-run results</summary>\n",
-        "| ID | Trigger | Score |", "|---|---|---|",
-    ]
-    for r in results:
-        t = "✅" if r["trigger_correct"] else "❌"
-        score = r["scoring"]["score"] if r["process_checks"] else "—"
-        lines.append(f"| {r['run_id']} | {t} | {score} |")
-    lines.append("\n</details>\n")
-
-    with open(path, "a") as f:
-        f.write("\n".join(lines) + "\n")
-
+from evals.cli import main
 
 if __name__ == "__main__":
+    if "--harness" not in sys.argv:
+        sys.argv += ["--harness", "codex"]
     main()
diff --git a/evals/harnesses/copilot/run-evals.py b/evals/harnesses/copilot/run-evals.py
index 9779110..d04c71e 100644
--- a/evals/harnesses/copilot/run-evals.py
+++ b/evals/harnesses/copilot/run-evals.py
@@ -1,391 +1,11 @@
 #!/usr/bin/env python3
-"""
-GitHub Copilot CLI eval harness for StackHawk agent skills.
-
-Uses `copilot -p --output-format json --allow-all-tools --plugin-dir`.
-Skills are loaded from plugins/<skill>/ via --plugin-dir.
-
-The trigger detection is uniquely reliable: Copilot emits an explicit
-  tool.execution_start {"toolName":"skill","arguments":{"skill":"hawkscan"}}
-event when the skill fires. No heuristic text-matching needed.
-
-Usage:
-    python3 evals/harnesses/copilot/run-evals.py --skill hawkscan
-    python3 evals/harnesses/copilot/run-evals.py --skill api
-    python3 evals/harnesses/copilot/run-evals.py --skill hawkscan --id hw-07
-    python3 evals/harnesses/copilot/run-evals.py --skill hawkscan --dry-run
-    python3 evals/harnesses/copilot/run-evals.py --skill hawkscan --model gpt-5.3-codex
-
-Requirements:
-    - GitHub Copilot CLI installed and authenticated (copilot login)
-    - Run from the agent-skills repo root
-
-Note: Copilot actually executes commands (--allow-all-tools), so process
-check scores reflect real hawk workflow completion — not just observations.
-"""
-
-import argparse
-import csv
-import json
-import os
-import re
-import shutil
-import subprocess
+"""Back-compat shim. The eval logic now lives in evals/cli.py and evals/lib/.
+Run `uv run evals --harness copilot --skill <skill>` instead.
+This shim forwards old invocations to the new CLI."""
 import sys
-import tempfile
-from datetime import datetime, timezone
-from pathlib import Path
-
-# ---------------------------------------------------------------------------
-# Paths
-# ---------------------------------------------------------------------------
-HARNESS_DIR = Path(__file__).parent.resolve()
-EVALS_DIR   = HARNESS_DIR.parent.parent
-REPO_ROOT   = EVALS_DIR.parent
-RESULTS_DIR = HARNESS_DIR / "results"
-
-# ---------------------------------------------------------------------------
-# Trigger detection
-# Copilot emits an unambiguous tool.execution_start event when a skill fires:
-#   {"type":"tool.execution_start","data":{"toolName":"skill","arguments":{"skill":"hawkscan"}}}
-# This eliminates all heuristic signal-matching needed for other platforms.
-# ---------------------------------------------------------------------------
-
-def detect_trigger(parsed: dict, skill: str) -> bool:
-    # Primary: explicit skill tool call (unambiguous)
-    for call in parsed.get("skill_calls", []):
-        if call.lower() == skill.lower() or call.lower() == f"stackhawk-{skill}".lower():
-            return True
-
-    # Fallback: CLI signals in bash commands (belt-and-suspenders)
-    cli_signals = {
-        "hawkscan": ["hawk scan", "hawk validate", "hawk rescan", "hawk config",
-                     "hawk create app", "hawk init", "hawk perch"],
-        "api": ["hawkop scan get", "hawkop org get", "/api/v2/org", "/api/v1/scan"],
-    }
-    cmd_haystack = " ".join(parsed.get("bash_commands", [])).lower()
-    return any(s.lower() in cmd_haystack for s in cli_signals.get(skill, []))
-
-
-# ---------------------------------------------------------------------------
-# Stream-json parsing — Copilot JSONL event format:
-#   tool.execution_start  {"toolName":"bash","arguments":{"command":"..."}}
-#   tool.execution_start  {"toolName":"skill","arguments":{"skill":"hawkscan"}}
-#   tool.execution_partial_result {"partialOutput":"..."}
-#   assistant.message     {"content":"..."}
-#   result                {}
-# ---------------------------------------------------------------------------
-
-def parse_stream(jsonl: str) -> dict:
-    bash_commands: list[str] = []
-    files_written: list[str] = []
-    skill_calls:   list[str] = []
-    output_text  = ""
-    usage: dict  = {}
-    error = None
-
-    for line in jsonl.splitlines():
-        line = line.strip()
-        if not line:
-            continue
-        try:
-            event = json.loads(line)
-        except json.JSONDecodeError:
-            continue
-
-        etype = event.get("type", "")
-        data  = event.get("data", {})
-
-        if etype == "tool.execution_start":
-            tool_name = data.get("toolName", "")
-            args      = data.get("arguments", {})
-
-            if tool_name == "bash":
-                cmd = args.get("command", "")
-                if cmd:
-                    bash_commands.append(cmd)
-
-            elif tool_name == "skill":
-                skill_name = args.get("skill", "")
-                if skill_name:
-                    skill_calls.append(skill_name)
-
-            elif tool_name in ("write_file", "create_file", "str_replace_editor"):
-                path = args.get("path") or args.get("file_path") or ""
-                if path:
-                    files_written.append(path)
-
-        elif etype == "assistant.message":
-            content = data.get("content", "")
-            if content:
-                output_text += content + "\n"
-
-        elif etype == "result":
-            usage = data.get("usage", {})
-            if data.get("error"):
-                error = str(data["error"])
-
-    return {
-        "bash_commands": bash_commands,
-        "files_written": files_written,
-        "skill_calls":   skill_calls,
-        "output_text":   output_text.strip(),
-        "usage":         usage,
-        "error":         error,
-    }
-
-
-# ---------------------------------------------------------------------------
-# Process checks
-# ---------------------------------------------------------------------------
-
-def run_process_checks(parsed: dict, checks: list) -> list[dict]:
-    haystack = " ".join([
-        *parsed["bash_commands"],
-        parsed["output_text"],
-    ]).lower()
-    all_files = " ".join(parsed["files_written"]).lower()
-
-    results = []
-    for check in checks:
-        ctype   = check.get("type", "command_executed")
-        signals = [s.lower() for s in check.get("signals", [])]
-        antis   = [a.lower() for a in check.get("anti_patterns", [])]
-
-        signal_hit = next((s for s in signals if s in haystack), None)
-        anti_hit   = next((a for a in antis   if a in haystack), None)
-
-        if ctype in ("command_negative", "file_content_negative", "output_negative"):
-            passed = anti_hit is None
-        elif ctype == "file_absent":
-            target = check.get("target_file", "").lower()
-            passed = target not in all_files
-        elif ctype == "conditional_command":
-            m = re.search(r"'([^']+)'", check.get("condition", ""))
-            condition_keyword = m.group(1).lower() if m else None
-            if condition_keyword and condition_keyword not in haystack:
-                passed = True
-            else:
-                passed = signal_hit is not None
-        elif ctype == "command_preference":
-            preferred = [p.lower() for p in check.get("preferred", [])]
-            passed = any(p in haystack for p in preferred) and anti_hit is None
-        else:
-            passed = signal_hit is not None
-            if antis:
-                passed = passed and anti_hit is None
-
-        results.append({
-            "id":           check["id"],
-            "pass":         passed,
-            "severity":     check.get("severity", "warning"),
-            "signal_found": signal_hit,
-            "anti_found":   anti_hit,
-        })
-    return results
-
-
-def score_checks(results: list[dict]) -> dict:
-    blocking_failed = sum(1 for r in results if not r["pass"] and r["severity"] == "blocking")
-    warning_failed  = sum(1 for r in results if not r["pass"] and r["severity"] == "warning")
-    return {
-        "total":           len(results),
-        "passed":          sum(1 for r in results if r["pass"]),
-        "blocking_failed": blocking_failed,
-        "warning_failed":  warning_failed,
-        "score":           max(0, 100 - blocking_failed * 15 - warning_failed * 5),
-    }
-
-
-# ---------------------------------------------------------------------------
-# Run copilot
-# ---------------------------------------------------------------------------
-
-def run_copilot(
-    prompt: str,
-    skill: str,
-    run_id: str,
-    model: str | None = None,
-) -> tuple[dict, int]:
-    tmpdir = Path(tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_"))
-    try:
-        plugin_dir = str(REPO_ROOT / "plugins" / skill)
-
-        cmd = [
-            "copilot", "-p", prompt,
-            "--output-format", "json",
-            "--allow-all-tools",
-            "--plugin-dir", plugin_dir,
-            "--no-ask-user",
-        ]
-        if model:
-            cmd += ["--model", model]
-
-        proc = subprocess.run(
-            cmd,
-            capture_output=True,
-            text=True,
-            timeout=600,
-            cwd=str(tmpdir),
-            env={**os.environ},
-        )
-
-        trace_dir = RESULTS_DIR / skill
-        trace_dir.mkdir(parents=True, exist_ok=True)
-        (trace_dir / f"{run_id}.jsonl").write_text(proc.stdout)
-
-        parsed = parse_stream(proc.stdout)
-        if proc.returncode != 0 and not parsed["output_text"] and not parsed["skill_calls"]:
-            stderr = proc.stderr.strip()
-            if stderr:
-                parsed["error"] = stderr[:300]
-
-        return parsed, proc.returncode
-
-    except subprocess.TimeoutExpired:
-        return {"bash_commands": [], "files_written": [], "skill_calls": [],
-                "output_text": "", "usage": {}, "error": "timeout"}, 1
-    except FileNotFoundError:
-        print("ERROR: 'copilot' CLI not found. Install GitHub Copilot CLI.", file=sys.stderr)
-        sys.exit(1)
-    finally:
-        shutil.rmtree(tmpdir, ignore_errors=True)
-
-
-# ---------------------------------------------------------------------------
-# Main
-# ---------------------------------------------------------------------------
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="GitHub Copilot CLI eval harness for StackHawk agent skills",
-    )
-    parser.add_argument("--skill", required=True, choices=["hawkscan", "api"])
-    parser.add_argument("--id", dest="prompt_id", metavar="RUN_ID")
-    parser.add_argument("--dry-run", action="store_true")
-    parser.add_argument("--model", metavar="MODEL_ID",
-                        help="Model override (e.g. gpt-5.3-codex)")
-    args = parser.parse_args()
-
-    skill = args.skill
-    prompts_path = EVALS_DIR / skill / "prompts.csv"
-    checks_path  = EVALS_DIR / skill / "process-checks.json"
-
-    with open(prompts_path) as f:
-        all_prompts = list(csv.DictReader(f))
-    checks = json.loads(checks_path.read_text())["checks"]
-
-    if args.prompt_id:
-        prompts = [p for p in all_prompts if p["id"] == args.prompt_id]
-        if not prompts:
-            print(f"ERROR: No prompt with id '{args.prompt_id}'", file=sys.stderr)
-            sys.exit(1)
-    else:
-        prompts = all_prompts
-
-    model_label = f"  |  Model: {args.model}" if args.model else ""
-    print(f"\nSkill: {skill}  |  Platform: copilot  |  Mode: full-auto{model_label}  |  Prompts: {len(prompts)}")
-    if args.dry_run:
-        print("[dry-run — no copilot calls]")
-    print("─" * 68)
-
-    all_results = []
-
-    for row in prompts:
-        run_id         = row["id"]
-        prompt         = row["prompt"]
-        should_trigger = row["should_trigger"].lower() == "true"
-        itype          = row.get("invocation_type", "")
-
-        print(f"\n[{run_id}] {itype:<12}  trigger={'Y' if should_trigger else 'N'}")
-        print(f"  {prompt[:92]}{'…' if len(prompt) > 92 else ''}")
-
-        if args.dry_run:
-            print("  → skipped")
-            continue
-
-        parsed, _exit = run_copilot(prompt, skill, run_id, model=args.model)
-
-        if parsed.get("error"):
-            print(f"  ERROR: {parsed['error']}")
-
-        did_trigger = detect_trigger(parsed, skill)
-        trigger_ok  = did_trigger == should_trigger
-
-        process_results: list[dict] = []
-        scoring = {"total": 0, "passed": 0, "blocking_failed": 0, "warning_failed": 0, "score": 0}
-        if should_trigger and did_trigger:
-            process_results = run_process_checks(parsed, checks)
-            scoring = score_checks(process_results)
-
-        result = {
-            "platform":        "copilot",
-            "skill":           skill,
-            "run_id":          run_id,
-            "prompt":          prompt,
-            "should_trigger":  should_trigger,
-            "did_trigger":     did_trigger,
-            "trigger_correct": trigger_ok,
-            "bash_commands":   parsed["bash_commands"],
-            "files_written":   parsed["files_written"],
-            "skill_calls":     parsed["skill_calls"],
-            "process_checks":  process_results,
-            "scoring":         scoring,
-            "usage":           parsed.get("usage", {}),
-            "timestamp":       datetime.now(timezone.utc).isoformat(),
-        }
-        all_results.append(result)
-
-        out_dir = RESULTS_DIR / skill
-        out_dir.mkdir(parents=True, exist_ok=True)
-        (out_dir / f"{run_id}.result.json").write_text(json.dumps(result, indent=2))
-
-        t_icon    = "✓" if trigger_ok else "✗"
-        score_str = f"score={scoring['score']}/100" if process_results else "—"
-        print(f"  {t_icon} did_trigger={did_trigger}  skill_calls={parsed['skill_calls']}  {score_str}")
-        for pr in process_results:
-            if not pr["pass"] and pr["severity"] == "blocking":
-                print(f"    BLOCKING FAIL: {pr['id']}")
-
-    if args.dry_run or not all_results:
-        return
-
-    trigger_correct = sum(1 for r in all_results if r["trigger_correct"])
-    total = len(all_results)
-    false_pos = [r for r in all_results if not r["should_trigger"] and r["did_trigger"]]
-    false_neg = [r for r in all_results if r["should_trigger"] and not r["did_trigger"]]
-    proc_runs = [r for r in all_results if r["process_checks"]]
-    avg_score = (sum(r["scoring"]["score"] for r in proc_runs) // len(proc_runs)
-                 if proc_runs else None)
-    total_blocking = sum(r["scoring"]["blocking_failed"] for r in proc_runs) if proc_runs else 0
-
-    print("\n" + "═" * 68)
-    print(f"SUMMARY  skill={skill}  platform=copilot")
-    print(f"  Trigger accuracy : {trigger_correct}/{total} ({100 * trigger_correct // total}%)")
-    if false_pos:
-        print(f"  False positives  : {', '.join(r['run_id'] for r in false_pos)}")
-    if false_neg:
-        print(f"  False negatives  : {', '.join(r['run_id'] for r in false_neg)}")
-    if avg_score is not None:
-        print(f"  Process avg score: {avg_score}/100  (blocking failures: {total_blocking})")
-    print(f"  Results in       : {RESULTS_DIR / skill}/")
-
-    summary = {
-        "skill": skill, "platform": "copilot",
-        "timestamp": datetime.now(timezone.utc).isoformat(),
-        "trigger_accuracy": {"correct": trigger_correct, "total": total},
-        "false_positives":  [r["run_id"] for r in false_pos],
-        "false_negatives":  [r["run_id"] for r in false_neg],
-        "process_avg_score": avg_score,
-        "total_blocking_failures": total_blocking,
-        "runs": [{"run_id": r["run_id"], "trigger_correct": r["trigger_correct"],
-                  "score": r["scoring"]["score"]} for r in all_results],
-    }
-    (RESULTS_DIR / skill / "summary.json").write_text(json.dumps(summary, indent=2))
-
-    if false_pos or false_neg or total_blocking > 0:
-        sys.exit(1)
-
+from evals.cli import main
 
 if __name__ == "__main__":
+    if "--harness" not in sys.argv:
+        sys.argv += ["--harness", "copilot"]
     main()
diff --git a/evals/harnesses/cursor/run-evals.py b/evals/harnesses/cursor/run-evals.py
index 364a3f7..d83ce7a 100644
--- a/evals/harnesses/cursor/run-evals.py
+++ b/evals/harnesses/cursor/run-evals.py
@@ -1,451 +1,11 @@
 #!/usr/bin/env python3
-"""
-Cursor Agent eval harness for StackHawk agent skills.
-
-Uses `agent --print --output-format stream-json` (Cursor's headless CLI).
-Skills are loaded from cursor/.cursor/rules/*.mdc (alwaysApply rules).
-
-Usage:
-    python3 evals/harnesses/cursor/run-evals.py --skill hawkscan
-    python3 evals/harnesses/cursor/run-evals.py --skill api
-    python3 evals/harnesses/cursor/run-evals.py --skill hawkscan --id hw-07
-    python3 evals/harnesses/cursor/run-evals.py --skill hawkscan --dry-run
-    python3 evals/harnesses/cursor/run-evals.py --skill hawkscan --full-auto   # actually execute commands
-
-Requirements:
-    - Cursor CLI installed and authenticated (`agent status`)
-    - Run from the agent-skills repo root
-    - cursor/.cursor/rules/ contains generated .mdc files (run generate-cursor-rules.sh)
-"""
-
-import argparse
-import csv
-import json
-import os
-import shutil
-import subprocess
+"""Back-compat shim. The eval logic now lives in evals/cli.py and evals/lib/.
+Run `uv run evals --harness cursor --skill <skill>` instead.
+This shim forwards old invocations to the new CLI."""
 import sys
-import tempfile
-from datetime import datetime, timezone
-from pathlib import Path
-
-# ---------------------------------------------------------------------------
-# Paths
-# ---------------------------------------------------------------------------
-HARNESS_DIR = Path(__file__).parent.resolve()
-EVALS_DIR   = HARNESS_DIR.parent.parent
-REPO_ROOT   = EVALS_DIR.parent
-RESULTS_DIR = HARNESS_DIR / "results"
-# cursor/.cursor/rules/ contains the alwaysApply .mdc skill rules
-CURSOR_RULES_DIR = REPO_ROOT / "cursor" / ".cursor" / "rules"
-
-# ---------------------------------------------------------------------------
-# Trigger signals — Cursor-specific tuning.
-# Cursor goes directly into execution without the Claude Code "EVALUATE: YES/NO"
-# evaluation step, so invocation signals focus on narrative phrases the agent
-# uses when kicking off a skill workflow.
-# CLI_SIGNALS are checked against shell commands the agent attempted to run.
-# ---------------------------------------------------------------------------
-CLI_SIGNALS = {
-    "hawkscan": [
-        "hawk scan",
-        "hawk validate",
-        "hawk rescan",
-        "hawk config",
-        "hawk create app",
-        "hawk init",
-        "hawk perch",
-    ],
-    # Cursor api: the agent runs hawkop status as its first step, then
-    # deeper hawkop commands. Include broader hawkop signals since Cursor
-    # doesn't have the false-positive risk of Codex full-auto mode.
-    "api": [
-        "hawkop status",
-        "hawkop scan get",
-        "hawkop org get",
-        "hawkop org set",
-        "hawkop app list",
-        "/api/v2/org",
-        "/api/v1/scan",
-        "hawk_api GET",
-    ],
-}
-
-INVOCATION_SIGNALS = {
-    "hawkscan": [
-        "hawkscan:hawkscan`: yes", "hawkscan:hawkscan` — yes",
-        "hawkscan:hawkscan**: yes", "hawkscan:hawkscan** — yes",
-        "hawkscan:hawkscan: yes",  "hawkscan:hawkscan — yes",
-        "hawkscan:hawkscan - yes", "hawkscan:hawkscan - **yes",
-        "hawkscan** - yes", "hawkscan** — yes",
-        "hawkscan**: yes",  "hawkscan: yes",
-        "hawkscan — yes",   "hawkscan - yes",
-        "autonomous security scan",
-        "dast scan after code", "dast scan triggered", "dast scan required",
-        "security scan required", "security scan after",
-        "run the security scan",  "running the hawkscan",
-    ],
-    "api": [
-        # Claude Code evaluation-format signals (if model uses that format)
-        "stackhawk-api:api`: yes", "stackhawk-api:api` — yes",
-        "stackhawk-api:api**: yes","stackhawk-api:api** — yes",
-        "stackhawk-api:api: yes",  "stackhawk-api:api — yes",
-        "stackhawk-api:api - yes",
-        "stackhawk-api**: yes",    "stackhawk-api** — yes",
-        "stackhawk-api: yes",      "stackhawk-api — yes",
-        "stackhawk-api - yes",
-        # Cursor narrative-style signals — agent says these instead of evaluating
-        "stackhawk api skill",          # "I'll use the StackHawk API skill"
-        "stackhawk api",                # "using the StackHawk API"
-        "api skill to",                 # "api skill to pull your org..."
-        "security posture",             # "pull your org's security posture"
-        "untriaged findings",           # "untriaged findings across all apps"
-        "scan history",                 # "scan history for"
-        "findings across",              # "findings across all apps"
-    ],
-}
-
-# ---------------------------------------------------------------------------
-# Stream-json parsing
-# Cursor events: system / user / thinking / assistant / tool_call / result
-# ---------------------------------------------------------------------------
-
-def parse_stream(jsonl: str) -> dict:
-    bash_commands: list[str] = []
-    output_text = ""
-    files_written: list[str] = []
-    usage: dict = {}
-    error = None
-
-    for line in jsonl.splitlines():
-        line = line.strip()
-        if not line:
-            continue
-        try:
-            event = json.loads(line)
-        except json.JSONDecodeError:
-            continue
-
-        etype = event.get("type", "")
-
-        if etype == "assistant":
-            for block in event.get("message", {}).get("content", []):
-                if block.get("type") == "text":
-                    output_text += block.get("text", "") + "\n"
-
-        elif etype == "tool_call" and event.get("subtype") == "started":
-            tc = event.get("tool_call", {})
-            # Shell command
-            shell = tc.get("shellToolCall", {})
-            if shell:
-                cmd = shell.get("args", {}).get("command", "")
-                if cmd:
-                    bash_commands.append(cmd)
-            # File write
-            write = tc.get("writeToolCall", {})
-            if write:
-                path = write.get("args", {}).get("path", "")
-                if path:
-                    files_written.append(path)
-
-        elif etype == "result":
-            usage = event.get("usage", {})
-            if event.get("is_error"):
-                error = event.get("result", "unknown error")
-
-    return {
-        "bash_commands": bash_commands,
-        "files_written": files_written,
-        "output_text": output_text.strip(),
-        "usage": usage,
-        "error": error,
-    }
-
-
-# ---------------------------------------------------------------------------
-# Trigger detection — same split-signal approach as Claude Code harness
-# ---------------------------------------------------------------------------
-
-def detect_trigger(parsed: dict, skill: str) -> bool:
-    cli_haystack = " ".join(parsed["bash_commands"]).lower()
-    if any(s.lower() in cli_haystack for s in CLI_SIGNALS.get(skill, [])):
-        return True
-    text_haystack = parsed["output_text"].lower()
-    return any(s.lower() in text_haystack for s in INVOCATION_SIGNALS.get(skill, []))
-
-
-# ---------------------------------------------------------------------------
-# Process checks — shared with Claude Code harness
-# ---------------------------------------------------------------------------
-
-def run_process_checks(parsed: dict, checks: list) -> list[dict]:
-    haystack = " ".join([
-        *parsed["bash_commands"],
-        parsed["output_text"],
-    ]).lower()
-    all_files = " ".join(parsed["files_written"]).lower()
-
-    results = []
-    for check in checks:
-        ctype = check.get("type", "command_executed")
-        signals = [s.lower() for s in check.get("signals", [])]
-        antis   = [a.lower() for a in check.get("anti_patterns", [])]
-
-        signal_hit = next((s for s in signals if s in haystack), None)
-        anti_hit   = next((a for a in antis   if a in haystack), None)
-
-        if ctype in ("command_negative", "file_content_negative", "output_negative"):
-            passed = anti_hit is None
-        elif ctype == "file_absent":
-            target = check.get("target_file", "").lower()
-            passed = target not in all_files
-        elif ctype == "conditional_command":
-            import re as _re
-            m = _re.search(r"'([^']+)'", check.get("condition", ""))
-            condition_keyword = m.group(1).lower() if m else None
-            if condition_keyword and condition_keyword not in haystack:
-                passed = True
-            else:
-                passed = signal_hit is not None
-        elif ctype == "command_preference":
-            preferred = [p.lower() for p in check.get("preferred", [])]
-            passed = any(p in haystack for p in preferred) and anti_hit is None
-        else:
-            passed = signal_hit is not None
-            if antis:
-                passed = passed and anti_hit is None
-
-        results.append({
-            "id":           check["id"],
-            "pass":         passed,
-            "severity":     check.get("severity", "warning"),
-            "signal_found": signal_hit,
-            "anti_found":   anti_hit,
-        })
-    return results
-
-
-def score_checks(results: list[dict]) -> dict:
-    blocking_failed = sum(1 for r in results if not r["pass"] and r["severity"] == "blocking")
-    warning_failed  = sum(1 for r in results if not r["pass"] and r["severity"] == "warning")
-    return {
-        "total":           len(results),
-        "passed":          sum(1 for r in results if r["pass"]),
-        "blocking_failed": blocking_failed,
-        "warning_failed":  warning_failed,
-        "score":           max(0, 100 - blocking_failed * 15 - warning_failed * 5),
-    }
-
-
-# ---------------------------------------------------------------------------
-# Run agent
-# ---------------------------------------------------------------------------
-
-def _setup_workspace(skill: str, target_dir: Path) -> None:
-    """Copy cursor/.cursor/rules/ into a fresh workspace so alwaysApply rules load."""
-    dst = target_dir / ".cursor" / "rules"
-    dst.mkdir(parents=True, exist_ok=True)
-    for mdc in CURSOR_RULES_DIR.glob("*.mdc"):
-        shutil.copy2(mdc, dst / mdc.name)
-
-
-def run_cursor(
-    prompt: str,
-    skill: str,
-    run_id: str,
-    full_auto: bool = False,
-    model: str | None = None,
-) -> tuple[dict, int]:
-    tmpdir = Path(tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_"))
-    try:
-        _setup_workspace(skill, tmpdir)
-
-        api_key = os.environ.get("CURSOR_API_KEY", "")
-        cmd = [
-            "agent", "-p", prompt,
-            "--output-format", "stream-json",
-            "--print",
-            "--trust",
-        ]
-        if api_key:
-            cmd += ["--api-key", api_key]
-        if model:
-            cmd += ["--model", model]
-        if full_auto:
-            cmd.append("--force")
-
-        proc = subprocess.run(
-            cmd,
-            capture_output=True,
-            text=True,
-            timeout=300,
-            cwd=str(tmpdir),
-        )
-
-        trace_dir = RESULTS_DIR / skill
-        trace_dir.mkdir(parents=True, exist_ok=True)
-        (trace_dir / f"{run_id}.jsonl").write_text(proc.stdout)
-
-        return parse_stream(proc.stdout), proc.returncode
-
-    except subprocess.TimeoutExpired:
-        return {"bash_commands": [], "files_written": [], "output_text": "",
-                "usage": {}, "error": "timeout"}, 1
-    except FileNotFoundError:
-        print("ERROR: 'agent' CLI not found. Install Cursor and ensure it is in PATH.",
-              file=sys.stderr)
-        sys.exit(1)
-    finally:
-        shutil.rmtree(tmpdir, ignore_errors=True)
-
-
-# ---------------------------------------------------------------------------
-# Main
-# ---------------------------------------------------------------------------
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Cursor Agent eval harness for StackHawk agent skills",
-    )
-    parser.add_argument("--skill", required=True, choices=["hawkscan", "api"])
-    parser.add_argument("--id", dest="prompt_id", metavar="RUN_ID")
-    parser.add_argument("--dry-run", action="store_true")
-    parser.add_argument("--full-auto", action="store_true",
-                        help="Pass --force so the agent can execute commands")
-    parser.add_argument("--model", metavar="MODEL_ID",
-                        help="Model override (e.g. gpt-5.5, sonnet-4)")
-    args = parser.parse_args()
-
-    skill = args.skill
-    prompts_path = EVALS_DIR / skill / "prompts.csv"
-    checks_path  = EVALS_DIR / skill / "process-checks.json"
-
-    with open(prompts_path) as f:
-        all_prompts = list(csv.DictReader(f))
-    checks = json.loads(checks_path.read_text())["checks"]
-
-    if args.prompt_id:
-        prompts = [p for p in all_prompts if p["id"] == args.prompt_id]
-        if not prompts:
-            print(f"ERROR: No prompt with id '{args.prompt_id}'", file=sys.stderr)
-            sys.exit(1)
-    else:
-        prompts = all_prompts
-
-    if not CURSOR_RULES_DIR.exists():
-        print(f"ERROR: {CURSOR_RULES_DIR} not found. Run scripts/generate-cursor-rules.sh first.",
-              file=sys.stderr)
-        sys.exit(1)
-
-    mode = "full-auto" if args.full_auto else "observe"
-    model_label = f"  |  Model: {args.model}" if args.model else ""
-    print(f"\nSkill: {skill}  |  Platform: cursor  |  Mode: {mode}{model_label}  |  Prompts: {len(prompts)}")
-    if args.dry_run:
-        print("[dry-run — no agent calls]")
-    print("─" * 68)
-
-    all_results = []
-    total_tokens = {"input": 0, "output": 0}
-
-    for row in prompts:
-        run_id         = row["id"]
-        prompt         = row["prompt"]
-        should_trigger = row["should_trigger"].lower() == "true"
-        itype          = row.get("invocation_type", "")
-
-        print(f"\n[{run_id}] {itype:<12}  trigger={'Y' if should_trigger else 'N'}")
-        print(f"  {prompt[:92]}{'…' if len(prompt) > 92 else ''}")
-
-        if args.dry_run:
-            print("  → skipped")
-            continue
-
-        parsed, _exit = run_cursor(prompt, skill, run_id, full_auto=args.full_auto, model=args.model)
-        u = parsed.get("usage", {})
-        total_tokens["input"]  += u.get("inputTokens", 0)
-        total_tokens["output"] += u.get("outputTokens", 0)
-
-        if parsed.get("error"):
-            print(f"  ERROR: {parsed['error']}")
-
-        did_trigger = detect_trigger(parsed, skill)
-        trigger_ok  = did_trigger == should_trigger
-
-        process_results: list[dict] = []
-        scoring = {"total": 0, "passed": 0, "blocking_failed": 0, "warning_failed": 0, "score": 0}
-        if should_trigger and did_trigger:
-            process_results = run_process_checks(parsed, checks)
-            scoring = score_checks(process_results)
-
-        result = {
-            "platform":        "cursor",
-            "skill":           skill,
-            "run_id":          run_id,
-            "prompt":          prompt,
-            "should_trigger":  should_trigger,
-            "did_trigger":     did_trigger,
-            "trigger_correct": trigger_ok,
-            "bash_commands":   parsed["bash_commands"],
-            "files_written":   parsed["files_written"],
-            "process_checks":  process_results,
-            "scoring":         scoring,
-            "usage":           u,
-            "timestamp":       datetime.now(timezone.utc).isoformat(),
-        }
-        all_results.append(result)
-
-        out_dir = RESULTS_DIR / skill
-        out_dir.mkdir(parents=True, exist_ok=True)
-        (out_dir / f"{run_id}.result.json").write_text(json.dumps(result, indent=2))
-
-        t_icon    = "✓" if trigger_ok else "✗"
-        score_str = f"score={scoring['score']}/100" if process_results else "—"
-        print(f"  {t_icon} did_trigger={did_trigger}  {score_str}")
-        for pr in process_results:
-            if not pr["pass"] and pr["severity"] == "blocking":
-                print(f"    BLOCKING FAIL: {pr['id']}")
-
-    if args.dry_run or not all_results:
-        return
-
-    trigger_correct = sum(1 for r in all_results if r["trigger_correct"])
-    total = len(all_results)
-    false_pos = [r for r in all_results if not r["should_trigger"] and r["did_trigger"]]
-    false_neg = [r for r in all_results if r["should_trigger"] and not r["did_trigger"]]
-    proc_runs = [r for r in all_results if r["process_checks"]]
-    avg_score = (sum(r["scoring"]["score"] for r in proc_runs) // len(proc_runs)
-                 if proc_runs else None)
-    total_blocking = sum(r["scoring"]["blocking_failed"] for r in proc_runs) if proc_runs else 0
-
-    print("\n" + "═" * 68)
-    print(f"SUMMARY  skill={skill}  platform=cursor")
-    print(f"  Trigger accuracy : {trigger_correct}/{total} ({100 * trigger_correct // total}%)")
-    if false_pos:
-        print(f"  False positives  : {', '.join(r['run_id'] for r in false_pos)}")
-    if false_neg:
-        print(f"  False negatives  : {', '.join(r['run_id'] for r in false_neg)}")
-    if avg_score is not None:
-        print(f"  Process avg score: {avg_score}/100  (blocking failures: {total_blocking})")
-    print(f"  Total tokens     : {total_tokens['input']} in / {total_tokens['output']} out")
-    print(f"  Results in       : {RESULTS_DIR / skill}/")
-
-    summary = {
-        "skill": skill, "platform": "cursor",
-        "timestamp": datetime.now(timezone.utc).isoformat(),
-        "trigger_accuracy": {"correct": trigger_correct, "total": total},
-        "false_positives":  [r["run_id"] for r in false_pos],
-        "false_negatives":  [r["run_id"] for r in false_neg],
-        "process_avg_score": avg_score,
-        "total_blocking_failures": total_blocking,
-        "total_tokens": total_tokens,
-        "runs": [{"run_id": r["run_id"], "trigger_correct": r["trigger_correct"],
-                  "score": r["scoring"]["score"]} for r in all_results],
-    }
-    (RESULTS_DIR / skill / "summary.json").write_text(json.dumps(summary, indent=2))
-
-    if false_pos or false_neg or total_blocking > 0:
-        sys.exit(1)
-
+from evals.cli import main
 
 if __name__ == "__main__":
+    if "--harness" not in sys.argv:
+        sys.argv += ["--harness", "cursor"]
     main()

From 5a7f80f48e7eefef5e3afa7de8ee8bbf843baac8 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 12:58:39 -0600
Subject: [PATCH 14/61] ci(evals): tiered runs (validate on PR, cheap PR
 matrix, full main) + uv

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/skill-evals.yml | 100 ++++++++++++++++--------------
 1 file changed, 52 insertions(+), 48 deletions(-)

diff --git a/.github/workflows/skill-evals.yml b/.github/workflows/skill-evals.yml
index 5cc3162..6be4b28 100644
--- a/.github/workflows/skill-evals.yml
+++ b/.github/workflows/skill-evals.yml
@@ -1,6 +1,12 @@
 name: Skill Evals
 
 on:
+  pull_request:
+    paths:
+      - "plugins/**"
+      - "evals/**"
+  push:
+    branches: [main]
   workflow_dispatch:
     inputs:
       skill:
@@ -35,26 +41,35 @@ permissions:
 
 jobs:
 
+  # ── Config validation (no API keys; runs on every PR including forks) ──────
+  validate-config:
+    name: validate eval config
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v5
+      - name: Validate prompts.yaml + process-checks.json
+        run: uv run validate
+
   # ── Claude Code ──────────────────────────────────────────────────────────
   eval-claude-code:
-    name: claude-code / ${{ matrix.skill }}
+    name: claude-code / ${{ matrix.skill }} / ${{ matrix.model }}
     runs-on: ubuntu-latest
+    needs: validate-config
     if: |
-      github.event_name != 'workflow_dispatch' ||
+      github.event_name == 'pull_request' ||
+      github.event_name == 'push' ||
       inputs.platform == 'all' ||
       inputs.platform == 'claude-code'
     strategy:
       fail-fast: false
       matrix:
         skill: [hawkscan, api]
+        model: ${{ github.event_name == 'pull_request' && fromJSON('["claude-haiku-4-5-20251001"]') || fromJSON('["claude-sonnet-4-6","claude-opus-4-7","claude-haiku-4-5-20251001"]') }}
 
     steps:
       - uses: actions/checkout@v4
-
-      - uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-
+      - uses: astral-sh/setup-uv@v5
       - uses: actions/setup-node@v4
         with:
           node-version: "20"
@@ -65,7 +80,7 @@ jobs:
       - name: Verify claude CLI
         run: claude --version
 
-      - name: Run ${{ matrix.skill }} evals
+      - name: Run ${{ matrix.skill }} evals (${{ matrix.model }})
         env:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
         run: |
@@ -73,17 +88,14 @@ jobs:
           if [ "${{ inputs.rubric }}" = "true" ]; then
             RUBRIC_FLAG="--rubric"
           fi
-          python3 evals/harnesses/claude-code/run-evals.py \
-            --skill ${{ matrix.skill }} \
-            --bare \
-            --max-budget 0.15 \
-            $RUBRIC_FLAG
+          uv run evals --harness claude-code --skill ${{ matrix.skill }} \
+            --model ${{ matrix.model }} --bare --max-budget 0.15 $RUBRIC_FLAG
 
       - name: Upload results
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: eval-claude-code-${{ matrix.skill }}
+          name: eval-claude-code-${{ matrix.skill }}-${{ matrix.model }}
           path: evals/harnesses/claude-code/results/${{ matrix.skill }}/
           retention-days: 30
 
@@ -91,10 +103,10 @@ jobs:
   eval-codex:
     name: codex / ${{ matrix.skill }}
     runs-on: ubuntu-latest
+    needs: validate-config
     if: |
-      github.event_name != 'workflow_dispatch' ||
-      inputs.platform == 'all' ||
-      inputs.platform == 'codex'
+      github.event_name == 'push' ||
+      github.event_name == 'workflow_dispatch' && (inputs.platform == 'all' || inputs.platform == 'codex')
     strategy:
       fail-fast: false
       matrix:
@@ -102,11 +114,7 @@ jobs:
 
     steps:
       - uses: actions/checkout@v4
-
-      - uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-
+      - uses: astral-sh/setup-uv@v5
       - uses: actions/setup-node@v4
         with:
           node-version: "20"
@@ -129,8 +137,7 @@ jobs:
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
         run: |
-          python3 evals/harnesses/codex/run-evals.py \
-            --skill ${{ matrix.skill }}
+          uv run evals --harness codex --skill ${{ matrix.skill }}
 
       - name: Upload results
         if: always()
@@ -144,10 +151,10 @@ jobs:
   eval-agy:
     name: agy / ${{ matrix.skill }}
     runs-on: ubuntu-latest
+    needs: validate-config
     if: |
-      github.event_name != 'workflow_dispatch' ||
-      inputs.platform == 'all' ||
-      inputs.platform == 'agy'
+      github.event_name == 'push' ||
+      github.event_name == 'workflow_dispatch' && (inputs.platform == 'all' || inputs.platform == 'agy')
     strategy:
       fail-fast: false
       matrix:
@@ -155,10 +162,7 @@ jobs:
 
     steps:
       - uses: actions/checkout@v4
-
-      - uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
+      - uses: astral-sh/setup-uv@v5
 
       - name: Install agy CLI
         run: curl -fsSL https://antigravity.google/install-cli | bash
@@ -177,9 +181,7 @@ jobs:
         env:
           AGY_API_KEY: ${{ secrets.AGY_API_KEY }}
         run: |
-          python3 evals/harnesses/agy/run-evals.py \
-            --skill ${{ matrix.skill }} \
-            --print-timeout 240s
+          uv run evals --harness agy --skill ${{ matrix.skill }}
 
       - name: Upload results
         if: always()
@@ -193,10 +195,10 @@ jobs:
   eval-cursor:
     name: cursor / ${{ matrix.skill }}
     runs-on: ubuntu-latest
+    needs: validate-config
     if: |
-      github.event_name != 'workflow_dispatch' ||
-      inputs.platform == 'all' ||
-      inputs.platform == 'cursor'
+      github.event_name == 'push' ||
+      github.event_name == 'workflow_dispatch' && (inputs.platform == 'all' || inputs.platform == 'cursor')
     strategy:
       fail-fast: false
       matrix:
@@ -204,11 +206,7 @@ jobs:
 
     steps:
       - uses: actions/checkout@v4
-
-      - uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-
+      - uses: astral-sh/setup-uv@v5
       - uses: actions/setup-node@v4
         with:
           node-version: "20"
@@ -224,8 +222,7 @@ jobs:
         env:
           CURSOR_API_KEY: ${{ secrets.CURSOR_API_KEY }}
         run: |
-          python3 evals/harnesses/cursor/run-evals.py \
-            --skill ${{ matrix.skill }}
+          uv run evals --harness cursor --skill ${{ matrix.skill }}
 
       - name: Upload results
         if: always()
@@ -238,7 +235,7 @@ jobs:
   # ── PR comment ────────────────────────────────────────────────────────────
   comment:
     name: Post PR summary
-    needs: [eval-claude-code, eval-codex, eval-agy, eval-cursor]
+    needs: [validate-config, eval-claude-code, eval-codex, eval-agy, eval-cursor]
     if: always() && github.event_name == 'pull_request'
     runs-on: ubuntu-latest
     permissions:
@@ -270,9 +267,16 @@ jobs:
             for (const platform of platforms) {
               body += `### Platform: \`${platform}\`\n\n`;
               for (const skill of skills) {
-                const summaryPath = path.join(
-                  'results', `eval-${platform}-${skill}`, 'summary.json'
-                );
+                let summaryPath;
+                if (platform === 'claude-code') {
+                  summaryPath = path.join(
+                    'results', `eval-claude-code-${skill}-claude-haiku-4-5-20251001`, 'summary.json'
+                  );
+                } else {
+                  summaryPath = path.join(
+                    'results', `eval-${platform}-${skill}`, 'summary.json'
+                  );
+                }
 
                 if (!fs.existsSync(summaryPath)) {
                   body += `**\`${skill}\`**: ⚠️ No results\n`;

From fc9c55102bcad8c382c89a36da44889b3c1d9e35 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 13:02:06 -0600
Subject: [PATCH 15/61] docs(evals): document uv CLI, prompts.yaml,
 compare/regrade, PASS-SLOW

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/README.md           | 39 +++++++++++++++++++----------
 evals/harnesses/README.md | 52 ++++++++++++++++++++++++---------------
 2 files changed, 58 insertions(+), 33 deletions(-)

diff --git a/evals/README.md b/evals/README.md
index 69d82e9..dfa653c 100644
--- a/evals/README.md
+++ b/evals/README.md
@@ -7,25 +7,27 @@ Evaluation assets for the `hawkscan` and `api` skills. The structure follows the
 ```
 evals/
   hawkscan/
-    prompts.csv          # 20 trigger/no-trigger test cases for the hawkscan skill
+    prompts.yaml         # 20 trigger/no-trigger test cases for the hawkscan skill
     process-checks.json  # Deterministic checks: commands, files, and patterns that must (or must not) appear
     rubric-items.json    # Qualitative rubric check definitions for style and correctness grading
   api/
-    prompts.csv          # 16 trigger/no-trigger test cases for the api skill
+    prompts.yaml         # 16 trigger/no-trigger test cases for the api skill
     process-checks.json  # Deterministic checks
     rubric-items.json    # Qualitative rubric check definitions
   rubric-schema.json     # Shared JSON Schema — constrains rubric grader output format
+  lib/                   # Shared library: models, config, grading, harness, replay, compare, reporting
+  cli.py                 # Unified CLI entrypoints (evals, compare, regrade, validate)
   harnesses/
-    README.md            # How to build platform-specific harnesses (Codex, Claude, Gemini, etc.)
+    README.md            # How to build platform-specific harnesses (Codex, Claude, etc.)
 ```
 
 ## Three layers of evaluation
 
-### 1. Trigger evals (`prompts.csv`)
+### 1. Trigger evals (`prompts.yaml`)
 
-Each row is a prompt with a `should_trigger` flag. Run the prompt through an agent and record whether the skill was invoked.
+Each entry is a prompt with a `should_trigger` flag. Run the prompt through an agent and record whether the skill was invoked. Each prompt may also set a `budget` (cost_usd / bash_commands / output_tokens / wall_seconds) and an `expected` list (each item has exactly one of: signal / anti_pattern / check_id).
 
-Columns: `id`, `should_trigger`, `invocation_type`, `prompt`, `notes`
+Fields: `id`, `should_trigger`, `invocation_type`, `prompt`, `notes`
 
 Invocation types:
 - `explicit` — skill named directly (e.g. `$hawkscan` or `$api`)
@@ -46,19 +48,30 @@ A second, read-only grader pass over the agent's output and generated files. The
 
 ## Running evals
 
-Harnesses are platform-specific. See `harnesses/README.md` for the contract and planned implementations.
+This is a uv project. All commands go through `uv run`.
 
-**Manual checklist:**
-1. Run the prompt in the target agent
-2. Check the output and any generated files against `process-checks.json` — look for `signals` (must appear) and `anti_patterns` (must not appear)
-3. Run a grader with the `grader_prompt` from `rubric-items.json` against the output; require JSON output conforming to `rubric-schema.json`
-4. Record results per check; track scores over time to detect regressions
+| Task | Command |
+|---|---|
+| Validate config (no keys) | `uv run validate` |
+| Run a skill | `uv run evals --harness claude-code --skill hawkscan` |
+| Single prompt | `uv run evals --harness claude-code --skill hawkscan --id hw-07` |
+| Compare with/without skill | `uv run compare --harness claude-code --skill hawkscan` |
+| Regrade a saved trace (free) | `uv run regrade <trace.jsonl> --skill hawkscan` |
+
+Per-prompt config lives in `evals/<skill>/prompts.yaml`. Each prompt may set a
+`budget` (cost_usd / bash_commands / output_tokens / wall_seconds) and an
+`expected` list (each item has exactly one of: signal / anti_pattern / check_id).
+A correct run that breaches a budget grades as PASS-SLOW. A process-check in
+`process-checks.json` may carry `applies_to: [<prompt id>]` to scope it to
+specific prompts (absent = applies to all).
+
+See `harnesses/README.md` for per-platform instructions and CI setup.
 
 ## Adding test cases
 
 When a skill bug or regression is discovered:
 
-1. Add a new row to the relevant `prompts.csv` capturing the prompt that exposed the bug
+1. Add a new entry to the relevant `prompts.yaml` capturing the prompt that exposed the bug
 2. If the bug was a missing process step, add a check to `process-checks.json`
 3. If the bug was a style or qualitative issue, add a check to the relevant `rubric-items.json`
 
diff --git a/evals/harnesses/README.md b/evals/harnesses/README.md
index 16d2370..52b3f2f 100644
--- a/evals/harnesses/README.md
+++ b/evals/harnesses/README.md
@@ -16,6 +16,8 @@ Each harness connects the platform-agnostic test cases in `evals/` to a specific
 
 ### Prerequisites
 
+Install [uv](https://docs.astral.sh/uv/) if you don't have it — `uv run` handles dependency installation automatically, so no separate `uv sync` step is needed before running evals.
+
 Install the CLI for whichever platform you want to test:
 
 ```bash
@@ -30,18 +32,18 @@ curl -fsSL https://antigravity.google/install-cli | bash  # Antigravity (agy)
 
 ```bash
 # Requires: ANTHROPIC_API_KEY
-python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan
-python3 evals/harnesses/claude-code/run-evals.py --skill api
+uv run evals --harness claude-code --skill hawkscan
+uv run evals --harness claude-code --skill api
 
 # Override model (default: claude's configured default)
-python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan --model claude-opus-4-7
-python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan --model claude-haiku-4-5-20251001
+uv run evals --harness claude-code --skill hawkscan --model claude-opus-4-7
+uv run evals --harness claude-code --skill hawkscan --model claude-haiku-4-5-20251001
 
 # Single prompt
-python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan --id hw-07
+uv run evals --harness claude-code --skill hawkscan --id hw-07
 
 # Dry run (no API calls)
-python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan --dry-run
+uv run evals --harness claude-code --skill hawkscan --dry-run
 ```
 
 ### Codex
@@ -55,20 +57,20 @@ codex plugin add stackhawk-api@stackhawk
 
 ```bash
 # Requires: OPENAI_API_KEY
-python3 evals/harnesses/codex/run-evals.py --skill hawkscan
-python3 evals/harnesses/codex/run-evals.py --skill api
+uv run evals --harness codex --skill hawkscan
+uv run evals --harness codex --skill api
 
 # Override model
-python3 evals/harnesses/codex/run-evals.py --skill hawkscan --model gpt-5.5
-python3 evals/harnesses/codex/run-evals.py --skill hawkscan --model o3
+uv run evals --harness codex --skill hawkscan --model gpt-5.5
+uv run evals --harness codex --skill hawkscan --model o3
 ```
 
 ### Cursor
 
 ```bash
 # Requires: Cursor Pro account
-python3 evals/harnesses/cursor/run-evals.py --skill hawkscan
-python3 evals/harnesses/cursor/run-evals.py --skill api
+uv run evals --harness cursor --skill hawkscan
+uv run evals --harness cursor --skill api
 ```
 
 ### Copilot
@@ -76,9 +78,9 @@ python3 evals/harnesses/cursor/run-evals.py --skill api
 ```bash
 # Requires: GitHub Copilot account (gh copilot or copilot CLI)
 # No plugin setup needed — loads directly via --plugin-dir
-python3 evals/harnesses/copilot/run-evals.py --skill hawkscan
-python3 evals/harnesses/copilot/run-evals.py --skill api
-python3 evals/harnesses/copilot/run-evals.py --skill hawkscan --model gpt-5.3-codex
+uv run evals --harness copilot --skill hawkscan
+uv run evals --harness copilot --skill api
+uv run evals --harness copilot --skill hawkscan --model gpt-5.3-codex
 ```
 
 > **Best trigger detection**: Copilot emits an explicit `skill` tool call
@@ -95,16 +97,23 @@ agy plugin install /path/to/agent-skills/plugins/api
 
 ```bash
 # Run with your main agy session idle (background tasks bleed in otherwise)
-python3 evals/harnesses/agy/run-evals.py --skill hawkscan
-python3 evals/harnesses/agy/run-evals.py --skill api
+uv run evals --harness agy --skill hawkscan
+uv run evals --harness agy --skill api
 
 # Longer timeout for slow prompts
-python3 evals/harnesses/agy/run-evals.py --skill hawkscan --print-timeout 300s
+uv run evals --harness agy --skill hawkscan --print-timeout 300s
 ```
 
+> **Shims vs adapters**: The per-platform `run-evals.py` scripts are back-compat
+> shims that forward to `uv run evals`. Full stream-parsing adapter logic lives in
+> `evals/harnesses/<platform>/adapter.py`; currently only **claude-code** has a
+> full adapter. The other platforms (codex, cursor, copilot, agy) forward through
+> the same CLI path and will gain dedicated adapters as output formats are
+> stabilised.
+
 ## How it works
 
-For each row in `evals/<skill>/prompts.csv`, each harness:
+For each entry in `evals/<skill>/prompts.yaml`, each harness:
 
 1. Runs `agent -p "<prompt>"` in a fresh isolated directory
 2. Captures bash commands executed and text output
@@ -122,7 +131,10 @@ For each row in `evals/<skill>/prompts.csv`, each harness:
 
 ## CI
 
-The `.github/workflows/skill-evals.yml` workflow runs Claude Code + Codex + Gemini + Cursor on every PR that touches `plugins/` or `evals/`.
+The `.github/workflows/skill-evals.yml` workflow is tiered:
+
+- **Every PR**: runs `uv run validate` (no API keys required) + a cheap claude-code / Haiku run
+- **Merge to main + manual dispatch**: runs the full model matrix across all platforms
 
 Required GitHub secrets:
 - `ANTHROPIC_API_KEY` — Claude Code

From 46ed9e803a6a8628ef9aea57a8696bd89d632741 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 13:11:10 -0600
Subject: [PATCH 16/61] fix(evals): gate process checks on
 should_trigger+did_trigger (C1); drop unused --rubric

Process checks, ad-hoc expectations, and budget checks now only run when
prompt.should_trigger and did_trigger are both true. Correct non-triggers,
false positives, and false negatives are graded purely on trigger accuracy,
fixing the critical bug where a 100%-correct run would exit non-zero in CI.
Also removes the parsed-but-never-read --rubric flag from _common_args.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/cli.py              |  1 -
 evals/lib/grading.py      | 17 ++++++++++++++++-
 tests/lib/test_grading.py | 37 +++++++++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/evals/cli.py b/evals/cli.py
index bb32b34..3bff8b7 100644
--- a/evals/cli.py
+++ b/evals/cli.py
@@ -25,7 +25,6 @@ def _common_args(p: argparse.ArgumentParser) -> None:
     p.add_argument("--max-budget", type=float, default=0.20)
     p.add_argument("--bare", action="store_true")
     p.add_argument("--full-auto", action="store_true")
-    p.add_argument("--rubric", action="store_true")
 
 
 def main() -> None:
diff --git a/evals/lib/grading.py b/evals/lib/grading.py
index a3876cc..3ab2c0f 100644
--- a/evals/lib/grading.py
+++ b/evals/lib/grading.py
@@ -108,6 +108,21 @@ def _score(checks: list[ProcessCheckResult]) -> int:
 
 def grade(prompt: PromptConfig, run: ParsedRun, checks: list[dict], *,
           platform: str, skill: str, did_trigger: bool) -> EvalResult:
+    trigger_correct = (did_trigger == prompt.should_trigger)
+
+    # Process checks, ad-hoc expectations, and budgets only apply when the skill
+    # should have fired AND did. For correct non-triggers, false positives, and
+    # false negatives, the verdict is purely the trigger outcome (no process grading).
+    if not (prompt.should_trigger and did_trigger):
+        return EvalResult(
+            platform=platform, skill=skill, run_id=prompt.id,
+            should_trigger=prompt.should_trigger, did_trigger=did_trigger,
+            trigger_correct=trigger_correct,
+            verdict=Verdict.PASS if trigger_correct else Verdict.FAIL,
+            budget_breaches=[], process_checks=[],
+            score=100 if trigger_correct else 0, cost_usd=run.cost_usd,
+        )
+
     proc = run_process_checks(run, applicable_checks(checks, prompt.id))
     proc += run_adhoc_expected(run, prompt.expected)
 
@@ -123,7 +138,7 @@ def grade(prompt: PromptConfig, run: ParsedRun, checks: list[dict], *,
     return EvalResult(
         platform=platform, skill=skill, run_id=prompt.id,
         should_trigger=prompt.should_trigger, did_trigger=did_trigger,
-        trigger_correct=(did_trigger == prompt.should_trigger),
+        trigger_correct=trigger_correct,
         verdict=verdict, budget_breaches=breaches, process_checks=proc,
         score=_score(proc), cost_usd=run.cost_usd,
     )
diff --git a/tests/lib/test_grading.py b/tests/lib/test_grading.py
index eab61de..a368d2c 100644
--- a/tests/lib/test_grading.py
+++ b/tests/lib/test_grading.py
@@ -164,3 +164,40 @@ def pc(passed, sev): return ProcessCheckResult(id="x", passed=passed, severity=s
     assert _score([pc(False, "warning")]) == 95
     assert _score([pc(False, "blocking"), pc(False, "warning")]) == 80
     assert _score([pc(False, "blocking")] * 8) == 0  # floored
+
+
+def test_grade_correct_negative_passes_without_process_checks():
+    # should_trigger=False, did_trigger=False -> correct -> PASS, no process checks run
+    run = ParsedRun(bash_commands=["echo not relevant"])
+    checks = [{"id": "c1", "type": "command_executed", "signals": ["hawk scan"],
+               "severity": "blocking"}]
+    p = _prompt(should_trigger=False)
+    res = grade(p, run, checks, platform="claude-code", skill="demo", did_trigger=False)
+    assert res.verdict == Verdict.PASS
+    assert res.trigger_correct is True
+    assert res.process_checks == []
+    assert res.score == 100
+
+
+def test_grade_false_negative_fails():
+    # should_trigger=True but did_trigger=False -> incorrect -> FAIL, no process checks
+    run = ParsedRun(bash_commands=["echo nothing"])
+    checks = [{"id": "c1", "type": "command_executed", "signals": ["hawk scan"],
+               "severity": "blocking"}]
+    p = _prompt(should_trigger=True)
+    res = grade(p, run, checks, platform="claude-code", skill="demo", did_trigger=False)
+    assert res.verdict == Verdict.FAIL
+    assert res.trigger_correct is False
+    assert res.process_checks == []
+
+
+def test_grade_false_positive_fails_without_process_checks():
+    # should_trigger=False but did_trigger=True -> incorrect -> FAIL, no process checks
+    run = ParsedRun(bash_commands=["hawk scan"])
+    checks = [{"id": "c1", "type": "command_executed", "signals": ["hawk scan"],
+               "severity": "blocking"}]
+    p = _prompt(should_trigger=False)
+    res = grade(p, run, checks, platform="claude-code", skill="demo", did_trigger=True)
+    assert res.verdict == Verdict.FAIL
+    assert res.trigger_correct is False
+    assert res.process_checks == []

From 413a748b55d7c55a316cacdb2af2815ed192e4ff Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 13:14:13 -0600
Subject: [PATCH 17/61] ci+docs(evals): dispatch-only non-claude jobs (C2), add
 pytest job, drop rubric plumbing, refresh stale docs

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/skill-evals.yml     |  36 ++++----
 evals/harnesses/claude-code/README.md | 126 +++++++++++---------------
 evals/harnesses/gemini/run-evals.py   |   2 +
 3 files changed, 74 insertions(+), 90 deletions(-)

diff --git a/.github/workflows/skill-evals.yml b/.github/workflows/skill-evals.yml
index 6be4b28..cf565f8 100644
--- a/.github/workflows/skill-evals.yml
+++ b/.github/workflows/skill-evals.yml
@@ -21,11 +21,6 @@ on:
         default: "all"
         type: choice
         options: [all, claude-code, codex, agy, cursor]
-      rubric:
-        description: "Run qualitative rubric grader (slower, ~$0.10 extra per run)"
-        required: false
-        default: false
-        type: boolean
 
 permissions:
   contents: read
@@ -51,6 +46,16 @@ jobs:
       - name: Validate prompts.yaml + process-checks.json
         run: uv run validate
 
+  # ── Unit tests (no API keys; runs on every PR + push) ─────────────────────
+  pytest:
+    name: pytest (lib)
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v5
+      - name: Run lib tests
+        run: uv run pytest -q
+
   # ── Claude Code ──────────────────────────────────────────────────────────
   eval-claude-code:
     name: claude-code / ${{ matrix.skill }} / ${{ matrix.model }}
@@ -84,12 +89,8 @@ jobs:
         env:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
         run: |
-          RUBRIC_FLAG=""
-          if [ "${{ inputs.rubric }}" = "true" ]; then
-            RUBRIC_FLAG="--rubric"
-          fi
           uv run evals --harness claude-code --skill ${{ matrix.skill }} \
-            --model ${{ matrix.model }} --bare --max-budget 0.15 $RUBRIC_FLAG
+            --model ${{ matrix.model }} --bare --max-budget 0.15
 
       - name: Upload results
         if: always()
@@ -99,14 +100,15 @@ jobs:
           path: evals/harnesses/claude-code/results/${{ matrix.skill }}/
           retention-days: 30
 
+  # NOTE: dispatch-only until evals/harnesses/codex/adapter.py exists (see harnesses/README.md).
   # ── Codex ─────────────────────────────────────────────────────────────────
   eval-codex:
     name: codex / ${{ matrix.skill }}
     runs-on: ubuntu-latest
     needs: validate-config
     if: |
-      github.event_name == 'push' ||
-      github.event_name == 'workflow_dispatch' && (inputs.platform == 'all' || inputs.platform == 'codex')
+      github.event_name == 'workflow_dispatch' &&
+      (inputs.platform == 'all' || inputs.platform == 'codex')
     strategy:
       fail-fast: false
       matrix:
@@ -147,14 +149,15 @@ jobs:
           path: evals/harnesses/codex/results/${{ matrix.skill }}/
           retention-days: 30
 
+  # NOTE: dispatch-only until evals/harnesses/agy/adapter.py exists (see harnesses/README.md).
   # ── Antigravity (agy) — replaces Gemini ───────────────────────────────────
   eval-agy:
     name: agy / ${{ matrix.skill }}
     runs-on: ubuntu-latest
     needs: validate-config
     if: |
-      github.event_name == 'push' ||
-      github.event_name == 'workflow_dispatch' && (inputs.platform == 'all' || inputs.platform == 'agy')
+      github.event_name == 'workflow_dispatch' &&
+      (inputs.platform == 'all' || inputs.platform == 'agy')
     strategy:
       fail-fast: false
       matrix:
@@ -191,14 +194,15 @@ jobs:
           path: evals/harnesses/agy/results/${{ matrix.skill }}/
           retention-days: 30
 
+  # NOTE: dispatch-only until evals/harnesses/cursor/adapter.py exists (see harnesses/README.md).
   # ── Cursor ────────────────────────────────────────────────────────────────
   eval-cursor:
     name: cursor / ${{ matrix.skill }}
     runs-on: ubuntu-latest
     needs: validate-config
     if: |
-      github.event_name == 'push' ||
-      github.event_name == 'workflow_dispatch' && (inputs.platform == 'all' || inputs.platform == 'cursor')
+      github.event_name == 'workflow_dispatch' &&
+      (inputs.platform == 'all' || inputs.platform == 'cursor')
     strategy:
       fail-fast: false
       matrix:
diff --git a/evals/harnesses/claude-code/README.md b/evals/harnesses/claude-code/README.md
index e84b0c3..b0246ae 100644
--- a/evals/harnesses/claude-code/README.md
+++ b/evals/harnesses/claude-code/README.md
@@ -5,71 +5,65 @@ Runs the StackHawk skill eval suite against Claude Code's non-interactive CLI (`
 ## Prerequisites
 
 - **Claude Code CLI** installed and authenticated: `claude --version`
-- **Python 3.11+**: `python3 --version`
+- **Python 3.11+** with `uv`: `uv run evals --help`
 - Run from the **agent-skills repo root** (plugin dirs are auto-detected)
 
-## How it works
+## Invocation
 
-For each row in `evals/<skill>/prompts.csv`:
+```bash
+# Run all prompts for a skill (preferred)
+uv run evals --harness claude-code --skill hawkscan
+uv run evals --harness claude-code --skill api
 
-1. Runs `claude -p "<prompt>" --output-format stream-json --plugin-dir plugins/<skill>`
-   in a fresh temp directory (isolated, no state leakage between runs)
-2. Parses the JSONL event stream to extract bash commands, files written, and output text
-3. Detects whether the skill triggered (skill-specific command patterns in the trace)
-4. If the skill should have triggered and did: runs deterministic checks from
-   `evals/<skill>/process-checks.json` against the captured trace
-5. Saves `results/<skill>/<run-id>.jsonl` (raw trace) and `results/<skill>/<run-id>.result.json` (scored)
+# Run a specific model
+uv run evals --harness claude-code --skill hawkscan --model claude-haiku-4-5-20251001
 
-Optionally, `--rubric` runs a second `claude -p` call as a qualitative grader, using
-`evals/<skill>/rubric-items.json` and enforcing `evals/rubric-schema.json` via `--json-schema`.
+# Cap spend per run (default: $0.20)
+uv run evals --harness claude-code --skill hawkscan --max-budget 0.10
 
-## Usage
+# Full-auto mode: agent executes commands (--dangerously-skip-permissions)
+uv run evals --harness claude-code --skill hawkscan --full-auto
 
-```bash
-# Run all prompts for a skill
-python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan
-python3 evals/harnesses/claude-code/run-evals.py --skill api
+# Suppress progress UI (used in CI)
+uv run evals --harness claude-code --skill hawkscan --bare
+```
 
-# Run a single prompt by ID
-python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan --id hw-07
+`run-evals.py` in this directory is a back-compat shim that forwards to `uv run evals --harness claude-code`. Use the `uv run evals` form going forward.
 
-# Dry run — print prompts without calling claude
-python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan --dry-run
+## Config source
 
-# Full-auto mode: agent can actually execute commands (--dangerously-skip-permissions)
-python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan --full-auto
+Prompts and trigger labels are loaded from `evals/<skill>/prompts.yaml` (not prompts.csv — the CSV was removed during the YAML migration). Process checks come from `evals/<skill>/process-checks.json`.
 
-# Also run the qualitative rubric grader (extra cost + ~30s per run)
-python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan --rubric
+## How it works
 
-# Cap spend per run (default: $0.20)
-python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan --max-budget 0.10
-```
+For each prompt in `evals/<skill>/prompts.yaml`:
+
+1. `ClaudeCodeAdapter.launch()` runs `claude -p "<prompt>" --output-format stream-json --plugin-dir plugins/<skill>` in a fresh temp directory (isolated, no state leakage between runs). The raw stdout is parsed in-memory; no raw `.jsonl` file is persisted.
+2. `parse_stream()` extracts bash commands, files written/edited, output text, and cost from the JSONL event stream.
+3. `detect_trigger()` checks whether the skill triggered using CLI command signals (e.g. `hawk scan`) and invocation-phrase signals in the output text.
+4. If the skill should have triggered and did, process checks from `process-checks.json` are run against the captured trace.
+5. A verdict (`pass`, `pass-slow`, or `fail`) is assigned and an `EvalResult` is written to `results/<skill>/<run-id>.result.json`.
 
 ## Two modes
 
 ### Observe mode (default)
 
-The agent runs normally but permissions are not bypassed. It will plan and narrate what
-it would do — including bash commands it intends to execute — without necessarily
-running them. Trigger detection and most process checks work because the agent names
-the commands in its output even when execution is blocked.
+Permissions are not bypassed. The agent plans and narrates what it would do — including bash commands it intends to run — without necessarily executing them. Trigger detection and most process checks still work because the agent names the commands in its output.
 
-**Use for:** trigger accuracy checks, output quality checks, rubric grading.
+**Use for:** trigger accuracy checks, output quality checks, CI.
 
 ### Full-auto mode (`--full-auto`)
 
-Passes `--dangerously-skip-permissions` so the agent can actually execute bash commands,
-write files, and run `hawk` CLI calls. Results are more accurate for process checks that
-require real execution (e.g. `hawk validate config` was actually run and passed).
+Passes `--dangerously-skip-permissions` so the agent can actually execute bash commands, write files, and run `hawk` CLI calls. Results are more accurate for process checks that require real execution.
 
-**Use for:** end-to-end process verification when `hawk` CLI is installed and a target app
-is available. Run in a trusted, isolated environment — not on a production machine.
+**Use for:** end-to-end process verification when `hawk` CLI is installed and a target app is available. Run in a trusted, isolated environment.
 
 ## Understanding results
 
 ### Per-run result file (`results/<skill>/<run-id>.result.json`)
 
+Conforms to the `EvalResult` Pydantic model (`evals/lib/models.py`):
+
 ```json
 {
   "platform": "claude-code",
@@ -78,67 +72,51 @@ is available. Run in a trusted, isolated environment — not on a production mac
   "should_trigger": true,
   "did_trigger": true,
   "trigger_correct": true,
-  "bash_commands": ["hawk version", "hawkop app list", "hawk validate config stackhawk.yml", "hawk scan --json-output"],
-  "files_written": ["stackhawk.yml"],
+  "verdict": "pass",
+  "budget_breaches": [],
   "process_checks": [
-    { "id": "preflight_version_check", "pass": true, "severity": "blocking", "signal_found": "hawk version" },
-    { "id": "step2_no_local_yml_created", "pass": true, "severity": "blocking", "signal_found": null }
+    { "id": "preflight_version_check", "passed": true, "severity": "blocking", "signal_found": "hawk version", "anti_found": null },
+    { "id": "step2_no_local_yml_created", "passed": true, "severity": "blocking", "signal_found": null, "anti_found": null }
   ],
-  "scoring": {
-    "total": 22,
-    "passed": 20,
-    "blocking_failed": 1,
-    "warning_failed": 1,
-    "score": 80
-  },
-  "rubric_result": null,
+  "score": 100,
   "cost_usd": 0.048
 }
 ```
 
 ### Summary file (`results/<skill>/summary.json`)
 
-Written after a full run. Tracks trigger accuracy, process score, false positives/negatives,
-and per-run scores — useful for comparing skill versions over time.
+Written after a full run. Tracks trigger accuracy, process score, false positives/negatives, and per-run scores.
 
 ### Scoring
 
-| Check type | Deduction per failure |
+| Check type  | Deduction per failure |
 |---|---|
-| `blocking` | −15 points |
-| `warning` | −5 points |
+| `blocking`  | −15 points |
+| `warning`   | −5 points |
 
-`overall_pass` in rubric results requires score ≥ 70 and zero blocking failures.
+Verdict is `pass` if trigger is correct and score ≥ 70 with zero blocking failures; `pass-slow` if correct but over budget; `fail` otherwise.
 
 ### Process checks only run when the skill should have triggered and did
 
-If `should_trigger=false` and the skill correctly did not fire, no process checks run —
-there is no workflow to grade. The run scores as a trigger-accuracy pass only.
+If `should_trigger=false` and the skill correctly did not fire, no process checks run — there is no workflow to grade.
 
-## Raw traces
+## adapter.py
 
-Each run saves the raw `claude --output-format stream-json` JSONL to
-`results/<skill>/<run-id>.jsonl`. Open it to debug false negatives or unexpected behavior:
+`ClaudeCodeAdapter` (`adapter.py`) implements the `HarnessAdapter` protocol for this platform:
 
-```bash
-# See all bash commands the agent attempted
-jq -r 'select(.type=="assistant") | .message.content[] | select(.type=="tool_use" and .name=="Bash") | .input.command' \
-  results/hawkscan/hw-07.jsonl
-```
+- `parse_stream(raw)` — parses `claude --output-format stream-json` JSONL into a `ParsedRun`
+- `detect_trigger(run, skill)` — checks CLI command signals and invocation-phrase signals
+- `launch(prompt, skill, run_id, ...)` — spawns `claude -p` in a temp directory, captures stdout in-memory, and returns a `ParsedRun`
 
 ## CI usage
 
-The harness exits non-zero if trigger accuracy falls below 100% or any blocking check
-fails. Wire it into CI after bumping a skill version to catch regressions:
-
 ```yaml
 - name: Run skill evals
-  run: |
-    python3 evals/harnesses/claude-code/run-evals.py --skill hawkscan
-    python3 evals/harnesses/claude-code/run-evals.py --skill api
   env:
     ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+  run: |
+    uv run evals --harness claude-code --skill hawkscan --bare --max-budget 0.15
+    uv run evals --harness claude-code --skill api --bare --max-budget 0.15
 ```
 
-Note: CI runs are in observe mode by default (no `--full-auto`), which avoids needing
-a live `hawk` CLI or running application. Add `--full-auto` only in a dedicated sandbox.
+CI runs use observe mode by default (no `--full-auto`), which avoids needing a live `hawk` CLI or running application.
diff --git a/evals/harnesses/gemini/run-evals.py b/evals/harnesses/gemini/run-evals.py
index d00c8c5..00fce99 100644
--- a/evals/harnesses/gemini/run-evals.py
+++ b/evals/harnesses/gemini/run-evals.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python3
 """
+FROZEN/LEGACY: superseded by the agy harness and the unified 'uv run evals' CLI. Not wired into CI. References the removed prompts.csv and will not run as-is. Kept for historical reference only.
+
 Gemini CLI eval harness for StackHawk agent skills.
 
 Uses `gemini -p --output-format stream-json` (Gemini's headless CLI).

From 7cc0f7795ba02ac02718080d450c6e01b1f7bf7f Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 14:53:25 -0600
Subject: [PATCH 18/61] feat(evals): real codex adapter (ports pre-shim stream
 parsing)

Implements CodexAdapter with CLI_SIGNALS, INVOCATION_SIGNALS, parse_stream
(item.started/item.completed/turn.completed), detect_trigger, and launch
(codex exec --json --sandbox workspace-write --skip-git-repo-check), resolving
the C2 defect where uv run evals --harness codex raised ValueError.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/harnesses/codex/adapter.py | 146 +++++++++++++++++++++++++++++++
 tests/fixtures/streams/codex.txt |   4 +
 tests/lib/test_adapters.py       |  22 +++++
 3 files changed, 172 insertions(+)
 create mode 100644 evals/harnesses/codex/adapter.py
 create mode 100644 tests/fixtures/streams/codex.txt
 create mode 100644 tests/lib/test_adapters.py

diff --git a/evals/harnesses/codex/adapter.py b/evals/harnesses/codex/adapter.py
new file mode 100644
index 0000000..09d991d
--- /dev/null
+++ b/evals/harnesses/codex/adapter.py
@@ -0,0 +1,146 @@
+"""codex Harness adapter. Parsing + signals ported from pre-shim run-evals.py."""
+from __future__ import annotations
+import json
+import shutil
+import subprocess
+import tempfile
+
+from evals.lib.models import ParsedRun
+
+# CLI signals — checked against bash_commands only (prevents documentation content
+# from creating false positives when the agent writes README/guides about HawkScan).
+CLI_SIGNALS = {
+    "hawkscan": [
+        "hawk scan",
+        "hawk validate",
+        "hawk rescan",
+        # "hawk version" excluded: running 'hawk version' alone is common for
+        # installation-check tasks and would cause false positives. The preflight
+        # workflow always also runs 'hawk config --help', so 'hawk config' below suffices.
+        "hawk config",
+        "hawk create app",
+        "hawk init",
+        "hawk perch",
+    ],
+    # Signals specific to the api reporting workflow — avoids false positives
+    # from hawkop status/app/env commands that the hawkscan skill also runs.
+    "api": [
+        "hawkop scan get",     # api Step 4: app deep dive
+        "hawkop org get",      # api Step 1: establish orgId
+        "hawkop org set",      # api Step 1: switch org
+        "/api/v2/org",         # api Step 3: org posture endpoint (hawkop doesn't wrap it)
+        "/api/v1/scan",        # api Step 4: raw scan drill-down
+        "hawk_api GET",        # api raw API helper function
+    ],
+}
+
+# Invocation signals — checked against output_text only. In full-auto mode these are
+# belt-and-suspenders: the agent usually runs CLI commands directly. They catch
+# contextual prompts where the skill fires but the agent finds an empty working dir
+# and stops before reaching the CLI (same as observe mode in Claude Code harness).
+INVOCATION_SIGNALS = {
+    "hawkscan": [
+        # All markdown formatting variants the model uses around `: YES` or ` — YES`
+        "hawkscan:hawkscan`: yes",   # backtick + colon
+        "hawkscan:hawkscan` — yes",  # backtick + dash
+        "hawkscan:hawkscan**: yes",  # bold + colon
+        "hawkscan:hawkscan** — yes", # bold + dash
+        "hawkscan:hawkscan: yes",    # plain colon
+        "hawkscan:hawkscan — yes",   # plain dash
+        # Specific action-intent phrases
+        "autonomous security scan",
+        "dast scan after code",
+        "dast scan triggered",
+        "dast scan required",
+        "security scan required",
+        "security scan after",
+        "run the security scan",
+        "running the hawkscan",
+    ],
+    "api": [
+        "stackhawk-api:api`: yes",
+        "stackhawk-api:api` — yes",
+        "stackhawk-api:api: yes",
+        "stackhawk-api:api — yes",
+    ],
+}
+
+
+def parse_stream(raw: str) -> ParsedRun:
+    cmds, out, otok, err, seen = [], "", 0, None, set()
+    for line in raw.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            ev = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        t = ev.get("type", "")
+        if t == "item.started":
+            it = ev.get("item", {})
+            if it.get("type") == "command_execution":
+                c = it.get("command", "")
+                if c and c not in seen:
+                    cmds.append(c)
+                    seen.add(c)
+        elif t == "item.completed":
+            it = ev.get("item", {})
+            if it.get("type") in ("message", "agent_message"):
+                txt = it.get("text", "")
+                if txt:
+                    out += txt + "\n"
+                content = it.get("content", "")
+                if isinstance(content, str):
+                    out += content + "\n"
+                elif isinstance(content, list):
+                    for b in content:
+                        if isinstance(b, dict) and b.get("type") == "text":
+                            out += b.get("text", "") + "\n"
+        elif t == "turn.completed":
+            otok += ev.get("usage", {}).get("output_tokens", 0)
+        elif t == "error":
+            err = ev.get("message", "unknown error")
+    return ParsedRun(bash_commands=cmds, output_text=out.strip(),
+                     output_tokens=otok or None, error=err)
+
+
+class CodexAdapter:
+    platform = "codex"
+
+    def cli_signals(self, skill): return CLI_SIGNALS.get(skill, [])
+    def invocation_signals(self, skill): return INVOCATION_SIGNALS.get(skill, [])
+    def parse_stream(self, raw): return parse_stream(raw)
+
+    def detect_trigger(self, run: ParsedRun, skill: str) -> bool:
+        cli = " ".join(run.bash_commands).lower()
+        if any(s.lower() in cli for s in self.cli_signals(skill)):
+            return True
+        text = run.output_text.lower()
+        return any(s.lower() in text for s in self.invocation_signals(skill))
+
+    def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill,
+               max_budget, bare, full_auto) -> ParsedRun:
+        tmpdir = tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_")
+        try:
+            cmd = [
+                "codex", "exec", "--json",
+                "--sandbox", "workspace-write",
+                "--skip-git-repo-check",
+            ]
+            if model:
+                cmd += ["-m", model]
+            if not full_auto:
+                cmd += ["--sandbox", "read-only"]
+            cmd.append(prompt)
+            try:
+                proc = subprocess.run(cmd, capture_output=True, text=True,
+                                      timeout=300, cwd=tmpdir)
+            except subprocess.TimeoutExpired:
+                return ParsedRun(error="timeout")
+            return parse_stream(proc.stdout)
+        finally:
+            shutil.rmtree(tmpdir, ignore_errors=True)
+
+
+ADAPTER = CodexAdapter()
diff --git a/tests/fixtures/streams/codex.txt b/tests/fixtures/streams/codex.txt
new file mode 100644
index 0000000..048da79
--- /dev/null
+++ b/tests/fixtures/streams/codex.txt
@@ -0,0 +1,4 @@
+{"type":"item.started","item":{"type":"command_execution","command":"hawk validate config stackhawk.yml"}}
+{"type":"item.started","item":{"type":"command_execution","command":"hawk scan --env Development"}}
+{"type":"item.completed","item":{"type":"agent_message","text":"Running the security scan; app reachable on localhost:8080."}}
+{"type":"turn.completed","usage":{"input_tokens":1200,"output_tokens":340}}
diff --git a/tests/lib/test_adapters.py b/tests/lib/test_adapters.py
new file mode 100644
index 0000000..94d1a01
--- /dev/null
+++ b/tests/lib/test_adapters.py
@@ -0,0 +1,22 @@
+from pathlib import Path
+from evals.lib.harness import get_adapter
+from evals.lib.models import ParsedRun
+
+FIX = Path(__file__).parent.parent / "fixtures" / "streams"
+
+
+def test_codex_parse_stream():
+    cx = get_adapter("codex")
+    run = cx.parse_stream((FIX / "codex.txt").read_text())
+    assert isinstance(run, ParsedRun)
+    assert "hawk validate config stackhawk.yml" in run.bash_commands
+    assert "hawk scan --env Development" in run.bash_commands
+    assert "localhost:8080" in run.output_text
+    assert run.output_tokens == 340
+
+
+def test_codex_detect_trigger():
+    cx = get_adapter("codex")
+    run = ParsedRun(bash_commands=["hawk scan --env Development"])
+    assert cx.detect_trigger(run, "hawkscan") is True
+    assert cx.detect_trigger(ParsedRun(bash_commands=["echo hi"]), "hawkscan") is False

From 7250ae250d7e3bb39156f28ce81f8dddb71b50f4 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 14:56:11 -0600
Subject: [PATCH 19/61] feat(evals): real cursor adapter (ports pre-shim stream
 parsing)

Implements CursorAdapter with cursor-specific stream-json event keys
(tool_call/subtype:started/shellToolCall, not claude-code's tool_use
blocks), the full CLI_SIGNALS and INVOCATION_SIGNALS from pre-shim,
launch flags matching the pre-shim invocation, and tests backed by a
minimal cursor.txt fixture.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/harnesses/cursor/adapter.py | 183 ++++++++++++++++++++++++++++++
 tests/fixtures/streams/cursor.txt |   3 +
 tests/lib/test_adapters.py        |  12 ++
 3 files changed, 198 insertions(+)
 create mode 100644 evals/harnesses/cursor/adapter.py
 create mode 100644 tests/fixtures/streams/cursor.txt

diff --git a/evals/harnesses/cursor/adapter.py b/evals/harnesses/cursor/adapter.py
new file mode 100644
index 0000000..02d7e10
--- /dev/null
+++ b/evals/harnesses/cursor/adapter.py
@@ -0,0 +1,183 @@
+"""cursor Harness adapter. Parsing + signals ported from pre-shim run-evals.py."""
+from __future__ import annotations
+import json
+import shutil
+import subprocess
+import tempfile
+
+from evals.lib.models import ParsedRun
+
+# CLI signals — checked against bash_commands only.
+# Cursor goes directly into execution, so CLI signals are the primary trigger
+# indicator. Invocation signals cover narrative phrases the agent uses when
+# kicking off a skill workflow without immediately running commands.
+CLI_SIGNALS = {
+    "hawkscan": [
+        "hawk scan",
+        "hawk validate",
+        "hawk rescan",
+        "hawk config",
+        "hawk create app",
+        "hawk init",
+        "hawk perch",
+    ],
+    # Cursor api: agent runs hawkop status as its first step, then deeper
+    # hawkop commands. Broader hawkop signals included since Cursor doesn't
+    # have false-positive risk of Codex full-auto mode.
+    "api": [
+        "hawkop status",
+        "hawkop scan get",
+        "hawkop org get",
+        "hawkop org set",
+        "hawkop app list",
+        "/api/v2/org",
+        "/api/v1/scan",
+        "hawk_api GET",
+    ],
+}
+
+# Invocation signals — checked against output_text only.
+# Cursor doesn't use the Claude Code "EVALUATE: YES/NO" evaluation step, so
+# these focus on narrative phrases the agent uses when kicking off a skill workflow.
+INVOCATION_SIGNALS = {
+    "hawkscan": [
+        "hawkscan:hawkscan`: yes", "hawkscan:hawkscan` — yes",
+        "hawkscan:hawkscan**: yes", "hawkscan:hawkscan** — yes",
+        "hawkscan:hawkscan: yes",  "hawkscan:hawkscan — yes",
+        "hawkscan:hawkscan - yes", "hawkscan:hawkscan - **yes",
+        "hawkscan** - yes", "hawkscan** — yes",
+        "hawkscan**: yes",  "hawkscan: yes",
+        "hawkscan — yes",   "hawkscan - yes",
+        "autonomous security scan",
+        "dast scan after code", "dast scan triggered", "dast scan required",
+        "security scan required", "security scan after",
+        "run the security scan",  "running the hawkscan",
+    ],
+    "api": [
+        # Claude Code evaluation-format signals (if model uses that format)
+        "stackhawk-api:api`: yes", "stackhawk-api:api` — yes",
+        "stackhawk-api:api**: yes", "stackhawk-api:api** — yes",
+        "stackhawk-api:api: yes",  "stackhawk-api:api — yes",
+        "stackhawk-api:api - yes",
+        "stackhawk-api**: yes",    "stackhawk-api** — yes",
+        "stackhawk-api: yes",      "stackhawk-api — yes",
+        "stackhawk-api - yes",
+        # Cursor narrative-style signals
+        "stackhawk api skill",
+        "stackhawk api",
+        "api skill to",
+        "security posture",
+        "untriaged findings",
+        "scan history",
+        "findings across",
+    ],
+}
+
+
+def parse_stream(raw: str) -> ParsedRun:
+    """Parse cursor stream-json output.
+
+    Cursor event shapes (from pre-shim run-evals.py):
+      - type="assistant":  message.content[] with blocks of type="text"
+      - type="tool_call" subtype="started":
+            tool_call.shellToolCall.args.command  -> bash_commands
+            tool_call.writeToolCall.args.path     -> files_written
+      - type="result":  usage.outputTokens, is_error, result
+    """
+    bash_commands: list[str] = []
+    files_written: list[str] = []
+    output_text = ""
+    output_tokens: int | None = None
+    error = None
+
+    for line in raw.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            event = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+
+        etype = event.get("type", "")
+
+        if etype == "assistant":
+            for block in event.get("message", {}).get("content", []):
+                if block.get("type") == "text":
+                    output_text += block.get("text", "") + "\n"
+
+        elif etype == "tool_call" and event.get("subtype") == "started":
+            tc = event.get("tool_call", {})
+            # Shell command
+            shell = tc.get("shellToolCall", {})
+            if shell:
+                cmd = shell.get("args", {}).get("command", "")
+                if cmd:
+                    bash_commands.append(cmd)
+            # File write
+            write = tc.get("writeToolCall", {})
+            if write:
+                path = write.get("args", {}).get("path", "")
+                if path:
+                    files_written.append(path)
+
+        elif etype == "result":
+            usage = event.get("usage", {})
+            otok = usage.get("outputTokens")
+            if otok is not None:
+                output_tokens = (output_tokens or 0) + int(otok)
+            if event.get("is_error"):
+                error = event.get("result", "unknown error")
+
+    return ParsedRun(
+        bash_commands=bash_commands,
+        files_written=files_written,
+        output_text=output_text.strip(),
+        output_tokens=output_tokens or None,
+        error=error,
+    )
+
+
+class CursorAdapter:
+    platform = "cursor"
+
+    def cli_signals(self, skill): return CLI_SIGNALS.get(skill, [])
+    def invocation_signals(self, skill): return INVOCATION_SIGNALS.get(skill, [])
+    def parse_stream(self, raw): return parse_stream(raw)
+
+    def detect_trigger(self, run: ParsedRun, skill: str) -> bool:
+        cli = " ".join(run.bash_commands).lower()
+        if any(s.lower() in cli for s in self.cli_signals(skill)):
+            return True
+        text = run.output_text.lower()
+        return any(s.lower() in text for s in self.invocation_signals(skill))
+
+    def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill,
+               max_budget, bare, full_auto) -> ParsedRun:
+        tmpdir = tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_")
+        try:
+            cmd = [
+                "agent", "-p", prompt,
+                "--output-format", "stream-json",
+                "--print",
+            ]
+            if model:
+                cmd += ["--model", model]
+            if full_auto:
+                cmd.append("--force")
+            try:
+                proc = subprocess.run(
+                    cmd,
+                    capture_output=True,
+                    text=True,
+                    timeout=300,
+                    cwd=tmpdir,
+                )
+            except subprocess.TimeoutExpired:
+                return ParsedRun(error="timeout")
+            return parse_stream(proc.stdout)
+        finally:
+            shutil.rmtree(tmpdir, ignore_errors=True)
+
+
+ADAPTER = CursorAdapter()
diff --git a/tests/fixtures/streams/cursor.txt b/tests/fixtures/streams/cursor.txt
new file mode 100644
index 0000000..2dfe9ee
--- /dev/null
+++ b/tests/fixtures/streams/cursor.txt
@@ -0,0 +1,3 @@
+{"type":"tool_call","subtype":"started","tool_call":{"shellToolCall":{"args":{"command":"hawk scan --env Development"}}}}
+{"type":"assistant","message":{"content":[{"type":"text","text":"Running HawkScan against the app on localhost:8080."}]}}
+{"type":"result","usage":{"inputTokens":950,"outputTokens":210},"is_error":false}
diff --git a/tests/lib/test_adapters.py b/tests/lib/test_adapters.py
index 94d1a01..43bd879 100644
--- a/tests/lib/test_adapters.py
+++ b/tests/lib/test_adapters.py
@@ -20,3 +20,15 @@ def test_codex_detect_trigger():
     run = ParsedRun(bash_commands=["hawk scan --env Development"])
     assert cx.detect_trigger(run, "hawkscan") is True
     assert cx.detect_trigger(ParsedRun(bash_commands=["echo hi"]), "hawkscan") is False
+
+
+def test_cursor_parse_stream():
+    cu = get_adapter("cursor")
+    run = cu.parse_stream((FIX / "cursor.txt").read_text())
+    assert "hawk scan --env Development" in run.bash_commands
+    assert "localhost:8080" in run.output_text
+
+
+def test_cursor_detect_trigger():
+    cu = get_adapter("cursor")
+    assert cu.detect_trigger(ParsedRun(bash_commands=["hawk scan x"]), "hawkscan") is True

From ff59637251411c09890bdc25dd7067d8d6ec9b79 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 14:58:23 -0600
Subject: [PATCH 20/61] fix(evals): cursor adapter launch restores --trust +
 skill-loading (live-run fidelity)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/harnesses/cursor/adapter.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/evals/harnesses/cursor/adapter.py b/evals/harnesses/cursor/adapter.py
index 02d7e10..54d18b1 100644
--- a/evals/harnesses/cursor/adapter.py
+++ b/evals/harnesses/cursor/adapter.py
@@ -1,12 +1,28 @@
 """cursor Harness adapter. Parsing + signals ported from pre-shim run-evals.py."""
 from __future__ import annotations
 import json
+import os
 import shutil
 import subprocess
 import tempfile
+from pathlib import Path
 
 from evals.lib.models import ParsedRun
 
+# adapter.py -> cursor -> harnesses -> evals -> repo root
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent
+# cursor/.cursor/rules/ holds the alwaysApply .mdc skill rules (pre-shim path).
+CURSOR_RULES_DIR = REPO_ROOT / "cursor" / ".cursor" / "rules"
+
+
+def _setup_skill(target_dir: str) -> None:
+    """Copy cursor/.cursor/rules/*.mdc into the run's workspace so alwaysApply
+    rules load. Mirrors the pre-shim run-evals.py _setup_workspace()."""
+    dst = Path(target_dir) / ".cursor" / "rules"
+    dst.mkdir(parents=True, exist_ok=True)
+    for mdc in CURSOR_RULES_DIR.glob("*.mdc"):
+        shutil.copy2(mdc, dst / mdc.name)
+
 # CLI signals — checked against bash_commands only.
 # Cursor goes directly into execution, so CLI signals are the primary trigger
 # indicator. Invocation signals cover narrative phrases the agent uses when
@@ -156,11 +172,19 @@ def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill,
                max_budget, bare, full_auto) -> ParsedRun:
         tmpdir = tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_")
         try:
+            # With/without-skill switch: only install the cursor rules when the
+            # skill should be loaded (pre-shim always installed them).
+            if load_skill:
+                _setup_skill(tmpdir)
+            api_key = os.environ.get("CURSOR_API_KEY", "")
             cmd = [
                 "agent", "-p", prompt,
                 "--output-format", "stream-json",
                 "--print",
+                "--trust",
             ]
+            if api_key:
+                cmd += ["--api-key", api_key]
             if model:
                 cmd += ["--model", model]
             if full_auto:

From 5610376ade0909740715ef4456c462c681517086 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 15:01:08 -0600
Subject: [PATCH 21/61] feat(evals): real agy adapter (plain-text parsing)

Adds AgyAdapter with plain-text parse_stream (wraps full stdout in
output_text, bash_commands always empty), INVOCATION_SIGNALS recovered
verbatim from pre-shim ALL_SIGNALS plus evaluation-format backtick
variants, and launch() mirroring the pre-shim agy -p / --print-timeout
invocation.  CLI_SIGNALS is empty (agy has no shell commands to scan).
Skills are installed globally in CI via agy plugin install; load_skill
is a no-op.  AGY_API_KEY flows through os.environ as before.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/harnesses/agy/adapter.py | 131 +++++++++++++++++++++++++++++++++
 tests/fixtures/streams/agy.txt |   2 +
 tests/lib/test_adapters.py     |  13 ++++
 3 files changed, 146 insertions(+)
 create mode 100644 evals/harnesses/agy/adapter.py
 create mode 100644 tests/fixtures/streams/agy.txt

diff --git a/evals/harnesses/agy/adapter.py b/evals/harnesses/agy/adapter.py
new file mode 100644
index 0000000..32eea4d
--- /dev/null
+++ b/evals/harnesses/agy/adapter.py
@@ -0,0 +1,131 @@
+"""agy Harness adapter. Plain-text output (no structured stream).
+
+Pre-shim (5472ed2~1:evals/harnesses/agy/run-evals.py) notes:
+- agy outputs plain text — no --output-format flag available.
+- Trigger detection scans output_text only; no bash_commands ever populated.
+- Skills installed globally via `agy plugin install` (done in CI); load_skill
+  toggling is a no-op here.
+- AGY_API_KEY passed via os.environ (CI sets it); no special env handling needed.
+- Launch: agy -p <prompt> --print-timeout <timeout> [--model M]
+- The pre-shim used a unified ALL_SIGNALS dict (no CLI/INVOCATION split) with
+  SKILL: prefix signals.  Those are carried in INVOCATION_SIGNALS below alongside
+  the backtick-evaluation-format signals shared by codex/cursor adapters.
+"""
+from __future__ import annotations
+import shutil
+import subprocess
+import tempfile
+
+from evals.lib.models import ParsedRun
+
+# CLI_SIGNALS: agy emits plain text — there are no shell commands to scan.
+CLI_SIGNALS: dict[str, list[str]] = {
+    "hawkscan": [],
+    "api": [],
+}
+
+# INVOCATION_SIGNALS: checked against output_text.
+# Combines the pre-shim ALL_SIGNALS (SKILL: prefix variants) with the
+# evaluation-format backtick signals used by the shared skill prompts.
+INVOCATION_SIGNALS: dict[str, list[str]] = {
+    "hawkscan": [
+        # Pre-shim ALL_SIGNALS (verbatim from 5472ed2~1:evals/harnesses/agy/run-evals.py)
+        "skill: hawkscan",
+        "skill:hawkscan",
+        # Evaluation-format variants emitted by the shared skill evaluation suffix
+        "hawkscan:hawkscan`: yes",
+        "hawkscan:hawkscan` — yes",
+        "hawkscan:hawkscan**: yes",
+        "hawkscan:hawkscan** — yes",
+        "hawkscan:hawkscan: yes",
+        "hawkscan:hawkscan — yes",
+        # Action-intent phrases
+        "autonomous security scan",
+        "dast scan after code",
+        "dast scan triggered",
+        "dast scan required",
+        "security scan required",
+        "security scan after",
+        "run the security scan",
+        "running the hawkscan",
+        "running the security scan",
+    ],
+    "api": [
+        # Pre-shim ALL_SIGNALS (verbatim)
+        "skill: api",
+        "skill:api",
+        "skill: stackhawk-api",
+        # Evaluation-format variants
+        "stackhawk-api:api`: yes",
+        "stackhawk-api:api` — yes",
+        "stackhawk-api:api: yes",
+        "stackhawk-api:api — yes",
+    ],
+}
+
+# Matches pre-shim default --print-timeout (180s); bumped slightly for safety.
+PRINT_TIMEOUT = "240s"
+
+
+def parse_stream(raw: str) -> ParsedRun:
+    """agy outputs plain text — wrap entirely in output_text; no commands to parse."""
+    return ParsedRun(output_text=raw.strip())
+
+
+class AgyAdapter:
+    platform = "agy"
+
+    def cli_signals(self, skill: str) -> list[str]:
+        return CLI_SIGNALS.get(skill, [])
+
+    def invocation_signals(self, skill: str) -> list[str]:
+        return INVOCATION_SIGNALS.get(skill, [])
+
+    def parse_stream(self, raw: str) -> ParsedRun:
+        return parse_stream(raw)
+
+    def detect_trigger(self, run: ParsedRun, skill: str) -> bool:
+        # agy is text-only; CLI signals may appear in prose too, so check both
+        # lists against the combined text.
+        hay = (" ".join(run.bash_commands) + " " + run.output_text).lower()
+        return (
+            any(s.lower() in hay for s in self.cli_signals(skill))
+            or any(s.lower() in hay for s in self.invocation_signals(skill))
+        )
+
+    def launch(
+        self,
+        prompt: str,
+        skill: str,
+        run_id: str,
+        plugin_dirs: list[str],
+        *,
+        model: str | None,
+        load_skill: bool,
+        max_budget: float,
+        bare: bool,
+        full_auto: bool,
+    ) -> ParsedRun:
+        # Skills are installed globally via `agy plugin install` in CI;
+        # load_skill toggling is a no-op here.
+        tmpdir = tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_")
+        try:
+            cmd = ["agy", "-p", prompt, "--print-timeout", PRINT_TIMEOUT]
+            if model:
+                cmd += ["--model", model]
+            try:
+                proc = subprocess.run(
+                    cmd,
+                    capture_output=True,
+                    text=True,
+                    timeout=420,
+                    cwd=tmpdir,
+                )
+            except subprocess.TimeoutExpired:
+                return ParsedRun(error="timeout")
+            return parse_stream(proc.stdout)
+        finally:
+            shutil.rmtree(tmpdir, ignore_errors=True)
+
+
+ADAPTER = AgyAdapter()
diff --git a/tests/fixtures/streams/agy.txt b/tests/fixtures/streams/agy.txt
new file mode 100644
index 0000000..2726a9e
--- /dev/null
+++ b/tests/fixtures/streams/agy.txt
@@ -0,0 +1,2 @@
+`hawkscan:hawkscan`: YES — running the security scan.
+I ran `hawk scan --env Development`; the app was reachable on localhost:8080.
diff --git a/tests/lib/test_adapters.py b/tests/lib/test_adapters.py
index 43bd879..1d551cb 100644
--- a/tests/lib/test_adapters.py
+++ b/tests/lib/test_adapters.py
@@ -32,3 +32,16 @@ def test_cursor_parse_stream():
 def test_cursor_detect_trigger():
     cu = get_adapter("cursor")
     assert cu.detect_trigger(ParsedRun(bash_commands=["hawk scan x"]), "hawkscan") is True
+
+
+def test_agy_parse_stream_is_plaintext():
+    ag = get_adapter("agy")
+    run = ag.parse_stream((FIX / "agy.txt").read_text())
+    assert run.bash_commands == []
+    assert "hawk scan --env Development" in run.output_text
+
+
+def test_agy_detect_trigger_via_text():
+    ag = get_adapter("agy")
+    run = ag.parse_stream((FIX / "agy.txt").read_text())
+    assert ag.detect_trigger(run, "hawkscan") is True

From 47d2a3e1d8d913864cfd2a161499b04c76e8c647 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 15:03:17 -0600
Subject: [PATCH 22/61] fix(evals): agy adapter appends OBSERVE_SUFFIX so
 triggers detect (live-run fidelity)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Restores the pre-shim OBSERVE_SUFFIX (verbatim) and appends it to the
prompt inside launch() before invoking agy. In --print mode agy hangs on
tool approvals, so the suffix makes the agent declare 'SKILL: hawkscan' /
'SKILL: api' / 'SKILL: none' up front — that declaration is what the
pre-shim SKILL: signals in INVOCATION_SIGNALS match. Without it, live agy
runs emit no detectable trigger text (all false-negatives).

Both signal sets are retained: pre-shim SKILL: entries AND the backtick
evaluation-format variants, so detection is robust regardless of which
format agy emits. Adds a unit test asserting OBSERVE_SUFFIX is non-empty,
requests the SKILL: declaration, and that detect_trigger fires on it.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/harnesses/agy/adapter.py | 15 ++++++++++++++-
 tests/lib/test_adapters.py     | 22 ++++++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/evals/harnesses/agy/adapter.py b/evals/harnesses/agy/adapter.py
index 32eea4d..99e34f2 100644
--- a/evals/harnesses/agy/adapter.py
+++ b/evals/harnesses/agy/adapter.py
@@ -66,6 +66,16 @@
 # Matches pre-shim default --print-timeout (180s); bumped slightly for safety.
 PRINT_TIMEOUT = "240s"
 
+# Appended to every prompt before invoking agy (verbatim from pre-shim
+# 5472ed2~1:evals/harnesses/agy/run-evals.py). In --print mode agy hangs on tool
+# approvals, so this asks the agent to declare its skill choice up front — that
+# declaration is what the SKILL: signals in INVOCATION_SIGNALS detect. Without
+# it, live agy runs produce no detectable trigger text (all false-negatives).
+OBSERVE_SUFFIX = (
+    "\n\n(Eval mode: before responding, state which skill you would invoke: "
+    "'SKILL: hawkscan', 'SKILL: api', or 'SKILL: none'. Then proceed with your response.)"
+)
+
 
 def parse_stream(raw: str) -> ParsedRun:
     """agy outputs plain text — wrap entirely in output_text; no commands to parse."""
@@ -110,7 +120,10 @@ def launch(
         # load_skill toggling is a no-op here.
         tmpdir = tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_")
         try:
-            cmd = ["agy", "-p", prompt, "--print-timeout", PRINT_TIMEOUT]
+            # --print mode hangs on tool approvals; the suffix makes agy declare
+            # its skill choice up front so detect_trigger has text to match.
+            effective_prompt = prompt + OBSERVE_SUFFIX
+            cmd = ["agy", "-p", effective_prompt, "--print-timeout", PRINT_TIMEOUT]
             if model:
                 cmd += ["--model", model]
             try:
diff --git a/tests/lib/test_adapters.py b/tests/lib/test_adapters.py
index 1d551cb..e1b7070 100644
--- a/tests/lib/test_adapters.py
+++ b/tests/lib/test_adapters.py
@@ -1,8 +1,18 @@
+import importlib.util
 from pathlib import Path
 from evals.lib.harness import get_adapter
 from evals.lib.models import ParsedRun
 
 FIX = Path(__file__).parent.parent / "fixtures" / "streams"
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+
+
+def _load_adapter_module(platform: str):
+    path = REPO_ROOT / "evals" / "harnesses" / platform / "adapter.py"
+    spec = importlib.util.spec_from_file_location(f"_t_adapter_{platform}", path)
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    return mod
 
 
 def test_codex_parse_stream():
@@ -45,3 +55,15 @@ def test_agy_detect_trigger_via_text():
     ag = get_adapter("agy")
     run = ag.parse_stream((FIX / "agy.txt").read_text())
     assert ag.detect_trigger(run, "hawkscan") is True
+
+
+def test_agy_observe_suffix_and_skill_signal():
+    ag = get_adapter("agy")
+    # The pre-shim SKILL: declaration format (emitted because of OBSERVE_SUFFIX)
+    # must still be detected by detect_trigger.
+    run = ag.parse_stream("I would use SKILL: hawkscan for this task.")
+    assert ag.detect_trigger(run, "hawkscan") is True
+    # OBSERVE_SUFFIX must be present, non-empty, and request the SKILL: declaration.
+    mod = _load_adapter_module("agy")
+    assert mod.OBSERVE_SUFFIX.strip()
+    assert "SKILL: hawkscan" in mod.OBSERVE_SUFFIX

From aef302eae6f7966c499cd998ec1b83b709deb801 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 15:04:28 -0600
Subject: [PATCH 23/61] ci(evals): re-enable codex/cursor/agy now that adapters
 exist (closes C2)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/skill-evals.yml | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/skill-evals.yml b/.github/workflows/skill-evals.yml
index cf565f8..146b4aa 100644
--- a/.github/workflows/skill-evals.yml
+++ b/.github/workflows/skill-evals.yml
@@ -100,15 +100,16 @@ jobs:
           path: evals/harnesses/claude-code/results/${{ matrix.skill }}/
           retention-days: 30
 
-  # NOTE: dispatch-only until evals/harnesses/codex/adapter.py exists (see harnesses/README.md).
   # ── Codex ─────────────────────────────────────────────────────────────────
   eval-codex:
     name: codex / ${{ matrix.skill }}
     runs-on: ubuntu-latest
     needs: validate-config
     if: |
-      github.event_name == 'workflow_dispatch' &&
-      (inputs.platform == 'all' || inputs.platform == 'codex')
+      github.event_name == 'pull_request' ||
+      github.event_name == 'push' ||
+      inputs.platform == 'all' ||
+      inputs.platform == 'codex'
     strategy:
       fail-fast: false
       matrix:
@@ -149,15 +150,16 @@ jobs:
           path: evals/harnesses/codex/results/${{ matrix.skill }}/
           retention-days: 30
 
-  # NOTE: dispatch-only until evals/harnesses/agy/adapter.py exists (see harnesses/README.md).
   # ── Antigravity (agy) — replaces Gemini ───────────────────────────────────
   eval-agy:
     name: agy / ${{ matrix.skill }}
     runs-on: ubuntu-latest
     needs: validate-config
     if: |
-      github.event_name == 'workflow_dispatch' &&
-      (inputs.platform == 'all' || inputs.platform == 'agy')
+      github.event_name == 'pull_request' ||
+      github.event_name == 'push' ||
+      inputs.platform == 'all' ||
+      inputs.platform == 'agy'
     strategy:
       fail-fast: false
       matrix:
@@ -194,15 +196,16 @@ jobs:
           path: evals/harnesses/agy/results/${{ matrix.skill }}/
           retention-days: 30
 
-  # NOTE: dispatch-only until evals/harnesses/cursor/adapter.py exists (see harnesses/README.md).
   # ── Cursor ────────────────────────────────────────────────────────────────
   eval-cursor:
     name: cursor / ${{ matrix.skill }}
     runs-on: ubuntu-latest
     needs: validate-config
     if: |
-      github.event_name == 'workflow_dispatch' &&
-      (inputs.platform == 'all' || inputs.platform == 'cursor')
+      github.event_name == 'pull_request' ||
+      github.event_name == 'push' ||
+      inputs.platform == 'all' ||
+      inputs.platform == 'cursor'
     strategy:
       fail-fast: false
       matrix:

From a041ca9bceaee0bf00359c4e6b58ac3f95d72300 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 15:06:04 -0600
Subject: [PATCH 24/61] feat(evals): CellReport model + cell.json artifact from
 main()

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/cli.py             |  8 ++++++++
 evals/lib/models.py      | 17 +++++++++++++++++
 tests/lib/test_models.py | 20 ++++++++++++++++++++
 3 files changed, 45 insertions(+)

diff --git a/evals/cli.py b/evals/cli.py
index 3bff8b7..6fcfbe0 100644
--- a/evals/cli.py
+++ b/evals/cli.py
@@ -58,6 +58,14 @@ def main() -> None:
     summary["timestamp"] = datetime.now(timezone.utc).isoformat()
     (out_dir / "summary.json").write_text(json.dumps(summary, indent=2))
 
+    from evals.lib.models import CellReport
+    import subprocess as _sp
+    commit = _sp.run(["git", "rev-parse", "--short", "HEAD"], capture_output=True,
+                     text=True).stdout.strip() or "unknown"
+    cell = CellReport(platform=args.harness, skill=args.skill,
+                      model=args.model or "default", commit=commit, results=results)
+    (out_dir / "cell.json").write_text(cell.model_dump_json(indent=2))
+
     if summary["false_positives"] or summary["false_negatives"] or \
             summary["total_blocking_failures"] > 0:
         sys.exit(1)
diff --git a/evals/lib/models.py b/evals/lib/models.py
index 4c34ea3..af87d6f 100644
--- a/evals/lib/models.py
+++ b/evals/lib/models.py
@@ -78,3 +78,20 @@ class EvalResult(BaseModel):
     process_checks: list[ProcessCheckResult] = []
     score: int
     cost_usd: float = 0.0
+
+
+class CellReport(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    platform: str
+    skill: str
+    model: str
+    commit: str
+    results: list[EvalResult]
+
+
+class LiftRow(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    id: str
+    without_verdict: Verdict
+    with_verdict: Verdict
+    effect: Literal["lift", "regress", "none"]
diff --git a/tests/lib/test_models.py b/tests/lib/test_models.py
index 2f95d78..c86d90f 100644
--- a/tests/lib/test_models.py
+++ b/tests/lib/test_models.py
@@ -52,3 +52,23 @@ def test_parsed_run_defaults():
     assert r.bash_commands == []
     assert r.cost_usd == 0.0
     assert r.output_tokens is None
+
+
+def test_cellreport_roundtrips():
+    from evals.lib.models import CellReport, EvalResult, Verdict
+    r = EvalResult(platform="codex", skill="hawkscan", run_id="hw-01",
+                   should_trigger=True, did_trigger=True, trigger_correct=True,
+                   verdict=Verdict.PASS, score=100)
+    cell = CellReport(platform="codex", skill="hawkscan", model="haiku",
+                      commit="abc1234", results=[r])
+    again = CellReport.model_validate_json(cell.model_dump_json())
+    assert again.results[0].run_id == "hw-01"
+    assert again.model == "haiku"
+
+
+def test_cellreport_rejects_unknown_field():
+    import pytest
+    from pydantic import ValidationError
+    from evals.lib.models import CellReport
+    with pytest.raises(ValidationError):
+        CellReport(platform="x", skill="y", model="m", commit="c", results=[], extra=1)

From da3d46cbe279af162b858643aa15601987f03bc1 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 15:08:12 -0600
Subject: [PATCH 25/61] feat(evals): render_job_summary (JUnit-style, failures
 first) + shields badge

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/lib/reporting.py             | 45 +++++++++++++++++++++++++++++-
 tests/lib/test_reporting_render.py | 31 ++++++++++++++++++++
 2 files changed, 75 insertions(+), 1 deletion(-)
 create mode 100644 tests/lib/test_reporting_render.py

diff --git a/evals/lib/reporting.py b/evals/lib/reporting.py
index 6a37bc5..2d3a104 100644
--- a/evals/lib/reporting.py
+++ b/evals/lib/reporting.py
@@ -5,7 +5,7 @@
 from rich.console import Console
 from rich.table import Table
 
-from evals.lib.models import EvalResult, Verdict
+from evals.lib.models import CellReport, EvalResult, Verdict
 
 console = Console()
 DOT = {Verdict.PASS: "[green]● PASS[/]", Verdict.PASS_SLOW: "[yellow]◐ PASS-SLOW[/]",
@@ -53,3 +53,46 @@ def render_compare(rows: list[dict]) -> None:
                 "[red]↓ regress[/]" if (wo != Verdict.FAIL and w == Verdict.FAIL) else "=")
         t.add_row(row["id"], DOT[wo], DOT[w], delta)
     console.print(t)
+
+
+_BADGE_COLOR = {
+    "pass": "brightgreen", "pass-slow": "yellow", "fail": "red",
+    "regressed": "red", "fixed": "brightgreen", "changed": "blue",
+    "same": "lightgrey", "better": "brightgreen", "worse": "red",
+    "no-change": "lightgrey",
+}
+
+
+def badge(kind: str, label: str) -> str:
+    color = _BADGE_COLOR.get(kind, "lightgrey")
+    safe = label.replace("-", "--").replace(" ", "_")
+    return f"![{label}](https://img.shields.io/badge/{safe}-{color})"
+
+
+_VERDICT_ICON = {"pass": "✅ PASS", "pass-slow": "◆ PASS-SLOW", "fail": "❌ FAIL"}
+
+
+def _row_rank(r: EvalResult) -> int:
+    # failures first (incl. trigger-incorrect), then slow, then pass
+    if r.verdict.value == "fail" or not r.trigger_correct:
+        return 0
+    if r.verdict.value == "pass-slow":
+        return 1
+    return 2
+
+
+def render_job_summary(cell: CellReport) -> str:
+    c = Counter(r.verdict.value for r in cell.results)
+    trig_ok = sum(1 for r in cell.results if r.trigger_correct)
+    n = len(cell.results)
+    head = (f"### {cell.platform} · {cell.skill} · {cell.model}  "
+            f"— ✅ {c.get('pass',0)} / ◆ {c.get('pass-slow',0)} / "
+            f"❌ {c.get('fail',0)}  ·  {c.get('fail',0)} failed  ·  "
+            f"trigger {trig_ok}/{n}\n\n")
+    rows = ["| test | result | why |", "|---|---|---|"]
+    for r in sorted(cell.results, key=lambda r: (_row_rank(r), r.run_id)):
+        why = "; ".join(r.budget_breaches) if r.budget_breaches else (
+            "" if r.trigger_correct else
+            ("false-positive" if r.did_trigger else "false-negative"))
+        rows.append(f"| {r.run_id} | {_VERDICT_ICON[r.verdict.value]} | {why} |")
+    return head + "\n".join(rows) + "\n"
diff --git a/tests/lib/test_reporting_render.py b/tests/lib/test_reporting_render.py
new file mode 100644
index 0000000..6642619
--- /dev/null
+++ b/tests/lib/test_reporting_render.py
@@ -0,0 +1,31 @@
+from evals.lib.models import CellReport, EvalResult, Verdict
+from evals.lib.reporting import badge, render_job_summary
+
+
+def _cell(*results):
+    return CellReport(platform="claude-code", skill="hawkscan", model="haiku",
+                      commit="abc1234", results=list(results))
+
+
+def _r(rid, verdict, trig=True, should=True, did=True, why=""):
+    return EvalResult(platform="claude-code", skill="hawkscan", run_id=rid,
+                      should_trigger=should, did_trigger=did, trigger_correct=trig,
+                      verdict=verdict, score=100 if verdict != Verdict.FAIL else 40,
+                      budget_breaches=[why] if (why and verdict == Verdict.PASS_SLOW) else [])
+
+
+def test_badge_is_shields_image():
+    md = badge("fail", "FAIL")
+    assert md.startswith("![") and "img.shields.io/badge/" in md
+
+
+def test_job_summary_has_counts_and_all_rows_failures_first():
+    cell = _cell(_r("hw-01", Verdict.PASS), _r("hw-02", Verdict.PASS),
+                 _r("hw-14", Verdict.FAIL, trig=False, should=False, did=True))
+    md = render_job_summary(cell)
+    assert "claude-code" in md and "hawkscan" in md and "haiku" in md
+    assert "1 failed" in md.lower() or "❌ 1" in md
+    for rid in ("hw-01", "hw-02", "hw-14"):
+        assert rid in md
+    # failing row appears before the first passing row
+    assert md.index("hw-14") < md.index("hw-01")

From 5afc9963519d4a02692bd288ef2f54100d36de77 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 15:09:36 -0600
Subject: [PATCH 26/61] feat(evals): main() writes GITHUB_STEP_SUMMARY job
 report

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/cli.py                       |  3 +++
 evals/lib/reporting.py             |  9 +++++++++
 tests/lib/test_reporting_render.py | 14 ++++++++++++++
 3 files changed, 26 insertions(+)

diff --git a/evals/cli.py b/evals/cli.py
index 6fcfbe0..a9e0a55 100644
--- a/evals/cli.py
+++ b/evals/cli.py
@@ -66,6 +66,9 @@ def main() -> None:
                       model=args.model or "default", commit=commit, results=results)
     (out_dir / "cell.json").write_text(cell.model_dump_json(indent=2))
 
+    from evals.lib.reporting import render_job_summary, write_github_summary
+    write_github_summary(render_job_summary(cell))
+
     if summary["false_positives"] or summary["false_negatives"] or \
             summary["total_blocking_failures"] > 0:
         sys.exit(1)
diff --git a/evals/lib/reporting.py b/evals/lib/reporting.py
index 2d3a104..0cd82ce 100644
--- a/evals/lib/reporting.py
+++ b/evals/lib/reporting.py
@@ -1,5 +1,6 @@
 """Summaries + rich rendering for eval runs."""
 from __future__ import annotations
+import os
 from collections import Counter
 
 from rich.console import Console
@@ -81,6 +82,14 @@ def _row_rank(r: EvalResult) -> int:
     return 2
 
 
+def write_github_summary(md: str) -> None:
+    path = os.environ.get("GITHUB_STEP_SUMMARY")
+    if not path:
+        return
+    with open(path, "a", encoding="utf-8") as fp:
+        fp.write(md)
+
+
 def render_job_summary(cell: CellReport) -> str:
     c = Counter(r.verdict.value for r in cell.results)
     trig_ok = sum(1 for r in cell.results if r.trigger_correct)
diff --git a/tests/lib/test_reporting_render.py b/tests/lib/test_reporting_render.py
index 6642619..76ae5eb 100644
--- a/tests/lib/test_reporting_render.py
+++ b/tests/lib/test_reporting_render.py
@@ -29,3 +29,17 @@ def test_job_summary_has_counts_and_all_rows_failures_first():
         assert rid in md
     # failing row appears before the first passing row
     assert md.index("hw-14") < md.index("hw-01")
+
+
+def test_write_github_summary_appends(tmp_path, monkeypatch):
+    from evals.lib.reporting import write_github_summary
+    f = tmp_path / "summary.md"
+    monkeypatch.setenv("GITHUB_STEP_SUMMARY", str(f))
+    write_github_summary("## hello\n")
+    assert "## hello" in f.read_text()
+
+
+def test_write_github_summary_noop_when_unset(monkeypatch):
+    from evals.lib.reporting import write_github_summary
+    monkeypatch.delenv("GITHUB_STEP_SUMMARY", raising=False)
+    write_github_summary("nothing")   # must not raise

From 03707c0771550bf4776da0bc7cbab8d624e13589 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 15:12:58 -0600
Subject: [PATCH 27/61] feat(evals): render_digest + report CLI; comment job
 posts rich digest

Adds render_digest() to reporting.py, a new `report` CLI entrypoint that
discovers cell.json artifacts via rglob and writes a consolidated digest.md,
and replaces the flat JS-built comment in skill-evals.yml with two clean
steps (uv build + thin github-script post).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/skill-evals.yml             | 80 +++----------------
 evals/cli.py                                  | 23 ++++++
 evals/lib/reporting.py                        | 22 +++++
 pyproject.toml                                |  1 +
 .../eval-claude-code-hawkscan-haiku/cell.json |  5 ++
 .../results/eval-codex-api-haiku/cell.json    |  4 +
 tests/lib/test_reporting_render.py            | 14 ++++
 7 files changed, 81 insertions(+), 68 deletions(-)
 create mode 100644 tests/fixtures/results/eval-claude-code-hawkscan-haiku/cell.json
 create mode 100644 tests/fixtures/results/eval-codex-api-haiku/cell.json

diff --git a/.github/workflows/skill-evals.yml b/.github/workflows/skill-evals.yml
index 146b4aa..df87ddc 100644
--- a/.github/workflows/skill-evals.yml
+++ b/.github/workflows/skill-evals.yml
@@ -255,81 +255,25 @@ jobs:
           merge-multiple: false
           path: results/
 
-      - name: Build and post comment
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v5
+      - name: Build digest
+        run: uv run report --pr --results-dir results --out digest.md
+      - name: Post digest comment
         uses: actions/github-script@v7
         with:
           script: |
             const fs = require('fs');
-            const path = require('path');
-
-            const needsResult = ${{ toJSON(needs) }};
-            const allSuccess = Object.values(needsResult).every(n => n.result === 'success');
-            const overallIcon = allSuccess ? '✅' : '❌';
-
-            let body = `## ${overallIcon} Skill Eval Results\n\n`;
-
-            const platforms = ['claude-code', 'codex', 'agy', 'cursor'];
-            const skills = ['hawkscan', 'api'];
-
-            for (const platform of platforms) {
-              body += `### Platform: \`${platform}\`\n\n`;
-              for (const skill of skills) {
-                let summaryPath;
-                if (platform === 'claude-code') {
-                  summaryPath = path.join(
-                    'results', `eval-claude-code-${skill}-claude-haiku-4-5-20251001`, 'summary.json'
-                  );
-                } else {
-                  summaryPath = path.join(
-                    'results', `eval-${platform}-${skill}`, 'summary.json'
-                  );
-                }
-
-                if (!fs.existsSync(summaryPath)) {
-                  body += `**\`${skill}\`**: ⚠️ No results\n`;
-                  continue;
-                }
-
-                const s = JSON.parse(fs.readFileSync(summaryPath, 'utf8'));
-                const ta = s.trigger_accuracy;
-                const triggerIcon = ta.correct === ta.total ? '✅' : '❌';
-
-                body += `**\`${skill}\`**: ${triggerIcon} Trigger ${ta.correct}/${ta.total}`;
-                if (s.process_avg_score !== null) {
-                  const scoreIcon = s.process_avg_score >= 70 && s.total_blocking_failures === 0 ? '✅' : '⚠️';
-                  body += ` | ${scoreIcon} Process ${s.process_avg_score}/100`;
-                }
-                if (s.false_positives?.length) body += ` | ⚠️ FP: ${s.false_positives.join(', ')}`;
-                if (s.false_negatives?.length) body += ` | ⚠️ FN: ${s.false_negatives.join(', ')}`;
-                body += '\n';
-              }
-              body += '\n';
-            }
-
-            body += `---\n_Commit ${context.sha.slice(0, 7)}. `;
-            body += `[Full results](https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId})_\n`;
-
+            const body = fs.readFileSync('digest.md', 'utf8');
             const marker = '<!-- skill-eval-comment -->';
             const { data: comments } = await github.rest.issues.listComments({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              issue_number: context.issue.number,
-            });
+              owner: context.repo.owner, repo: context.repo.repo,
+              issue_number: context.issue.number });
             const existing = comments.find(c => c.body.includes(marker));
-            const fullBody = marker + '\n' + body;
-
             if (existing) {
-              await github.rest.issues.updateComment({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                comment_id: existing.id,
-                body: fullBody,
-              });
+              await github.rest.issues.updateComment({ owner: context.repo.owner,
+                repo: context.repo.repo, comment_id: existing.id, body });
             } else {
-              await github.rest.issues.createComment({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                issue_number: context.issue.number,
-                body: fullBody,
-              });
+              await github.rest.issues.createComment({ owner: context.repo.owner,
+                repo: context.repo.repo, issue_number: context.issue.number, body });
             }
diff --git a/evals/cli.py b/evals/cli.py
index a9e0a55..89cf391 100644
--- a/evals/cli.py
+++ b/evals/cli.py
@@ -94,6 +94,29 @@ def regrade() -> None:
     render_table([res])
 
 
+def report() -> None:
+    import argparse
+    from pathlib import Path
+    from evals.lib.models import CellReport
+    from evals.lib.reporting import render_digest
+    ap = argparse.ArgumentParser(prog="report")
+    ap.add_argument("--pr", action="store_true")
+    ap.add_argument("--results-dir", type=Path, default=Path("results"))
+    ap.add_argument("--baseline-dir", type=Path, default=None)
+    ap.add_argument("--lift-dir", type=Path, default=None)
+    ap.add_argument("--out", type=Path, default=Path("digest.md"))
+    args = ap.parse_args()
+    cells = []
+    for cj in sorted(args.results_dir.rglob("cell.json")):
+        try:
+            cells.append(CellReport.model_validate_json(cj.read_text()))
+        except Exception:
+            continue
+    md = render_digest(cells)
+    args.out.write_text(md)
+    print(f"wrote {args.out} ({len(cells)} cells)")
+
+
 def validate() -> None:
     ap = argparse.ArgumentParser(prog="validate")
     ap.add_argument("--skill", choices=["hawkscan", "api"])
diff --git a/evals/lib/reporting.py b/evals/lib/reporting.py
index 0cd82ce..efab71c 100644
--- a/evals/lib/reporting.py
+++ b/evals/lib/reporting.py
@@ -90,6 +90,28 @@ def write_github_summary(md: str) -> None:
         fp.write(md)
 
 
+def render_digest(cells, baselines=None, lift=None) -> str:
+    out = ["<!-- skill-eval-comment -->", "## Skill Eval Results\n"]
+    out.append("| platform | skill | model | trigger | ✅/◆/❌ | score |")
+    out.append("|---|---|---|---|---|---|")
+    for cell in cells:
+        c = Counter(r.verdict.value for r in cell.results)
+        n = len(cell.results)
+        trig = sum(1 for r in cell.results if r.trigger_correct)
+        graded = [r for r in cell.results if r.did_trigger and r.should_trigger]
+        avg = sum(r.score for r in graded) // len(graded) if graded else 0
+        ticon = "✅" if trig == n else "❌"
+        out.append(f"| {cell.platform} | {cell.skill} | {cell.model} | "
+                   f"{ticon} {trig}/{n} | {c.get('pass',0)}/{c.get('pass-slow',0)}/"
+                   f"{c.get('fail',0)} | {avg} |")
+    out.append("")
+    if baselines is None:
+        out.append("_No baseline available — showing absolute results only._\n")
+    for cell in cells:
+        out.append(render_job_summary(cell))
+    return "\n".join(out) + "\n"
+
+
 def render_job_summary(cell: CellReport) -> str:
     c = Counter(r.verdict.value for r in cell.results)
     trig_ok = sum(1 for r in cell.results if r.trigger_correct)
diff --git a/pyproject.toml b/pyproject.toml
index b64b1ff..b87b331 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,6 +17,7 @@ evals = "evals.cli:main"
 compare = "evals.cli:compare"
 regrade = "evals.cli:regrade"
 validate = "evals.cli:validate"
+report = "evals.cli:report"
 
 [build-system]
 requires = ["hatchling"]
diff --git a/tests/fixtures/results/eval-claude-code-hawkscan-haiku/cell.json b/tests/fixtures/results/eval-claude-code-hawkscan-haiku/cell.json
new file mode 100644
index 0000000..100a650
--- /dev/null
+++ b/tests/fixtures/results/eval-claude-code-hawkscan-haiku/cell.json
@@ -0,0 +1,5 @@
+{"platform":"claude-code","skill":"hawkscan","model":"haiku","commit":"abc1234",
+ "results":[
+   {"platform":"claude-code","skill":"hawkscan","run_id":"hw-01","should_trigger":true,"did_trigger":true,"trigger_correct":true,"verdict":"pass","budget_breaches":[],"process_checks":[],"score":100,"cost_usd":0.05},
+   {"platform":"claude-code","skill":"hawkscan","run_id":"hw-14","should_trigger":false,"did_trigger":true,"trigger_correct":false,"verdict":"fail","budget_breaches":[],"process_checks":[],"score":0,"cost_usd":0.02}
+ ]}
diff --git a/tests/fixtures/results/eval-codex-api-haiku/cell.json b/tests/fixtures/results/eval-codex-api-haiku/cell.json
new file mode 100644
index 0000000..1343366
--- /dev/null
+++ b/tests/fixtures/results/eval-codex-api-haiku/cell.json
@@ -0,0 +1,4 @@
+{"platform":"codex","skill":"api","model":"haiku","commit":"abc1234",
+ "results":[
+   {"platform":"codex","skill":"api","run_id":"api-01","should_trigger":true,"did_trigger":true,"trigger_correct":true,"verdict":"pass","budget_breaches":[],"process_checks":[],"score":100,"cost_usd":0.04}
+ ]}
diff --git a/tests/lib/test_reporting_render.py b/tests/lib/test_reporting_render.py
index 76ae5eb..8bb50eb 100644
--- a/tests/lib/test_reporting_render.py
+++ b/tests/lib/test_reporting_render.py
@@ -43,3 +43,17 @@ def test_write_github_summary_noop_when_unset(monkeypatch):
     from evals.lib.reporting import write_github_summary
     monkeypatch.delenv("GITHUB_STEP_SUMMARY", raising=False)
     write_github_summary("nothing")   # must not raise
+
+
+def test_render_digest_overview_and_per_cell():
+    from pathlib import Path
+    from evals.lib.models import CellReport
+    from evals.lib.reporting import render_digest
+    root = Path(__file__).parent.parent / "fixtures" / "results"
+    cells = [CellReport.model_validate_json((p / "cell.json").read_text())
+             for p in sorted(root.iterdir()) if (p / "cell.json").exists()]
+    md = render_digest(cells)
+    assert "Skill Eval" in md
+    assert "claude-code" in md and "codex" in md
+    assert "hw-14" in md            # failing test surfaced
+    assert "no baseline" in md.lower()   # no baseline supplied

From e07684620dd215b62a7629772d7bd3f3bfb1cae6 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 15:15:14 -0600
Subject: [PATCH 28/61] feat(evals): baseline diff + score_delta (pure
 threshold math, no AI)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/lib/baseline.py      | 45 ++++++++++++++++++++++++++++++++++++++
 tests/lib/test_baseline.py | 32 +++++++++++++++++++++++++++
 2 files changed, 77 insertions(+)
 create mode 100644 evals/lib/baseline.py
 create mode 100644 tests/lib/test_baseline.py

diff --git a/evals/lib/baseline.py b/evals/lib/baseline.py
new file mode 100644
index 0000000..a23575b
--- /dev/null
+++ b/evals/lib/baseline.py
@@ -0,0 +1,45 @@
+"""Pure-Python (no AI) comparison of a run against a baseline run."""
+from __future__ import annotations
+from pathlib import Path
+
+from evals.lib.models import CellReport
+
+
+def diff(current: CellReport, baseline: CellReport) -> dict[str, str]:
+    cur = {r.run_id: r.verdict.value for r in current.results}
+    base = {r.run_id: r.verdict.value for r in baseline.results}
+    out: dict[str, str] = {}
+    for rid in set(cur) | set(base):
+        if rid not in base:
+            out[rid] = "new"
+        elif rid not in cur:
+            out[rid] = "dropped"
+        elif cur[rid] == base[rid]:
+            out[rid] = "same"
+        elif cur[rid] == "fail":
+            out[rid] = "regressed"
+        elif base[rid] == "fail":
+            out[rid] = "fixed"
+        else:
+            out[rid] = "changed"
+    return out
+
+
+def score_delta(current_avg: int, baseline_avg: int, band: int = 3) -> str:
+    d = current_avg - baseline_avg
+    if abs(d) <= band:
+        return "no-change"
+    return "better" if d > 0 else "worse"
+
+
+def load_baseline_dir(path: Path | None) -> dict[tuple[str, str, str], CellReport]:
+    out: dict[tuple[str, str, str], CellReport] = {}
+    if not path or not Path(path).exists():
+        return out
+    for cj in Path(path).rglob("cell.json"):
+        try:
+            cell = CellReport.model_validate_json(cj.read_text())
+        except Exception:
+            continue
+        out[(cell.platform, cell.skill, cell.model)] = cell
+    return out
diff --git a/tests/lib/test_baseline.py b/tests/lib/test_baseline.py
new file mode 100644
index 0000000..727f270
--- /dev/null
+++ b/tests/lib/test_baseline.py
@@ -0,0 +1,32 @@
+from evals.lib.models import CellReport, EvalResult, Verdict
+from evals.lib.baseline import diff, score_delta
+
+
+def _cell(verdicts: dict):
+    results = [EvalResult(platform="p", skill="s", run_id=k, should_trigger=True,
+                          did_trigger=True, trigger_correct=True, verdict=v, score=100)
+               for k, v in verdicts.items()]
+    return CellReport(platform="p", skill="s", model="m", commit="c", results=results)
+
+
+def test_diff_statuses():
+    base = _cell({"a": Verdict.PASS, "b": Verdict.FAIL, "c": Verdict.PASS, "d": Verdict.PASS})
+    cur = _cell({"a": Verdict.FAIL, "b": Verdict.PASS, "c": Verdict.PASS, "e": Verdict.PASS})
+    d = diff(cur, base)
+    assert d["a"] == "regressed"
+    assert d["b"] == "fixed"
+    assert d["c"] == "same"
+    assert d["e"] == "new"
+    assert d["d"] == "dropped"
+
+
+def test_diff_changed_non_fail():
+    base = _cell({"a": Verdict.PASS})
+    cur = _cell({"a": Verdict.PASS_SLOW})
+    assert diff(cur, base)["a"] == "changed"
+
+
+def test_score_delta_bands():
+    assert score_delta(90, 88) == "no-change"
+    assert score_delta(95, 88) == "better"
+    assert score_delta(80, 88) == "worse"

From 10b48837a47034c51c33cf6ac625e241f8c70ec1 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 15:16:53 -0600
Subject: [PATCH 29/61] feat(evals): digest shows regression vs released-tag
 baseline

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/cli.py                       |  4 +++-
 evals/lib/reporting.py             | 22 +++++++++++++++++-----
 tests/lib/test_reporting_render.py | 17 +++++++++++++++++
 3 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/evals/cli.py b/evals/cli.py
index 89cf391..a650224 100644
--- a/evals/cli.py
+++ b/evals/cli.py
@@ -112,7 +112,9 @@ def report() -> None:
             cells.append(CellReport.model_validate_json(cj.read_text()))
         except Exception:
             continue
-    md = render_digest(cells)
+    from evals.lib.baseline import load_baseline_dir
+    baselines = load_baseline_dir(args.baseline_dir) or None
+    md = render_digest(cells, baselines=baselines)
     args.out.write_text(md)
     print(f"wrote {args.out} ({len(cells)} cells)")
 
diff --git a/evals/lib/reporting.py b/evals/lib/reporting.py
index efab71c..fbd7113 100644
--- a/evals/lib/reporting.py
+++ b/evals/lib/reporting.py
@@ -91,24 +91,36 @@ def write_github_summary(md: str) -> None:
 
 
 def render_digest(cells, baselines=None, lift=None) -> str:
+    from evals.lib.baseline import diff as _diff
     out = ["<!-- skill-eval-comment -->", "## Skill Eval Results\n"]
     out.append("| platform | skill | model | trigger | ✅/◆/❌ | score |")
     out.append("|---|---|---|---|---|---|")
     for cell in cells:
         c = Counter(r.verdict.value for r in cell.results)
-        n = len(cell.results)
-        trig = sum(1 for r in cell.results if r.trigger_correct)
+        n = len(cell.results); trig = sum(1 for r in cell.results if r.trigger_correct)
         graded = [r for r in cell.results if r.did_trigger and r.should_trigger]
         avg = sum(r.score for r in graded) // len(graded) if graded else 0
         ticon = "✅" if trig == n else "❌"
-        out.append(f"| {cell.platform} | {cell.skill} | {cell.model} | "
-                   f"{ticon} {trig}/{n} | {c.get('pass',0)}/{c.get('pass-slow',0)}/"
-                   f"{c.get('fail',0)} | {avg} |")
+        out.append(f"| {cell.platform} | {cell.skill} | {cell.model} | {ticon} {trig}/{n} | "
+                   f"{c.get('pass',0)}/{c.get('pass-slow',0)}/{c.get('fail',0)} | {avg} |")
     out.append("")
     if baselines is None:
         out.append("_No baseline available — showing absolute results only._\n")
     for cell in cells:
         out.append(render_job_summary(cell))
+        if baselines is not None:
+            base = baselines.get((cell.platform, cell.skill, cell.model))
+            if base is None:
+                out.append("_no baseline for this cell._\n")
+            else:
+                d = _diff(cell, base)
+                changed = {k: v for k, v in d.items()
+                           if v in ("regressed", "fixed", "changed")}
+                if changed:
+                    out.append("**vs baseline:** " + ", ".join(
+                        f"{badge(v, v)} {k}" for k, v in sorted(changed.items())) + "\n")
+                else:
+                    out.append("_vs baseline: no changes._\n")
     return "\n".join(out) + "\n"
 
 
diff --git a/tests/lib/test_reporting_render.py b/tests/lib/test_reporting_render.py
index 8bb50eb..4a0b268 100644
--- a/tests/lib/test_reporting_render.py
+++ b/tests/lib/test_reporting_render.py
@@ -45,6 +45,23 @@ def test_write_github_summary_noop_when_unset(monkeypatch):
     write_github_summary("nothing")   # must not raise
 
 
+def test_digest_shows_regression_vs_baseline():
+    from evals.lib.models import CellReport, EvalResult, Verdict
+    from evals.lib.reporting import render_digest
+
+    def cell(v):
+        r = EvalResult(platform="claude-code", skill="hawkscan", run_id="hw-01",
+                       should_trigger=True, did_trigger=True, trigger_correct=True,
+                       verdict=v, score=100 if v != Verdict.FAIL else 0)
+        return CellReport(platform="claude-code", skill="hawkscan", model="haiku",
+                          commit="c", results=[r])
+    cur = cell(Verdict.FAIL)
+    base = {("claude-code", "hawkscan", "haiku"): cell(Verdict.PASS)}
+    md = render_digest([cur], baselines=base)
+    assert "regressed" in md.lower()
+    assert "no baseline" not in md.lower()
+
+
 def test_render_digest_overview_and_per_cell():
     from pathlib import Path
     from evals.lib.models import CellReport

From 324b8cfe485d4ccd1c2cbff0e405f48a21f1c86e Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 15:21:36 -0600
Subject: [PATCH 30/61] ci(evals): capture baseline at release tag; PR diffs
 against it (graceful)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/actionlint.yaml                |  3 ++
 .github/workflows/capture-baseline.yml | 39 ++++++++++++++++++++++++++
 .github/workflows/release.yml          | 22 +++++++++++++--
 .github/workflows/skill-evals.yml      | 18 +++++++++++-
 4 files changed, 79 insertions(+), 3 deletions(-)
 create mode 100644 .github/actionlint.yaml
 create mode 100644 .github/workflows/capture-baseline.yml

diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
new file mode 100644
index 0000000..e9a7d0e
--- /dev/null
+++ b/.github/actionlint.yaml
@@ -0,0 +1,3 @@
+self-hosted-runner:
+  labels:
+    - agent-skills-amd-4cpu
diff --git a/.github/workflows/capture-baseline.yml b/.github/workflows/capture-baseline.yml
new file mode 100644
index 0000000..b9a0497
--- /dev/null
+++ b/.github/workflows/capture-baseline.yml
@@ -0,0 +1,39 @@
+name: Capture Eval Baseline
+on:
+  workflow_dispatch:
+    inputs:
+      tag:
+        description: "Release tag to baseline (e.g. v1.9.0)"
+        required: true
+        type: string
+permissions:
+  contents: read
+jobs:
+  capture:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        skill: [hawkscan, api]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.tag }}
+      - uses: astral-sh/setup-uv@v5
+      - uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+      - name: Install Claude Code CLI
+        run: npm install -g @anthropic-ai/claude-code
+      - name: Run baseline eval (haiku)
+        env:
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+        run: |
+          uv run evals --harness claude-code --skill ${{ matrix.skill }} \
+            --model claude-haiku-4-5-20251001 --bare --max-budget 0.15 || true
+      - name: Upload baseline artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: baseline-claude-code-${{ matrix.skill }}-haiku
+          path: evals/harnesses/claude-code/results/${{ matrix.skill }}/cell.json
+          retention-days: 90
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index aa29ba8..1843daf 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -127,6 +127,24 @@ jobs:
         if: inputs.dry_run == true
         run: echo "DRY RUN complete — all checks passed for ${{ steps.version.outputs.tag }}"
 
+  capture-baseline:
+    name: Trigger baseline capture
+    needs: release
+    if: inputs.dry_run != true
+    runs-on: ubuntu-latest
+    permissions:
+      actions: write
+    steps:
+      - uses: actions/checkout@v4
+      - name: Dispatch capture-baseline
+        # GITHUB_TOKEN can dispatch workflows in the same repo for most orgs.
+        # If org policy blocks it, swap to the TF_GITHUB_TOKEN PAT that
+        # update-marketplace pulls from SSM (aws ssm get-parameter --name TF_GITHUB_TOKEN).
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          RELEASE_TAG: ${{ needs.release.outputs.tag }}
+        run: gh workflow run capture-baseline.yml -f tag="$RELEASE_TAG"
+
   update-marketplace:
     name: Update marketplace pin
     needs: release
@@ -139,7 +157,7 @@ jobs:
       - name: Resolve cache
         run: |
           biodome ci restore-cache
-          rm -rf *.tar.lz4
+          rm -rf ./*.tar.lz4
 
       - name: Pull secrets
         run: biodome ci save-secrets
@@ -158,7 +176,7 @@ jobs:
           echo "::add-mask::${GH_PAT}"
           git clone https://github.com/stackhawk/agent-skills-marketplace.git /tmp/marketplace
           git -C /tmp/marketplace remote set-url origin \
-            https://x-access-token:${GH_PAT}@github.com/stackhawk/agent-skills-marketplace.git
+            "https://x-access-token:${GH_PAT}@github.com/stackhawk/agent-skills-marketplace.git"
 
       - name: Update marketplace.json
         run: |
diff --git a/.github/workflows/skill-evals.yml b/.github/workflows/skill-evals.yml
index df87ddc..8c8d7f8 100644
--- a/.github/workflows/skill-evals.yml
+++ b/.github/workflows/skill-evals.yml
@@ -256,9 +256,25 @@ jobs:
           path: results/
 
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
       - uses: astral-sh/setup-uv@v5
+      - name: Fetch released baseline (best-effort)
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          set +e
+          mkdir -p baseline
+          TAG=$(gh release view --json tagName -q .tagName 2>/dev/null)
+          if [ -z "$TAG" ]; then echo "no release yet"; exit 0; fi
+          SHA=$(git rev-list -n 1 "$TAG" 2>/dev/null)
+          RUN=$(gh run list --workflow capture-baseline.yml --json databaseId,headSha \
+                 -q "map(select(.headSha==\"$SHA\")) | .[0].databaseId" 2>/dev/null)
+          if [ -z "$RUN" ] || [ "$RUN" = "null" ]; then echo "no capture run for $TAG"; exit 0; fi
+          gh run download "$RUN" -p 'baseline-*' -D baseline 2>/dev/null || echo "download failed"
+          echo "baseline fetched for $TAG (run $RUN)"
       - name: Build digest
-        run: uv run report --pr --results-dir results --out digest.md
+        run: uv run report --pr --results-dir results --baseline-dir baseline --out digest.md
       - name: Post digest comment
         uses: actions/github-script@v7
         with:

From bd52c2c7da1a6f9e3de62e3890303eaa99368a6f Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 15:23:59 -0600
Subject: [PATCH 31/61] feat(evals): compare emits lift effect + writes
 lift.json

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/cli.py              |  7 +++++++
 evals/lib/compare.py      | 14 ++++++++++++--
 tests/lib/test_compare.py | 22 ++++++++++++++++++++++
 3 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/evals/cli.py b/evals/cli.py
index a650224..fd04113 100644
--- a/evals/cli.py
+++ b/evals/cli.py
@@ -81,6 +81,13 @@ def compare() -> None:
     rows = compare_skill(args.skill, args.harness, model=args.model,
                          max_budget=args.max_budget, bare=args.bare,
                          full_auto=args.full_auto, only_id=args.prompt_id)
+    import json
+    from pathlib import Path
+    out_dir = Path(__file__).resolve().parent / "harnesses" / args.harness / "results" / args.skill
+    out_dir.mkdir(parents=True, exist_ok=True)
+    (out_dir / "lift.json").write_text(json.dumps(
+        [{**r, "with_verdict": r["with_verdict"].value,
+          "without_verdict": r["without_verdict"].value} for r in rows], indent=2))
     render_compare(rows)
 
 
diff --git a/evals/lib/compare.py b/evals/lib/compare.py
index b48316c..5f00856 100644
--- a/evals/lib/compare.py
+++ b/evals/lib/compare.py
@@ -5,6 +5,7 @@
 from evals.lib.config import load_skill
 from evals.lib.grading import grade
 from evals.lib.harness import get_adapter
+from evals.lib.models import Verdict
 
 
 def compare_skill(skill: str, platform: str, *, model: str | None = None,
@@ -26,11 +27,20 @@ def compare_skill(skill: str, platform: str, *, model: str | None = None,
             did = adapter.detect_trigger(run, skill)
             graded[load] = grade(p, run, cfg.checks, platform=platform, skill=skill,
                                  did_trigger=did)
+        wv = graded[True].verdict
+        wo = graded[False].verdict
+        if wo == Verdict.FAIL and wv != Verdict.FAIL:
+            effect = "lift"
+        elif wo != Verdict.FAIL and wv == Verdict.FAIL:
+            effect = "regress"
+        else:
+            effect = "none"
         rows.append({
             "id": p.id,
-            "with_verdict": graded[True].verdict,
-            "without_verdict": graded[False].verdict,
+            "with_verdict": wv,
+            "without_verdict": wo,
             "with_cost": graded[True].cost_usd,
             "without_cost": graded[False].cost_usd,
+            "effect": effect,
         })
     return rows
diff --git a/tests/lib/test_compare.py b/tests/lib/test_compare.py
index fbe6fd7..4adb5cf 100644
--- a/tests/lib/test_compare.py
+++ b/tests/lib/test_compare.py
@@ -41,3 +41,25 @@ def test_compare_shows_lift(monkeypatch):
     assert row["without_verdict"] == Verdict.FAIL          # no skill -> blocking checks fail
     assert row["with_verdict"] in (Verdict.PASS, Verdict.PASS_SLOW)  # skill -> workflow satisfied
     assert row["with_cost"] == 0.05 and row["without_cost"] == 0.02
+
+
+def test_compare_skill_returns_lift_effect(monkeypatch):
+    from evals.lib.models import ParsedRun, Verdict
+    from evals.lib import compare as compare_mod
+
+    class Stub:
+        platform = "stub"
+        def cli_signals(self, s): return ["hawk scan"]
+        def invocation_signals(self, s): return []
+        def parse_stream(self, raw): return ParsedRun()
+        def detect_trigger(self, run, s): return any("hawk scan" in c for c in run.bash_commands)
+        def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill,
+                   max_budget, bare, full_auto):
+            return (ParsedRun(bash_commands=["hawk version","hawk config --help",
+                    "hawkop app list","hawkop env list","hawk init",
+                    "hawk validate config stackhawk.yml","hawk scan"],
+                    output_text="reachable on localhost:8080") if load_skill
+                    else ParsedRun(bash_commands=["echo idk"]))
+    monkeypatch.setattr(compare_mod, "get_adapter", lambda p: Stub())
+    rows = compare_mod.compare_skill("hawkscan", "stub", only_id="hw-01")
+    assert rows[0]["effect"] == "lift"

From 53721037c42fa2fc32c86ceb3c4ebe3cce1877de Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 15:26:10 -0600
Subject: [PATCH 32/61] feat(evals): render skill-lift section; PR runs compare
 for lift

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/skill-evals.yml  | 10 +++++++++-
 evals/cli.py                       | 12 +++++++++++-
 evals/lib/reporting.py             | 15 +++++++++++++++
 tests/lib/test_reporting_render.py | 15 +++++++++++++++
 4 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/skill-evals.yml b/.github/workflows/skill-evals.yml
index 8c8d7f8..c6d998a 100644
--- a/.github/workflows/skill-evals.yml
+++ b/.github/workflows/skill-evals.yml
@@ -92,6 +92,14 @@ jobs:
           uv run evals --harness claude-code --skill ${{ matrix.skill }} \
             --model ${{ matrix.model }} --bare --max-budget 0.15
 
+      - name: Skill lift (compare with/without)
+        if: github.event_name == 'pull_request'
+        env:
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+        run: |
+          uv run compare --harness claude-code --skill ${{ matrix.skill }} \
+            --model ${{ matrix.model }} --bare --max-budget 0.15 || true
+
       - name: Upload results
         if: always()
         uses: actions/upload-artifact@v4
@@ -274,7 +282,7 @@ jobs:
           gh run download "$RUN" -p 'baseline-*' -D baseline 2>/dev/null || echo "download failed"
           echo "baseline fetched for $TAG (run $RUN)"
       - name: Build digest
-        run: uv run report --pr --results-dir results --baseline-dir baseline --out digest.md
+        run: uv run report --pr --results-dir results --baseline-dir baseline --lift-dir results --out digest.md
       - name: Post digest comment
         uses: actions/github-script@v7
         with:
diff --git a/evals/cli.py b/evals/cli.py
index fd04113..e51a5df 100644
--- a/evals/cli.py
+++ b/evals/cli.py
@@ -121,7 +121,17 @@ def report() -> None:
             continue
     from evals.lib.baseline import load_baseline_dir
     baselines = load_baseline_dir(args.baseline_dir) or None
-    md = render_digest(cells, baselines=baselines)
+    lift = None
+    if args.lift_dir and args.lift_dir.exists():
+        lift = {}
+        for lj in args.lift_dir.rglob("lift.json"):
+            sib = lj.parent / "cell.json"
+            if not sib.exists():
+                continue
+            cell = CellReport.model_validate_json(sib.read_text())
+            lift[(cell.platform, cell.skill, cell.model)] = json.loads(lj.read_text())
+        lift = lift or None
+    md = render_digest(cells, baselines=baselines, lift=lift)
     args.out.write_text(md)
     print(f"wrote {args.out} ({len(cells)} cells)")
 
diff --git a/evals/lib/reporting.py b/evals/lib/reporting.py
index fbd7113..4a95ec1 100644
--- a/evals/lib/reporting.py
+++ b/evals/lib/reporting.py
@@ -121,6 +121,21 @@ def render_digest(cells, baselines=None, lift=None) -> str:
                         f"{badge(v, v)} {k}" for k, v in sorted(changed.items())) + "\n")
                 else:
                     out.append("_vs baseline: no changes._\n")
+    if lift:
+        out.append("\n### Skill lift (with vs without)\n")
+        for key, rows in lift.items():
+            lifted = sum(1 for r in rows if r["effect"] == "lift")
+            out.append(f"**{key[0]} · {key[1]} · {key[2]}** — "
+                       f"{lifted}/{len(rows)} prompts lifted FAIL→PASS\n")
+            out.append("| test | without | with | |")
+            out.append("|---|---|---|---|")
+            for r in rows:
+                eff = {"lift": badge('fixed', '↑ lift'),
+                       "regress": badge('regressed', '↓ regress'),
+                       "none": ""}[r["effect"]]
+                out.append(f"| {r['id']} | {r['without_verdict']} | "
+                           f"{r['with_verdict']} | {eff} |")
+            out.append("")
     return "\n".join(out) + "\n"
 
 
diff --git a/tests/lib/test_reporting_render.py b/tests/lib/test_reporting_render.py
index 4a0b268..9a999ae 100644
--- a/tests/lib/test_reporting_render.py
+++ b/tests/lib/test_reporting_render.py
@@ -74,3 +74,18 @@ def test_render_digest_overview_and_per_cell():
     assert "claude-code" in md and "codex" in md
     assert "hw-14" in md            # failing test surfaced
     assert "no baseline" in md.lower()   # no baseline supplied
+
+
+def test_digest_renders_lift_section():
+    from evals.lib.models import CellReport, EvalResult, Verdict
+    from evals.lib.reporting import render_digest
+    r = EvalResult(platform="claude-code", skill="hawkscan", run_id="hw-01",
+                   should_trigger=True, did_trigger=True, trigger_correct=True,
+                   verdict=Verdict.PASS, score=100)
+    cell = CellReport(platform="claude-code", skill="hawkscan", model="haiku",
+                      commit="c", results=[r])
+    lift = {("claude-code", "hawkscan", "haiku"): [
+        {"id": "hw-01", "without_verdict": "fail", "with_verdict": "pass", "effect": "lift"}]}
+    md = render_digest([cell], lift=lift)
+    assert "lift" in md.lower() and "hw-01" in md
+    assert "1/1" in md or "1 of 1" in md.lower()

From 690dc5a41861950941cbce7b1071bf926c7a64df Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 15:29:05 -0600
Subject: [PATCH 33/61] docs(evals): document JUnit-style report, comparisons,
 four real adapters

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/README.md           | 33 +++++++++++++++++++++++++++++++++
 evals/harnesses/README.md | 19 +++++++++++++------
 2 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/evals/README.md b/evals/README.md
index dfa653c..3b3ff68 100644
--- a/evals/README.md
+++ b/evals/README.md
@@ -67,6 +67,39 @@ specific prompts (absent = applies to all).
 
 See `harnesses/README.md` for per-platform instructions and CI setup.
 
+### Reports
+
+**Per-job summaries.** Each `uv run evals` run writes a JUnit-style table to
+`$GITHUB_STEP_SUMMARY`: one row per test, failures-first ordering,
+`✅ PASS / ◆ PASS-SLOW / ❌ FAIL` verdicts. It also writes a `cell.json`
+artifact in the results directory so downstream steps can aggregate across
+jobs.
+
+**PR digest comment.** When a PR lands, the `comment` CI job collects all
+`cell.json` artifacts and runs:
+
+```
+uv run report --pr [--results-dir DIR] [--baseline-dir DIR] [--lift-dir DIR] [--out FILE]
+```
+
+This produces a consolidated Markdown digest posted as a sticky PR comment.
+The digest contains:
+
+- **Matrix overview** — one row per (platform × skill × model) cell showing
+  trigger accuracy, ✅/◆/❌ verdict mix, and aggregate score.
+- **Per-cell tables** — the same failures-first rows from each job summary.
+- **Regression vs released-tag baseline** — the `comment` job fetches the
+  baseline from the most recent release's `capture-baseline.yml` run
+  (best-effort; missing baseline degrades gracefully to "no baseline
+  available"). Comparison is pure deterministic threshold math: per-test
+  verdict-flips (fixed / regressed) and aggregate score deltas with a ±3
+  band → better / worse / no-change. No AI or LLM calls are used.
+- **Skill lift section** — with-skill vs without-skill verdict comparison
+  showing how many prompts move from FAIL→PASS when the skill is active.
+
+Baselines are captured at release tags by `capture-baseline.yml`, which is
+triggered automatically from `release.yml`.
+
 ## Adding test cases
 
 When a skill bug or regression is discovered:
diff --git a/evals/harnesses/README.md b/evals/harnesses/README.md
index 52b3f2f..04d8b2a 100644
--- a/evals/harnesses/README.md
+++ b/evals/harnesses/README.md
@@ -106,10 +106,10 @@ uv run evals --harness agy --skill hawkscan --print-timeout 300s
 
 > **Shims vs adapters**: The per-platform `run-evals.py` scripts are back-compat
 > shims that forward to `uv run evals`. Full stream-parsing adapter logic lives in
-> `evals/harnesses/<platform>/adapter.py`; currently only **claude-code** has a
-> full adapter. The other platforms (codex, cursor, copilot, agy) forward through
-> the same CLI path and will gain dedicated adapters as output formats are
-> stabilised.
+> `evals/harnesses/<platform>/adapter.py`; **claude-code, codex, cursor, and agy**
+> all have real `adapter.py` implementations. Copilot and Gemini use the legacy
+> shim path (Gemini is frozen). The per-platform `run-evals.py` files remain thin
+> forwarding shims for back-compat.
 
 ## How it works
 
@@ -133,8 +133,15 @@ For each entry in `evals/<skill>/prompts.yaml`, each harness:
 
 The `.github/workflows/skill-evals.yml` workflow is tiered:
 
-- **Every PR**: runs `uv run validate` (no API keys required) + a cheap claude-code / Haiku run
-- **Merge to main + manual dispatch**: runs the full model matrix across all platforms
+- **Every PR + push**: runs `uv run validate` (no API keys required), then runs
+  **all four platforms** (claude-code, codex, agy, cursor). On PRs, claude-code
+  uses the Haiku model to stay within budget; the other platforms run their
+  default model.
+- **Merge to main + manual dispatch**: runs the full multi-model matrix across
+  all platforms.
+- **PR comment job**: collects `cell.json` artifacts from all platform jobs,
+  fetches the released-tag baseline (best-effort), and posts a consolidated
+  digest comment via `uv run report --pr`.
 
 Required GitHub secrets:
 - `ANTHROPIC_API_KEY` — Claude Code

From dca5e67bd45c4d6ace1b5640bfc6b08691a3bdb3 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Fri, 29 May 2026 15:35:49 -0600
Subject: [PATCH 34/61] fix(evals): comment job checkout-before-download
 (empty-digest bug); cursor best-effort; wire score_delta into overview

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/skill-evals.yml  |  9 +++++----
 evals/lib/reporting.py             | 16 ++++++++++++----
 tests/lib/test_reporting_render.py | 16 ++++++++++++++++
 3 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/skill-evals.yml b/.github/workflows/skill-evals.yml
index c6d998a..280f253 100644
--- a/.github/workflows/skill-evals.yml
+++ b/.github/workflows/skill-evals.yml
@@ -232,12 +232,14 @@ jobs:
 
       - name: Verify agent CLI
         run: agent --version
+        continue-on-error: true  # CLI package name TBD; skip if unavailable
 
       - name: Run ${{ matrix.skill }} evals
         env:
           CURSOR_API_KEY: ${{ secrets.CURSOR_API_KEY }}
         run: |
           uv run evals --harness cursor --skill ${{ matrix.skill }}
+        continue-on-error: true  # best-effort; digest degrades gracefully
 
       - name: Upload results
         if: always()
@@ -257,15 +259,14 @@ jobs:
       pull-requests: write
 
     steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
       - uses: actions/download-artifact@v4
         with:
           pattern: eval-*
           merge-multiple: false
           path: results/
-
-      - uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
       - uses: astral-sh/setup-uv@v5
       - name: Fetch released baseline (best-effort)
         env:
diff --git a/evals/lib/reporting.py b/evals/lib/reporting.py
index 4a95ec1..9a31e66 100644
--- a/evals/lib/reporting.py
+++ b/evals/lib/reporting.py
@@ -91,18 +91,26 @@ def write_github_summary(md: str) -> None:
 
 
 def render_digest(cells, baselines=None, lift=None) -> str:
-    from evals.lib.baseline import diff as _diff
+    from evals.lib.baseline import diff as _diff, score_delta
     out = ["<!-- skill-eval-comment -->", "## Skill Eval Results\n"]
-    out.append("| platform | skill | model | trigger | ✅/◆/❌ | score |")
-    out.append("|---|---|---|---|---|---|")
+    out.append("| platform | skill | model | trigger | ✅/◆/❌ | score | vs base |")
+    out.append("|---|---|---|---|---|---|---|")
     for cell in cells:
         c = Counter(r.verdict.value for r in cell.results)
         n = len(cell.results); trig = sum(1 for r in cell.results if r.trigger_correct)
         graded = [r for r in cell.results if r.did_trigger and r.should_trigger]
         avg = sum(r.score for r in graded) // len(graded) if graded else 0
         ticon = "✅" if trig == n else "❌"
+        vs = "—"
+        if baselines is not None:
+            b = baselines.get((cell.platform, cell.skill, cell.model))
+            if b is not None:
+                bg = [r for r in b.results if r.did_trigger and r.should_trigger]
+                bavg = sum(r.score for r in bg) // len(bg) if bg else 0
+                delta = score_delta(avg, bavg)
+                vs = f"{badge(delta, delta)}"
         out.append(f"| {cell.platform} | {cell.skill} | {cell.model} | {ticon} {trig}/{n} | "
-                   f"{c.get('pass',0)}/{c.get('pass-slow',0)}/{c.get('fail',0)} | {avg} |")
+                   f"{c.get('pass',0)}/{c.get('pass-slow',0)}/{c.get('fail',0)} | {avg} | {vs} |")
     out.append("")
     if baselines is None:
         out.append("_No baseline available — showing absolute results only._\n")
diff --git a/tests/lib/test_reporting_render.py b/tests/lib/test_reporting_render.py
index 9a999ae..ed5f9c6 100644
--- a/tests/lib/test_reporting_render.py
+++ b/tests/lib/test_reporting_render.py
@@ -76,6 +76,22 @@ def test_render_digest_overview_and_per_cell():
     assert "no baseline" in md.lower()   # no baseline supplied
 
 
+def test_digest_overview_shows_score_delta_vs_baseline():
+    from evals.lib.models import CellReport, EvalResult, Verdict
+    from evals.lib.reporting import render_digest
+
+    def cell(score):
+        r = EvalResult(platform="claude-code", skill="hawkscan", run_id="hw-01",
+                       should_trigger=True, did_trigger=True, trigger_correct=True,
+                       verdict=Verdict.PASS, score=score)
+        return CellReport(platform="claude-code", skill="hawkscan", model="haiku",
+                          commit="c", results=[r])
+    cur = cell(70)
+    base = {("claude-code", "hawkscan", "haiku"): cell(90)}
+    md = render_digest([cur], baselines=base)
+    assert "worse" in md.lower()   # 70 vs 90 -> worse
+
+
 def test_digest_renders_lift_section():
     from evals.lib.models import CellReport, EvalResult, Verdict
     from evals.lib.reporting import render_digest

From 7947bc32d991761e9b97651130a7ece387184936 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Sun, 31 May 2026 20:45:41 -0600
Subject: [PATCH 35/61] feat(evals): capture stderr+returncode, surface harness
 errors in report, fix total_cost_usd
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- ParsedRun gains returncode + stderr_tail fields; EvalResult gains note field
- grade() propagates run.error → EvalResult.note on both return paths
- render_job_summary() appends note to the "why" column when present
- All four adapters (claude-code, codex, cursor, agy) now capture proc.returncode
  and proc.stderr after subprocess.run, set run.error on non-zero exit or empty output
- claude-code adapter parse_stream reads total_cost_usd (new key) before cost_usd
  (legacy key) so cost stops showing $0.00

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/harnesses/agy/adapter.py         |  9 ++++++++-
 evals/harnesses/claude-code/adapter.py | 11 +++++++++--
 evals/harnesses/codex/adapter.py       |  9 ++++++++-
 evals/harnesses/cursor/adapter.py      |  9 ++++++++-
 evals/lib/grading.py                   |  2 ++
 evals/lib/models.py                    |  3 +++
 evals/lib/reporting.py                 |  2 ++
 tests/lib/test_adapters.py             | 11 +++++++++++
 tests/lib/test_grading.py              | 10 ++++++++++
 tests/lib/test_models.py               | 20 ++++++++++++++++++++
 tests/lib/test_reporting_render.py     | 12 ++++++++++++
 11 files changed, 93 insertions(+), 5 deletions(-)

diff --git a/evals/harnesses/agy/adapter.py b/evals/harnesses/agy/adapter.py
index 99e34f2..6af16c3 100644
--- a/evals/harnesses/agy/adapter.py
+++ b/evals/harnesses/agy/adapter.py
@@ -136,7 +136,14 @@ def launch(
                 )
             except subprocess.TimeoutExpired:
                 return ParsedRun(error="timeout")
-            return parse_stream(proc.stdout)
+            run = parse_stream(proc.stdout)
+            run.returncode = proc.returncode
+            run.stderr_tail = (proc.stderr or "")[-2000:]
+            if proc.returncode != 0 and not run.error:
+                run.error = f"exit {proc.returncode}: {run.stderr_tail[-300:].strip()}"
+            elif not run.output_text and not run.bash_commands and not run.error:
+                run.error = f"empty output (exit {proc.returncode})"
+            return run
         finally:
             shutil.rmtree(tmpdir, ignore_errors=True)
 
diff --git a/evals/harnesses/claude-code/adapter.py b/evals/harnesses/claude-code/adapter.py
index c6d2a92..3787a06 100644
--- a/evals/harnesses/claude-code/adapter.py
+++ b/evals/harnesses/claude-code/adapter.py
@@ -59,7 +59,7 @@ def parse_stream(raw: str) -> ParsedRun:
                     elif name == "Edit" and inp.get("file_path"):
                         edited.append(inp["file_path"])
         elif etype == "result":
-            cost = event.get("cost_usd") or 0.0
+            cost = event.get("total_cost_usd") or event.get("cost_usd") or 0.0
             text += event.get("result", "")
             if event.get("subtype") == "error_during_execution":
                 err = event.get("result", "unknown error")
@@ -102,7 +102,14 @@ def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill,
                                       timeout=300, cwd=tmpdir)
             except subprocess.TimeoutExpired:
                 return ParsedRun(error="timeout")
-            return parse_stream(proc.stdout)
+            run = parse_stream(proc.stdout)
+            run.returncode = proc.returncode
+            run.stderr_tail = (proc.stderr or "")[-2000:]
+            if proc.returncode != 0 and not run.error:
+                run.error = f"exit {proc.returncode}: {run.stderr_tail[-300:].strip()}"
+            elif not run.output_text and not run.bash_commands and not run.error:
+                run.error = f"empty output (exit {proc.returncode})"
+            return run
         finally:
             shutil.rmtree(tmpdir, ignore_errors=True)
 
diff --git a/evals/harnesses/codex/adapter.py b/evals/harnesses/codex/adapter.py
index 09d991d..6a263c1 100644
--- a/evals/harnesses/codex/adapter.py
+++ b/evals/harnesses/codex/adapter.py
@@ -138,7 +138,14 @@ def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill,
                                       timeout=300, cwd=tmpdir)
             except subprocess.TimeoutExpired:
                 return ParsedRun(error="timeout")
-            return parse_stream(proc.stdout)
+            run = parse_stream(proc.stdout)
+            run.returncode = proc.returncode
+            run.stderr_tail = (proc.stderr or "")[-2000:]
+            if proc.returncode != 0 and not run.error:
+                run.error = f"exit {proc.returncode}: {run.stderr_tail[-300:].strip()}"
+            elif not run.output_text and not run.bash_commands and not run.error:
+                run.error = f"empty output (exit {proc.returncode})"
+            return run
         finally:
             shutil.rmtree(tmpdir, ignore_errors=True)
 
diff --git a/evals/harnesses/cursor/adapter.py b/evals/harnesses/cursor/adapter.py
index 54d18b1..698ed51 100644
--- a/evals/harnesses/cursor/adapter.py
+++ b/evals/harnesses/cursor/adapter.py
@@ -199,7 +199,14 @@ def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill,
                 )
             except subprocess.TimeoutExpired:
                 return ParsedRun(error="timeout")
-            return parse_stream(proc.stdout)
+            run = parse_stream(proc.stdout)
+            run.returncode = proc.returncode
+            run.stderr_tail = (proc.stderr or "")[-2000:]
+            if proc.returncode != 0 and not run.error:
+                run.error = f"exit {proc.returncode}: {run.stderr_tail[-300:].strip()}"
+            elif not run.output_text and not run.bash_commands and not run.error:
+                run.error = f"empty output (exit {proc.returncode})"
+            return run
         finally:
             shutil.rmtree(tmpdir, ignore_errors=True)
 
diff --git a/evals/lib/grading.py b/evals/lib/grading.py
index 3ab2c0f..9f9d1fa 100644
--- a/evals/lib/grading.py
+++ b/evals/lib/grading.py
@@ -121,6 +121,7 @@ def grade(prompt: PromptConfig, run: ParsedRun, checks: list[dict], *,
             verdict=Verdict.PASS if trigger_correct else Verdict.FAIL,
             budget_breaches=[], process_checks=[],
             score=100 if trigger_correct else 0, cost_usd=run.cost_usd,
+            note=(run.error or ""),
         )
 
     proc = run_process_checks(run, applicable_checks(checks, prompt.id))
@@ -141,4 +142,5 @@ def grade(prompt: PromptConfig, run: ParsedRun, checks: list[dict], *,
         trigger_correct=trigger_correct,
         verdict=verdict, budget_breaches=breaches, process_checks=proc,
         score=_score(proc), cost_usd=run.cost_usd,
+        note=(run.error or ""),
     )
diff --git a/evals/lib/models.py b/evals/lib/models.py
index af87d6f..3b05e23 100644
--- a/evals/lib/models.py
+++ b/evals/lib/models.py
@@ -56,6 +56,8 @@ class ParsedRun(BaseModel):
     output_tokens: int | None = None
     wall_seconds: float | None = None
     error: str | None = None
+    returncode: int | None = None
+    stderr_tail: str = ""
 
 
 class ProcessCheckResult(BaseModel):
@@ -78,6 +80,7 @@ class EvalResult(BaseModel):
     process_checks: list[ProcessCheckResult] = []
     score: int
     cost_usd: float = 0.0
+    note: str = ""
 
 
 class CellReport(BaseModel):
diff --git a/evals/lib/reporting.py b/evals/lib/reporting.py
index 9a31e66..fd4bba4 100644
--- a/evals/lib/reporting.py
+++ b/evals/lib/reporting.py
@@ -160,5 +160,7 @@ def render_job_summary(cell: CellReport) -> str:
         why = "; ".join(r.budget_breaches) if r.budget_breaches else (
             "" if r.trigger_correct else
             ("false-positive" if r.did_trigger else "false-negative"))
+        if r.note:
+            why = f"{why} — {r.note}" if why else r.note
         rows.append(f"| {r.run_id} | {_VERDICT_ICON[r.verdict.value]} | {why} |")
     return head + "\n".join(rows) + "\n"
diff --git a/tests/lib/test_adapters.py b/tests/lib/test_adapters.py
index e1b7070..3cb5e49 100644
--- a/tests/lib/test_adapters.py
+++ b/tests/lib/test_adapters.py
@@ -57,6 +57,17 @@ def test_agy_detect_trigger_via_text():
     assert ag.detect_trigger(run, "hawkscan") is True
 
 
+def test_claude_code_parses_total_cost_usd():
+    import json
+    cc = get_adapter("claude-code")
+    lines = [
+        json.dumps({"type":"assistant","message":{"content":[{"type":"text","text":"hi"}]}}),
+        json.dumps({"type":"result","result":"done","total_cost_usd":0.123,"subtype":"success"}),
+    ]
+    run = cc.parse_stream("\n".join(lines))
+    assert abs(run.cost_usd - 0.123) < 1e-9
+
+
 def test_agy_observe_suffix_and_skill_signal():
     ag = get_adapter("agy")
     # The pre-shim SKILL: declaration format (emitted because of OBSERVE_SUFFIX)
diff --git a/tests/lib/test_grading.py b/tests/lib/test_grading.py
index a368d2c..67fc371 100644
--- a/tests/lib/test_grading.py
+++ b/tests/lib/test_grading.py
@@ -201,3 +201,13 @@ def test_grade_false_positive_fails_without_process_checks():
     assert res.verdict == Verdict.FAIL
     assert res.trigger_correct is False
     assert res.process_checks == []
+
+
+def test_grade_propagates_harness_error_to_note():
+    from evals.lib.models import ParsedRun, Verdict
+    from evals.lib.grading import grade
+    p = _prompt(should_trigger=True)   # _prompt helper already in this file
+    run = ParsedRun(returncode=1, stderr_tail="agent: command not found", error="exit 1: agent: command not found")
+    res = grade(p, run, [], platform="cursor", skill="hawkscan", did_trigger=False)
+    assert res.verdict == Verdict.FAIL          # didn't trigger
+    assert "command not found" in res.note      # harness error surfaced
diff --git a/tests/lib/test_models.py b/tests/lib/test_models.py
index c86d90f..ff84e20 100644
--- a/tests/lib/test_models.py
+++ b/tests/lib/test_models.py
@@ -72,3 +72,23 @@ def test_cellreport_rejects_unknown_field():
     from evals.lib.models import CellReport
     with pytest.raises(ValidationError):
         CellReport(platform="x", skill="y", model="m", commit="c", results=[], extra=1)
+
+
+def test_parsedrun_has_diagnostic_fields():
+    from evals.lib.models import ParsedRun
+    r = ParsedRun()
+    assert r.returncode is None
+    assert r.stderr_tail == ""
+    r2 = ParsedRun(returncode=1, stderr_tail="boom")
+    assert r2.returncode == 1 and r2.stderr_tail == "boom"
+
+
+def test_evalresult_has_note_field():
+    from evals.lib.models import EvalResult, Verdict
+    e = EvalResult(platform="p", skill="s", run_id="r", should_trigger=True,
+                   did_trigger=True, trigger_correct=True, verdict=Verdict.PASS, score=100)
+    assert e.note == ""
+    e2 = EvalResult(platform="p", skill="s", run_id="r", should_trigger=True,
+                    did_trigger=False, trigger_correct=False, verdict=Verdict.FAIL,
+                    score=0, note="harness error: agent: command not found")
+    assert "command not found" in e2.note
diff --git a/tests/lib/test_reporting_render.py b/tests/lib/test_reporting_render.py
index ed5f9c6..f2e27c6 100644
--- a/tests/lib/test_reporting_render.py
+++ b/tests/lib/test_reporting_render.py
@@ -92,6 +92,18 @@ def cell(score):
     assert "worse" in md.lower()   # 70 vs 90 -> worse
 
 
+def test_job_summary_shows_note():
+    from evals.lib.models import CellReport, EvalResult, Verdict
+    from evals.lib.reporting import render_job_summary
+    r = EvalResult(platform="cursor", skill="hawkscan", run_id="hw-01",
+                   should_trigger=True, did_trigger=False, trigger_correct=False,
+                   verdict=Verdict.FAIL, score=0, note="harness error: agent not found")
+    cell = CellReport(platform="cursor", skill="hawkscan", model="default",
+                      commit="c", results=[r])
+    md = render_job_summary(cell)
+    assert "agent not found" in md
+
+
 def test_digest_renders_lift_section():
     from evals.lib.models import CellReport, EvalResult, Verdict
     from evals.lib.reporting import render_digest

From 05c7bc7915a0ea64b1dcb09867b9a6423c4c057d Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Sun, 31 May 2026 20:48:21 -0600
Subject: [PATCH 36/61] fix(evals): main() resilient to per-prompt launch
 crashes; always write cell+summary+trace

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/cli.py                     | 32 +++++++++++++++++++------
 tests/lib/test_cli_resilience.py | 41 ++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 7 deletions(-)
 create mode 100644 tests/lib/test_cli_resilience.py

diff --git a/evals/cli.py b/evals/cli.py
index e51a5df..b15f747 100644
--- a/evals/cli.py
+++ b/evals/cli.py
@@ -39,17 +39,35 @@ def main() -> None:
     if not prompts:
         print(f"no prompt '{args.prompt_id}'", file=sys.stderr); sys.exit(1)
 
+    from evals.lib.models import EvalResult, Verdict
     results = []
     out_dir = RESULTS_ROOT / args.harness / "results" / args.skill
     out_dir.mkdir(parents=True, exist_ok=True)
     for p in prompts:
-        run = adapter.launch(p.prompt, args.skill, p.id, plugin_dirs,
-                             model=args.model, load_skill=True,
-                             max_budget=args.max_budget, bare=args.bare,
-                             full_auto=args.full_auto)
-        did = adapter.detect_trigger(run, args.skill)
-        res = grade(p, run, cfg.checks, platform=args.harness, skill=args.skill,
-                    did_trigger=did)
+        try:
+            run = adapter.launch(p.prompt, args.skill, p.id, plugin_dirs,
+                                 model=args.model, load_skill=True,
+                                 max_budget=args.max_budget, bare=args.bare,
+                                 full_auto=args.full_auto)
+            did = adapter.detect_trigger(run, args.skill)
+            res = grade(p, run, cfg.checks, platform=args.harness, skill=args.skill,
+                        did_trigger=did)
+            # persist a trace for visibility (uploaded with the artifact)
+            trace = (f"# {p.id} (returncode={run.returncode})\n"
+                     f"## error\n{run.error or ''}\n"
+                     f"## stderr_tail\n{run.stderr_tail}\n"
+                     f"## output_text\n{run.output_text}\n"
+                     f"## bash_commands\n" + "\n".join(run.bash_commands) + "\n")
+            (out_dir / f"{p.id}.trace.txt").write_text(trace)
+        except Exception as e:  # noqa: BLE001 — never let one prompt abort the cell
+            res = EvalResult(platform=args.harness, skill=args.skill, run_id=p.id,
+                             should_trigger=p.should_trigger, did_trigger=False,
+                             trigger_correct=(not p.should_trigger),
+                             verdict=Verdict.FAIL if p.should_trigger else Verdict.PASS,
+                             score=0 if p.should_trigger else 100,
+                             note=f"harness exception: {type(e).__name__}: {e}")
+            (out_dir / f"{p.id}.trace.txt").write_text(
+                f"# {p.id}\n## harness exception\n{type(e).__name__}: {e}\n")
         results.append(res)
         (out_dir / f"{p.id}.result.json").write_text(res.model_dump_json(indent=2))
 
diff --git a/tests/lib/test_cli_resilience.py b/tests/lib/test_cli_resilience.py
new file mode 100644
index 0000000..9668e4f
--- /dev/null
+++ b/tests/lib/test_cli_resilience.py
@@ -0,0 +1,41 @@
+import json
+from pathlib import Path
+import pytest
+import evals.cli as cli_mod
+
+
+class BoomAdapter:
+    platform = "boom"
+
+    def cli_signals(self, s):
+        return []
+
+    def invocation_signals(self, s):
+        return []
+
+    def parse_stream(self, raw):
+        from evals.lib.models import ParsedRun
+        return ParsedRun()
+
+    def detect_trigger(self, run, s):
+        return False
+
+    def launch(self, *a, **k):
+        raise FileNotFoundError("agent: command not found")
+
+
+def test_main_survives_launch_crash(monkeypatch, tmp_path):
+    # Point results at a temp dir and force the boom adapter + a tiny prompt set.
+    monkeypatch.setattr(cli_mod, "get_adapter", lambda p: BoomAdapter())
+    monkeypatch.setattr(cli_mod, "RESULTS_ROOT", tmp_path)
+    monkeypatch.setattr("sys.argv", ["evals", "--harness", "claude-code", "--skill", "hawkscan"])
+    with pytest.raises(SystemExit):   # FP/FN cause sys.exit(1) — that's fine
+        cli_mod.main()
+    # The cell + summary were still written despite every launch crashing:
+    out = tmp_path / "claude-code" / "results" / "hawkscan"
+    assert (out / "cell.json").exists()
+    assert (out / "summary.json").exists()
+    cell = json.loads((out / "cell.json").read_text())
+    assert len(cell["results"]) == 20            # all hawkscan prompts graded
+    # positive prompts failed with a harness note; at least one note mentions the crash
+    assert any("command not found" in r.get("note", "") for r in cell["results"])

From 06e063bf4e6ec497c6dcd735bc9b7f627b2c48e7 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Sun, 31 May 2026 20:52:30 -0600
Subject: [PATCH 37/61] =?UTF-8?q?ci(evals):=20full=20tool=C3=97model=20mat?=
 =?UTF-8?q?rix=20on=20PR+dispatch=20(drop=20push);=20digest=20to=20run=20s?=
 =?UTF-8?q?ummary;=20capture-baseline=20full=20matrix?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/capture-baseline.yml | 135 ++++++++++++++++++++++++-
 .github/workflows/skill-evals.yml      |  43 ++++----
 2 files changed, 155 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/capture-baseline.yml b/.github/workflows/capture-baseline.yml
index b9a0497..0f25b26 100644
--- a/.github/workflows/capture-baseline.yml
+++ b/.github/workflows/capture-baseline.yml
@@ -8,13 +8,18 @@ on:
         type: string
 permissions:
   contents: read
+
 jobs:
-  capture:
+
+  # ── Claude Code — 3 models ─────────────────────────────────────────────────
+  capture-claude-code:
+    name: baseline / claude-code / ${{ matrix.skill }} / ${{ matrix.model }}
     runs-on: ubuntu-latest
     strategy:
       fail-fast: false
       matrix:
         skill: [hawkscan, api]
+        model: [claude-sonnet-4-6, claude-opus-4-7, claude-haiku-4-5-20251001]
     steps:
       - uses: actions/checkout@v4
         with:
@@ -25,15 +30,137 @@ jobs:
           node-version: "20"
       - name: Install Claude Code CLI
         run: npm install -g @anthropic-ai/claude-code
-      - name: Run baseline eval (haiku)
+      - name: Verify claude CLI
+        run: claude --version
+      - name: Run baseline eval (${{ matrix.model }})
         env:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
         run: |
           uv run evals --harness claude-code --skill ${{ matrix.skill }} \
-            --model claude-haiku-4-5-20251001 --bare --max-budget 0.15 || true
+            --model ${{ matrix.model }} --bare --max-budget 0.15 || true
       - name: Upload baseline artifact
+        if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: baseline-claude-code-${{ matrix.skill }}-haiku
+          name: baseline-claude-code-${{ matrix.skill }}-${{ matrix.model }}
           path: evals/harnesses/claude-code/results/${{ matrix.skill }}/cell.json
           retention-days: 90
+
+  # ── Codex — 2 models ──────────────────────────────────────────────────────
+  capture-codex:
+    name: baseline / codex / ${{ matrix.skill }} / ${{ matrix.model }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        skill: [hawkscan, api]
+        model: [gpt-5.5, o3]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.tag }}
+      - uses: astral-sh/setup-uv@v5
+      - uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+      - name: Install Codex CLI
+        run: npm install -g @openai/codex
+      - name: Verify codex CLI
+        run: codex --version
+      - name: Install StackHawk skills (hawkscan + api)
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        run: |
+          codex plugin marketplace add .
+          echo y | codex plugin add hawkscan@stackhawk
+          echo y | codex plugin add stackhawk-api@stackhawk
+      - name: Run baseline eval (${{ matrix.model }})
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        run: |
+          uv run evals --harness codex --skill ${{ matrix.skill }} --model ${{ matrix.model }} || true
+      - name: Upload baseline artifact
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: baseline-codex-${{ matrix.skill }}-${{ matrix.model }}
+          path: evals/harnesses/codex/results/${{ matrix.skill }}/cell.json
+          retention-days: 90
+
+  # ── Antigravity (agy) — default model ─────────────────────────────────────
+  capture-agy:
+    name: baseline / agy / ${{ matrix.skill }} / ${{ matrix.model }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        skill: [hawkscan, api]
+        model: [default]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.tag }}
+      - uses: astral-sh/setup-uv@v5
+      - name: Install agy CLI
+        run: curl -fsSL https://antigravity.google/install-cli | bash
+      - name: Verify agy CLI
+        run: agy --version
+      - name: Install StackHawk plugins
+        env:
+          AGY_API_KEY: ${{ secrets.AGY_API_KEY }}
+        run: |
+          echo y | agy plugin install plugins/hawkscan
+          echo y | agy plugin install plugins/api
+      - name: Run baseline eval
+        env:
+          AGY_API_KEY: ${{ secrets.AGY_API_KEY }}
+        run: |
+          MODEL_ARGS=()
+          if [ "${{ matrix.model }}" != "default" ]; then MODEL_ARGS=(--model "${{ matrix.model }}"); fi
+          uv run evals --harness agy --skill ${{ matrix.skill }} "${MODEL_ARGS[@]}" || true
+      - name: Upload baseline artifact
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: baseline-agy-${{ matrix.skill }}-${{ matrix.model }}
+          path: evals/harnesses/agy/results/${{ matrix.skill }}/cell.json
+          retention-days: 90
+
+  # ── Cursor — default model ─────────────────────────────────────────────────
+  capture-cursor:
+    name: baseline / cursor / ${{ matrix.skill }} / ${{ matrix.model }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        skill: [hawkscan, api]
+        model: [default]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.tag }}
+      - uses: astral-sh/setup-uv@v5
+      - uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+      - name: Install Cursor CLI
+        run: npm install -g @cursor/cli || npm install -g cursor-agent
+        continue-on-error: true  # package name TBD; update when stable
+      - name: Verify agent CLI
+        run: agent --version
+        continue-on-error: true  # CLI package name TBD; skip if unavailable
+      - name: Run baseline eval
+        env:
+          CURSOR_API_KEY: ${{ secrets.CURSOR_API_KEY }}
+        run: |
+          MODEL_ARGS=()
+          if [ "${{ matrix.model }}" != "default" ]; then MODEL_ARGS=(--model "${{ matrix.model }}"); fi
+          uv run evals --harness cursor --skill ${{ matrix.skill }} "${MODEL_ARGS[@]}" || true
+        continue-on-error: true  # best-effort
+      - name: Upload baseline artifact
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: baseline-cursor-${{ matrix.skill }}-${{ matrix.model }}
+          path: evals/harnesses/cursor/results/${{ matrix.skill }}/cell.json
+          retention-days: 90
diff --git a/.github/workflows/skill-evals.yml b/.github/workflows/skill-evals.yml
index 280f253..3510bff 100644
--- a/.github/workflows/skill-evals.yml
+++ b/.github/workflows/skill-evals.yml
@@ -5,8 +5,6 @@ on:
     paths:
       - "plugins/**"
       - "evals/**"
-  push:
-    branches: [main]
   workflow_dispatch:
     inputs:
       skill:
@@ -16,7 +14,7 @@ on:
         type: choice
         options: [hawkscan, api, both]
       platform:
-        description: "Platform to run (all = claude-code + codex + agy + cursor)"
+        description: "Platform to run"
         required: true
         default: "all"
         type: choice
@@ -46,7 +44,7 @@ jobs:
       - name: Validate prompts.yaml + process-checks.json
         run: uv run validate
 
-  # ── Unit tests (no API keys; runs on every PR + push) ─────────────────────
+  # ── Unit tests (no API keys; runs on every PR) ────────────────────────────
   pytest:
     name: pytest (lib)
     runs-on: ubuntu-latest
@@ -63,14 +61,13 @@ jobs:
     needs: validate-config
     if: |
       github.event_name == 'pull_request' ||
-      github.event_name == 'push' ||
       inputs.platform == 'all' ||
       inputs.platform == 'claude-code'
     strategy:
       fail-fast: false
       matrix:
         skill: [hawkscan, api]
-        model: ${{ github.event_name == 'pull_request' && fromJSON('["claude-haiku-4-5-20251001"]') || fromJSON('["claude-sonnet-4-6","claude-opus-4-7","claude-haiku-4-5-20251001"]') }}
+        model: [claude-sonnet-4-6, claude-opus-4-7, claude-haiku-4-5-20251001]
 
     steps:
       - uses: actions/checkout@v4
@@ -110,18 +107,18 @@ jobs:
 
   # ── Codex ─────────────────────────────────────────────────────────────────
   eval-codex:
-    name: codex / ${{ matrix.skill }}
+    name: codex / ${{ matrix.skill }} / ${{ matrix.model }}
     runs-on: ubuntu-latest
     needs: validate-config
     if: |
       github.event_name == 'pull_request' ||
-      github.event_name == 'push' ||
       inputs.platform == 'all' ||
       inputs.platform == 'codex'
     strategy:
       fail-fast: false
       matrix:
         skill: [hawkscan, api]
+        model: [gpt-5.5, o3]
 
     steps:
       - uses: actions/checkout@v4
@@ -144,34 +141,34 @@ jobs:
           echo y | codex plugin add hawkscan@stackhawk
           echo y | codex plugin add stackhawk-api@stackhawk
 
-      - name: Run ${{ matrix.skill }} evals
+      - name: Run ${{ matrix.skill }} evals (${{ matrix.model }})
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
         run: |
-          uv run evals --harness codex --skill ${{ matrix.skill }}
+          uv run evals --harness codex --skill ${{ matrix.skill }} --model ${{ matrix.model }}
 
       - name: Upload results
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: eval-codex-${{ matrix.skill }}
+          name: eval-codex-${{ matrix.skill }}-${{ matrix.model }}
           path: evals/harnesses/codex/results/${{ matrix.skill }}/
           retention-days: 30
 
   # ── Antigravity (agy) — replaces Gemini ───────────────────────────────────
   eval-agy:
-    name: agy / ${{ matrix.skill }}
+    name: agy / ${{ matrix.skill }} / ${{ matrix.model }}
     runs-on: ubuntu-latest
     needs: validate-config
     if: |
       github.event_name == 'pull_request' ||
-      github.event_name == 'push' ||
       inputs.platform == 'all' ||
       inputs.platform == 'agy'
     strategy:
       fail-fast: false
       matrix:
         skill: [hawkscan, api]
+        model: [default]
 
     steps:
       - uses: actions/checkout@v4
@@ -194,30 +191,32 @@ jobs:
         env:
           AGY_API_KEY: ${{ secrets.AGY_API_KEY }}
         run: |
-          uv run evals --harness agy --skill ${{ matrix.skill }}
+          MODEL_ARGS=()
+          if [ "${{ matrix.model }}" != "default" ]; then MODEL_ARGS=(--model "${{ matrix.model }}"); fi
+          uv run evals --harness agy --skill ${{ matrix.skill }} "${MODEL_ARGS[@]}"
 
       - name: Upload results
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: eval-agy-${{ matrix.skill }}
+          name: eval-agy-${{ matrix.skill }}-${{ matrix.model }}
           path: evals/harnesses/agy/results/${{ matrix.skill }}/
           retention-days: 30
 
   # ── Cursor ────────────────────────────────────────────────────────────────
   eval-cursor:
-    name: cursor / ${{ matrix.skill }}
+    name: cursor / ${{ matrix.skill }} / ${{ matrix.model }}
     runs-on: ubuntu-latest
     needs: validate-config
     if: |
       github.event_name == 'pull_request' ||
-      github.event_name == 'push' ||
       inputs.platform == 'all' ||
       inputs.platform == 'cursor'
     strategy:
       fail-fast: false
       matrix:
         skill: [hawkscan, api]
+        model: [default]
 
     steps:
       - uses: actions/checkout@v4
@@ -238,14 +237,16 @@ jobs:
         env:
           CURSOR_API_KEY: ${{ secrets.CURSOR_API_KEY }}
         run: |
-          uv run evals --harness cursor --skill ${{ matrix.skill }}
+          MODEL_ARGS=()
+          if [ "${{ matrix.model }}" != "default" ]; then MODEL_ARGS=(--model "${{ matrix.model }}"); fi
+          uv run evals --harness cursor --skill ${{ matrix.skill }} "${MODEL_ARGS[@]}"
         continue-on-error: true  # best-effort; digest degrades gracefully
 
       - name: Upload results
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: eval-cursor-${{ matrix.skill }}
+          name: eval-cursor-${{ matrix.skill }}-${{ matrix.model }}
           path: evals/harnesses/cursor/results/${{ matrix.skill }}/
           retention-days: 30
 
@@ -284,7 +285,11 @@ jobs:
           echo "baseline fetched for $TAG (run $RUN)"
       - name: Build digest
         run: uv run report --pr --results-dir results --baseline-dir baseline --lift-dir results --out digest.md
+      - name: Write digest to run summary
+        if: always()
+        run: cat digest.md >> "$GITHUB_STEP_SUMMARY"
       - name: Post digest comment
+        if: always()
         uses: actions/github-script@v7
         with:
           script: |

From 327a7708f07783e495b0c5fde0a267306f7e7e85 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Mon, 1 Jun 2026 07:57:28 -0600
Subject: [PATCH 38/61] ci(evals): revert to workflow_dispatch-only (match
 origin/main); digest to run summary on dispatch

Evals run real agents against tool CLIs and were never an automatic PR gate
(origin/main commit c860e47 deliberately removed the pull_request trigger).
Auto-PR runs surfaced env gaps (CLIs not installed, skills not loading under
--bare) that were never set up for CI. Restore manual dispatch; the report job
now writes the consolidated digest to GITHUB_STEP_SUMMARY on dispatch and only
posts a PR comment when a PR context exists.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/skill-evals.yml | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/skill-evals.yml b/.github/workflows/skill-evals.yml
index 3510bff..31e9889 100644
--- a/.github/workflows/skill-evals.yml
+++ b/.github/workflows/skill-evals.yml
@@ -1,10 +1,9 @@
 name: Skill Evals
 
 on:
-  pull_request:
-    paths:
-      - "plugins/**"
-      - "evals/**"
+  # Manual, on-demand only — matches origin/main's deliberate design (commit c860e47
+  # "ci: remove pull_request trigger — evals run on workflow_dispatch only"). These
+  # evals drive real agents against tool CLIs and were never an automatic PR gate.
   workflow_dispatch:
     inputs:
       skill:
@@ -60,7 +59,6 @@ jobs:
     runs-on: ubuntu-latest
     needs: validate-config
     if: |
-      github.event_name == 'pull_request' ||
       inputs.platform == 'all' ||
       inputs.platform == 'claude-code'
     strategy:
@@ -111,7 +109,6 @@ jobs:
     runs-on: ubuntu-latest
     needs: validate-config
     if: |
-      github.event_name == 'pull_request' ||
       inputs.platform == 'all' ||
       inputs.platform == 'codex'
     strategy:
@@ -161,7 +158,6 @@ jobs:
     runs-on: ubuntu-latest
     needs: validate-config
     if: |
-      github.event_name == 'pull_request' ||
       inputs.platform == 'all' ||
       inputs.platform == 'agy'
     strategy:
@@ -209,7 +205,6 @@ jobs:
     runs-on: ubuntu-latest
     needs: validate-config
     if: |
-      github.event_name == 'pull_request' ||
       inputs.platform == 'all' ||
       inputs.platform == 'cursor'
     strategy:
@@ -251,10 +246,10 @@ jobs:
           retention-days: 30
 
   # ── PR comment ────────────────────────────────────────────────────────────
-  comment:
-    name: Post PR summary
+  report:
+    name: Eval report (run summary + PR comment)
     needs: [validate-config, eval-claude-code, eval-codex, eval-agy, eval-cursor]
-    if: always() && github.event_name == 'pull_request'
+    if: always()
     runs-on: ubuntu-latest
     permissions:
       pull-requests: write
@@ -289,7 +284,7 @@ jobs:
         if: always()
         run: cat digest.md >> "$GITHUB_STEP_SUMMARY"
       - name: Post digest comment
-        if: always()
+        if: github.event_name == 'pull_request'
         uses: actions/github-script@v7
         with:
           script: |

From cbe638f35704396e0b0b1e8a0dee801a6cd2d022 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Mon, 1 Jun 2026 08:51:56 -0600
Subject: [PATCH 39/61] fix(evals): unblock codex/cursor/agy harness execution
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- codex: pick --sandbox value once (workspace-write vs read-only). Passing
  both made codex exit 2 ("--sandbox cannot be used multiple times"), failing
  every non-full-auto run before the agent started.
- cursor: pass CURSOR_API_KEY via the child environment instead of --api-key
  on the command line (the flag leaked the secret into process listings/logs;
  the agent CLI reads it from the env directly).
- agy: mark CLI install/verify/plugin-install/run steps continue-on-error so a
  flaky preview installer no longer aborts the job before evals run — the eval
  CLI records the launch failure and uploads a result the digest can surface
  (matches cursor's best-effort treatment).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/skill-evals.yml | 4 ++++
 evals/harnesses/codex/adapter.py  | 8 +++++---
 evals/harnesses/cursor/adapter.py | 8 +++++---
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/skill-evals.yml b/.github/workflows/skill-evals.yml
index 31e9889..34b2d5f 100644
--- a/.github/workflows/skill-evals.yml
+++ b/.github/workflows/skill-evals.yml
@@ -172,9 +172,11 @@ jobs:
 
       - name: Install agy CLI
         run: curl -fsSL https://antigravity.google/install-cli | bash
+        continue-on-error: true  # preview installer; don't abort the job — evals records the launch failure
 
       - name: Verify agy CLI
         run: agy --version
+        continue-on-error: true  # if unavailable, the eval run captures it as a per-prompt error
 
       - name: Install StackHawk plugins
         env:
@@ -182,6 +184,7 @@ jobs:
         run: |
           echo y | agy plugin install plugins/hawkscan
           echo y | agy plugin install plugins/api
+        continue-on-error: true  # depends on agy CLI; best-effort so evals still runs
 
       - name: Run ${{ matrix.skill }} evals
         env:
@@ -190,6 +193,7 @@ jobs:
           MODEL_ARGS=()
           if [ "${{ matrix.model }}" != "default" ]; then MODEL_ARGS=(--model "${{ matrix.model }}"); fi
           uv run evals --harness agy --skill ${{ matrix.skill }} "${MODEL_ARGS[@]}"
+        continue-on-error: true  # best-effort; digest degrades gracefully (matches cursor)
 
       - name: Upload results
         if: always()
diff --git a/evals/harnesses/codex/adapter.py b/evals/harnesses/codex/adapter.py
index 6a263c1..55507e5 100644
--- a/evals/harnesses/codex/adapter.py
+++ b/evals/harnesses/codex/adapter.py
@@ -123,15 +123,17 @@ def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill,
                max_budget, bare, full_auto) -> ParsedRun:
         tmpdir = tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_")
         try:
+            # Pick the sandbox once: full-auto needs write access for the agent
+            # to run the skill workflow; otherwise read-only. Passing --sandbox
+            # twice makes codex exit 2 ("cannot be used multiple times").
+            sandbox = "workspace-write" if full_auto else "read-only"
             cmd = [
                 "codex", "exec", "--json",
-                "--sandbox", "workspace-write",
+                "--sandbox", sandbox,
                 "--skip-git-repo-check",
             ]
             if model:
                 cmd += ["-m", model]
-            if not full_auto:
-                cmd += ["--sandbox", "read-only"]
             cmd.append(prompt)
             try:
                 proc = subprocess.run(cmd, capture_output=True, text=True,
diff --git a/evals/harnesses/cursor/adapter.py b/evals/harnesses/cursor/adapter.py
index 698ed51..fc2f2c6 100644
--- a/evals/harnesses/cursor/adapter.py
+++ b/evals/harnesses/cursor/adapter.py
@@ -176,19 +176,20 @@ def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill,
             # skill should be loaded (pre-shim always installed them).
             if load_skill:
                 _setup_skill(tmpdir)
-            api_key = os.environ.get("CURSOR_API_KEY", "")
             cmd = [
                 "agent", "-p", prompt,
                 "--output-format", "stream-json",
                 "--print",
                 "--trust",
             ]
-            if api_key:
-                cmd += ["--api-key", api_key]
             if model:
                 cmd += ["--model", model]
             if full_auto:
                 cmd.append("--force")
+            # Pass CURSOR_API_KEY via the environment, never on the command line
+            # (a CLI arg leaks the secret into process listings and logs). The
+            # agent CLI reads CURSOR_API_KEY from the environment directly.
+            env = dict(os.environ)
             try:
                 proc = subprocess.run(
                     cmd,
@@ -196,6 +197,7 @@ def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill,
                     text=True,
                     timeout=300,
                     cwd=tmpdir,
+                    env=env,
                 )
             except subprocess.TimeoutExpired:
                 return ParsedRun(error="timeout")

From 7c382907f86878eff1677714fa763c6a372d08f7 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Mon, 1 Jun 2026 10:57:50 -0600
Subject: [PATCH 40/61] ci(evals): install latest hawk CLI in the claude-code
 job
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The hawkscan skill grades on whether the agent runs the documented hawk
commands (hawk version/config/validate/scan) — all command_executed checks.
With no hawk on the runner, the agent improvised (docker) and never emitted a
hawk* trigger signal, so every triggering prompt scored FN.

Add a JDK 17 (hawk is a Java app) + install the latest hawk via the repo's
own documented method: resolve version from api.stackhawk.com/hawkscan/version,
download the Linux ZIP, unzip, add to PATH. Install/verify are
continue-on-error so a flaky download still lets evals run and record state.

Auth (HAWK_API_KEY) and hawkop are not wired here; the api skill and live-app
checks remain blocked until those land. This isolates "does installing hawk
flip the hawkscan trigger + preflight/validate/scan checks green".

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/skill-evals.yml | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/.github/workflows/skill-evals.yml b/.github/workflows/skill-evals.yml
index 34b2d5f..7b27a47 100644
--- a/.github/workflows/skill-evals.yml
+++ b/.github/workflows/skill-evals.yml
@@ -80,6 +80,31 @@ jobs:
       - name: Verify claude CLI
         run: claude --version
 
+      # hawk CLI is a Java app; the Linux ZIP needs a JDK 17+ on PATH.
+      - uses: actions/setup-java@v4
+        with:
+          distribution: temurin
+          java-version: "17"
+
+      # Install the latest hawk so the hawkscan skill can follow its documented
+      # CLI path (hawk version/config/validate/scan). Without it the agent
+      # improvises (docker rabbit hole) and never emits a hawk* trigger signal.
+      # Version + URL pattern per the repo's own install reference
+      # (cursor/.cursor/rules/stackhawk-hawkscan-install.mdc).
+      - name: Install latest hawk CLI
+        run: |
+          set -euo pipefail
+          HAWK_VERSION="$(curl -fsSL https://api.stackhawk.com/hawkscan/version)"
+          echo "Installing hawk ${HAWK_VERSION}"
+          curl -fLo /tmp/hawk.zip "https://download.stackhawk.com/hawk/cli/hawk-${HAWK_VERSION}.zip"
+          unzip -q /tmp/hawk.zip -d "${HOME}"
+          echo "${HOME}/hawk-${HAWK_VERSION}" >> "${GITHUB_PATH}"
+        continue-on-error: true  # if version/download endpoint hiccups, evals still runs and records it
+
+      - name: Verify hawk CLI
+        run: hawk version
+        continue-on-error: true  # absence is captured per-prompt in the eval traces
+
       - name: Run ${{ matrix.skill }} evals (${{ matrix.model }})
         env:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}

From 7b79cf9d7b9c93e6d39680f5ef1077b86ba06362 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Mon, 1 Jun 2026 11:01:31 -0600
Subject: [PATCH 41/61] ci(evals): install hawk via official hawkscan-action
 (install-only)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the hand-rolled curl/unzip with stackhawk/hawkscan-action@v2.5.0 using
installCLIOnly: true — the maintained, canonical install path (resolves latest,
handles the download/PATH). Keep setup-java@17 (hawk is a Java app; the action
ships the CLI, not a JRE) and the post-install `hawk version` verify.

apiKey is passed from the (currently empty) HAWK_API_KEY secret; install-only
performs no scan so the key is unused. Step is continue-on-error so a missing
key can't abort the job — evals still runs and records hawk availability.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/skill-evals.yml | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/skill-evals.yml b/.github/workflows/skill-evals.yml
index 7b27a47..88082f0 100644
--- a/.github/workflows/skill-evals.yml
+++ b/.github/workflows/skill-evals.yml
@@ -80,26 +80,23 @@ jobs:
       - name: Verify claude CLI
         run: claude --version
 
-      # hawk CLI is a Java app; the Linux ZIP needs a JDK 17+ on PATH.
+      # hawk CLI is a Java app; ensure a JDK 17+ is on PATH for it.
       - uses: actions/setup-java@v4
         with:
           distribution: temurin
           java-version: "17"
 
-      # Install the latest hawk so the hawkscan skill can follow its documented
-      # CLI path (hawk version/config/validate/scan). Without it the agent
-      # improvises (docker rabbit hole) and never emits a hawk* trigger signal.
-      # Version + URL pattern per the repo's own install reference
-      # (cursor/.cursor/rules/stackhawk-hawkscan-install.mdc).
+      # Install the latest hawk via StackHawk's official action in install-only
+      # mode (no scan). It downloads the CLI and adds it to PATH so the hawkscan
+      # skill can follow its documented CLI path (hawk version/config/validate/
+      # scan). Without hawk the agent improvises and never emits a hawk* signal.
       - name: Install latest hawk CLI
-        run: |
-          set -euo pipefail
-          HAWK_VERSION="$(curl -fsSL https://api.stackhawk.com/hawkscan/version)"
-          echo "Installing hawk ${HAWK_VERSION}"
-          curl -fLo /tmp/hawk.zip "https://download.stackhawk.com/hawk/cli/hawk-${HAWK_VERSION}.zip"
-          unzip -q /tmp/hawk.zip -d "${HOME}"
-          echo "${HOME}/hawk-${HAWK_VERSION}" >> "${GITHUB_PATH}"
-        continue-on-error: true  # if version/download endpoint hiccups, evals still runs and records it
+        uses: stackhawk/hawkscan-action@v2.5.0
+        with:
+          apiKey: ${{ secrets.HAWK_API_KEY }}
+          version: latest
+          installCLIOnly: true
+        continue-on-error: true  # install-only shouldn't need apiKey; don't let a check abort the job
 
       - name: Verify hawk CLI
         run: hawk version

From f7d47fce969008b2692c4ef88bfd51358d2b67c1 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Mon, 1 Jun 2026 11:06:08 -0600
Subject: [PATCH 42/61] ci(evals): install hawk in every harness job, not just
 claude-code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All four AI harnesses (claude-code, codex, agy, cursor) drive an agent that
runs the hawkscan skill, which needs the hawk CLI on PATH. Previously only
claude-code installed it. Add the same install-only step (setup-java@17 +
stackhawk/hawkscan-action@v2.5.0 installCLIOnly + hawk version verify) to the
codex, agy, and cursor jobs.

Note: the api skill prefers hawkop (separate CLI) and codex/agy/cursor still
have their own tool-CLI provisioning blockers (codex auth, agy/cursor install)
— tracked separately. This change covers hawk specifically, for every job.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/skill-evals.yml | 48 +++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/.github/workflows/skill-evals.yml b/.github/workflows/skill-evals.yml
index 88082f0..cbaea4e 100644
--- a/.github/workflows/skill-evals.yml
+++ b/.github/workflows/skill-evals.yml
@@ -160,6 +160,22 @@ jobs:
           echo y | codex plugin add hawkscan@stackhawk
           echo y | codex plugin add stackhawk-api@stackhawk
 
+      # hawk CLI (Java app) — the hawkscan skill needs it on PATH; install-only, no scan.
+      - uses: actions/setup-java@v4
+        with:
+          distribution: temurin
+          java-version: "17"
+      - name: Install latest hawk CLI
+        uses: stackhawk/hawkscan-action@v2.5.0
+        with:
+          apiKey: ${{ secrets.HAWK_API_KEY }}
+          version: latest
+          installCLIOnly: true
+        continue-on-error: true  # install-only shouldn't need apiKey; don't let a check abort the job
+      - name: Verify hawk CLI
+        run: hawk version
+        continue-on-error: true  # absence is captured per-prompt in the eval traces
+
       - name: Run ${{ matrix.skill }} evals (${{ matrix.model }})
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
@@ -208,6 +224,22 @@ jobs:
           echo y | agy plugin install plugins/api
         continue-on-error: true  # depends on agy CLI; best-effort so evals still runs
 
+      # hawk CLI (Java app) — the hawkscan skill needs it on PATH; install-only, no scan.
+      - uses: actions/setup-java@v4
+        with:
+          distribution: temurin
+          java-version: "17"
+      - name: Install latest hawk CLI
+        uses: stackhawk/hawkscan-action@v2.5.0
+        with:
+          apiKey: ${{ secrets.HAWK_API_KEY }}
+          version: latest
+          installCLIOnly: true
+        continue-on-error: true  # install-only shouldn't need apiKey; don't let a check abort the job
+      - name: Verify hawk CLI
+        run: hawk version
+        continue-on-error: true  # absence is captured per-prompt in the eval traces
+
       - name: Run ${{ matrix.skill }} evals
         env:
           AGY_API_KEY: ${{ secrets.AGY_API_KEY }}
@@ -254,6 +286,22 @@ jobs:
         run: agent --version
         continue-on-error: true  # CLI package name TBD; skip if unavailable
 
+      # hawk CLI (Java app) — the hawkscan skill needs it on PATH; install-only, no scan.
+      - uses: actions/setup-java@v4
+        with:
+          distribution: temurin
+          java-version: "17"
+      - name: Install latest hawk CLI
+        uses: stackhawk/hawkscan-action@v2.5.0
+        with:
+          apiKey: ${{ secrets.HAWK_API_KEY }}
+          version: latest
+          installCLIOnly: true
+        continue-on-error: true  # install-only shouldn't need apiKey; don't let a check abort the job
+      - name: Verify hawk CLI
+        run: hawk version
+        continue-on-error: true  # absence is captured per-prompt in the eval traces
+
       - name: Run ${{ matrix.skill }} evals
         env:
           CURSOR_API_KEY: ${{ secrets.CURSOR_API_KEY }}

From 7a789ec288cb3a66dffbb7e656cc5d796173c864 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Mon, 1 Jun 2026 11:36:07 -0600
Subject: [PATCH 43/61] ci(evals): fix agent-CLI plumbing for codex, agy,
 cursor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Root causes found from the run-26769783222 step logs + traces:

- codex: 401 "Missing bearer" — `codex exec` reads stored credentials, not
  OPENAI_API_KEY. Add `printenv OPENAI_API_KEY | codex login --with-api-key`
  before the eval run (pipe via stdin, never as an arg).
- agy: `https://antigravity.google/install-cli` returns the site's HTML landing
  page, so `| bash` died with a syntax error and `agy` never installed. Use the
  real bootstrapper `/cli/install.sh`, add ~/.local/bin to PATH, and set
  ANTIGRAVITY_API_KEY (the env var agy actually reads) from the AGY_API_KEY secret.
- cursor: `@cursor/cli` 404s and the `cursor-agent` npm package ships no `agent`
  binary. Use the official installer `curl https://cursor.com/install | bash`,
  which symlinks `agent` into ~/.local/bin; add that to PATH.

claude-code plumbing already works (hw-05 trace: agent runs `hawk version` +
`hawk scan --help`); no change there. Installer URLs verified to serve real
shell scripts (application/x-sh) before wiring them in.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/skill-evals.yml | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/skill-evals.yml b/.github/workflows/skill-evals.yml
index cbaea4e..af0f0e9 100644
--- a/.github/workflows/skill-evals.yml
+++ b/.github/workflows/skill-evals.yml
@@ -152,6 +152,13 @@ jobs:
       - name: Verify codex CLI
         run: codex --version
 
+      # codex exec reads stored credentials, not OPENAI_API_KEY directly — without
+      # this it 401s ("Missing bearer"). Pipe the key via stdin (never as an arg).
+      - name: Authenticate codex CLI
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        run: printenv OPENAI_API_KEY | codex login --with-api-key
+
       - name: Install StackHawk skills (hawkscan + api)
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
@@ -209,8 +216,12 @@ jobs:
       - uses: astral-sh/setup-uv@v5
 
       - name: Install agy CLI
-        run: curl -fsSL https://antigravity.google/install-cli | bash
-        continue-on-error: true  # preview installer; don't abort the job — evals records the launch failure
+        run: |
+          # /cli/install.sh is the real bootstrapper; /install-cli returns the
+          # site's HTML landing page (piping that into bash is what broke before).
+          curl -fsSL https://antigravity.google/cli/install.sh | bash
+          echo "$HOME/.local/bin" >> "$GITHUB_PATH"   # installer drops `agy` here
+        continue-on-error: true  # don't abort the job — evals records any launch failure
 
       - name: Verify agy CLI
         run: agy --version
@@ -218,7 +229,7 @@ jobs:
 
       - name: Install StackHawk plugins
         env:
-          AGY_API_KEY: ${{ secrets.AGY_API_KEY }}
+          ANTIGRAVITY_API_KEY: ${{ secrets.AGY_API_KEY }}  # agy reads ANTIGRAVITY_API_KEY, not AGY_API_KEY
         run: |
           echo y | agy plugin install plugins/hawkscan
           echo y | agy plugin install plugins/api
@@ -242,7 +253,7 @@ jobs:
 
       - name: Run ${{ matrix.skill }} evals
         env:
-          AGY_API_KEY: ${{ secrets.AGY_API_KEY }}
+          ANTIGRAVITY_API_KEY: ${{ secrets.AGY_API_KEY }}  # agy reads ANTIGRAVITY_API_KEY, not AGY_API_KEY
         run: |
           MODEL_ARGS=()
           if [ "${{ matrix.model }}" != "default" ]; then MODEL_ARGS=(--model "${{ matrix.model }}"); fi
@@ -279,12 +290,16 @@ jobs:
           node-version: "20"
 
       - name: Install Cursor CLI
-        run: npm install -g @cursor/cli || npm install -g cursor-agent
-        continue-on-error: true  # package name TBD; update when stable
+        run: |
+          # Official installer; symlinks the `agent` binary into ~/.local/bin.
+          # (@cursor/cli / cursor-agent npm packages don't exist — they 404'd.)
+          curl https://cursor.com/install -fsS | bash
+          echo "$HOME/.local/bin" >> "$GITHUB_PATH"
+        continue-on-error: true  # best-effort; evals records any launch failure
 
       - name: Verify agent CLI
         run: agent --version
-        continue-on-error: true  # CLI package name TBD; skip if unavailable
+        continue-on-error: true  # absence is captured per-prompt in the eval traces
 
       # hawk CLI (Java app) — the hawkscan skill needs it on PATH; install-only, no scan.
       - uses: actions/setup-java@v4

From f7e0a3eb7e3238ac93554e3e91b3fe24838296bc Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Mon, 1 Jun 2026 12:17:52 -0600
Subject: [PATCH 44/61] fix(evals): codex bypasses bwrap sandbox in CI so the
 agent can run hawk
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After auth was fixed, codex still couldn't reach hawk: on Ubuntu-24.04 runners
the bubblewrap sandbox fails to initialize (unprivileged user namespaces are
gated by AppArmor) — "bwrap: loopback: Failed RTM_NEWADDR: Operation not
permitted" — so codex exits at sandbox startup before running any command
(33 occurrences across cells; codex issue #16334).

When CI is set, launch with --dangerously-bypass-approvals-and-sandbox instead
of --sandbox <mode>. Safe on an ephemeral runner in a throwaway tmpdir, and the
agent needs write+exec to run the hawkscan workflow anyway. Local runs keep the
real sandbox (workspace-write for full-auto, else read-only).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/harnesses/codex/adapter.py | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/evals/harnesses/codex/adapter.py b/evals/harnesses/codex/adapter.py
index 55507e5..ee27284 100644
--- a/evals/harnesses/codex/adapter.py
+++ b/evals/harnesses/codex/adapter.py
@@ -1,6 +1,7 @@
 """codex Harness adapter. Parsing + signals ported from pre-shim run-evals.py."""
 from __future__ import annotations
 import json
+import os
 import shutil
 import subprocess
 import tempfile
@@ -123,15 +124,26 @@ def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill,
                max_budget, bare, full_auto) -> ParsedRun:
         tmpdir = tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_")
         try:
-            # Pick the sandbox once: full-auto needs write access for the agent
-            # to run the skill workflow; otherwise read-only. Passing --sandbox
-            # twice makes codex exit 2 ("cannot be used multiple times").
-            sandbox = "workspace-write" if full_auto else "read-only"
-            cmd = [
-                "codex", "exec", "--json",
-                "--sandbox", sandbox,
-                "--skip-git-repo-check",
-            ]
+            # In CI the bubblewrap sandbox can't initialize (Ubuntu 24.04 blocks
+            # unprivileged user namespaces), so codex exits at sandbox startup
+            # before running any command — the agent can't reach hawk. Bypass the
+            # sandbox there; it's safe on an ephemeral runner in a throwaway tmpdir,
+            # and the agent needs write+exec to run the skill workflow anyway.
+            # Locally, keep the real sandbox (workspace-write for full-auto,
+            # else read-only). Passing --sandbox twice makes codex exit 2.
+            if os.environ.get("CI"):
+                cmd = [
+                    "codex", "exec", "--json",
+                    "--dangerously-bypass-approvals-and-sandbox",
+                    "--skip-git-repo-check",
+                ]
+            else:
+                sandbox = "workspace-write" if full_auto else "read-only"
+                cmd = [
+                    "codex", "exec", "--json",
+                    "--sandbox", sandbox,
+                    "--skip-git-repo-check",
+                ]
             if model:
                 cmd += ["-m", model]
             cmd.append(prompt)

From 755dd7bc42ecd4ca519ace025b2afa476aedcc91 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Mon, 1 Jun 2026 12:19:03 -0600
Subject: [PATCH 45/61] evals(agy): label the OAuth-only auth blocker
 distinctly

agy has no non-interactive auth (OAuth-only; upstream antigravity-cli#78 is open
and unimplemented), so in a browser-less CI runner it prints an auth URL and
times out. Detect that and set a clear error note so the digest attributes it to
the upstream limitation rather than a plumbing/eval failure on our side.

Boundary A (eval finds agy) and hawk-on-PATH are fixed; this is the one remaining
harness that cannot run headlessly until upstream adds API-key auth.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/harnesses/agy/adapter.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/evals/harnesses/agy/adapter.py b/evals/harnesses/agy/adapter.py
index 6af16c3..44bc0ed 100644
--- a/evals/harnesses/agy/adapter.py
+++ b/evals/harnesses/agy/adapter.py
@@ -139,7 +139,14 @@ def launch(
             run = parse_stream(proc.stdout)
             run.returncode = proc.returncode
             run.stderr_tail = (proc.stderr or "")[-2000:]
-            if proc.returncode != 0 and not run.error:
+            # agy has no non-interactive auth (relies on OAuth; see upstream
+            # google-antigravity/antigravity-cli#78). In a browser-less CI runner
+            # it prints an auth URL and times out. Label that distinctly so the
+            # digest doesn't read it as an eval/plumbing failure on our side.
+            blob = (run.output_text + " " + run.stderr_tail).lower()
+            if "authentication required" in blob or "authentication timed out" in blob:
+                run.error = "agy: no headless auth (upstream antigravity-cli#78) — not runnable in CI"
+            elif proc.returncode != 0 and not run.error:
                 run.error = f"exit {proc.returncode}: {run.stderr_tail[-300:].strip()}"
             elif not run.output_text and not run.bash_commands and not run.error:
                 run.error = f"empty output (exit {proc.returncode})"

From 4b7008c946289c3a882ad1bf3c63c0ed05f107f8 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Mon, 1 Jun 2026 13:17:04 -0600
Subject: [PATCH 46/61] report(evals): collapse matrix into one pivot table in
 the run summary
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Actions run summary showed ~14 tables (one per matrix cell), because every
eval job wrote its own render_job_summary to GITHUB_STEP_SUMMARY. Stop that; the
`report` job now aggregates all cell.json into a single pivot table:

  test | claude-code-haiku-4-5 | claude-code-sonnet-4-6 | codex-gpt-5.5 | ...
  hawkscan/hw-01 | ✅ | ❌ — false-negative | ❌ — blocking check failed | ...

Rows are skill/test, columns are platform-model (date stamp + redundant
"claude-" prefix trimmed), cells are a verdict emoji + a terse reason on
non-pass outcomes (`·` = that harness/model didn't run the test). Baseline/lift
extras kept as compact notes below the table, not as more tables.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/cli.py           |   6 +-
 evals/lib/reporting.py | 147 ++++++++++++++++++++++++++++-------------
 2 files changed, 105 insertions(+), 48 deletions(-)

diff --git a/evals/cli.py b/evals/cli.py
index b15f747..ff777a5 100644
--- a/evals/cli.py
+++ b/evals/cli.py
@@ -83,9 +83,9 @@ def main() -> None:
     cell = CellReport(platform=args.harness, skill=args.skill,
                       model=args.model or "default", commit=commit, results=results)
     (out_dir / "cell.json").write_text(cell.model_dump_json(indent=2))
-
-    from evals.lib.reporting import render_job_summary, write_github_summary
-    write_github_summary(render_job_summary(cell))
+    # Note: individual cells no longer write to GITHUB_STEP_SUMMARY — the `report`
+    # job aggregates every cell.json into one pivot table (render_digest), so the
+    # run summary holds a single table instead of one per matrix cell.
 
     if summary["false_positives"] or summary["false_negatives"] or \
             summary["total_blocking_failures"] > 0:
diff --git a/evals/lib/reporting.py b/evals/lib/reporting.py
index fd4bba4..be48088 100644
--- a/evals/lib/reporting.py
+++ b/evals/lib/reporting.py
@@ -1,6 +1,7 @@
 """Summaries + rich rendering for eval runs."""
 from __future__ import annotations
 import os
+import re
 from collections import Counter
 
 from rich.console import Console
@@ -90,60 +91,116 @@ def write_github_summary(md: str) -> None:
         fp.write(md)
 
 
+_PLATFORM_ORDER = {p: i for i, p in
+                   enumerate(["claude-code", "codex", "cursor", "agy", "copilot"])}
+_PIVOT_ICON = {"pass": "✅", "pass-slow": "◆", "fail": "❌"}
+
+
+def _short_model(model: str) -> str:
+    """Compact column label: drop a trailing date stamp and a redundant
+    'claude-' prefix. 'claude-haiku-4-5-20251001' -> 'haiku-4-5'; 'o3' -> 'o3'."""
+    m = re.sub(r"-\d{6,}$", "", model)
+    if m.startswith("claude-"):
+        m = m[len("claude-"):]
+    return m or model
+
+
+def _id_sort_key(run_id: str):
+    m = re.search(r"(\d+)", run_id)
+    return (int(m.group(1)) if m else 0, run_id)
+
+
+def _fail_reason(r: EvalResult) -> str:
+    reason = (r.note or "").strip()
+    if not reason:
+        if not r.trigger_correct:
+            reason = "false-positive" if r.did_trigger else "false-negative"
+        elif r.budget_breaches:
+            reason = "; ".join(r.budget_breaches)
+        else:
+            reason = "blocking check failed"
+    reason = reason.replace("|", "/").replace("\n", " ").strip()
+    return reason[:69] + "…" if len(reason) > 70 else reason
+
+
+def _pivot_cell(r: EvalResult | None) -> str:
+    """One matrix cell: emoji, plus a terse reason on non-pass outcomes."""
+    if r is None:
+        return "·"   # this harness/model didn't run this test
+    v = r.verdict.value
+    if v == "pass":
+        return _PIVOT_ICON["pass"]
+    if v == "pass-slow":
+        why = "; ".join(r.budget_breaches) or "slow"
+        return f"{_PIVOT_ICON['pass-slow']} — {why}"[:74]
+    return f"{_PIVOT_ICON['fail']} — {_fail_reason(r)}"
+
+
 def render_digest(cells, baselines=None, lift=None) -> str:
-    from evals.lib.baseline import diff as _diff, score_delta
+    """One aggregated pivot table for the whole matrix.
+
+    Rows are tests (skill/id); columns are platform-model combos; each cell is a
+    verdict emoji followed by a short reason on failures. Replaces the previous
+    per-cell tables so the Actions run summary holds a single table.
+    """
     out = ["<!-- skill-eval-comment -->", "## Skill Eval Results\n"]
-    out.append("| platform | skill | model | trigger | ✅/◆/❌ | score | vs base |")
-    out.append("|---|---|---|---|---|---|---|")
-    for cell in cells:
-        c = Counter(r.verdict.value for r in cell.results)
-        n = len(cell.results); trig = sum(1 for r in cell.results if r.trigger_correct)
-        graded = [r for r in cell.results if r.did_trigger and r.should_trigger]
-        avg = sum(r.score for r in graded) // len(graded) if graded else 0
-        ticon = "✅" if trig == n else "❌"
-        vs = "—"
-        if baselines is not None:
-            b = baselines.get((cell.platform, cell.skill, cell.model))
-            if b is not None:
-                bg = [r for r in b.results if r.did_trigger and r.should_trigger]
-                bavg = sum(r.score for r in bg) // len(bg) if bg else 0
-                delta = score_delta(avg, bavg)
-                vs = f"{badge(delta, delta)}"
-        out.append(f"| {cell.platform} | {cell.skill} | {cell.model} | {ticon} {trig}/{n} | "
-                   f"{c.get('pass',0)}/{c.get('pass-slow',0)}/{c.get('fail',0)} | {avg} | {vs} |")
+    if not cells:
+        out.append("_No results._\n")
+        return "\n".join(out) + "\n"
+
+    cols = sorted({(c.platform, c.model) for c in cells},
+                  key=lambda pm: (_PLATFORM_ORDER.get(pm[0], 99), pm[1]))
+    col_label = {pm: f"{pm[0]}-{_short_model(pm[1])}" for pm in cols}
+
+    lookup: dict[tuple, EvalResult] = {}
+    row_keys: dict[tuple, bool] = {}
+    for c in cells:
+        for r in c.results:
+            lookup[(c.platform, c.model, c.skill, r.run_id)] = r
+            row_keys[(c.skill, r.run_id)] = True
+    skill_rank = {"hawkscan": 0, "api": 1}
+    rows = sorted(row_keys, key=lambda sr: (skill_rank.get(sr[0], 9), *_id_sort_key(sr[1])))
+
+    out.append("| test | " + " | ".join(col_label[pm] for pm in cols) + " |")
+    out.append("|---" * (len(cols) + 1) + "|")
+    for skill, rid in rows:
+        line = " | ".join(_pivot_cell(lookup.get((pm[0], pm[1], skill, rid)))
+                          for pm in cols)
+        out.append(f"| {skill}/{rid} | {line} |")
     out.append("")
+    out.append("_Legend: ✅ pass · ◆ pass-slow · ❌ fail — reason follows the icon "
+               "on non-pass cells; `·` = not run._\n")
+
+    # Optional, compact extras (kept off the main table to avoid the old sprawl).
     if baselines is None:
         out.append("_No baseline available — showing absolute results only._\n")
-    for cell in cells:
-        out.append(render_job_summary(cell))
-        if baselines is not None:
-            base = baselines.get((cell.platform, cell.skill, cell.model))
+    else:
+        from evals.lib.baseline import diff as _diff, score_delta
+        notes = []
+        for c in cells:
+            base = baselines.get((c.platform, c.skill, c.model))
             if base is None:
-                out.append("_no baseline for this cell._\n")
-            else:
-                d = _diff(cell, base)
-                changed = {k: v for k, v in d.items()
-                           if v in ("regressed", "fixed", "changed")}
-                if changed:
-                    out.append("**vs baseline:** " + ", ".join(
-                        f"{badge(v, v)} {k}" for k, v in sorted(changed.items())) + "\n")
-                else:
-                    out.append("_vs baseline: no changes._\n")
+                continue
+            tag = f"{c.platform}-{_short_model(c.model)}/{c.skill}"
+            for k, v in sorted(_diff(c, base).items()):
+                if v in ("regressed", "fixed", "changed"):
+                    notes.append(f"{badge(v, v)} {tag}:{k}")
+            g = [r for r in c.results if r.did_trigger and r.should_trigger]
+            bg = [r for r in base.results if r.did_trigger and r.should_trigger]
+            avg = sum(r.score for r in g) // len(g) if g else 0
+            bavg = sum(r.score for r in bg) // len(bg) if bg else 0
+            delta = score_delta(avg, bavg)
+            if delta in ("better", "worse"):
+                notes.append(f"{badge(delta, delta)} {tag}")
+        out.append(("**vs baseline:** " + ", ".join(notes) + "\n") if notes
+                   else "_vs baseline: no changes._\n")
+
     if lift:
         out.append("\n### Skill lift (with vs without)\n")
-        for key, rows in lift.items():
-            lifted = sum(1 for r in rows if r["effect"] == "lift")
+        for key, rws in lift.items():
+            lifted = sum(1 for r in rws if r["effect"] == "lift")
             out.append(f"**{key[0]} · {key[1]} · {key[2]}** — "
-                       f"{lifted}/{len(rows)} prompts lifted FAIL→PASS\n")
-            out.append("| test | without | with | |")
-            out.append("|---|---|---|---|")
-            for r in rows:
-                eff = {"lift": badge('fixed', '↑ lift'),
-                       "regress": badge('regressed', '↓ regress'),
-                       "none": ""}[r["effect"]]
-                out.append(f"| {r['id']} | {r['without_verdict']} | "
-                           f"{r['with_verdict']} | {eff} |")
-            out.append("")
+                       f"{lifted}/{len(rws)} prompts lifted FAIL→PASS\n")
     return "\n".join(out) + "\n"
 
 
From 221d47d8ea37b504d0a9df57f747f3457fcaf4bf Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Mon, 1 Jun 2026 14:11:53 -0600
Subject: [PATCH 47/61] ci(evals): drop --bare so claude-code skills
 auto-trigger
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

--bare is "minimal mode": per `claude --help` it skips hooks/LSP/plugins and
"skills still resolve via /skill-name" — i.e. skills do NOT auto-trigger from
their description. The eval prompts are natural language, so in bare mode the
skill never fired (the agent ran as a vanilla model and gave generic DAST advice,
even naming ZAP). That produced ~all false-negatives on positive prompts.

Run in full plugin mode instead (also the realistic user experience). Isolated
change — measuring trigger rate and whether negative controls now over-trigger
before layering in HAWK_API_KEY / hawkop.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/skill-evals.yml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/skill-evals.yml b/.github/workflows/skill-evals.yml
index af0f0e9..b8c41f6 100644
--- a/.github/workflows/skill-evals.yml
+++ b/.github/workflows/skill-evals.yml
@@ -102,12 +102,16 @@ jobs:
         run: hawk version
         continue-on-error: true  # absence is captured per-prompt in the eval traces
 
+      # No --bare: --bare is "minimal mode" where skills only resolve via an
+      # explicit /skill-name and do NOT auto-trigger from their description, so
+      # natural-language prompts never fire the skill (all false-negatives).
+      # Full plugin mode is also the realistic user experience (hooks + skill).
       - name: Run ${{ matrix.skill }} evals (${{ matrix.model }})
         env:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
         run: |
           uv run evals --harness claude-code --skill ${{ matrix.skill }} \
-            --model ${{ matrix.model }} --bare --max-budget 0.15
+            --model ${{ matrix.model }} --max-budget 0.15
 
       - name: Skill lift (compare with/without)
         if: github.event_name == 'pull_request'
@@ -115,7 +119,7 @@ jobs:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
         run: |
           uv run compare --harness claude-code --skill ${{ matrix.skill }} \
-            --model ${{ matrix.model }} --bare --max-budget 0.15 || true
+            --model ${{ matrix.model }} --max-budget 0.15 || true
 
       - name: Upload results
         if: always()

From ee2d17c16e8daf2a0b0a8d86ecd9fdd7085291ab Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Mon, 1 Jun 2026 15:07:02 -0600
Subject: [PATCH 48/61] evals(claude-code): observe-mode suffix so skill
 triggering can be gauged
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The CI sandbox has no running app/credentials, so a triggered agent correctly
stops and asks for a target instead of completing a scan — leaving the
workflow process-checks (which scan bash_commands + output_text) unsatisfied.

In observe mode (default, no --bare, not full-auto) append a suffix asking the
agent to (1) declare the StackHawk skill it would invoke in a signal-matching
format and (2) outline the CLI commands that skill's workflow runs. The
declaration drives trigger detection; the outline satisfies the workflow checks
via output_text — reproducing origin/main's observe-mode intent (gauge that the
right skill triggers and the agent knows its workflow) without a real target.

The commands are intentionally NOT listed in the suffix — producing them is the
skill's job. Full-auto/extended (real target) keeps the bare prompt.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/harnesses/claude-code/adapter.py | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/evals/harnesses/claude-code/adapter.py b/evals/harnesses/claude-code/adapter.py
index 3787a06..ec4b53b 100644
--- a/evals/harnesses/claude-code/adapter.py
+++ b/evals/harnesses/claude-code/adapter.py
@@ -33,6 +33,23 @@
     ],
 }
 
+# Observe mode: the CI sandbox has no running app / credentials, so the agent
+# can't execute a full scan — it would stop and ask for a target. We're gauging
+# whether the right skill TRIGGERS and whether the agent knows its WORKFLOW, so
+# we ask it to declare the skill and outline the commands it would run. The
+# declaration matches INVOCATION_SIGNALS; the outlined commands match the
+# process-check signals (which scan bash_commands + output_text). We deliberately
+# do NOT list the commands here — producing them is the skill's job, i.e. the test.
+# Appended only in observe mode (not full-auto / extended, which uses a real target).
+OBSERVE_SUFFIX = (
+    "\n\n---\n"
+    "(Eval harness — observe mode. Before doing anything else, output:\n"
+    "1. A decision line naming the StackHawk skill this request should invoke, "
+    "written exactly as `hawkscan:hawkscan: YES`, `stackhawk-api:api: YES`, or `none: NO`.\n"
+    "2. If a skill applies, the specific CLI commands that skill's documented "
+    "workflow would run, in order. Then proceed as normal.)"
+)
+
 
 def parse_stream(raw: str) -> ParsedRun:
     bash, written, edited, text, cost, err = [], [], [], "", 0.0, None
@@ -85,7 +102,11 @@ def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill,
                max_budget, bare, full_auto) -> ParsedRun:
         tmpdir = tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_")
         try:
-            cmd = ["claude", "-p", prompt, "--output-format", "stream-json",
+            # Observe mode (default): ask the agent to declare + outline its
+            # workflow. Full-auto/extended runs against a real target execute for
+            # real, so they use the bare prompt.
+            effective_prompt = prompt if full_auto else prompt + OBSERVE_SUFFIX
+            cmd = ["claude", "-p", effective_prompt, "--output-format", "stream-json",
                    "--verbose", "--no-session-persistence",
                    "--max-budget-usd", str(max_budget)]
             if model:

From 482206b79d47e94900b880c821821f7f46ad9211 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Mon, 1 Jun 2026 15:24:36 -0600
Subject: [PATCH 49/61] ci(evals): install hawkop CLI in every harness job
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The hawkscan Step-1 dedup checks (hawkop app list / env list) and the entire
api skill require hawkop — a separate native CLI we never installed, so agents
couldn't run/narrate those steps. No official GitHub Action exists for hawkop,
so install the native Linux binary directly (download.stackhawk.com/hawkop,
latest-version.txt + x86_64-unknown-linux-gnu tarball) into /usr/local/bin,
right beside the hawk install in all four jobs. continue-on-error so a flaky
download never aborts the job. No runtime deps (native binary; no JDK needed).

URL/version per the repo's own api skill reference (hawkop-shortcuts.md);
tarball verified to contain a top-level `hawkop` binary before wiring it in.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/skill-evals.yml | 76 +++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)

diff --git a/.github/workflows/skill-evals.yml b/.github/workflows/skill-evals.yml
index b8c41f6..9692a5e 100644
--- a/.github/workflows/skill-evals.yml
+++ b/.github/workflows/skill-evals.yml
@@ -102,6 +102,25 @@ jobs:
         run: hawk version
         continue-on-error: true  # absence is captured per-prompt in the eval traces
 
+      # hawkop (StackHawk platform/API CLI) — needed for the hawkscan Step-1 dedup
+      # checks (hawkop app list / env list) and the entire api skill. No official
+      # action exists, so install the native Linux binary straight into
+      # /usr/local/bin (already on PATH). Version + URL per the repo's api skill
+      # reference (plugins/api/skills/api/references/hawkop-shortcuts.md).
+      - name: Install hawkop CLI
+        run: |
+          set -euo pipefail
+          HAWKOP_VERSION="$(curl -fsSL https://download.stackhawk.com/hawkop/latest-version.txt)"
+          echo "Installing hawkop v${HAWKOP_VERSION}"
+          curl -fLo /tmp/hawkop.tar.gz \
+            "https://download.stackhawk.com/hawkop/cli/hawkop-v${HAWKOP_VERSION}-x86_64-unknown-linux-gnu.tar.gz"
+          tar -xzf /tmp/hawkop.tar.gz -C /tmp
+          sudo mv /tmp/hawkop /usr/local/bin/hawkop
+        continue-on-error: true  # don't abort the job — evals records any absence
+      - name: Verify hawkop CLI
+        run: hawkop --version
+        continue-on-error: true  # absence is captured per-prompt in the eval traces
+
       # No --bare: --bare is "minimal mode" where skills only resolve via an
       # explicit /skill-name and do NOT auto-trigger from their description, so
       # natural-language prompts never fire the skill (all false-negatives).
@@ -187,6 +206,25 @@ jobs:
         run: hawk version
         continue-on-error: true  # absence is captured per-prompt in the eval traces
 
+      # hawkop (StackHawk platform/API CLI) — needed for the hawkscan Step-1 dedup
+      # checks (hawkop app list / env list) and the entire api skill. No official
+      # action exists, so install the native Linux binary straight into
+      # /usr/local/bin (already on PATH). Version + URL per the repo's api skill
+      # reference (plugins/api/skills/api/references/hawkop-shortcuts.md).
+      - name: Install hawkop CLI
+        run: |
+          set -euo pipefail
+          HAWKOP_VERSION="$(curl -fsSL https://download.stackhawk.com/hawkop/latest-version.txt)"
+          echo "Installing hawkop v${HAWKOP_VERSION}"
+          curl -fLo /tmp/hawkop.tar.gz \
+            "https://download.stackhawk.com/hawkop/cli/hawkop-v${HAWKOP_VERSION}-x86_64-unknown-linux-gnu.tar.gz"
+          tar -xzf /tmp/hawkop.tar.gz -C /tmp
+          sudo mv /tmp/hawkop /usr/local/bin/hawkop
+        continue-on-error: true  # don't abort the job — evals records any absence
+      - name: Verify hawkop CLI
+        run: hawkop --version
+        continue-on-error: true  # absence is captured per-prompt in the eval traces
+
       - name: Run ${{ matrix.skill }} evals (${{ matrix.model }})
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
@@ -255,6 +293,25 @@ jobs:
         run: hawk version
         continue-on-error: true  # absence is captured per-prompt in the eval traces
 
+      # hawkop (StackHawk platform/API CLI) — needed for the hawkscan Step-1 dedup
+      # checks (hawkop app list / env list) and the entire api skill. No official
+      # action exists, so install the native Linux binary straight into
+      # /usr/local/bin (already on PATH). Version + URL per the repo's api skill
+      # reference (plugins/api/skills/api/references/hawkop-shortcuts.md).
+      - name: Install hawkop CLI
+        run: |
+          set -euo pipefail
+          HAWKOP_VERSION="$(curl -fsSL https://download.stackhawk.com/hawkop/latest-version.txt)"
+          echo "Installing hawkop v${HAWKOP_VERSION}"
+          curl -fLo /tmp/hawkop.tar.gz \
+            "https://download.stackhawk.com/hawkop/cli/hawkop-v${HAWKOP_VERSION}-x86_64-unknown-linux-gnu.tar.gz"
+          tar -xzf /tmp/hawkop.tar.gz -C /tmp
+          sudo mv /tmp/hawkop /usr/local/bin/hawkop
+        continue-on-error: true  # don't abort the job — evals records any absence
+      - name: Verify hawkop CLI
+        run: hawkop --version
+        continue-on-error: true  # absence is captured per-prompt in the eval traces
+
       - name: Run ${{ matrix.skill }} evals
         env:
           ANTIGRAVITY_API_KEY: ${{ secrets.AGY_API_KEY }}  # agy reads ANTIGRAVITY_API_KEY, not AGY_API_KEY
@@ -321,6 +378,25 @@ jobs:
         run: hawk version
         continue-on-error: true  # absence is captured per-prompt in the eval traces
 
+      # hawkop (StackHawk platform/API CLI) — needed for the hawkscan Step-1 dedup
+      # checks (hawkop app list / env list) and the entire api skill. No official
+      # action exists, so install the native Linux binary straight into
+      # /usr/local/bin (already on PATH). Version + URL per the repo's api skill
+      # reference (plugins/api/skills/api/references/hawkop-shortcuts.md).
+      - name: Install hawkop CLI
+        run: |
+          set -euo pipefail
+          HAWKOP_VERSION="$(curl -fsSL https://download.stackhawk.com/hawkop/latest-version.txt)"
+          echo "Installing hawkop v${HAWKOP_VERSION}"
+          curl -fLo /tmp/hawkop.tar.gz \
+            "https://download.stackhawk.com/hawkop/cli/hawkop-v${HAWKOP_VERSION}-x86_64-unknown-linux-gnu.tar.gz"
+          tar -xzf /tmp/hawkop.tar.gz -C /tmp
+          sudo mv /tmp/hawkop /usr/local/bin/hawkop
+        continue-on-error: true  # don't abort the job — evals records any absence
+      - name: Verify hawkop CLI
+        run: hawkop --version
+        continue-on-error: true  # absence is captured per-prompt in the eval traces
+
       - name: Run ${{ matrix.skill }} evals
         env:
           CURSOR_API_KEY: ${{ secrets.CURSOR_API_KEY }}

From 9c5719fa22116a9bcd2ec9957e9fc6beddbb5a36 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Tue, 2 Jun 2026 09:51:54 -0600
Subject: [PATCH 50/61] evals: integrate origin/main's stackhawk-data-seed
 suite into the new world
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Merged origin/main (clean — the bootstrap→stackhawk-data-seed rename + new eval
suite touched disjoint files from our restructure). Accounting for evals:

- Verified the CSV→YAML migration lost nothing: hawkscan 20=20, api 16=16.
- Brought the new stackhawk-data-seed suite into the new format: converted
  prompts.csv (16 trigger/no-trigger cases) → prompts.yaml, removed the CSV
  (new world is yaml-only, matching hawkscan/api). process-checks.json +
  rubric-items.json carried over as-is; `uv run validate` passes for all three
  (data-seed: 16 prompts, 17 checks).
- Registered stackhawk-data-seed as a skill: CLI --skill choices + validate
  default list, the CI matrix (all 4 harness jobs), and the codex/agy
  plugin-install steps. claude-code (dynamic --plugin-dir) and cursor (copies
  all .mdc rules) already cover it.

Not yet wired: stackhawk-data-seed trigger signals in the 4 adapters (+ the
observe-suffix declaration option) — without them detect_trigger is always
False for data-seed. Deferred because it inherits the same observe-vs-extended
grading decision still open for hawkscan.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/skill-evals.yml             |  10 ++-
 evals/__pycache__/__init__.cpython-314.pyc    | Bin 0 -> 154 bytes
 evals/__pycache__/cli.cpython-314.pyc         | Bin 0 -> 13163 bytes
 evals/cli.py                                  |   8 +-
 .../lib/__pycache__/__init__.cpython-314.pyc  | Bin 0 -> 158 bytes
 .../lib/__pycache__/baseline.cpython-314.pyc  | Bin 0 -> 2953 bytes
 evals/lib/__pycache__/compare.cpython-314.pyc | Bin 0 -> 2922 bytes
 evals/lib/__pycache__/config.cpython-314.pyc  | Bin 0 -> 2722 bytes
 evals/lib/__pycache__/grading.cpython-314.pyc | Bin 0 -> 11213 bytes
 evals/lib/__pycache__/harness.cpython-314.pyc | Bin 0 -> 3801 bytes
 evals/lib/__pycache__/models.cpython-314.pyc  | Bin 0 -> 5475 bytes
 evals/lib/__pycache__/replay.cpython-314.pyc  | Bin 0 -> 2276 bytes
 .../lib/__pycache__/reporting.cpython-314.pyc | Bin 0 -> 19859 bytes
 evals/stackhawk-data-seed/prompts.csv         |  17 ----
 evals/stackhawk-data-seed/prompts.yaml        |  80 ++++++++++++++++++
 tests/__pycache__/__init__.cpython-314.pyc    | Bin 0 -> 154 bytes
 .../lib/__pycache__/__init__.cpython-314.pyc  | Bin 0 -> 158 bytes
 ...test_adapters.cpython-314-pytest-9.0.3.pyc | Bin 0 -> 19301 bytes
 ...test_baseline.cpython-314-pytest-9.0.3.pyc | Bin 0 -> 7215 bytes
 ...li_resilience.cpython-314-pytest-9.0.3.pyc | Bin 0 -> 5746 bytes
 .../test_compare.cpython-314-pytest-9.0.3.pyc | Bin 0 -> 8487 bytes
 .../__pycache__/test_compare.cpython-314.pyc  | Bin 0 -> 3199 bytes
 .../test_config.cpython-314-pytest-9.0.3.pyc  | Bin 0 -> 8231 bytes
 .../test_grading.cpython-314-pytest-9.0.3.pyc | Bin 0 -> 39560 bytes
 .../test_harness.cpython-314-pytest-9.0.3.pyc | Bin 0 -> 6910 bytes
 .../test_models.cpython-314-pytest-9.0.3.pyc  | Bin 0 -> 16811 bytes
 .../test_replay.cpython-314-pytest-9.0.3.pyc  | Bin 0 -> 5190 bytes
 ...est_reporting.cpython-314-pytest-9.0.3.pyc | Bin 0 -> 3235 bytes
 ...orting_render.cpython-314-pytest-9.0.3.pyc | Bin 0 -> 23278 bytes
 29 files changed, 90 insertions(+), 25 deletions(-)
 create mode 100644 evals/__pycache__/__init__.cpython-314.pyc
 create mode 100644 evals/__pycache__/cli.cpython-314.pyc
 create mode 100644 evals/lib/__pycache__/__init__.cpython-314.pyc
 create mode 100644 evals/lib/__pycache__/baseline.cpython-314.pyc
 create mode 100644 evals/lib/__pycache__/compare.cpython-314.pyc
 create mode 100644 evals/lib/__pycache__/config.cpython-314.pyc
 create mode 100644 evals/lib/__pycache__/grading.cpython-314.pyc
 create mode 100644 evals/lib/__pycache__/harness.cpython-314.pyc
 create mode 100644 evals/lib/__pycache__/models.cpython-314.pyc
 create mode 100644 evals/lib/__pycache__/replay.cpython-314.pyc
 create mode 100644 evals/lib/__pycache__/reporting.cpython-314.pyc
 delete mode 100644 evals/stackhawk-data-seed/prompts.csv
 create mode 100644 evals/stackhawk-data-seed/prompts.yaml
 create mode 100644 tests/__pycache__/__init__.cpython-314.pyc
 create mode 100644 tests/lib/__pycache__/__init__.cpython-314.pyc
 create mode 100644 tests/lib/__pycache__/test_adapters.cpython-314-pytest-9.0.3.pyc
 create mode 100644 tests/lib/__pycache__/test_baseline.cpython-314-pytest-9.0.3.pyc
 create mode 100644 tests/lib/__pycache__/test_cli_resilience.cpython-314-pytest-9.0.3.pyc
 create mode 100644 tests/lib/__pycache__/test_compare.cpython-314-pytest-9.0.3.pyc
 create mode 100644 tests/lib/__pycache__/test_compare.cpython-314.pyc
 create mode 100644 tests/lib/__pycache__/test_config.cpython-314-pytest-9.0.3.pyc
 create mode 100644 tests/lib/__pycache__/test_grading.cpython-314-pytest-9.0.3.pyc
 create mode 100644 tests/lib/__pycache__/test_harness.cpython-314-pytest-9.0.3.pyc
 create mode 100644 tests/lib/__pycache__/test_models.cpython-314-pytest-9.0.3.pyc
 create mode 100644 tests/lib/__pycache__/test_replay.cpython-314-pytest-9.0.3.pyc
 create mode 100644 tests/lib/__pycache__/test_reporting.cpython-314-pytest-9.0.3.pyc
 create mode 100644 tests/lib/__pycache__/test_reporting_render.cpython-314-pytest-9.0.3.pyc

diff --git a/.github/workflows/skill-evals.yml b/.github/workflows/skill-evals.yml
index 9692a5e..298b841 100644
--- a/.github/workflows/skill-evals.yml
+++ b/.github/workflows/skill-evals.yml
@@ -64,7 +64,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        skill: [hawkscan, api]
+        skill: [hawkscan, api, stackhawk-data-seed]
         model: [claude-sonnet-4-6, claude-opus-4-7, claude-haiku-4-5-20251001]
 
     steps:
@@ -159,7 +159,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        skill: [hawkscan, api]
+        skill: [hawkscan, api, stackhawk-data-seed]
         model: [gpt-5.5, o3]
 
     steps:
@@ -189,6 +189,7 @@ jobs:
           codex plugin marketplace add .
           echo y | codex plugin add hawkscan@stackhawk
           echo y | codex plugin add stackhawk-api@stackhawk
+          echo y | codex plugin add stackhawk-data-seed@stackhawk
 
       # hawk CLI (Java app) — the hawkscan skill needs it on PATH; install-only, no scan.
       - uses: actions/setup-java@v4
@@ -250,7 +251,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        skill: [hawkscan, api]
+        skill: [hawkscan, api, stackhawk-data-seed]
         model: [default]
 
     steps:
@@ -275,6 +276,7 @@ jobs:
         run: |
           echo y | agy plugin install plugins/hawkscan
           echo y | agy plugin install plugins/api
+          echo y | agy plugin install plugins/stackhawk-data-seed
         continue-on-error: true  # depends on agy CLI; best-effort so evals still runs
 
       # hawk CLI (Java app) — the hawkscan skill needs it on PATH; install-only, no scan.
@@ -340,7 +342,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        skill: [hawkscan, api]
+        skill: [hawkscan, api, stackhawk-data-seed]
         model: [default]
 
     steps:
diff --git a/evals/__pycache__/__init__.cpython-314.pyc b/evals/__pycache__/__init__.cpython-314.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1990f70eedd4cd32a352f0cbd78158fb52cf523d
GIT binary patch
literal 154
zcmdPq<K<!ig2^`}vq1D?5CH>>P{wCAAftgHh(Vb_lhJP_LlF~@{~08C%UnOSIJKx)
zKdC4&FC{-uuRO6RMc+9;B~?E$JvFaHw>Ud9C#P6HwJZ@x#>Z#oWtPOp>lIYq;;_lh
cPbtkwwJTx;ngg<_7{vI*%*e=C#0+Es0ESB=fdBvi

literal 0
HcmV?d00001

diff --git a/evals/__pycache__/cli.cpython-314.pyc b/evals/__pycache__/cli.cpython-314.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..238dc5be17a18175947cc896b1ad6df7cd8a1bba
GIT binary patch
literal 13163
zcmd5jZE#c9mG4PU-(R+5<1dV5Fc@LW24iCb27<vx=EKMjBNn3I2um1&kRR_o1<)oL
zXWD6;W~Z1oyO_4SnC|XicGJa7I}@|b4z$fq$gfJ+j>L<ZY`g7r`VRuxw)D@QbDy3i
zj9BP)JG0mJz2}~L?)SO(o{#4`tHnq`IQ54m$N#O8qW%*zN>IxH{ji;;s9}nwMCvfb
z(%rO2C90|rR&}dIHAxwfA*n{xK&tN6dbFa}qZ4(6j_KBW45ER+nr@@VB$^1U?KXQX
zqJ_Y^ZmY*8+6b)gF7OnJb^;r^9iAew$Wts9dlrd{JSAcY!5h0vJ!N8971cnorF&_y
z{1rnr^-4uEj1_}-y`1L8&6$jMLoQZ^SyR0!ixro!<`Rmvlu(fZv5K_<Y$LFfEdaQX
zz}2iBU<ZK<*&=|83A~hD1aJv~m$9m4l&dV>u|FC<8V&`Wq1OTt=eFJ54bIM}$e(p`
zL*b|>Y#?}{-Z?OIk_+&mdM6(m<O4wffIb`yh@l1-Etvw*=#Ur?!$VP_$EA|=SRjT^
zh9o`4@u6twDlIYIfOrh3jgg^1uwOV4jzl1*A?;x!gCVg$5Dai)h=;5$)02ylEJtGD
z2-Jw3JQ?86N@hM34TkuBF>oXjlB}6rW`vS%00uD>2|*K<3{LJ#G8_x=(U2g7gaO=)
z%!W`5z`#=ULni>kl#e>BYWpfh1p&91>ZOm-EX}G|HOn;Ol4*XU0y2r_;w1ro5Y|Kp
zHEiVvV<$sV(Th1=(()lO#z!Tl2j@+}K(t4P_>%!qz%G|cO^y43OlUm92cp5DXv66M
zA8gz<6bv;62B8%<Ss9@bcSL9$h=dzBm~(%BW{pGr{d^g;Qv`oP1w_NtZz&z!`T(Ou
zQQdt^CmE7`2*nhcECLv`@JA-Q0>*-q^(xCiisguf(*)dW-wP9O&78KXXVJpK_;E={
zU^UOB*RncR-<qlM%vKG`d3h#<(a+5HYGTdL?bpIuS=)2_EqIcOErcz1Jl478P3gM|
zyv>JB#ln0j2%L6oC_E4n9vj^53~N0$4MYO5V8}fH+X=+FVCZNd77_U>7;BtyyTd`r
z!0~XH#Qty)ID-j>1ThZnpM(mLxEX-J8TXM`5DvrOw=ZJ&WD7LJh-bMFzYJT^y4^?M
zC`v{_9O6U$A|J!Ur40<=@r#?>?xV3t#2tu<Lp@j3lKxoW^a)`g5S7#cE*y6VVqo9|
z;@ogX18yM{3cA#iIS>r?13wcsmJHtATl>29vU~PPnxk-7MHkIGpliavEBI0Xfg#~=
zog9i{ErDMHD8i620EVeFqj|gMt)9{K<Kmr`{^6b(=2Zo?al-Q9S;TZJn04bO%-1XV
z^)pPR0&jj_Gbv(eyMk#O-;Mb-N`BMC;^Cf8nRR?Q^!&uRMiwKizL<<D|G93Ymr+;M
zyd6k*oB&^p(ISXo&Lxn@XO_&F96J&%!2(<nJbRnzq6Zj_1W6(f1j^<^0YGFRkKs)l
zru_2--uZJgkD{vKJ!9$C%&FukzeXwZQBRx+or2G%$~(f*euIMb(O&YBDtS3oTH;iG
zi&6rVRt3(c^OS0Tp@Q|}a*^Q0HCqs>DP<h2D);TmKayMurLP8>V|*$v?!tmlK<(GW
zR1H*2)wLF+gI+K4;(}0{$p>MV7E?Zr7dc}=fU)GM+4dI{H3&PknA!yl?bGzAcC}EH
zfm%XwT2ABBDlHGbgfxDTOXR?+^S5WFSf)wsM{Xg(>&yT-3EY_TcVQ0Y(j1ZuSWO+K
zl@%hAhPK=8ad0|M5vNCKgERP7DYf$NW$s$Dy4>9J_l7h2j9$0Wel}(G@MFl%!KW>t
zI_a*zq9{&NLHX3)#w@J@!Je#E;B1<I4A0m<2B@2|brp!!172Z21qmr&*X3k0AYj{`
z#<IrN%<17wF#e5CQyjRS_e^E>ku&=@DRmCK41By<$z@aMInP-GY@V~~Gqa{ve77NR
z7S^19MS#~VK9lz)LiR)iB~~cucmhc>8|itDCR|?y(j-t90QL5#siCA0O1kov*qcnO
z!`uCI8PwSMG*;o``kW%xk(=YJ2mUeN$%RoAH9c{!c=zNqk^y~Ir3a874BnUXP*}1Q
z&+Mt<1$wf0UwOLq!zy;Ma))R8>7;j^lkXFMzf#ASBkA}9^Vs~Hke_P|&9HkIxB~wX
z83H{cs6g4Yn(`l2a^7PKlueb9dQW6=3Zxt=UjbXW0+2o<yCnAp$+rvsh*H}dRiJEI
zP5BDQI%Up{(mT1^s^D&G;|d8}-o2i)`|MtnEfxfjQdfeMig%DxkpR7~Eg*ud_w3U+
zWkz{UK{e;-D}?doU+-t_O`iKGPuoj<4xfQt)|z?OxguD}n&+*gujs|O40dRFea@RV
zf5zqauz+2Wx70>n%L2AGxAZX8_8zE0pl`r?mwm^UKE0YxzD{iUG7A4<ppVB?oF+^;
zDSwYrOG^bGVT+utPghO#<y~dK=D7>|^1kMP^(t*=Q?@uiURjS-)c;%_1j%SI)u;Ee
z*)|nuj|uR7N-mo|r$uth9;WuthlA`Q{{f|5pH1O?FGwH@1KfBeeQ+-~F>LAl%VatD
zC)|##8OE8zAxe+XQ`sLoD%R1YQ$}A!Rl=@wwk@_!)$R{vC`^Z`jp|A&OuJl?hUm1Q
ztuUYi4N5vL5*rLhg}5m?<dn4{XRSonqF!_~3@W@zGIoL%%!UNep(LFz#0SFz0PqAM
zpgN{QvjkWwBz@>iSP=V%PPjCZhG<i~4kjiUL80oG6)K4#ijx5qq<&P8K=aXak${Nm
zxuhX_w`2;2gP>G}2M0rZT(!&@w>VeI3O(v{o8l&yaoI9wi06lRqeR6EFeiv$k3fS&
zIAWABLotzyiTz^ejA)G8k>*H1I7W0PQ2GU<L>uGA29Xa8gc`&%VprUe(IB0nGXo(G
zRk;n$$EpoZmx1quY4G^$f@L8V1-((S3de?GU=hg-liz`bwoDE>;rS3Q(`%Xa(g=VG
zNetL%Lh(g7n5Q~3_QcH%^0GD@7lxv!QHH_F5)~x_ns7jEJjun)854{`V%!#r!0vEE
zya;<`9RS-+3{+Bi@}RTA>?K_+dLlY>I!csRmnJ(Kek++0s*z!lN0I5VVF21AjZ3zp
zVEqX7b3;N{48Il<6ht&M2zDk!l!#z*iu50e3=NzJM+f_l!s<is0vL?cgJCgl;6tyu
ziGc?7YgjQ}lo(Iv*6prMl3uahNY*EfH$;<_jHH^Z+9DH?)#p(vlysS;lhlI9^IAly
z2Tp@AK;~b8uR)XwHc7+rU_t}Bvx20RIYeTBNkyrE9D{~P_(%g`RDnk!lW)hGX0~(R
z{@s21`q{mE`+$*81jAra0;587;FzS9J0UB_qd>=R!w7{n$r=oau<P?Yp+}^4;8<wj
zgdiES91MpLc~>&!Fs)?D;a7<{4z@oqy5%ygWIWA>MX)yDmKr)09_4pqFHG0g?%iPC
z+q!QbkGHF&@9&QWPJ*Qh%n}?2);h^XCfXm2o#gtFzj!my@o3rPQFxKmz#=?DpqMxy
z8N$MlY%_z=0J)^WngYKcDU3qw2rL9xUIa-ECxWalOaM+W90AT%EU5yV#GqjU_U0(u
zGa0L*WRfqOeppk1STuN)hq9|rxKPqTTS&rxjf@BG1Ly`APD75XM293@<{Zf5QTGcR
ze-LRI<X-_z2+n*#wwy301K}wDH$a3Bs?ZIy9;W6Dg(-vcj=?!TIAdrS?wDf=M(s09
zQQBlVx9Oct?`|31{!3%gXy1iH=MPOAox|JatYu?0m+CLpPg`q;cg#5!y(eBc^UX8g
zijQ~CmbfRG4~*B1vn3m*9UBw&_TgRW^2$qf->#ds`YzUu_Fg-D?eN6msZ%$_8{#eT
z<2Qf&W^&8^bEig5U9214k+AwcH`(X(mQmB#_Dek%dlD6O6KfJy_Yb#yu=D!PS^bue
z93Pkdr1W<6M-{(jn$y+-=-*mC)_2K&(LZZ-O?1y%Tc((s#v8_2Ysc`8G-G;u&s%%O
zY9@+?_slR2bC#kJ`o5~f&_k!~WfvOGH;flvDZN}e(R^*g)eV!UJ`k^qQ!l4$>#oEv
z$ESkn`qkI=Ufp}E=4027UFo{UYn!ibzOSPk)ekAPW8=uS`+CYzp0ck@+E-3Et{uO6
zeA>Q#WZRs*WQ-nHUol=bPWFA^zwW<X^t19$%F{KqkJPlo1=X}vY57?7rJ9R1V~fV#
znAmsi@YTbK`u6)2y=iHB>GCVC%dRVp<Bb!1qPhu?YgeXM)?Kq*wIyoXAiI7+cGZ0y
zv#1Wn($IA6;MIeZLaNo1Z1qgLyT3Fri+0jbvWM=aG1E(rblx{o*5XXx@kGh0S<9-F
zd0o=HZZbAw-gLi+G8CM%zGEE~X7m+lL&1gM`QWH;tUcjaIcr#%(l;md&6E35EuLhH
zC*kc)w0Ndl*pz<XgBED^Q5&Tx94(n)%I2&Uu!B1<?wnw*nXj5B2dAx@VXrEdU26Y!
z`}=ytO;@y}Dz+pmwoF%S9o{jrE6J3mHRh2wXEl{6#+77T@7G*g^-rrN58UdVS+!$^
z@%)OZ`;1wfwiKK@@Xmod#qGEI(&bC;tD)bA6m4~W?p$$MdnI-`_V=fMtER2EoYwYq
z(?+1sn0{_59uqIUe)09m<_|Vp-;iGH#KEN&uT3moJK>t@y1fH7$Go2&-u@pjhQFNE
zl&6>#NoK{kn5x~FtlfBPO`>+=bZzGh(*+Btw<T=Ol&*S4S3Q0p;oh8DzGY_lmbBTC
zD00u58zu{9%uRE8(>eXu^`nK~Fg;LHmbJhC;sj0E4$!}Qw3;d?dqnAh3Gdn$j|Z<r
zE=Q(nZmznq>UQtX`abDP*Sk~o?aBJ~+so5UYg0`f$)=8(hK_X0x>QRi{3cpD#}8cb
zf6srr^=Iv$w5M0PQY+iwH?gwq@|vlEbWQ7r%vA4>bg4E^vdxod?Y?clz4i9m#L~Tw
zOd1EUz_$D5XRtupUNqW!q3?X(Xjh`V>4(iz^bcB7YqurWZc8+6ztwT;)UAQr_C&#M
zSYJyI{RM^bAMV?5Dvt%U(EMa+LvJhd(_$^e|K8fE>TOpg+fDm7Xzu8?uHWCHnW48f
zKr-8+0o?!2WXgl}?fGU>RC4Fdq*l7@0R_=0SW6*!+Dr=KwrnQ-B`AddKg^_{0OC8f
zAUsnoL__a_0JQc6D98e~C@(gT$rD^KD1t!XRK$g8=FqjtbXz=Djsf!q18V)eYNm+d
z-fFo{Zrq&7c=M=ltR`0l%|Gv43Rat&1NuD5S}p~cPc`1<xucO`a0Wf;4$`oO+<IP=
zCl&-S%B@_0f-I=(DliXgGv;A1J964a%BS5%vqqnmHRh_d-c?W6XH7t5dLC7yY8wq^
zVK5J)Sr}AXEtoLOiuqGn{VK|j2ayDyDvM>+^D416Ot~!RtR#L_9<7rg9HU6QP_WYt
zMI&cH>hIyvWWXbn5jmxg-wfbmLnM4ulx1e~>9BaLUy=H4m_tz}gW!9C+*QKAiVXxX
zf>}`f5sX0j&j`5#2;~_Dk!$&{Kq;zR;xQ0jW$71WJ=E1e)<d;yDDqke#8je_@o4nn
z(elE773*n0m5YUVJm)SQ{|@56j(CP2IxUc<NsIh95J~d_vVBI3lKn3iu?j}yVyUH{
z6r92{hShlmr#VyUSka~Oi{;~eSNxa#Q$;t+Z-6AQ^2*-Jd*RM^-EgIs)n3_rdGl2B
z&CNG9KhmhoyWkCasHHRo_n5_Zt@iiY-i@ZLHA!pDv~@+w>P}kSlcrhg#;M@V$c@OX
zb?5DavsPc?;47(v1IdE}v(|y9#U6ZCbDy*;@$bcd-qH0?O&dJ)-HxuQ-ur6WxPD|S
zJeiQZId*D%$!yi?RMom<)w;B?Vyrh&wR*O4O{%grS=kB?rExvfdO)hDy}dB%mHX&N
zluFY_KZ21vMw>rnN`L=_2Ada9Px)v`OQ+lX=e+MZ=w<%C*U_cugD%DU9+Ic;dmIzU
z{`A8hvXOtBTMQi683P)$G9Or#A6q4XRV$~GxRCQvGpq&-h-NS#Y8B(*Gil+L@}o*6
zw{3WWKIa1sg+jS&!4-%6P6FJLRt3(cfPLY216&iREkOXs9jX~5_*4x!jv>|rE^7;q
z_c6Iu`11gPGZQaT9$le*JUXs;Cq~s6;d9_p$vSQ)z&(5&VoW(VMFY~|((-Vx%6A4R
zd6_RXNDp##a(m#D=~8D8I6n#q0S6AJSw6{RbQlqqL4$3s3M8N8G^2fc?^}CEcl<-m
z@ZK4wR>^lyFn89yil~5s+BH%6kpVF~bM&P-qkXjb!iMu3;MgC!d<af@{f+t=;||cB
zK)G5rTKp+f@+WwWUrYpw<J1*Tv6F6oKtVJLUuQt_^pQXYBS+#323G{0dAb{x2vQY(
z-Nezv9<J_Idl*js(uwKTc(k0Fux%dZ0&lVgm>Uil2Josf-a;A9jqwS7!iSfi%6Om<
z+A}H7*%RMf<ga1z;m0$%-)7{ev%Vmmjf|k+86&`DUJ&MwAslp0n{#9|`Mk^CvIW&4
z%N8oVDQN-K1(tqJ2i{Gs7eByR5WwSy2W&y`>Ad(2$bz7ZPib38tEgR2gdzs)y}i58
zV@DsaPY*(l#=BxcBal_<<?%ProPnjmd#lMOG%3<m@E64TpO?mqx50v-jC;QRPq2O?
z^{jsKuB<~DHLU1+u%agL71l542(oNpu0Y6{edZS#QO1uC7uPfP!~7!q!CCxx^^uUz
zXAS#ddXaVxd_SMJ+Gok@tG49xRj*a!w(kP75pqSJG4D<red=oJKo~qqMe`4Z6?Q(4
zmph+dJJ)NP^<lHc`C9;Q!6N^b%;LaVP<9jj6-^o?2R>zb83o=Sr$PzdA9zU?Q|K)K
zkLUzN&3gz+e}hCBLAMV;@B~;g-}eQ>gJ6ZmkL%FfOrAFUpo-&pR0en#L~)DTE!(gK
zH`?#erp)8A#jW_wln{!9qaiGb8{KXcn+dFOyFp5h8{`feP6qiyPz8D}B;P42^@v6A
zJ6NKU3<^?^7;Go5J<Wq{-5FOoSH^WtVg+=$obm(ucQ^`|F~vtPA|^~n)&eZg2MpN!
zKVxwKBl6s$U=F5Dm@Is2X3ES9M<HqW!N|}N{!Jus$c9zrYfNHXmB923Us$58md6K3
zvS+(fM$Ug93(QKEw5!Lt5J*d$DFS~Ph-9H(gNq?7wBD;><zxx}7&fPlgrY8+JZ&;#
zoUA;rthlcs%Oj}<j*~I)|AI8vu(~R89L(jQSo}4~oeT=7mt<4`C6NV)_puU2<bnM)
zJaEI*XN&{H^?OX+J@`Cq-ky4F#?Uz2p-2i{<LxLXtXA@SCu*jcse?bVA=Z<@CW>k%
znx~2owOJwDJuXi6A!dVu*)d)-$xOaHy?QI6J8~qCwAnh`IcEiPZK`;AvUqu_xGq^-
zH?eEFxb0Kx`VrNAmD*68HW!}T{m$;O@>E%CvaEGVb<=#q{3Gjh*^W=mo(CGrw&6=7
zrD?faP(5CgTDms5bnSFObE=>{QP4ic40oqZw&Cq*o$2j2-hN~F4bX2YFIJ|?+{rR`
zs;u!&S!3E(IKrgui!Qj&yHob%N&E8g6QA0fMmoT}bisArHFjdw4rb7_V-bo-7uvts
zez%}B?WjyS)+8NkCYOESzV7}(<Fun=WcytEOE-7^>&{#J$FKkR^`E{u-R>LNj?Io2
zT`9j@KGApW(A7hU!nSGi`UfV;u@($l1x4pxA9;OL9AWMn7LCTHm>I(++ynjFp0_+H
zrX<OfjHzariaEwG(gi|V+xfQ9Rbw6B6~<e>6CaCA)FzBgpE7HI_o$j!>1adoXXYYM
zj||0+1vIDpw6LLjh3b<PrtWo`Pg-b*|D0;g-$IY`GMt`X|A2yMw1y;4%W$X+5E<?a
zVaop-wFcA_1E?uasx+t{F9^!J%+a7$pftH4%)i%G^4{mAX`uJ0<^DJonpC>rBIeY7
zoEHf`wQ{#}jNhza^Qkr<Eiyheya$@>w}T*o<}rvVEds6f3ECp1{cM^;tIPF0@AAoS
zBdZ4m$SCQRD^b>hQ1byT2xYJsOrCmqkPn5Tul(Y>-+H<66(_$FP;q0%2`$v4*H?C(
z$=|s@OC`cPTGo7UGk9#6$D5G<5Tm0Q@eqM!Cwu>C<?p-YYVSg+fVwHU-sHROBdmrI
zVZ8GI<SIwUXAEny_gVLy+FrbV*xc)<vij1YONYh|jRemff9Lo+{yXji!(E>hmW*iT
ztfgbc)7GjH^_<>z&hmB3m~}ihJ~+YtP(8WrgI(8mO|O0_(YW<i`CQrJv3-{gT|D%i
zszj-4q7n{YN!x!>v|;^24P}?Ld6-o-3{KLZWBd&G8e=@XR%p}W!jU2Hd<F*3x@sg{
ze}8aj0PI6XuoT6JgSekp;T&JZ=qnh7F+vTI_yGC&Co|Vo3g7h~4G+pr{le^nhW|c*
zuP1W5l!MRsX=Sf%j>i?=+eqN7tRG*&9w#wE?v{O$sIHTZ%xz3D(-AuvQGqWZ**Hez
zjXa6a8jL_flZ{m5c_RBrz9%L(1=(Op-wuy<EE3wxCxIBQ2}(Ex5v(yy-=#eFsHLA#
zw$CWjXO!`Cs`egL`YVi=-J@LhsMViSebZFmJ*w^(=9h*IcWsVQ&%1A?Y|E3j<<qv>
xl&v9YYnZH<wY3kMztqCN0(@Bn-zI<Ap#q!xmrKDA^JNjJp1<v);SM81`frvl(~|%I

literal 0
HcmV?d00001

diff --git a/evals/cli.py b/evals/cli.py
index ff777a5..b28b1e9 100644
--- a/evals/cli.py
+++ b/evals/cli.py
@@ -18,7 +18,7 @@
 
 
 def _common_args(p: argparse.ArgumentParser) -> None:
-    p.add_argument("--skill", required=True, choices=["hawkscan", "api"])
+    p.add_argument("--skill", required=True, choices=["hawkscan", "api", "stackhawk-data-seed"])
     p.add_argument("--harness", default="claude-code", choices=PLATFORMS)
     p.add_argument("--id", dest="prompt_id")
     p.add_argument("--model")
@@ -112,7 +112,7 @@ def compare() -> None:
 def regrade() -> None:
     ap = argparse.ArgumentParser(prog="regrade")
     ap.add_argument("trace", type=Path)
-    ap.add_argument("--skill", required=True, choices=["hawkscan", "api"])
+    ap.add_argument("--skill", required=True, choices=["hawkscan", "api", "stackhawk-data-seed"])
     ap.add_argument("--harness", default="claude-code", choices=PLATFORMS)
     args = ap.parse_args()
     res = _regrade(args.trace, skill=args.skill, platform=args.harness)
@@ -156,9 +156,9 @@ def report() -> None:
 
 def validate() -> None:
     ap = argparse.ArgumentParser(prog="validate")
-    ap.add_argument("--skill", choices=["hawkscan", "api"])
+    ap.add_argument("--skill", choices=["hawkscan", "api", "stackhawk-data-seed"])
     args = ap.parse_args()
-    skills = [args.skill] if args.skill else ["hawkscan", "api"]
+    skills = [args.skill] if args.skill else ["hawkscan", "api", "stackhawk-data-seed"]
     for skill in skills:
         cfg = load_skill(skill)   # raises on any validation error
         console.print(f"[green]✓[/] {skill}: {len(cfg.prompts)} prompts, "
diff --git a/evals/lib/__pycache__/__init__.cpython-314.pyc b/evals/lib/__pycache__/__init__.cpython-314.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0eec099b1401da63bec7e7e626a7771eec53f33d
GIT binary patch
literal 158
zcmdPq<K<!ig2^`}vq1D?5CH>>P{wCAAftgHh(Vb_lhJP_LlF~@{~08C%UVCQIJKx)
zKdC4&FC{-uuRO6RMc+9;B~?E$JvFaHw>Ud9C#P6HwJZ@x=42-6$H!;pWtPOp>lIYq
g;;_lhPbtkwwJTx;ngz107{vI*%*e=C#0+Es04|Co>i_@%

literal 0
HcmV?d00001

diff --git a/evals/lib/__pycache__/baseline.cpython-314.pyc b/evals/lib/__pycache__/baseline.cpython-314.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a9616b2a5c6f6ab1342061726c363072a8402882
GIT binary patch
literal 2953
zcmcgu-ES1v6+iQ_-|Jn2NjG*39=ybO$l{fYlR72B5EEi>n{1}ZI<(;FcxP;e3^UU^
zv*5T@v{Lh6L=Z+rVyh%d9w9Fw>O=e1B!7W1G~SGeTB+(o-b|r#Q}v<e-r2RywrSsb
zr2BR5xgY1;bAIPOmFY+V(!*HK+22P1{!9lg62cDmpp2awDB#01DBN*wg7^78+UJi8
z6QVD&zHmG;5%r^epolq8A~~=+{g{$cqBx4tk;VCOC61#6n@cFkJjm&Vu~L;7`%7>7
zXB=A^v>oZlaapQ3=Uk1L9<rQSNs~y`mb5v|v^^h3Gn!{urfpF6kjw=s&9)t1^G(P0
za442E{|vUtal^6{!*z&X;ZOxyC`GZLZrp=C*r`Doruosg0d(Z`!YkYvPT>?@5wM6#
zq$uDmqhIqV8pJA9LJZp{A|1tB*9fA-@T<gL$mnLp|B2_5f`rxSn}C;y&N}3r=6m$;
z{@`HoHP0YkafWEN?$|>YG@=*B9o;BubGY$-?+w$kyrOYl!`d=uimfw-+&6=?sx}oh
zRF!n#hH)(3Zmeps4%@iweOfK;Tcgd5Bx`QH#h4FJ2r5*I7U64$R`@4gjas49n%UBJ
z!fSk$FG7_+mIwG*e`)IuO}R2(qV{N`618$0&G3^?0wiD;xPr@<x&P~lqKkUCkyDA*
zrOLuc(<8R<#1^j*Ysmq0!m()pH#((1iz{@T>BVT6Di5H@WENnmvW3qz`A;DKnL^Lz
zE>p+0jz(YK*okkh7-?Uj2wSAYrqqil^~g?-NJW}bKL${u&HhhAo0lVbe8)a4E^lt>
z$+1ANjSE3cCywhHdLVk*IU^8fHPZ?b#F!(7=OH^XYtCa6tDMp7IYXBtK@4ZAmhWKz
zRBhG7x$_30F%U%1fvZL!c!n>>2sI_4z8Y{O5HN@Wk@|HYI8{Foh^c#2Vqid`MAB4H
z)XmvhLS38r_!rozL1XCYkDZU4JIO;gog3S(j9wgFdgc1bt0%9kSJjo&o|{f>d^vfj
zQT*O&acreHb|-cCc5&>+?kf`)CziD9=2i2$bJbZ%KZ)Gh(dE?PMo({T{9bnZ(t%6!
z@8#?IyYgx_znsl43XN3eov}u)cX9mEM5Ckgo#{qSYW6R`Uf=P4_uY>EMsDXO>pOQl
z`Wm_Zkg@IZ$@;+E4ylpbMHznz*WT}j&OK{Akj~aleDl}ckQ)0YNFDZ4sM-~Id_?>#
zE{{JeK7v$KcZO7hO#lCiRJ;;VqD2wWDn@AR1*s-lsoMFuKrn4TNVm|0vG5ISozQT=
zck&6Li9qHEy-$c%>?Ih^u&S3%gcn%{k=u8lR`nmP(dL%Gn%^B@&8HrlBN<##!TWCz
z?K6FTfSnplaRrur!g*IX)r?1g&DbpSqu*~~vjTaXnTFA88yqf=SqOC&7816zKXlC?
zI%D|0L4wEyhj<2pu%OL&vPklH5J3b2LL)cebdLhp06JdibSbam5JS}s%h$+$oMI}=
z*g@Zmcl{yWUpsOymRubE<@u!@Z@=|rwx^MMygq!bx;VNl${+5<D%6?k(b9z;!E~0`
zKHvI=Vv*_*3Exh1F@-IpXd;t-A-&|-279Q50hVF>s%se=c_vIY1!*(eo&KM1NiE?a
zLZM>okfezNtNP>`ZI%lP8*ZwCa*U{3F;&W_Tn!LTxiai+PU;f1*On)Q7L=B7O*<jh
zF>0-zLz!ME<8%JU9cZ#@BV}WMgy$b3Jo8V$DSAUjB^o+yv>Py0QTJ}6)?Hfnb%m3|
zuL>pVy={ct&Ndbfk-Pohxd#O5WRov9!51}?;1tC+Mp4jFr(WoToiM2Zm~c?o2_`3V
z3yBJTTZYc!-$zaoYPuk5%$pv<Y=q2N&P@3G=sJhL21`YpHFeE5R4Px=TGLhEnD>Lk
z3-cAjrC*{T?pm5pzac?{{V#$DTZ;iv#NA0BJ%X>C4Mciy7!n&eyv=fO#|_-jO+pT$
zd>)H;87mBo?~c^Q|Cq{N^Xo@{wXm8Syqz0d&Fx#s?Q3MWFDCD0wl9rbI=h<Lbvv`G
z{_`&~Pc8CmypVY8Uf0gmuHlug;amN`Eqqe=&Cs2$Ba27ZqA>d0;;UCqUp!qu`oW1`
zpSU4@oV=M_P9L}vAGx1`Y+)_U5cKa4W6;_C5V*u+L3+oUfbBm#`V5~X_dMUtf8L#X
zelYU+UJmP*WbSKfM>!e9R8@B>sv0C!b+(G1Hu{qa|8_Lu@W&Neyp#RFhHyN@Y~?Yd
zvwXyWff<2C8H<t-KH|@zY}GOjkt28-h6s2PR(Lm#`x0LG3i`i-$NvI*{}#pX+4>;D
L=6VL>jqUI+IzeS=

literal 0
HcmV?d00001

diff --git a/evals/lib/__pycache__/compare.cpython-314.pyc b/evals/lib/__pycache__/compare.cpython-314.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b46e281f18bd3c88fa9014db789515624223dca0
GIT binary patch
literal 2922
zcma(TTWlN0agUEX@_3~9kRnT#Es_#!Gm^<dk!80woF=kqv5F1mAfMt!$>Gd9K5CkG
z**jW}P@w$ihm0bqoEnfEAgJ;cy9i+WD=yHFq(5!{EZByajnx_``q4kSHemFxvv;JV
z$Oha2c6VlWc6N7mc4kIGK^_4-<NAMbh(qX~l<0sxgAnud5^x$wL>BrGiA;`}a#^lH
zio0^|DUao`ad*x;#agV5dve^A&+^&0H|L+?E#Ai2TwqGDgh3>7qEGaTyciG#F$gUr
z`Z8Xzs~3skUQ`WNT~<)^4WU$I;fz?X#Wkf+j+<q@UR7laS4t%f#|^AsG_3f1#VW^@
znrb6@-HKafEpA?@RIBl-uBe)NG>$bx$5y;rDO%|iLjp>zrdx_t(Q77<p1fj}VdSZ}
zY_9~$Tf(rIN|!WCR#e5XGz=ssX;`fkEaJVWt6KF8;V&we<@0qF#7duCr|`EoiRZk6
zwJh-$>(#2P)GeKGdae4gTu}=Q7?U;%VQ6k@;Uj=Fkc2*T9r-mvD)0u-yG)r88PO%W
zp?O5F$ZlHA4|elX%VnsILjQKba^kg*PTZ!2iO;AiR#C?nu@9yec;Ll<89$@fH0-x!
zi*Q*B(-(%b>UtFi0L22OS*%%gTwCC)71Mg3-t~i<2yV_&eTrq$!zam4GG|Q<o0;<%
zY_HeS^9oiolVFmJQqpSHLHjUfM!Td`%}lj&K2w0#0;8mjkBK15?I+M=8Sespbayic
zT?0KwLrnMy?N(wfI^1yE!|eroXzm?qxq;gpo3O{ndlD@#(2vKDJLAu0o>2NJC_@Em
z7z$o!UC?Y}{|WjAn(I6Qn0>sB8{8bsIUq5cT;CjBZ^Inh!Sd9yJ0O9^+Z+*%(mS<5
z`T^C^n`R8Z#O2%HtRqXuFgnZEU1?NzWkI3AOMIR_%?{Y#rc1WFO!p!ru^pb;O<f4q
zg?20gQeuz1hfsUf7|{F~*YpHJezX@E0fSE?Ly!Vtbdt&bl<uBEMo^4|QMLk9XvUY1
zyhJCTkG{w~_tO0ZB)X;0=1J|-i=NYr=oQeId-hW)G&?SVyC0^m3Se|ezI=~^Z|`SY
z<L&n~!gFwY2WD@9O{q`qfZ4DV7P-Ui_nZCuKA4}r;M`gYip*&L)`I*tyu&pjAmw_c
zR8)%24RqGz_ja(Yk=kDEvtK9j3dwnp^iWCBXT(ltUa6VN|9_h4863@i(CiX<BPI#?
z#MUW0m>77u4~c>l5ra}x4835xWM}QxD0aPIyO&9YrC5gT{Ek8Jzv;>L`v^nu7Y3mj
zd<CVVgfpu3Ql)0jWC?SScxVnF9PJ?iq`Y>}!*D-jM>{*r*#R!)#?*H)4Fx2iRyLCf
z@zCTzIINYhW}3uPgHTCa6_p57t9+>)^t;-7Y%=`zbU`;QM-ujzZHm>3MXg|^Tr=5J
z)Hw@vJxoJBra^|d3-c;m#Li+S3H>m;;UOx*R+W0KP$sM+MMA1(fznP2#561swoujz
z7fiyn4~Zd<LsZYcm3xP<ieYFqmH6nH<hrS*y6ww5W}p#lhlQLJ;4TzPgli`~5^#(y
zs}*b##=x)J`msK564uU=Dwq#)CvoF?jkv+A#5I2jnvM^Es0pD6?b|m@A-)|j{j((4
zP7F@w#cu-7&IW&gnA||mSQO}4I(whr-I#n7>RUN*KQz=h^(gk*HEa3ur<Z@Xu$p_=
zd+@gRVeg^)v59-p!;R@?|DNBEd^YkFF<m+4`pD9|&1h^rntT{dK69gR<R7DN-2Uim
z>yCAI{?4zSQ_kW?*GC$sTDE+PMM9`?;!&`Bk!eO^i|qH^iNzDmXy0=BYI-&D<*qMw
z-8l61%$=Fr*}I9qsDI4f%b&Y<^yl}+-ft#Gzfqf`6MsF|@}gMcDRRZ$V-_btVXS{W
zdSESj;706zbbN90(eC(a<VNgP|IPlp<>tX3J##YyV~g3}yz?Z0dXlXW3Ux1?_+)N9
zm{<!Y7G2F?Z0Y1DAFKzHYr!Otf#9{?<$<dM4+48ulV5SSgqy;H<ao1Z_mylj*mdpj
z^0BMO9s~zh-~MX<t<jsK4~8e2J$tWYANTBD%r<?(Zv$5XO(DFLSRTGQ+!Xp(-fjwe
zR&Z13U3qQQ^ZCGfUwW-CO((Lehdw{OJ}|a6FxC{hmx`?**TX+YoImhn2!(oDV@L=$
zrvLk&qtwnOjkw=NyHE9bz8&Fc|3(P<KeH_KsSx30S=9@&On6x?)}g+jnV&{)_yJIU
zL4n|Wim+WUq=8Ns^jfh}vcC?AY_X}wRca;2_qXxO3a-J|#q^hvZEEZ?&9py-?Od2<
z5)5GyIsWH61`_qEb{wAq9xVjSY3N#RhGD)#GaD$mfp+~1?cYE{8z{AbhFdI@7vG$4
fs*!!KJJraKk2}@KaLTDd-a7f5Rv~sBV{i69@fVTu

literal 0
HcmV?d00001

diff --git a/evals/lib/__pycache__/config.cpython-314.pyc b/evals/lib/__pycache__/config.cpython-314.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a07684f594f1cfa99d3a27318c12dd7ce24e0d61
GIT binary patch
literal 2722
zcmb7GO>7&-6`tk(@NY>`KNh9fF_*MsnXpZ|hSJ7z8Z?$AOOd0{>v)wqk{2s-C2d6Q
zu4ji<MYsTg7O2q#uu`OmgrbO#_AN&P>7fa7@F52Zj3asDph(jm3iRMq1_IyuW|wQT
zMlKzIGjHC^yr1vAeUC?yi5P;>a0lN>@(6uLH^G2C9n9|MFgK8l2)c}9W|El*kx+)x
zp-Gmou(Fff1W))jPap#DxXJKDghV_ZKN+2fkyr-J@mVw%A7(tks5I!Wo%3*eQ+lqG
z>(f8RBgs7|i)0~<<nUZ%nDuXxE;%MgfEP`pYO2bRlpH#M^6~oX=Pa!vX=X*bs#T4O
zMs!J&oMoe09daZccv9Ii7mP*eu#K%1n>d9vZKWz5kziZa9p_kiNiQ!ug?Ajw%op;E
z8`n(JA{sF))0u`nSJKE5tkKh&qrYxd^eSw`k|%7`6J73ftZgr|`%~aH&>SKR;2gS~
zB<w-JdXAGrhu}nJf#N)h2TJg$FyZA0colWy7pMomc$akuc6raIdQa7G$lIOT=iG4H
zvwF-stQci7=L)`Gx6yQ7a3iW}YAd>`x-nH<u`0DHrA1Y}Qq!vKlP*<VFt9_ahN+tt
z90Q{R3SjI}RfhsoRy9JfF<&FPqpDa0cXlZR?#bdsN5@Wa9z#4Vv#_e+N^uk*D{70n
zNsf6gI7J#2r&u-Si+*Sd_8Jbudv8ht;q=1vJG2u|{^qrvM5e`McEtW|t{+prcdJYj
zuAOv902*Ltb`Jv4Knl7Xdg-?aRe-k-ooAL9nUO;>D|0e03q=+H4*x4eB}M5Jfys@`
zLvYoKf$MP^Zt0pdZJD|&V4c*k>03wrgwP8M7GBYaL$&8{AAJo#wvhr@G0k(zc4$l+
z=m*rxoMFEA=JoPJZ@u}pzuuYh`JQ8CYEv+%X`WAC0JnkOWKJ<d?XyL|Y9Gp=+2Ag+
zLo?FR_>(QN+1V$uO59U^KxL7_%Bb^2iQ@mv2tM54<L<D)LpFm7$mSF-g)Z{7Pyy9K
z;}ATXpQW~YLyi}b!k=b1G##2sArwXXkuBJ~BEXyQ(>GUEOJ30m-uY|Cn}{7&BJd_!
z3mvC=_91j-qm=mH`*f#718XlcFMWW}qG#z8ghYuhho<(oZHU=XB^KaW*-P=9?FyvH
zjDkccg|poOM~MYeMRbA-=p|;hC%93f0e@zAD6{`B$})GfqfpxM*>oUQi3FN$QHiI}
z_$fd;p(K8+=*oNn4YzXuZsy=-a^~MBA!pl&K(=&vEGP*%+^HA*k0?o)qbK-4>k>ob
zH}!KL!&yMhA2Wgymt#Kzr0F;S@yWo?=8b>~KcS)q%8PZeU9Y@KTz{@ov#Um#7H*#}
z8I{9MUNTIn9vYGAy}*68P-B|xZcMY`CDb5d)x%!7lZM=oQK=sulAaz(SYObwZh{P{
zW-go7su?I6a+#q#>z4tVfSy>dp5>EnbWC~m+y(W_r0jCEWw=pCThP@iv>P{yb*-Wj
z{Z|Cjo{FER=I~{!W)e51Xw{lNhOvcR!LcyWD=tU7i_6ou4wQ49xa^`%yt0z-^7Yd&
zyBsY;pokWKT8-Rz8!8oRt1gSJRZQ)48QWzI5XBfS5Af-Z4~Sq?R4SY)y8_WLsBmaj
zA37v;4!>r_s4=Ri3BL%Jyo&V;NRS59-QMlE)Qn3XtToQ;r276SHYOkR^nY0RL*Zur
zmU%Dt@^<#@R`zTwJK5@aZM&z`>?z%yZ%o}UoNOgWZZF?--n+KdmD?zP*_FG!+&J4z
zj(j^PwPM-L^!lqehTj|BOgBcGvF!bUy)E(hCcFNVkJdKVZgv0Z+AVD>Q@nBh`uR<^
zG1e51|1B&&NDegjzWA~J@ywmmf4TZs{gYqcv6_R^t>hcc$Q%Dibl<r2`%4>x%@;=Q
z4UTLNjBO2!wFb_%5)*d^?j_{L_=7}hz4zU>8slFldapA({pk<&_w|i)w{l<gAG<!j
z)3xWTt^*JHrJergH?*7FpBFcN**r9T_v+oHrZUs)dux|v(wCTri0Qt>?D8mG_?|_{
zzDFVw(;FA>jDHy)ZE~ZJ|C2#oryl=3mVNAs$DIK{^V6Qbu{8JT(Y~>L+-E|1EX#eC
z6JX9W)A?>Utg02O?04gZ8gv-iHt5%iY0H3q4lM}UBc`7emMAM?uT?Y?S}&&M+)a5c
zv;cPs(9QL#<E0^O^9zR8+QV3Ptm;+hV73OiPjH5cqD9^dwwL36H>GiRBQN`Z3SPml
zz?OD4rv%eOmSLFt=)yNB_ce-tg9g5%^ZtLL{I_Ciec;{cZ85hc=33&xZSlyKcw}2V
Zu_c~pL>~zN+j{;Hg5?MCY5Vgu{ug)=Z#Dn`

literal 0
HcmV?d00001

diff --git a/evals/lib/__pycache__/grading.cpython-314.pyc b/evals/lib/__pycache__/grading.cpython-314.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d45e90940f5720b7715dda0efd3aa186a6017c6b
GIT binary patch
literal 11213
zcmd5iYj9IndiU!6ep<F=8!Th6EnBvEV+@!W;}^l;TpNLdK_jGVTS1n@xiSRrw$Ub;
zsY$jA&SqkTY=h}`1~c7EoM}37W;zqMe>R=|k&v=d?=nN$Zl|;V%tFetKX&?^b9E)#
z$grK=&h(6Q&bjCDopZkPo$qx{lR>YcARKvI_1cjNiuw=yAqI(%p&xw!nHa@TQR+O!
z(EW6uBr2)Kc}c%CDuq<qFYA*><$a2%qE8uB_Nk((K6O;xr-^F%v{7v}#mLGiMqWk*
zO;H`AfV`6A^^6MgYLYiF8pvx&-pJ@6uP1pEV}QJo<jsr;@@A5^Fp^rzX_<bv7kT}G
z(0IEeg2FzQ;~c&T)_0L}G(^HE%K9B+C_L$iPOuJN&^zU4n|)zF>zMGO5VUo=91#{Z
zL(j=bRO9tGPlSCA_Vozsi+ZDhaENm`E=>8y*{Fl_g%OP6a(F|20xdd$ShksicC6zv
zi~Ip!w8csDT5l*6&UHTFRPyRUFXC7~GZo@>gJhVlaA+(r&TEbkz(X(=ukR*9vi>eS
zbzbQf;PDpdftN&z8J3$0MtM#5Wp7Z(`)Jr0{Kr87e<^%E+6tK%<)+R{+TNxpKa^Eb
zuh0`TLo*Ua%E%ZwBWaZa?i9bqJIpHteEm)n3~<r2cpc|>wSW;%z|Tv$DB=}}jZUG^
zT?$sH7z?9GZ<NCWZ{gcpPjM{bS}!0#OgPkX#f$u{U4Y_N?>HNZHggvP!64Vl!lb#@
zVBkXQxUgL<k*mDU<H=!*^>|PTz|+B>tAQv+J)xAe@ga`ludU!@afl>Kuad$?(1`No
zNEGKO8T?_f@<%bq#Hbc3B5_kDs%H;HMWk-&pol1_6$3->&Xq=FZdrY<*ssPYJ6*CL
zzQ+mrUfxPkZpk3z*9W&$giuYrypN*P)MjW2$dyd@cM6o~@J1rR0LwW7V~#)+D35c5
z!eXH(8g?{{2g4V<L8l{(93k%{w2V#!I84q!l#YPEh1cXjKx?OjmjXTV3MgVjey1E^
zoFM~5yfi!&<!K+UjCv8Y;W#;z*3~&24rImkW&+0Ny%1zQ!lt85P)?AJ7aOC#)#zef
zztmf2=&Z#$w{3P?T$i$Jn^8V6md2gGsA$RRjMtA{J9gvJj|LtpDMQJMiqaWl$B+$r
zuc5}=X3&ou1n$Pv*ou)QM_OpcOVLyaVI(6Hj*K`mL*%591rmYc$WjQgkf;<_Y=#I^
z3{7{@4@rDu{|ZUxt-^O<hR~FR{0t-TiMd-b0%?JBG`O0`a-C1oP>vz23n%gfHK<%y
z1)sZfMX+L;ryWiiFMBN<2=V$0UT(tU3r|kM8RB>?Y-409>WQ+iM|pWLe1%1TfokYX
zFi<CKrrI;%y$bBdcM;V=6~RL*WMWiSYq;Katu0Zq|AsG9TAwVfhwOs&)9Sm`kV$FV
zGuoGu+Luz=BMI4&0%FAT@xNgt3c^T~Wc$brh3z9GsfWpsjr?6X;VjScvU4xoT@XnW
zw}q_g3DrQuW<nJ2U%|=3G)WR2Zc-{NWc6NQngVK~A03Bt;I+o6QE`c|tKzW@TVdym
z9qgj}3%icdvo5$+eT#4hfr+dS;{Hb@z?`IR+C+6rdb)u<0rMHevt1W3$KIV=QAEB$
z8)*wAwlPts6jK0uO40-8<<i*m=5RvtGB8}st|H2aVguN+HLz8yV9)%4u+<UOC?*6F
z+}2$*(E?O%r73?J7=<|hh-O_|tR<-2>Y{D6ZaGtdxt1SJS{3EiGKynVMAuCZuu!9F
zI|DVq(V?a|Zq<xN9QS!^iRe$?+YfX@Ah!WzErI-_=RnpKAV&<Nm@|_wh?&~DPy<?y
z+aS&(VssnwLI`*o%-uK!N1%!t#r&HD#^9DQMi)L9xuByMliL8Xd8b^Yz6siG%tLYr
zkOnc6UKh?6_|<dhb%16R5tG}LhX^_&%*`}p#h4K^48}4bb!%V`=!kCMZvanLf~U7h
z%i=pwvWB|MIqHIyM$B&W^Y#R27uG^(*8*)V`FZsbTwzv7!U6gf*)hrwoU@9mrU>=f
z2t`>66x9Q~O6;E#_HYt6Q?Q;AJOWcXV5-@muZZmfv<SfuoQ8~*qsD=1kuC=+1njJ8
z3-XP$Rj=(W^Z=Y&B05l8?BZ$)>1d^xUr!5e-3HWN#*~XxcKUA_`#!80KoD0@4V>s@
zz#dUwKsTHM{eDUkQ3S-CDxWh4C}$<AfzG@vdNsoGrkvpNu&=YeDbToiB^MYEfxgG<
zy`gBp6Y)l)EDCW%K_J6Igjwz`%$(aGbCBKw8x*64sUASeCG(JEmF@-G;ig}wVsg;1
zKpP;NLe)}xqyhRaeL@f+QeM)sou@foB??6j4^Zcb<%H!yP)0$oM;-h&P%=EATVcIA
zrv<ef42FF$-;S1ivs&oQl|l5|U&Fi-6^Cz+&zlz~KHGF_$ITr-9e?^r1$A$!D(D-U
z3i`yKeq{)Y^4!|ccF6Oz7wv>}&AJd^kV06jY{gZD`IaBJ-5tNQ*l_pUE%QzDPrIKY
z(1IymI~D-d&wGIb&79W?TA~M+@f2_Mg+u-THhR24Q5WIW`NC;+eZ$%F_2-(L_0yI4
z(n8amu3OJb0{&@3eY+#y#!)}Lxn5LzIk2IEZ1bh5FgDFDvRAK!k$;%Ch%^*I>==vK
zkdMVS2+kwcUmU$y>aHRj;9nx)<cI|bXEm`DL22Z9C<Sev;xOM8NT90Yv;Kuk^9>)L
z`@8<9oD9m}GMCUjyozJN{0l^{@=CCMLa;tiXAeO&uZVa#F#XUG{Gk)hgl8-~74m~Z
zj}HdPJEesa6k^B0?7YnKHnLHG!YJ&NCtqyLX@+8f(?E1Tgf%2ykNa?*D=-)6#p{Gm
z7WNe8unk(Ea@<S?=OD^U5X&n#){A@-yvFSfPO;qxg%K|wN8zamFZG75BFuRZrgLYB
zpxa@!j#qnwK|(VeuSCQo<#;)<CLqBK0qwy3@a`atLSBy7%JG^U0ww}cUPYD<Idl}a
zPzaRB>+>|@!A2)N$t!adf;us{!#E;`08|&jumRg<Rd`7_?UrYcI64PCh&DL_3e|68
z)O}-l-1_dH$NI7+>zry<6}M*WjY)gs!ceC9P_p^ZH}*p*Q%A;hG-*1TGWEpzvki^0
z-d~x^X5?9uZB9L_j@Ko&>`s~XKtWy8oxttDqIzj(x~?N-Ih?HPm^aVuo89-pe#p)!
z6PCl-t<F3BxBC~uODMhdNXpce+<GK_X|8Ox>;pSwXXFV}*L|BkjX6!K;_z}w%GNWZ
zxo<5?<FlM9Kd>~MvL2aHeXBJkN`}+g;jC@bd`ZH#?VX;i-g09yt*_16D&|IJN8*3B
za3N*ek+JPd+V-Vv`)7Kxw#uZ<MH(hbo6`EGY*o#z{tx@-FD~v(RqdS_xNow|sozt_
zT?_h@Y41$u17p>^`LFHwOC8zL9gEwRWPjhZxCsoZ!{yoXZ8zRX)E!-s(zYJ@kyKLF
zORrE8b1(fwK{dDB@Xn3Tj>o+}el1g4mn^MImo_Z!p6PwCdD}w$QhB<ub9VS^P2$z_
ziN5pK@VpbHjcI*j*0tl){=5B)7ngUYT|KGt-lVH%{!-k0tL($FkL~a=)1N5sePAqq
zV6Xbfn#LzFRn>9Jddqq<|CO?LWb8u;`_OXR*U?1R(5&KHd(HfC!tR8et$IG1ur+0D
zrxLbPKR>ltzBHWN(U}+=p6U5kWxQ^<W=NMgv&NDe!+-g``Mn|@rx*68Y<sfy+Q&*+
zx#kHaQ)?d{qAcYrJv61W#!fu_O&3jBU!tG#rmm;IsiW+DG>7fff80}ktW5DwwdKdE
z6hF7vkJTs$0|Qlwgg@TC;#Go+iuF%hiD|XQeRNK+o*Q7W7ur`~d{7*gb=Z2gVEYOp
zY+pf=*l|=y`@N+2Spy5Z4nfSyAN?NIR@}T2Or23Ij7f0k6;ANXd`nSR;54Fcx4f7!
z%|fp#3affhU&QQU-MGGZ-SgB(H_<L4ieYInqpM(oFcP;6)N1KY>9B&4xfOXe4Gfx%
z#s#$vOq_jKJU}R@S1-R#R*^RbgnAs039FphsQyp562(qwVHJ(wlr4Pu#WfX~Az72M
z$eRAWpfEaB(?1-{ds5f9P+J@V=WkuD3_84sbzA_o>mmru1A_7Z-bM$Ad=A%XK(vH|
zmqACka8+>Lg4kh^!#nN`gt(~a>una?Si~s{o<?wq`5hCWKeU{fR*Prk?6m4!yD$Y_
zB@XG75gAS74-wD^ABAbfxpq9uXKQ<))~O=WmRGZ(sYw=r<Cj;F@v)m!A&ejh+PviC
zb*mTcR0?v`1HgzJLqm|`=>RW<$?{SVh@28ih|5lN+7V&q^@C#?6m0C625BCEM&#)J
z8)RbCV+Cch=cE~yQui&D@xEVJwp~+z9}k>C2d^Exr>*^0T?zhZzt(<FTa&fe<{D=k
zGnOq$%a)X7>w+q2**?>i)mg8fxOO64*_^d)j$exV=F1b-#>HLB6%U$P;zPGyz4_|m
z(Z_O0Rr`Na68(V{1!XX2^qZ3UO=-OoU<&XumWHIIA!TV?m`Gao7K2)vur@3XeO|WQ
z{-AYdd{?Hj>0V{ik_<!cz>p6=v{BZ&l?uXV|1Vrryh8BnBROD@FwAK+VWpsWyx1-E
zDs;-bsWo)TLkglBffbxARFGszO$wi7ns|fZ$kUI053Dcerltp>aD8C=37esc)wzrI
zXnP;5@fOmnsI!{EUd#2vz&9h&1Dh)j6W8sO6&M+`$}zXqfY=c{>L^|t3B|pmB9!7V
z$n3#P0oGc4_Od?|K4#j*$Iz``3s!H}1@@I3-iy=~targzfN#NIq#`ZJYCs9za5Zwk
zQIUSRxfiWjw-4mzZtdh*U??ymHCUu>gSi)J`9eBU6d!?6=IE#hOkB5S{@CxY0ygan
zb5YL}=XXp?TXv02OC24KX~P<aIDCs9@$f}9#5tz*SFjPnvDjXKPif2cF{c8UaSoJV
zxzqw`cxez!A^n=}yl!=Gr&Qp{7jbLV1;l!RrE`LnU6;Ez(Ft5W$1xDJBn;pX(R#8K
zRbWIY^?&ik58k+^+%nJI`Tp(iFOGc?{w$oX@6M`B*NxYVap!#3o#VHUFIp3|dlrY5
zO24Z9vifUlqHQ2iej=?p3H>w5%A~ULp0fUDU5nDp_Ri$?&XnuOvOCi~lI$KyyGBG%
zEkCnjBc&tR+>vq~PBgr<EM4x&96ggfdM42|k|=*Qtvb6Jp4d3k+?8nVTCQ2XoasHA
z>^+-yoy~(>ux6a?NoRYa;lR?arOTPaL&?KKiH_k!`Kh$ZO`ugIl@<4tbqie?*TJOg
zV8+#%aCI)zc!Fn=-DlEGXGCbt3(`zOTe6{T$(pD?v^2C_n&}))b`B>_ok_eTPEc!&
zoqU$DW7a~LqXROr=Uig0x;7Fnz#5<f;p4nC5Q@G~@g0Ze9Co8&#aBx2d5EKgwP^*X
zi&FAgm>#qvA+NgMgwNFqFGH2VWD8zJk<&%!!kq&{Z_q|kP>;`3=lyQ_ywQclQZBew
zohG;BynLfN(Z7c;AB%fTFy-bQZOzzUQcN1u=f)NMx-Ww_6*wHos`Y`Pu!lG2Gj&qb
zdC^G+t}Ehqp>s|;3GzbU6L%#AZ42_-p$y?2;4tjmQ6mm5-tNJ-?Q9XZ+yriq4>s}2
zS(p#U;^t5LKIwaZ$CCN0iZ3gcr<40%ncMN+j*rHkKEm#bH?8~Vn}&V#Cz^fq|0kq4
zC8!IgcLIT3K%uJ;J%a``4W(GqD=gSN_@O&af7<p*+xs<3^jDfMHOqly+sV0__i8?J
zKSkez#`u1io0{aMlit^Xw_zL*^poLo&chsAQwV5AZ$KMz9y%d|JtLMR#%#*Mq>N0g
z@4msD!~>_ygDJxxXqQ^U%%vN(an=0JztVqW*|wlutocOyjb&F#yE`G<T{tzs9!bC^
z1v%8%QTe<>4KEWEjFM3?YWQmyEu&-ftxBM0gK$5r5U#|L4B!;IZRAyvpf`%2Cy={i
zUQQk#@LGSs?}?(o_&AGr*@bX8C|Jm94CY1m%ahQE6A<%++D0oL;^>BJ1t;?(Bw5ms
zBwJFpB8AedqgF_F?iJDl-4AiOfEYjo2{;8`w}6<>;VVX+9xQeo+xp-_1PffD-<mjo
zRADLDA{D>KfuKMg6UU-Pc0+%RwRNF5)M5FsVwGhG7y?*%`@5LZGGJU)zlvRn!?nO`
zj8za}URA*8256!@VG>;zfZ-IeL!VV(@CE6<T>n~X6z?SoZn|(`cG^*F<s!2b(}>co
z90gB<5Kfy0^Os0s8Cdf<Cg=iNSqu^7DE6w7kh^ZG!mEQPPf}p_tFAdWyl9dQm<hX-
z59r}Ka9yvwAW^wh;_mv_??@4hui23z%49?hSI`=_dey!aX+^H^4&wCyrBsOPETk+k
zHb(Iy5)uG!MOuk?gYh&Qt^q&cSQjGdKZb%wZVoF)YebN4rHDzOtv?yX!4oP!jDXr9
za_Z2VIR6$z@LmconZPSx;3mRT!94dtZ{td1ju*i#3*2t_qP#NqevCH?DyBzNLwUp6
ziz?t~STG{Yl)#Y>cPPbWS_JCzg5LEaLElIfPL|&!Q16B!AGlHOdIk1=wU|d&V9@7?
zGrk<Z{3<5YG1|BWh=@?PGygpNHGOO1!-=1`G8J8kimrtnpSIs^U#dtpb)_r1{$=<n
zhe`F#%Dwc*<$LMj9;X>y!($VEj&EL&5+YtU*f})B%X&Kdj|&Ks!(|d+9Nz?xcTdEP
zf`~Kb6<mf*0EBfUc(eyyJwwM&o+iT4sYO4)b=5e+#8cR5vZLr7+)0M_3bruk)bfmZ
z;XcNX-o@=NLBzp*6u5Tkwr<@@t}KMR3b=3qte`m55mD0zQWM@`m2Jt?9Z15j>_F_~
ziiXl#GrAi1C3H1+F5WlV!TZ0s_ly0X?ax-$WGc5OE4MFQ%vLo$*t+Ze-nrxN9sh^2
zud2SR%2sV&Ko9H<v$YG}rP{AtU%Ig0$2seqYrWALM~{sv6UZ)=TJyMs(ru3W(zUx6
z-CvD-Ig+aFO=<gLN3$|j8Y;LuCvTrj>2`l3+Z{XlU#0dR`;vH@Qg&v}H|KjV|CK6b
zVkaMLZvNr;_24_f&-Z<C@Uw%l-fsm9$?$&toNLyVR&Ji(d*{&YL-%SA-nUlIOYfY!
zeJ*9)HKTf<wcan?B)nLf>HU?~mNnZKE`NIU?$vDBrc9YDS>{@5%9ht>O=Zhl?;F5d
z|88)8=+5Zv(X7FIJ#Z~>Jv<YR?|iIR>J;!)P%cPZr<GTFJbrMWd3cS-Gd2Ye8S!&N
z%*@dMM7)VSPHch4i7k`xa52aUkC?E$6x>5t_7R#Qw_yUEVV);kz7cu~;7?3DgkZ>a
z64inTePO3n2Lbw2knKP}fp)OX6o-9yaFasQ4<$6+P5&pg<(HKDiA+jMuu&qxR*2*~
l9lU+~t^_}A`%VSVp1yO_;GF-i5nN5*Re-VbL?`MN{{w;oiO>K5

literal 0
HcmV?d00001

diff --git a/evals/lib/__pycache__/harness.cpython-314.pyc b/evals/lib/__pycache__/harness.cpython-314.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..15f3b2012aeb47b55a9db4e8056fec107412f9ce
GIT binary patch
literal 3801
zcmbtXO>7&-6`ox#|CTf*N0w#9X}z`t39K!$c2qfVETM)ZRElbWT{o<XSl+J4mALI{
zmz^D2w%P-W7D(M5oLhX<QNAfR_m<vrB%vA-rWTN(2$0@nD1m{V`rhoW%qXsnqz~Ym
zH-9^CzV|ba=X1wY0`$?7{$J0>3HcXxB7#1WbnpmxljwwzTSS*;r5TyYXHb@BV@!cI
zHXENwu!I;>W|K21mJ)J&Ha(*<^$b}|oF$8yQAvD|l=_3VlaGd-((9e0f&8K|mg^;F
ziJs^qdU7!}8VgpkE<FV!=|19gvu<4;AbEBDiy4!;mhWqI>M^h4Iof&6teSOZQH@$F
zw$JF@f_BY49Q4*)U$bso^e(H}?utemF0)sy5x;I#>}9)hMXh;j8uK*AY`B%0#%h*k
zu2?R+poJ54llqosGhc&`TjuHo&2+2UTFr*lzP;j_j<4Bn)vlP#!tJX68{2WTWt&>n
zf;Rp9=ql<WKA}}S*Ke#^zBsOg2UPWj3x~iZScAHLAusWa>AD^>nGK8Qp|6xoR)aQG
z3hxeb`fzil6YCH{>EITOHpwDk5|~`Rm1D6%F#DpS%R?}u#{ghO036rjAcF}2?1Y{K
z8NdfQi3ZSBA*b{V$XOw$^<zxcb6}J%o(zq`V{kp5syilI_ULK_T_ki9qJRuqNe3DT
zO;RSe<hTArNEOE3AU~68k}m189@CW~82VxS-!d%YaRDVwp_~Yi`A^ybBfkd+Ej}aw
zKmt!PnUCk3<5!CB`WE$zOB8J6xrH^8R*REf)hY^`j)?F1Me8;gt?1ZG#aiG_h5B8d
zHH;8&%P{zf@J@x!(QY`ze+^iZ>?oJp%H@romU4NsAEW~qB^A^<BGd?Omp({<fFTF>
zlIAIX<DtwI-*T4eG3fZXX=sSzcX_7b*hc78yeHD3V^FA|z|to9Xp;7V@ZDji8Q3lq
z%1iV($N`E%6WW7wCxD5G=+YByWnwedQYN<2AO+{Y;?lG)I)$FXGdi#Pn(f~9AaKCN
z4{!Z?bn9=<2~~p2|Lx8H^YZyP&4M~SM$I+hVKj$&VxoK{B#6eLAB?q?u?>J^Y!e_E
z`<9U8Wr_)up2aVAB9j$q#|ZMpb8qY%6<UO@@B%c@?L@>4g7C74)8l%g7=uVoMu;@r
z00(r>=h$#%$@837f`g9Zqs=i$rrXN&M!uy?Z%qPwGzHxNA<*oQUJxnIohBa(5W;*F
z#0zd6iUKBAwU||52BY@MibeU!$a}s%E|dze`uo5)JqP_<VD)1VUV@dZL*_)h^rW8B
z)4Hl>^sIgi+8q44^zNbxI35phOu!qpT4w=(2qomOhHKbWo~b*H727qcHuY&2&cs(C
zNjd8(*jxL(Ht)F>R~^r+8bJme#x?4ydB<34R97s<<I6D1z#=BKczU_vIEL9^-nZ~!
zyt2OltVx~`IX>|e*@GMV*lhNQW~(BaPtu~<?H|~W!Q}469|i5?jA%dY?ca~X_@_fZ
z2-@OXPq81M92On~+n3u}4eWdY779Dl5tItZipYM5>>{!^kztYHW5ENiFf=rX6r|8Z
z@RIXf<OxAv+!wWIlFzP;3+03>&*w8dWf-oxifLap46sDQK^Y!|-!@Dqoar(Qc+!35
z*skSzFbxA8542%Wtaj-4kqscrBO3$8dku(VNOBd!WZ<Vu4QBbU9#eUs2SzeYMhx?i
z|2<&ulil3$KVIKcF7GK5d&=0JGW|>jWQD1Mow(_{KLBA4q-i`FU)jKageIun|9TcM
znlNqocxCjTq}2Bm*@J<7Y&M2PFYo}gdPMkT5odQ`v`KD8<M29Oz{e=0^4EI0)Hzp|
z%2Ekaz!5@!KpGAjvI3yKL2gF=2L*1KWTE0IOIdPBiOScSbQANWpb{c*gi0M+MN_&`
zCMAsLBZPTjG<*!<47n^ty}JC0Q!ob{p|Tw5SdW$E5}M-(mCy`Fs2q!Sogw8|37!53
zVV>vdT2S{`{<<2L&rkxb_q*O<;S6<^7FBZE@Ope$GnvLy;cA0RBU~~dcGA=3Yd^hV
zOwH;%Vcmh+%;##^bQ;z)r5@#Jd$sOSh?*s?G??x1KCHZrWk`(>PY`EL`2O=Gwcxyp
z#k<1@6l-ubPhcv9<estaz}{e*C$CLiEB$<0&nv=zxl)Jn80sqA5Q8<o0yiJhG%c*G
zAZ;3<_Ed){;lLk)&?HYXCqDSy<IK5@g~yqZt?c8>)#lW0rtALO_ujrg{+ID5xl^sv
zqgxAG?>w?Tp89yK)&I_RZmyM@gMIfW?oI4uF19llw`LwK?2KP;k6-_!(z;RJ9{)w_
z>O$+%FI(>!+nHZAr=F-i_Y3z5J8HhI<~Iko)napUS4q7;_lLP1<#by){b2GRing1{
z;jt~{q==sUah^1cs#ghe({cm;9YAiP=w>{3h;0F(r9Z*JB*Fl2Q;ZRwV0Y{Aw}&DY
zyj#@01(+-VK9=K)S01yUu!Mmo@IB|Y#S>6xz<(e*1>1tegoqu<SUhz#FvSGLK*l)t
z=YZ|UBuRQI%kb~=s|1k-{z-cG$f-SY`g1b$S+@IwQ-7V`$)0Ox&uwQ1ce3Z(+4DQu
bOYQ8XX8OxSN}Bj{?Mnj9vy+fW1P1>FhhaSh

literal 0
HcmV?d00001

diff --git a/evals/lib/__pycache__/models.cpython-314.pyc b/evals/lib/__pycache__/models.cpython-314.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..90d5bc9f676b4bca2947cbc37217ca203c28377c
GIT binary patch
literal 5475
zcma)AO>7&-72YM6|D>p2%W`Cik{qjQZLvulG)`PM{*7$Qw6jiCqzScHkxOag6<3)V
z+Om=ZQ3s6<J?NNAfgE~hZtbb<sn;HDFo4-X1Egq+Ah!e}TGXe0Z<drOg+aQI-oE!{
zcXsBz@BIva(3eSSxGuKF-}xNeKS=0Y(SXr+le(tWHB%GXvS#Yn^o5WJO%flv7G5xf
zq4@B%$U;;^6>nUNEyP89At4eANs*k?Mzkr-jErh#v=~1V4(1gprHw%w@6x7~HUVw2
zOPf*J6twALU(q;YwCBhweFplzuBZEzHVbWkmv%sD2cRA7(he%^5VXTx+99PKf%Zt3
zcGw)9(dLeBZQp)SwyVM|<;u3Ob0xnjm|YS)SMga+tT{R7p6%uM11_BP`JA(f(I3xZ
zY{f0l=GN`I4ySi2?rKsz*zkF7&1U7C=iBAe!d=JPjQNb`+$!F;%Q?T2<F4mah4<jJ
zyIS>`Q_fXf$1C&sIbEjgYSkCEaQ!NWXk4k*)^U$5y24?$Hy4(Pi#B&|`enzH$xG_9
zm)(+(*>zG`SgDI$_^j0NJKfv%)pu7Q)U~1}bof8CoD|^`TG4<vbu;uM;DRm@F~cHi
z8pW6yISCn39G;Guv9^>@QoJoC)sqP&B}K{%!=@>jyjUx*I^xcTQ<BlZR-foHqKsE~
zny$b+BKLl-;8z_PFZo<pHC}$2bgO&X%!<v|tdhUJZdc1(X8fAis0mB>cbzJi>HD_l
zS=@o)%ls2<E-mAhRkhb0%aTdUf-!0y@oCF?w`P0ok*sA^T*igxR-LMESuBeE$#h`4
zz=y1lRCut<5qPj4S~fsrkj6(Wi<6g1mMsKxS8BrHcxnh^yZG1S+5Atr!+3s$VfB7>
z{ys2~zXTWM?N#{xG{5V59?ui9c;0hY@@frye&Yd4;C)3XmiZZw54FDz9shXY)1lM#
z#pdy;`ogx?Fs7Qh6ZM-rFE@-6UEI{O^~If2PmE{TGo5LABViN6T}T-GzmY(=i0&bQ
zB!!D&5-5tBi4Ka)B%%!w?)B3S=mIv<BCYU|k(n!-fQE3&m)4xpT}E4FDUia`Bo}NS
z88m9SWf|p&72A_(3RG(YR(DucM(hgi6enc95&^gkVl4z7sY%}*2UFLUwB^uCA81+`
zx^eAweN89G=XHQJ{3Sgrqs$RCR^7_Z9$4C}aw<MSC|<V(rw>dsnw7xSMhQXf*n#)v
zyT``xG$(M>wf|_5$ne)hKJwK~IZ7=Z<LV+%Vws8E9WZt6?ar2vvq!O!+qB$67qrG9
z6+~!hu0z)g;eM^-lQZqPH}oZaS%2w143s7Vy-grgm#KH2S<{y^m@9N(tyrh_0-5=H
zV5YT6toIG=YFyLa9Wb@)`leQoOlmja-h?)R_0MQ8hFyJ5F9cQ%$<X`{rM}5&ODA)I
zRTJkBA<pIoJ1<WkGO4qPyzD)&kTvGdd437ObAEpB@ne{T4}*M(RXiHC9_N>TJN5DK
zuQQGO@_#wm=NJ8h`qtG0ljhcIT;#YTkl5AaIZRGTy3~eVE{}Xo*aOoIzPX4D^V&Mg
zVO};xWEw;!mu#=*T%nvl7m@~dyoyX<dStR%RUFu8+UnZRH{jDXb`%ry1(3S-&*9O>
zqt@??KV*KN`J?#rdw+WGFPEPz79Zm;zuXwL8pGChqL~{0NdMLOzYUImrRfu|>bsgg
zbU_E7xuAcZ9okL>$$<blmtkZvHbUeGkx?Q98%CcEwnzBg-y@5W_#zQ9xpHMNBSi(9
z=n%@3`~t{<{PoNv^4B{JW3oARync22dc!#0oSZ?{`hLTh`8&d5_S498D}(^q4QtUE
z#Rap|#3lUiUF1?+`tJLnUCC_f+mcxkWlg272)H1L#lW>o8Msavi<x0aDfMg`<u3!N
zkED4`B+M+Nel<2Al4cq~F(}iwnZLdvf{MJK&wvrZWC$6|Sn+*N_VG2p=9MiJjn!3$
zZJliw_P1t%c+V}lV)j&S)?MG{QI-VCCW;u1RGn3noqNt~QD)uhJ%j+Y3Kkj<M@Bc)
z5~L<0=ph_l*c#Zc!kv^FT~UR3YshoCc<VqY6lENXP+2($&(wEdL&4%zScQ?7*+n9~
z5b+WX5TP*nCL-dghp#n<k2Z{>&5=UGC^XaAhc}u7L-i}$FFd?O#qH{&^l_?lCr;KE
z9`#K<F-``Epg9j9LWcsEzDu##g@{ukZ0f==L!IVEpt*tHqB6GRurf7J#)i%LR)Xkh
z?)mqnQL$Z58KN*3?_~*T+`e$<j!Y1>?kwK=nKZ6mxVG5S1edWMM&uO!IW-c$)S9V*
z`sHT-GxclWKDZogiJm#ICCae4^xfZpc5Uf-0y*Gln0hf{hT7naDk<ESVoGX5UUhMi
zb{7{1j1zb-sbHKU7(>dmr4;(_h?#9mY4qI@bD%9{)RTisiike+f)R5_Ny8#*reM?&
zn8aq>DVw$G(}cnqf}p5+D4D6C`Q_Gq=AwjGWjYX@GWy#x`w6zl&VtBPr`-eezkotr
zGNSrM)uI(x7P_D%%MeG|hE0N?%cSfplR-f*p)-&vF3QkaqT|xJ{$3sp+B^m8n>1a4
z$Soq&74G+c%p~y+kvE9+0{smdAo4u^c?IM{tvNY`mhCJxZ~NRt4#{t>VdR>lWAz(5
zLk(lBd1MSp(jgA$Z#Q{CYu7NIZ5|y*XrS>Mr)Kc_PLk#e-hE<>1u&uy_l6NIy$eP^
zAs7KBhz;e$Ft|>RG&(sFQ4@zg$c`}_4oK=i2Mr*co|c2ifF*}>)zO&_uX#fGlD&*C
zvR6Qmp%7h8d23=X>#d-Q-*v02l-tlFSBXU);?6w>2zgKlJem$FuvI|~Ez6`T!-@~`
zRL|&S%8hT*w|`EgmyeccfCvGP)BfQjm5$Q=4;Pz#ry9no{o>TN_7h_&ura;Z+r}Sb
zq|3&y9kMZ5TZO2y`(bNGFd7`Shzn{{qGN9ae8fyaO4Di~MNylKpf;f{8GCgphGVg@
z7npJNr4i-fBM5zU@Ri9cIDT|_O2#)lTTq`XBkF*Hrnpu``>njk&f$wPRd)BPm&~@y
z7@8Z#P&ibElQH;?QYu@d7qUUYu~wL4qo?9qM^#4nCRmEH4+V@6WU1bZ%P@|ESP3R~
z*lo<kUMHeLksKxsDs$}&o$MLU3IpULR^EGcmAGDLc$)@@yoP^H-US-^1|HsQrurUU
z52{j7rUphD#z=Eyi~=+>+%SflQ#18jJ8z<cY384+7ak1*InM>J?T2#zVEyv;%!h>l
zJ@lai=rN$crSH;67kZYyvzwt_rF~Qf*FKzq3s4jXx8K)Lf==U?1IL5sH+&}cdzBzs
zi}*TQ29XhUCY4b-gS%p@Kj;|tAXAhvrlOn&F8HK(7)EJ6)k-R7^fJ6lj}Rf7x5KdO
z#Jlf;eLrN5KQWF6hNKsJ8}d-9v%b@i`)wg{-)=tq98(ddfB&%UUZ#y)29eqOu0W?J
ztadEm2MZ`V($@4|0`xEwB|qk@!kS$5csPL|2}Yetg<^`@^a5L_sfi6XJx-k8Ar=@u
z19G6#R*7t6oa)4&f@*sHi7_9%Pp=kGsbiK^_DjK!CY2fvz&Ob=I*m!g!B0;tOl%bG
zB@#0ugd#_@e=TDXh`~ug9SG>i7&QN?-#bep*+LaTm1R|MDMtrOKe!zkKL^0!XPfhE
z6R%M#$G1Vg3hTQ5bx7AQ>HpHEKhs7&(~f<iz4BOlr4^0nCwInL8g8xrv_8CZt)=1C
znuzPOJ7-%OZmlC>eQ>+9bEc(%X{D3;v7JOq!>x5JqECOM-kzqnhiE3;S`$%yf_U6o
QgAqNy?fx5I$ScGB56L<7;{X5v

literal 0
HcmV?d00001

diff --git a/evals/lib/__pycache__/replay.cpython-314.pyc b/evals/lib/__pycache__/replay.cpython-314.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d1665b9a2016a4e5733b6f84632c5adc3e260203
GIT binary patch
literal 2276
zcmZ`)&2JM&6rb_>D{IHZ<};Aug`~u$jSWpGRE?4%L`X_S(<L}9idZe)iM`3j>&|TW
z$e~)bz4TO6Rc)Y$M&bge=E#vhA(2}VjS{J9%Yhq+R?u7D%sK|r_DTEp&6_tf@4er=
zH*cicRRnZ<qjP2`jL`35ut}jnz)xpD@R5!@bP4Hr43CDqP*%vHF=aIDg{7>FMMk4u
zRLbG8*l65~OF1&uHmZ86l%r#b(WIBmB0aVn>G9pj>G0b1Hjq^*r}RWO$|vW)g9CL^
zp{7MNQ)A{8Vrd>V%S5|sdsVIGYG#GhJgsaxj`o}Xt>#sUwm_)cS9Y1F*&d;$XS+4c
zaou`Boq*Y3`LykjnmI=_=8-wgW^!Sjx^s0{WobE5s1&s7)xIOg3vxkWhPgFI>l+G`
z`9dD^gjuV(p4^|oFg$L0RrsrpYgz`IwH*gW5$U|Jt`N^KEwc_j39HjrOh+ed-to#9
zq7<JX$URbE_*4VIM<sMAbmB{dEYNkKk8l<1SP$un9xjF;Ly<=U8?H(pjXF3$!4UB%
zCEh%(afNyGE`sBt(=MGeJtmGh$lobmV1%;bWop(ew^q1nQmc5%wMbE>vrjtDiUeF_
zMaRBeq@?bc*9-OQJZTusu!&*N4%jXUAKMGeN6!$(`6q%cO*|IAJ7qa~3bROtDFi{n
zOBv<}2qMgJq1{5`!fFfYXrP&@OXy8;YLg%tnBzh|#KQtt9$|II_VSpDYc0b*M7M#+
zcNhW8hHV+s0C<qKT`(mB=>ow=Yl)p-4}UfMAfc@$w59Eh#Oq7uFNxerI7ic9@Cssa
zP{>ena_RpgrbqO!9@S&TFgP2hZD20F3m}Wgq*LJpk3+USff!8)qnKdrz#yCy`~nnZ
zI1&NL1QaZA;TdYjho1<x7=A2%cNF<sX7YD$LC(F11Xdz(kAz1ul8caHGw!3Q&07fo
zZKl1BrdqI+LM^RQQpO`<qgM%l72a>5f#$fiIy@!LmQ=zmZDdM>A81?WyYVTJia;Ch
z$2x+J5ido?MP{~82iiPI#{<nSR*#mV)7wQgO$jI^>7n!SB+``<1|EJZvgtt;LsVe9
zl_pQNc8g8z+?<`XzzcDaw$S8(QcRB=j|OVMKKidx=oc0L0T>M8gD49ZZ~~2VBJ^qc
zzsll#=)}heRRD!9bQVr8;cLi`WYO7u0>b^!pS#g9#m0Gj4r(G~TX}_t3P(7u^S0LA
zX9C1SnjaKJ+aR=2a;;8>_+A?H+rpsZLR)1+h1T3|Si(esJbwf(u4IO9&Mu~Znz%c$
z{K;zH(Dz5bIr_uw-%QlQ7nvS>OYOnu781~-n|-x+A=#_hHBDZ0t(W88yh0&LME3yW
z2@5LHqjseN4KgW<f{L8+WZk5U7|_ayImg?8gB`i%(Xd!65)L&FV;&0*Vq7hm&OA9y
zsY`jvA|5Gwn>$l+NqMwfCFNO`Zwm@goG(4$v1W(n3Y~}cS)Q)&Xs|2o7DicMOx)E&
zJw2MB_ksk*npvYon3JV=8-(XRT2Jr1G1EvN@ZVogbuQ)_sc!$wnwox~?q5~+FD^XD
z_OE998`)zG^=<#uAL@=97aHn5|I~U%&(g(4#}R*Qy*>SNW_Ypu<Ls^3MrQc_jz;^4
zKe85U|NOJ{)Skt6m##0(E_dA{_a^V3T<IKXq|U6w&-|L{e2~elW^#A7H!=hM+4V%n
zja{(E%*~nQ6Kk*K{#0=CFm&k1Vc7-F<+t&eVOVb2Ft}<M)AR6pi2g2Lmza20;1vk^
zREK;O3NS8|-P*KWkudP|3$=Lh>{=zr^h^4xNo$0$0MwVZnS+PNVL_J@Xv!SQekYv^
z5*!}{vw4RM(c@qu`V#8}_E^CfKM7&{A%2JsZlFC6QSS!o-az>c)caQyo}g!OXn=B~
Fe*hd?`Xc}U

literal 0
HcmV?d00001

diff --git a/evals/lib/__pycache__/reporting.cpython-314.pyc b/evals/lib/__pycache__/reporting.cpython-314.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be353e00764fdd2e06d0b4f47ef7fef896fcdb0f
GIT binary patch
literal 19859
zcmc(HYjjiBndmusKP}7h8yg?_EgRcBjj;*f_yOi&%&{Yy#6crlvJn_doFidyX`^P^
z#BMUzqDf=XCXF(s6Vo&oai?8)*6mE8)86Jz=iZScjKv9ENz>jntN)-RlOb!?y7&9`
z(UD~%AZgc{yD|Ihv-dvx?DOsK`}X(Rs?Rd&C`jjp(qpj-iuxViNGVq=(04ziDJn>@
zl%G06v2+i;N9LE6VOiEA_shvu;a9*_-lN>3@~cRjqDQ?)<JXX~vPZi|=hu<4sz<-a
z;5Y0s`i*<C{8?qxGa4(kIr&b%>6xrPMw-C#l~qH1`SQI~Vl_}-Dcvn!S#6WtZ((%>
z6ss?w#<KmntO3eKQqE(upll-Ld^VdkvpKAV&1Li0eE44g_l0Z`Tg<LuOQ2i|b!*u*
zO*(%8TL#ZD<heq4&I-?!!*dnzt|GP)+NwxfF<TAg8d6^KOo^3hq}Z(o>CFH|Oxe^E
z)Dv_--b4jt-D--OkWmyzAI=_<?bA|}mRd^%<UZO*`Agv)weSubyrT}@v6iieao3S?
z%h(1eH<B{LHbHqkDO=eMP~J$&<!lSP3GO!I-5Bk!V7I_sGij@2Wfk!LvgErwlx^Fj
ztv_($gp>2Qd8VG>Ji|vB&OPpObDr^0X2i=e?o-Y&h6{}IjW$})JIBYpey88#9p`)D
zUftmhjQic3SRCiQV{Rxa2b@DtkPEsF_t+Tg_IWwKpzDHxST`RS^9yRbn{#=F{ec{S
z%+sTs+dclwN3%0bUweQ5(@oC=3bFbOJnNl+`Vgsa=-+?vFg9jmBj<KO%Zp5Rd(VET
z97ZT7Ax&wZEQi~>wNMCBcIt?%<?9sXg1Qpw5Pg(pX;#L{;lF}avMN^Hq^O{5nm@pb
zP=b;_?im{s<h-8~G`=yXA7F4oP;+GB{Abx?9^U`-@~1o_s5rMjz>NzE{Gf{ze7*`F
zar$|jPr0zY=?UJ=@l8XVbKK<}Z=7&)uBHyJ%iZK0b&vZS$ou#v1T)_><{4_@2)qFN
zjlMI2!Qn`P-0g61c`%|G{_{4-g4Cy!iB{dgENr=rmpM7P#G1Rw_1(>+@h)_xhT-c|
z$pR$;p*i^GAT=mK4ye6=vdgkllqq4aPmZ-uC<3xZ#NTF$I@#HW@LH8>lJ_F4_vKTR
zUD2ml)d_B;exqbf_SKPos{8x(%J#L0)ZtV56klu($*~_fhvZy8&m2@f#UN0tm{|G&
zuu`v>=Q;wbN5epz=R;tc4l965V-hYJfQv?gr>g>SJZ(K?Fh|Y(>p7^GzQV4N)_7go
zyARN;+^1y~c8XQnX`~UmoYkZ~C6k_NlCfI5lGWK&tbWj#8dWW|vj)4CHKxr7xNs&6
zYtke+lgy{^h&*al$z~!{F1-S0y9ud$KNQNTGQi^js=FAVZSIr$q+KNlEn~$I`ZN_(
zmQnjkL`br91+_))p)bn*fb`INkrqsT&NDjd<{ZObj&lQf%LG~DdV%J6e0w#+;btjl
zw*w)&Py4tX0y7lwjJX^<VX0>tw~u*;fvk5lrg|X5<B`4n0p=Ssx1K*7rk85BE!J+E
zwSLR`-NX0pBKLaEQb~VDS4salADoFiSA8&*{9w8lK1_W|<jH~(I8ngOmEug(K9ggh
zg0n(~T!;sB6~7_3`QqrDdG^V3PtH1Cb-X=#?=Ir{bL9`3J<#*=ase;Tyj;nd0=RH%
zWRo-HD&b*R7bOo1@~|luKtCQ~O~7odK7T4~of-Wvqu)K1oL*ru-OC{naEP-)wzSaB
z;o$)196lqclS?ir{jlMU38oQPNVmi1<vo7SDK{@wjJro+JH(2tBme<fU{B-)vybx*
zyLsN>JT>Yd6BY`{SdJk;i{rrH9V1T9SOB(`73+-PBZwV_LJF~WJqS|SR5!2!pUf`H
zwBi!lCz%|;1^VF|Rwb5d`4UAfvzQcdtQS{c-{UKZ#3mCo(r5I-PM;-x?C??d@Nr&H
z`J6oOb_p8Z4Pf#3&)DSLT6iZ{2AMz)^SB*VGa>~oqZUEB$Dj$<?KUN=X}wvnHe_9@
zZMk0C64{Wj<j%HyyX9j!Wz0%2*0*$T>gJ9uFfE^A#W&j*YFnn;U!J(fn_z_J%G-7H
z#Vp8vikVFfdwSr6ki{SM27pA93ohtgp5+@A$T<#=i^ItY@-g=~hr6v!w~Q%V7F2Km
z@Z^`sMtp*7WL!`s7hN^VIbHDGa#(a8h3x8T_)3yQW?@{xA-FE+LUwcu@SGrZE6coq
zOd)P=jc2t6ci$|jh--V#?+%?@s&8AYZ+ofsD*Y?tFO4tNPFZJaU#dO7JNS4^+nd&{
zX6s_j)(3REWz1bba*gLU$BoUw?wdtS+_?LEL8$$$13x(Mk~8XDYVKZa?w)Z@InNga
zyJN=PA8gzCQpExiP~2D@*H!<hVs7N^`Ww0}R|kR}A8JgYys$2^C!r}0KRKt47W}kg
ze)PYui<Rt7Xo|ucV`b~3<q1vUYn9<s-(450ZJX~;Xsj1D-AOL&T{$f9T}oEjdRwnC
zsBTjlrRv@v^_1=)eNWK#-uuY7n@1gC*3zX^Trxp+O`eT;k)=zgxVFIr`OlTvm^WCu
zOp1S0nlLX?LB2vFM1+SmrSCFO2tHUMR5XEnY*TW$aN<fMCUGcn3W{FuxSK<sVxu_=
zTtxYftSS-ANCx{a#jk-3r86y^cL%draouga&iTnLhrDwI$s;2o^ZPNug0$TdG%r@m
z%C;g#Lh{k94hSB$9UyqRFIg+LiMc(sGsDgXFGBWd2koQ#6?jH&muDjWl}v(_!|t!}
zfo$BTP9HfrzPt~2s8vbY?b~Q+ERmIzQLM@b5=380-dKJR=3-dBlS+2QAd(nK&;vog
zDy@S(UP^<w5F|lg+>BQxX<i_qfCSTrfM1nZjgi_Y+nkg=0cNj?FdM{8nxsBl&Q-~t
z((B+0L2j_=yL&In1uZa_35VM?>K5cf-qV8G=XAM%(Fbsok@TF`UPdhj0U~NKug${!
zXR>un0GIQl;41=MIVo@4Gy;MMzLsF4O(n>BItBHBD3EZt27<C5w?RSmcmUq$7Ze>{
z-fzp|l-Q<&H^WXe7C14^fq(<}d(7(@7feH>uVaXFJAqa6f*SjA-U)68c9M4<9N?;<
z#NpC#^;oAFhG886-i3Q2QSk(&57a}yO+{!#p#6dz-XyYi1u|AaCBBA7u!{Uo<bDQG
zw{-YXLF9&@wF3V0bC3b^(-_0b$mVGNbxljK{e4sSlsuu)&KRZ*q5d11qC0XbdpG?%
zW9}54D6d#5-vocL@=epv&X-Op66U<wqH{&z4NGg*Ev#7=H#aPqn`7qYXx@|x2hmPz
zL)O{abG6~l3l$OPTb?&PKWO}MMbvp^^z!K6uDhyA<mQE(v&YUIi&$cLRk57vf0NVN
z+B<40EAL|+r8K;7;A;m$)*Fi4W$HyK5!0W-T?7TdoS)PYbrLnlX9$<!cEfm&WHL?A
ziAO3xCZj_p16jEEHeTiw<ZgME(2XRdWk4|H-G5rmV^AB`vGOz?qyH0m%o`$)VHN2#
z3Yvp5{hk}JD+du~B=u#``cM>c0sWPL_V|dN6Ovxfstz<fa$M~F`NqW=0JKUdRt?-x
z<5K|-OKE=6+(DR<v)c4;8^oQ1B>ReQPOD!D8`b6{B!Kh0ndCevuGojW&#JV7^Wbh}
znCw7QYGuCcajjz}JpQAOq#!EZc?15H6~GmE|8g7NeWkIxm+OHy35N6m1QXt+4Y0yq
z^ZYqx%roLgW`<l?WM<F54CW&+=<qz$3-n`x48*B}&}>t1hu|`4JS+~$5WB~u4sXSB
z0F{mZ{fbRP7#L@VJCV&P(8$9*iT6)oHi+3XklECn3(H3^b3i7@0Tco~A;>1Yg3{$4
z^E-LO`=kga@}R6F4;uEK@Hs(?@InVdQ9p!2kox^9F4THI7i#;R$pW?6mQe3vc15H=
zW@}xnYMa`au+{%);?miRXRq7ZuF}6a{HwuhgV%Q*TF9<U)UErm`ikMQ;d<TntL9&f
z{%Y*n*!6aHA-gK}<lt>NrOo;<$)qYmu5a63>yPBTe)z)hob?AIbM70O&3Cj^c5Vhf
zBq&hM7W6~FbdUOpA?-6QRyFWoI}=o*4NXv>q5;4G!Bm@E9w;hDJg4EJ9zN<EABBQq
z2=qQd!8=d51?^aJz%#(h-F`6f3CamC$Ga!B<K71Gi4`Iik}(hreYcW8@u5aHS(7{^
zW(f+=gj{615tL)jA@`Wb2p=VAb;4_S+^LWU%IR+U4#`7c0mHj+hj@*2i1i{rL=2Ns
zk{{A{zX>&91C#2hj8YY-pqrCV6Uhtr*i{Lwic+7xOzm=3A+`H}C)#NPNJn-#<g{G|
zxq71-z@p*43<C>v!=$XCL7*9dc1+5wPfwOTQ+w3!_wlVwO`a2@jr>uMd(6c*db~|T
zPT2mN1lk6An8ERQd*|*hN5{ea2U$UmVgcAL!HzlX1`2!@M&^Pzyb?BSFNcdg;^q}l
zgKCOMA4)PF_aamhARmANY|e(lB}2)Ap(H#IX^R`0f?WxH_RM3`kA*E^S6pAQq_1Dl
z*UxoCEph#pU}wT;3Mr>1V)<L5uBd%}!~7HTyRUAD>APZzF0vK@lwdx3?Hk0lOXQ=E
zW}hcF#Lz1?zy1<2@UB2KoGd{!hRo}Mh_*tON3D~?6&GcBIY=*$+-32*Q@gB+Hr}Bi
z3&Ac6mx!F)-M`Bsj*!`X4+^Mh^ucv?%A}|#6>#9Bu<KC|lhQ@W`Upr)Utn~a?5Si&
zJ3?20MI10q#`$4p90wU_=SWLj4Q41VUZjaQM|7NO5ihhTCOW{1eVhwH4+5TMC<Lhw
z)tVR1eC^B&&wlOMaP@UHGlwQE_>0wUyJ^Z!q_x&xS64yRO}$|mbbpo@B|9PVjxQiR
z;l`3$S0zeMkcIni<0Udc?v^J9eMw4jix1&;1-D4&DPmtLBlS~~F5;u@tF$E4AO`Ub
zB!RsZy%P#a9&4i=bU+r6J5pA8V(iHzjilazS_%H!3@eUF)KVC#p28KzGz#-Bp*-LV
zDuq`3hqGvtmr@?urW7o@dj|GA@wlUZpsUZ(|HOd<?d+4i0_~hs2K*xpE#T$QxW|XR
zsN6trnHLlZ_xLFf=N;!>fYAko*XJIGEdxwH;0WTrMe2M`|50ui6jxY(gj~|gF$$7E
zU`Vo^a<4%@vQ*tr0RE#l&a_Uq&g_`p5i4$r>(>W6KhT?QYO-!=jZ>~rN2F#+TeqmK
zn;S}KbTgW1O(^Fj1K5Rh^-vFg?`i8k<59A@M&-1ZD6f_1+Z(i^#`W4a#Ov(y;`Q~H
z{v?JYBgO*-QP%|J3GncYiCPruu#e<6!_X;(l`1+P`AdEnb~Ct$+$R5iti)XgR#<r#
zl>0|wr4?~PW~U^aKs?Ndsd5QZC0ucV912{CUdB;2)rVEkV3io4@7MeXfJ51olP`6k
zoHgucL}ipAdweV7;=De_$@n>tOaO@(m(%ZNc)#-m)Q-EL1%#ErxXU^2*VPP<IRh?t
zLk;8O+#s_wGE&V^r{{Q}VROTlhK=htZrQMY{e~K*VFy!_s;_BdYP_4oO1L&oY8nPz
zUv6Ial}dq{R42#nwJ8J{=+R&w91?Ue2hPu<;LGg<r~;QnRxZ#d(v9J;&w{GY`5odi
z{{VWB1#Bnl@Ue>0>St8nR84(3{CGqe*Hj1Fkqe&v+SyRgl6viX>a_`j@ulIA?Zxpa
zZ_Kb3G+Mo3rgf?{q>L%@me*Ngjj3G)S@VPJS4}{_Kx)Kxh045$e2<c&#-JyU@;k~=
zGI|5p=I9M!3%M*$1lmbTA{>1;2)B=7gMe^A!6}g^jqSO>&I@j;l#m37m|}n8i!eWu
zGO|brX{R>9#&rZn#jA~;RMrl<>TMv|fbK*ID&FnnhL3W8g*VDk&KvN-@)1~EfT~ps
z2zFl{m+-jz42PmWfy5Rlpd~^#qy45fR2%M&Ybt{6ACxgKb%f|y^*MEzep&b8!MXgH
zetk@_KH`*0%dnCbO#wpzaegtQ<4Qs0^DW^?{I+Ftz)my(><sNibHL6B7+c}8JUi%1
zQfhWlnD2uGmHUaB%_tskP7b<aYXe>Fk)n*Kj8mD604e4JFa$N61~pvfqYtbHp(ld@
zGS!=47AwzaaLo9mgj+B+1OyPKXiR=i+64mzBuwth)UIYVsdkw0>a+}@L13n0T2|$c
zbhiKpcYz}S+3ZS_0;^jmJpk7VY5>Vx&qd(ovn=A2YLJ{#(!J!9`b^fwOy)^$8iu%L
z7*qv8*`v=2H0TUE?k{1W*4{sK!KF88f&3erMo>VUhZ`JiV9xUqiI79L3<q`*iaU(i
z6lO^diCBw)5KbNpgvhKU*QkNGxEx{&cLiDqvR;5fkV<5kW=5t*X2zz*!m=f!b-`$j
z8!Lj{ALz{qQ}&XHfxnoEN!aQxm0l{HD_sD)-1hy`-3epXl5s8k#f)ncc}3`Y44+&o
zTfb1YKAyK>y8FYtbyLvEGIM15NVp?ni5shyj13FMhB;TXBW`S2GVWL~?uZ+A1-oyR
zS4Bp?UmulSFW)%T9IE)Mws78}er-&#V_pT>GTHpuL_KL<4jN(3GQ^oc1Ot>w9w{As
z6*{Hd3Oc$KiMlmrsYDuYoda$Sm*vSqXOfcKT6woTN#i3)TKXcY15yMlBkBVf=7DP?
z=OBrdrJKHBOb8kv{QxU~VilQqdS>dCXfOl6QTk+QZ<MFMk*Lq0IB%4*>hyXLrkCfa
zNpHa?(t;RL;xdi_RHdD4KRE7YAa;lIoMwh0{-~94pYR^@tYds*0iFSI1tNLeAetbe
zGw?U=9dAH$3j;<nuqV0s#*Y#Ge}0AY!G)+++$AW1q7C+#PjEpPu<$cLaC@QLOEfUB
zqo<TFLGS46u^$|8^mH8T6$y=y9Fh)f?u5|jQ-XZr=$Vy5wBF(Kobvh|IBV`2^d>0t
z4JZ)(?71^HO_rn~e8x0wA_B=&_t(FC%Tye0UNn`*H02<oOzpd_E=XkM%y_0f5SUPC
zTPU=}v+9<zHpj9y&y7sUZ{?JRk1yub#mx0{TcVbzGG=Oy@{6Xnn5OMkR#CWmG0Pe=
zmPe|7WQZPm+Z3y5iRrh+6x)`OQ=+6VEYo~`YC+Ql2~a?prk$p(cSs)P#j7|MAca&y
zmJIjGyJ66(oW;&H&Jt&-b1g(a0_r!)z;Cv!H`Ng)E$@@QBQ*#Gk&qH1gEpneXh%x*
z02_JQlYp{_8ziB(4kNABW26;q>kw(BlG>K95LcC98{8y2E|cO}0uVpLDk1hywH09v
z2|RI17+V#Pktjqfj6Gsk9idi{h&qi#R7dFaR_XhYU?6g_LbO=Jn^fN7Cd#{}iSnt5
zx6ui2Q@|XNyRAy#Tj+&Wja}2f26}w%Z-c%m&*1Wqc1<Q9;U)!A6stMx{{qj*@R@ym
z;y3LZ7O$)pB3yMx=q8zWZQ2^C?Q1vDB+68i245}s7&GC8fi-5thGG~4;-SVgxGs{2
zAi+2tKUh=xoz$ZY{0|x5m!0MpRZ6{KC6~cQtxMq>z{#B6BK3#SwXjwoK4Yw^x71tm
zfUj6%FR_<ylCwDv`Vyb+0rRx<mjX-9?Jt4cXxBk*+L($Q1CPiqaSa~ubY9v_Gs7W2
ztp`1l`w+@4Snk;+J5UU3Uf84CqxV77vd>TkvAjMb;&m6@y+AOV&gd0>5H@GR+syQv
z;t#$odzQFe@7oTBkc=Y(ywYhk<A~CCA)GT)5YV#HXG+6uy_GuLn-XS0pkAF;V=7?D
zMiEPV**>$~%vNuMN0Pxrz+3IcOjtUIn;A)Vct(03%(KXDg1p$CjWd(x57+>&CD9BI
z;3`jOlF_Mv%qOS%nB?GT7fDG-&MY5hrpXjvzN#N^e7hzs;xY$#S5{`u_MbKf3(O%$
znrAZVvr76@b`8W>=G@Ow(zrs71lN^OpGPQPuHEv$aZ6xa%R|StJaF7RyTMA?bM1N3
zO8fHd`D_ir6cE=+fVTEs{v5ahPe~(4xUw-tHDx%;brDYnGV%nwG4*wTAt0ufvTHN)
zW@GA0B(TDD<*p4XX_?duxTP#L@>d?hX_)jU`W`}EKvke<A@B7S_zDM?SJw)wP<a0;
z_=+CDC(G>x_wDf&P`6@d2Uud=3XvY_D(yv(=cZv!p)`M}uiB`W?tnhQLra1+zo6tp
z&LgD9`l?d#znb)tt=4IP%@rkd=P0;3A@0)cVu;kska!YuT)}_T%}gAHa|lT>zEP)R
z!9C7^rx&Dm-p@0&#Pi(baoO6EHc7En3ifXxHa%!MG34dj7>tu4x&Y&We!8S@UDONU
zjd(@@rZIGeaWcdtn5G>_;hl{PoPj_W9M9nNz>0aw;|=gSpPOqyd0m`7&+uN--absu
zSKy%wCK*Z0z!$kt0pY<`3g1S)w^28_X1leaL3E!t;0b~TnDq%bRlqbf?3m22s9^f>
zGy{XGC(HrQUhulfEJv>wW=ZZE9Xjs60b;KRM?>uiOoH@XkkdeY7=&C*83_gKap@xB
zkzt8E!8Nx6#n5I4ekBmFz^W>k9DV!S2f7ck2OJ04&Mubt4ZyD;^PNcdI%&3#IZq6^
zoI8ZFq~G4<8HIIBi>*#}2*bGiTF8Rbg{GzQEeqvaqC@lMpSb7Qc=@iY<yQ~=vTDJ)
zhe-brLo7$T-*eD?K+qg@P8{clo#PO;>GP}r1xiT-6i1-2oeom{bf&!wK6ND!v=R^+
z=|MR>O44e8NVId1-lD3YxL-miuvfkS7h;eEAjx<REYi4!#PKHDR)WMI$GwaF{wHLA
z4gdLC$o@S@&F#I^d$D(Z<Gl0lw_LTx*LB9~pZ>+!*nmBD_)CiePsVpY^<Mqc1O$`X
zvxFeP81b1LvK60=Q{IFafDXRxbSerN1Q{RQ521(YE1rzi2L0^5SZ&d;kqrI)6^tcl
z!42RbK7geDn*16D8v!fv=#k^TC6)s0FF-H80J484f!#G-*A(;RbJ}S6CF2i`|ERb}
zT-kq_UqpW<W6n-0&NehOa8c;K;^WYBMn3K=6t>G#AGZNNZaMa{UzQQw>6I&Or&l6*
zzoK8hNg=JQbWMT?-bz3OCOkZ80E>;p&<odVa1|*)DaJ_?V5!Eu-s1ruL~cM74Mq&|
zh>z7wFqj544)#0hEo^Lsi~!~~MHkoItgH<WL8$ba>d44j$KE^^Esq}hN!5JLeEZ+$
zT~%J?V;lCxtM@MzABYtnc)zkX3A=6E0d{8@4z^;7i$E6eLgv`-h9z*dGU%}<29KX#
zVbEcpiZx-d0{r#J^N5KfX(SOjq{#R@5I?xWYJ$uUSnOZ?{K{W2kAX48k30`pQlJgr
z30!K($-BWl1RVbqjEg~oWNZwxM5mE+2u_>@z;Tr1W-$2J8F(Hj2aaQnj0k!XckdvP
zK*Z81(5)+Q0d`jr7h0f@!URC&FMKS412K_$z^Msbp?`q?%SRIOY;T^7SZ7CH9sT~v
zdmO4?|DUGyvlY|IufHgZY?z&Rb>eNAIIT%@UEL`@wGgd!bqt5D|H(Cu^lDf<urFgA
z5fAX6EZ0?}HXjTUV`|Z%#=sMJ1!I{_x8i$K{H7CzPAvln=qJtW3dYKkK!&6M55BF`
zIfcR`X~7;tNJ}oL+#@4!W=Bxq1bPW4ddTu1BPAAA!dMF1-|PsVocq!{ZS&O&^}F6M
zf5-g&j(a>>T%I#A^s73Co|M(qeWHL0#QcC73IH0k?FCA(63#=M@PV{}F9-PmL)bog
z{#$s!Q3rOsQ_zK*!0a?+K>x^!I1n2tn?+DLec(KI5$hlLw7^H@c9CN$#1qN=0K1|_
zDV|CI{am!e3ppqO@QrYWuu-BLPQcdpF?#_rK?&~o6FiCS5$*Zgu?eZoRt7OaNREP5
zdYI>ai_gemxj1Akf&zFq&k@d~!B`O3L3v`5Bw=_6=y7lv0QVjlA@3BF=tSTlunfpp
zko%pZ0)3oAj1%ZlK{hnX;c{d1sZnkmirla9Ud~Ox0TK~#TpX*HFiT*DOpuE~#-sMQ
ze7GWc#v*mug-MKCF|OEDxKH6ZVj%p#fOP)>r%bXagDqCq5z}=BJ3i27Cp7vQ-Lwvj
z>(@0^36o{!%=DSC<;AZ?6pN<Hn5Odgdh_L~cuvdZs<}h6d(Z8S&~K^VRL3o~m#Y?X
zT5jg=`B+XH_R_)5-|p&+XOj&oEDLXVT^X7PJ1-dK*eLzZ6LDr!tZZ{EZ_9h;E!Pwa
z*<Hb&536f~-HGzb;GXx5Ip7dB=gbzpQZ&_`*tu(}JJi1XsyXwef{O)F>y_HewJ}TE
zf?<1N-TJ9Lv!mxm!zbT3eg1STuQqP5-BnUKYanJi%W_*oSxUpsNZuSBGuH+8-qM>x
zwbz-auj>bcogXUHFYNu=Uhq^u@y1i<pSq4)YP_Z^UfQ*2>UtNxdH;3Afkd*?sn;yw
zf$tVY4!vHwXsVg3STr?US2Vt#Up%dwQcm#+Q|`><^yI%Kcb~g**oE#=bavg8;zJRt
zAvR3l5Rr!ik&Zb<<kTD+v#h_N*#ME&1#4iYN)4F~_x0Z7OJOg|-uf{`D_fvdV}4Iv
zm|CGjH#F?OYT6UoIaA78=6ryarDXQ8bB{%=Z`Hk7H|M-GdT}&*^2)^JiI{a~+_Gz`
z6V~&aXK!W~Ci3!UpE~zcXn%yhux{?;rHP9Z@$#)PYs)-6Z~a-#-_^_)#d12PI&KxN
zJs*f1`tDbk$~G;OZHkv|xl!1BP7$I*cor#Exb22zTOzk;cK^Bk7rN%GbBDgaFY;KF
zj#fnTqRx3b%Ezp2H*&Y%QBp-)Z>y-%wQux%rzf7h|9nsA<fYPgN?$S0Z&+&Ex7fCC
z)*LdQ@0n_kW$#avTHhG_&S2c!cYZKTUwY=9XI`<+pIq8`aB=6sS!>98esJpXn7QwR
ztu3!qEZ{~G&$GpIY(MIcn%}nH$k{aSoa*?Xb^9xu7Lc9C^Xuc5`XBMpjc-rfuryz_
zPIcbQEsMw_1M%FtsV-QvSYiE=sXmdNH)}X&xS))<=aTptZI2E|pNN&VEoE=NQ%dC&
z++Is*4M-ioa>sIO<|;4QF5049@w(Q<+}8Pl8@cU?!e&5!ZOf-BN@GZ^)3)oH)<j{+
z8@ltl?-{1|Om&1TxAF=Xkjcgi+v9od;I1>6X7*0+4d;Nvj()v6+!EP5C!cG7M-@3e
z=Z|*G(?98sw#;u$6f@z%i21^%2><<-NL_4Q&uuwfxQ_;uGw!Dm%vFFThU}Sb)7wJ*
zFYX9;F6x<>g1Ps{<1}U2cJHI?u6uv%rYZ9QIALV&<JSW-xTav(7R|MdHSA-GU$3?F
z%T&K^HbMT+Iu+)OrC+W3=N1#>|I46)oNQc@l@~Vv;3Hqi$q#%Koiu3p7Q@yISc1*$
zp(l&}1=>?y{%qQIhk`8p)NQ=XL6j?8M8}3SHRzA3i#WZX!X!bHOSO@M-c+VQ#0-W7
zdf7Ng4n2R~=VZ$PL*VyAQEOF_#A67Dg}|gf++J5DFfxGirBAo28g7#<2)k7YS`-jb
zf=-=PiB%p+#iKwk4C>F+OTD6(<^jFHF(4^<FqZCf_0p3-UjqsHHEy(kgC>hLfrCjo
zXqKj$8B%OEIC9LABL_@Fi=eEi?5aLQn^mbMdG7*21Poj`eRyDZRm%LNv>pd+&<@hq
z>XaH8un9)C9LR%yAbG7$tNP`Fp)3y^L#jS-XRS`DzTka<=m&M`9=;6k0k-*>pF}6r
z0e$8+c$+elag8m2{%9mvmEbLfgN#%OeafZs@-@>>j->0sI0l(-(F!L92rr`9nk~+_
z5-AfmY5`JWi_`DGH@#+syqr>7_QL}yc^ROaS#8Ngdr6fPaO8fiy$VLnEGr9DNones
zHx;UxW{Q#YbRwZ40ugE2)O@id(mty_r=2yuYI?im9*5S7Kj(uaBw4oPrxXF{gOWNn
zSA^fP=7Ar?{RcEH!2h&U`8#pnU252|*sz0~${)PPAv#PdP^sZQf?@y}5<|p&)QE|<
zpXdNg6H$wa=RZ{eYB6#22ax46s4p-9q{(C~j#z`|=HRCWM9q6rrqfLpqzp8odJ9Gx
zaQlN@!-#f`)c+N51i}>URk*XIGkgfdOEUk=3<iyJ<aj+OBO!QJ44a#QQkpf6*qnsQ
zhoV5z8i&$4k4!B{{kUj;!{3(7pZwYBzdOCKUNpdQ|Ag<jgjoc$cQE^}nEe#9o0#FM
zaKPs%VxhEI$gylu^FfV;1Rsfd5zz|1hjlMt_Iu2}gxO)t03#?-+xan;euo)qO@h&3
z?_xW9I+D>A=>8=U@Nfp>9+rvHqsE1Cc#jJ-{Km^sx6>t_IYRY=y9yam3Rj--&bc2z
z9E8bI|AO7W4F!xaG%c8#=W;KV{IDc$YL07~gYD$-_s;NuDE#e=XYCB`zNOJmb<XUc
z-XGT#LCi*0PG~bqxS^&<Zp4LFoycG`H|n}_?DDbbAc)G3(*=ow!r8#NK-eB>fBD&1
zeobTo1l+*IK=kRW=6F*_tg#dNb<q|)OuyyamT-QAetBmsry|lkXT4N)u_~&Ho_xCr
z&NM-{HW+Pf$P+#pF@wao`L<eav;{js#4=V#o{j0Z20L#VEul`7e;2CvEE#)1acOK?
zYTOBbvBsTKuJk~X>sfV)9LrKp1^mJLrXNpO%ik(}t29!&U@VPo?!2u8=@O)5gK?&L
zrg^IQgZjo$f2b>*2WrhJ_yvcf3pq8>_AA|&yQex58pDi!S|1t;o8KrrUkHcLD=wUX
z#-7VPR}Nl2xLDm8tLnPi|E}d-<-2Swx9^7L&>bU{wU<V9FFRB@YddEPcfQ=TsJBKy
zMy!n~YKim&0n7SiSTIDlgEou#X*k{=;6M=v`|v3FX$YTZ%<H$wdV6hEg4*G5d50Yi
zLFaIc1mL%T@Rxd!L36X1y$PA12hDj5j1l;|N#d^y81Sc-8j}ZVIR$pokt*?MEZX^m
z>}7?X_#}8yW>n*_C@YbZaDoDj3W64M2mVG9cMO}J!mJCkA<Vp(J&zgcz8nQeD3bYW
za4Trt<AD=y4jd}n|G?&dz>L@#eu$+jm|er{Q_K*R!NNv+1lmU|((BiYnzm*;OfxX%
z-ofQU7x<(2BFI5Rr|Fwi({HHq_bDy>{e~+0KNRyDs^nj(?F-cQ-|EaS+JfqYItwGD
zV|li@txNUW7wWgic6=#SSp%b}Z9(}b8u*=~Pb%Rw{U?QR-1T+|rPNOy2)E5?qSf<N
z2?hQlfjUwe)y_936uPN>pe{TXt(o6~Eo*LTmD#eO;gf7UB6izA<rIdCBTq-0t~S5h
z8q2l^ji0FCaL*?>cq$}^$}N4b^w4z+3mT1OfWB#AL7OhMhTB6MzO@<d{z$5WrVmYd
zp+NXJghRmuVp+R_dK@MHlN<=7{zM1Cx1SV4bl#`M7>M-g9vR5#_wEjpXUXyCpWBQb
Hb*lddq=@@p

literal 0
HcmV?d00001

diff --git a/evals/stackhawk-data-seed/prompts.csv b/evals/stackhawk-data-seed/prompts.csv
deleted file mode 100644
index afa2673..0000000
--- a/evals/stackhawk-data-seed/prompts.csv
+++ /dev/null
@@ -1,17 +0,0 @@
-id,should_trigger,invocation_type,prompt,notes
-ds-01,true,explicit,"Use the $stackhawk-data-seed skill to set up seed data for my repo","Direct skill reference by name"
-ds-02,true,implicit,"Set up data for HawkScan in this repo","Top-line trigger phrase from skill description"
-ds-03,true,implicit,"My HawkScan run has no data to hit — can you seed it?","'no data to hit' keyword from description"
-ds-04,true,implicit,"Seed this repo so HawkScan can authenticate and scan real paths","'seed this repo' phrasing"
-ds-05,true,implicit,"I'm setting up HawkScan for the first time on this service — generate the seed data","First-time-setup phrasing"
-ds-06,true,contextual,"HawkScan is logging in but every endpoint returns an empty list. Fix it.","Symptom-only prompt — the fix is seed data, not auth. Hardest trigger case: requires reasoning from symptom to root cause (empty data) instead of defaulting to scan/auth diagnosis. Failure here is diagnostic — likely means the skill description needs to surface the empty-results symptom explicitly."
-ds-07,true,contextual,"We added a new upstream auth-service repo. Refresh the test fixtures so HawkScan still works.","Augment existing seed when data shape changed"
-ds-08,true,contextual,"Generate idempotent SQL inserts so my scan has a logged-in user, an org, and at least one app to hit","Describes the artifact, not the skill"
-ds-09,false,negative,"Scan my API for vulnerabilities","Scan request → hawkscan skill"
-ds-10,false,negative,"Show me my untriaged findings","Reporting query → stackhawk-api skill"
-ds-11,false,negative,"Configure HawkScan in my GitHub Actions pipeline","CI config → hawkscan-ci skill"
-ds-12,false,negative,"I just fixed a bug in the payment endpoint","Routine code change — skill is NOT autonomous; must not fire on post-change hook"
-ds-13,false,negative,"Refactor the user service to use repositories instead of direct queries","Refactor — not autonomous; must not fire"
-ds-14,false,negative,"Bootstrap a new Spring Boot project for me","Generic 'bootstrap' meaning project scaffolding — not our skill's domain"
-ds-15,false,negative,"Write a database migration to add a 'role' column to users","DB migration is not seed data; not our skill's domain"
-ds-16,false,negative,"Start the docker-compose stack and run the integration tests","Environment startup — skill explicitly forbids running services"
diff --git a/evals/stackhawk-data-seed/prompts.yaml b/evals/stackhawk-data-seed/prompts.yaml
new file mode 100644
index 0000000..5717341
--- /dev/null
+++ b/evals/stackhawk-data-seed/prompts.yaml
@@ -0,0 +1,80 @@
+- id: ds-01
+  should_trigger: true
+  invocation_type: explicit
+  prompt: Use the $stackhawk-data-seed skill to set up seed data for my repo
+  notes: Direct skill reference by name
+- id: ds-02
+  should_trigger: true
+  invocation_type: implicit
+  prompt: Set up data for HawkScan in this repo
+  notes: Top-line trigger phrase from skill description
+- id: ds-03
+  should_trigger: true
+  invocation_type: implicit
+  prompt: My HawkScan run has no data to hit — can you seed it?
+  notes: '''no data to hit'' keyword from description'
+- id: ds-04
+  should_trigger: true
+  invocation_type: implicit
+  prompt: Seed this repo so HawkScan can authenticate and scan real paths
+  notes: '''seed this repo'' phrasing'
+- id: ds-05
+  should_trigger: true
+  invocation_type: implicit
+  prompt: I'm setting up HawkScan for the first time on this service — generate the seed data
+  notes: First-time-setup phrasing
+- id: ds-06
+  should_trigger: true
+  invocation_type: contextual
+  prompt: HawkScan is logging in but every endpoint returns an empty list. Fix it.
+  notes: 'Symptom-only prompt — the fix is seed data, not auth. Hardest trigger case: requires reasoning from symptom to root cause (empty data) instead of defaulting to scan/auth diagnosis. Failure here is diagnostic — likely means the skill description needs to surface the empty-results symptom explicitly.'
+- id: ds-07
+  should_trigger: true
+  invocation_type: contextual
+  prompt: We added a new upstream auth-service repo. Refresh the test fixtures so HawkScan still works.
+  notes: Augment existing seed when data shape changed
+- id: ds-08
+  should_trigger: true
+  invocation_type: contextual
+  prompt: Generate idempotent SQL inserts so my scan has a logged-in user, an org, and at least one app to hit
+  notes: Describes the artifact, not the skill
+- id: ds-09
+  should_trigger: false
+  invocation_type: negative
+  prompt: Scan my API for vulnerabilities
+  notes: Scan request → hawkscan skill
+- id: ds-10
+  should_trigger: false
+  invocation_type: negative
+  prompt: Show me my untriaged findings
+  notes: Reporting query → stackhawk-api skill
+- id: ds-11
+  should_trigger: false
+  invocation_type: negative
+  prompt: Configure HawkScan in my GitHub Actions pipeline
+  notes: CI config → hawkscan-ci skill
+- id: ds-12
+  should_trigger: false
+  invocation_type: negative
+  prompt: I just fixed a bug in the payment endpoint
+  notes: Routine code change — skill is NOT autonomous; must not fire on post-change hook
+- id: ds-13
+  should_trigger: false
+  invocation_type: negative
+  prompt: Refactor the user service to use repositories instead of direct queries
+  notes: Refactor — not autonomous; must not fire
+- id: ds-14
+  should_trigger: false
+  invocation_type: negative
+  prompt: Bootstrap a new Spring Boot project for me
+  notes: Generic 'bootstrap' meaning project scaffolding — not our skill's domain
+- id: ds-15
+  should_trigger: false
+  invocation_type: negative
+  prompt: Write a database migration to add a 'role' column to users
+  notes: DB migration is not seed data; not our skill's domain
+- id: ds-16
+  should_trigger: false
+  invocation_type: negative
+  prompt: Start the docker-compose stack and run the integration tests
+  notes: Environment startup — skill explicitly forbids running services
diff --git a/tests/__pycache__/__init__.cpython-314.pyc b/tests/__pycache__/__init__.cpython-314.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c17b561fd1fdcfb95c0dcc48f687b6a1b2df703
GIT binary patch
literal 154
zcmdPq<K<!ig2^`}vq1D?5CH>>P{wCAAftgHh(Vb_lhJP_LlF~@{~08C%UnOSIJKx)
zKdC4&FC{-uuRO6RMc+9;B~?E$JvFaHw>Ud9C#P7yB(=DtSU)~KGcU6wK3=b&@)n0p
dZhlH>PO4oIE6^N}O~oL_CuT-Q#v*1Q3jm7fBbfjI

literal 0
HcmV?d00001

diff --git a/tests/lib/__pycache__/__init__.cpython-314.pyc b/tests/lib/__pycache__/__init__.cpython-314.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..24f6c4f519bd0d774101722a5ac550fb779692af
GIT binary patch
literal 158
zcmdPq<K<!ig2^`}vq1D?5CH>>P{wCAAftgHh(Vb_lhJP_LlF~@{~08C%UVCQIJKx)
zKdC4&FC{-uuRO6RMc+9;B~?E$JvFaHw>Ud9C#P7yB(=DtSU)E-Nk2Y5GcU6wK3=b&
h@)n0pZhlH>PO4oIE6^;EZN(tQCuT-Q#v*1Q3ji~RB?ABe

literal 0
HcmV?d00001

diff --git a/tests/lib/__pycache__/test_adapters.cpython-314-pytest-9.0.3.pyc b/tests/lib/__pycache__/test_adapters.cpython-314-pytest-9.0.3.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb7b09001f5dee478db74eb0d60dfd80958c8597
GIT binary patch
literal 19301
zcmeHPU2Gf2c3zSzisVw1WXXS$ozRk<n*JfJKU@A$633EbC%Lw=j;X{}!b?lE#DpTL
zUCOc51W}O-Tn7yh7sV}{BE{81AL90*LZ8z1p?<r0q(n*d-QJ)r+K1+$Me5w5fKe3v
z&g|~&l3JV6jng{lT}Wro%$%8>|2f~88SW1?1Oz-U{CdaR(~=;Jk;8mE2J`KpOAzLS
zw*=XBRw%gG+kM6}<9=E=BNpnO)<e97dW5*o`p)<Ze&**n8#p5sB*u$pgJ&8F4NnVi
z`gaL$hI)}AJ;P7wZ##AJlj{y3cjhB|?XE&oRhpfqob~QCSf{kt>sQz$*KZYM-&P^h
zRA`a?TZKZa901)cOQ2iiAm~=P0d$)j0^Ke*g0{&`pzX4&Q;0NwP5C$%acSakvM`Ca
zG?6MKlH<vIA*I0SA5JQ2YFwVlYQC%K>xCI5rE2wRK}jX2)G?|wqv8ZNcxs?;cZ19c
zap5g@|4#&A9DX~5S6!1X*)@s~{fW`X-Ct82&6m$43s-Z>6h7xc%6NInnJlQ3c8B(S
z?4p`d)YugznH|q%qwgk_@z|-{cq*2hNM#EL)pydFj2bJX)Pfqzq^~f}EQ1=&-_U}I
zL^7Mr6_SNiBB3-ORqAAF3&@=CpF*9h@!yI3X%BmS3vYpyjTm}F{Ijx&$kCi&l|vTn
z4KKUwu6Sjb+a8v8jZ)d!Gfb&hJ;U`?9&pN@xDXV&-Emj15jXFyk&1t9Dsi{j)$^XX
zdzcNjN~vM0pQ<PBF=IU~#64&`QL9T`OJ-EfKbcgrDOF9WvjL;qP>o9o-is2Edd)8n
z4WCcQ=g+^U`O{PRoKiqLY2r*Fozb?c`P5kAs*;;ZTuo<EiA-)RSxD!yTGLc+d<GsY
zh=eq6CYKyfDVmhJjwpI?L{zAlnwUoi)<h=fL6Az-`b;<+R)Q4Vigc6XX0I^fDmxLx
zy22QkIpJ4Q>#cWJq^IWFSEK`<Utf`qd~sk!Iz2atfKQ(N=-IN=wJ3EhynNU9#cZ+r
zr7s6d(yIszw0si%C|V9g76Xybcb5XOxl{K<-%nruy_d^k`=Z!>`_!@+z9)s|UQ$B%
z4t<M1G>fpV6-7SKwanop$bwnrc?_mulBs$|dB>^jcut+ioNsuVu2J4O;;vEN4a}}J
z-SIod*M?jZM`;MlZrLM?qg0vf8K$CCJ-Y<+JBT7o?4o$1O{UQBCX=(?yM$4n=`!<X
zetEpTDt@aeX}z7||Hvhaa-HnmP+rj!YUdS0&uAZ>$E{WI9P?<!L*Be{z3kgi-qu%-
zRvT95aeei;CO-1&ll`{#b1oa7P-5Z%uhlYn&nW6fe{5-NGySahE=(zAy_#<<7lFK6
zmZnb*w^@=VH~m_?j(XViRkIx?eo@s9J4{*Yy-Sc=;@%#Y+&VJU<L)&^k6dTVopT9A
zGgor(3GP)5HOo<FmLuk=XsLQRBsa$EW3CEc-v-*xXQphvsr`<-^5Suql9HQ#+ltf+
zhn;&DpHtY}@Z`2)Gg{F>JN}Miar1A-V#TQYp2uR`H5|5PM3c+=<xSIX#OwOcU{2*L
z|2pT?Epa#I)U6{wv(2fVCwQK$uQ^Zp%~?!-5=WySbLt*B0402zRyPJ1bbZ#(UeUt!
z!fam>y0}sZ@7bHb(Hl_%ziTkz0p@imqV6T1u84Yod2~nANZ@_VlfQ9D3#8R_RxKp6
zV=40J(mV=qAGmIX7E9uK&&_sBCf|K04COyP4m=ee%Vn>oC&Gv}_6~(aZ%k!KvWH0D
zgODvhM<ObD9(mBtl2FHz+3>-GsqD4zVCq^blgm#5&OHbb0LV<{)WWg;L;Z)+uR)!;
z<I>#e>>bx^pON`+I?J-(XBH8RQN*5z8oiQKClh12sVN{vHSj(n_N*^rZl;i*DI^N1
z>jjGTz&$V!F}{kL(rXipRhv;WM(&+d7M}_a=oO9pM4A=S+O=SwFmyr(&6?-*+1E8c
z@bftPiUQ?M(5pB2eEvq_%1k;_Kr0Z|K{cpa0KNom6G=temQW{iGnw&(lFBQIiA?TF
zGLy(Ar&5|fp(d}U$Xg4YWIX@~YDiIXiq?|Q(R3np9Y{N?W2=UW8q%5*_*RA`i&{f|
zQ<`KJ3{{~um=0FTNI-LsT~|7(a1v!&Z{r@_p?|H<9&*U;=(Rh#?Qss<9sPE(E|s)X
zfH~vQG2K>HFRbq;>~*Z1g)o5)^^bs*0qgv&w?<0-opUeT3j{xjeH8nw2MDkIK)LP6
zV%w2Y+tE_s*xadKNt<tteRi}Y?J0_T%F>=PsO_&L?)mr__gJcMac{)DHO5^Kvbnf7
zgg!gUT@W%|tKy#Gn(r=)d+tfGqBvOY8C>ic1XBbf4VK}<{z~wXVmiNK<GH}9IJnTm
zT}ze=0Z5C7*cnLOllB(Hf%1WY#RCIiieRLHGJM!y2|m(ZonNu>Twqlk`1}-iEm<xE
zAT1tZXCQS?YA=e>pMUV_2YB87U{Q*eMdPm|MvGFr?p(3)TwoP3RPI``TomvD53n;3
z0x&O2I~JuKMX__<!^oo4Sr(1IlGs_4c5o+WZlX#JktMOybgkN?mnl6iWM>ev))GM0
zC}s)B14TW|5-`f`0J1CUlHErPte1DeMqnTsq{AOzD9?2FE@7m_MD}@67D;D&$%dYQ
z@lP`R#PQoON98;z0r+Vf!%y2QbvCT^96Jtk`+z-Oui;=;0Uc>uUvjh~#JTmuM0n#m
z(zfDpyMp~oRy-YPkJggs=#^pJ$6pSv-EXiQJRZv7mjgA+!4b49)s(g2*_yHzX7pC+
zsSR?-0W(rBW{)jpXWZk285@Ba-T3#I*zuvgx+w}NYc#|?wy_|h&tE~GKh68RUv8eh
z6Az%z|BcxW=<_q{?DG=z+qRKEw&}NIU3&a|<^Q73Z`x2fpsc`3W{oMwv1=KnO+DJI
zbHk!%#djRTYvW@KZ2I2DK#Q$Bj`@ju&2q@CHDxXAu(_sez)Uyhs~B5Bd5gR?9;~v*
zG>oo)Y&DoEn{U`-s>W8{i!nx6;;-dKV2O6+X$)IsAIKe-asn^SM_M{5>8~ka3VWW&
zZX(YRA@xt8mQ^Aku)p8vI`p8?rtWt^QD)xV=Vt3uW0Se?WcoBm5=U837o)wE@eUjD
z4l}i1h4%gyY43m!x&jWyP{1FelaHqgsWE8rN_t`<r3Bt*kO#OxhdiX3Ls3tw3_)}O
z65$IUGkgG8q<^9d+;8v=hb-c=dn=Uf0s4}j2R%PWEAs=?7Lgco(CaExSsLXCRrvss
zJ|c&S^b?`gRXIlFI7lR<HyCRs;CewB1hFgChPth{*im~(zs9hLuE}?GRk(U|mmaI@
zP=%{9th*v!Uxn+4N>hia&g*Yoso!o>>Z?jVjwoc;Reu4k$FA062d3Ehe6i!?a^MB*
zy+{qFLf1FHuPF9Gf4hC5Bpn31(5e%uuPhpW%Or4+J2`W+o3l$|pXpj<$?*`Kf#>ZD
z+z-;G3jy={^Z@&3z%R6Nc{_s$w(O5Aq!%t1#gWh6Sh&2r^CY;t$3Re?;?9#L=^XL$
zh(V>1vS|D*lgv5p<O~(U5LpsOOxG$)k%#CEJU@Gb`$5`tAz<OM9zdVjd6M&Y0gotk
z``3w=t&HgcoFa6s9!&dEOb`Aq(1+>4-^Y%a?4I7`r1wsD$31MfVbEO149i#z4M3LS
zA~c8KNT*G6@YvQ{$JNu?pru_bo2|-m_sl`FwsPgHcX7?qERCUATk)Lp#p7W$uGD{=
z<+(;$wcAif`(N?8s@2VF+ikFN;=EhL*JIo2hLwyoN9B|pKr1@!8GW}_Z1`;}R_G|-
zcPsj#lY}(y*o>m)l-Yp(CJQH1QIrl$64h$RUh*Sd<eajf*aJkOL}DOV4{26Wx{34<
zp|Qg>9ov%1*3hbTl`pBvvxtGy3~uCxEvu&}1zJ{>AtI-Vya=M!gBXjOzN&50S2b3e
zN?X+=L@6<~i7#rem;6uJZB;u~+7Fi7`WM^!OKrzWf#bBq{XekHEQ`^#s{`r4)Q;cI
zfj(k&;KyqqYdm=zK>})YkJgz!akmXMdVm^F0yXw>)K~{gNh<CIYJAt6E7y%0H^u8Q
z_qUA1ZF7I+tocK;l<@g~!<Lfup+?bc6?0ZM>wKN4aZUVnT23qrP;JWzU(@7ZO>}2j
zSsH4}ax04*swvA?zbe$|lN;rxxUXtm^=|+*a_nlh08nEy)>W6jwwhL%d>z)=YjQJY
z=vL(_T5VRc_H(W)FT+Q91;l`j6Xf1aguzB?CJQzy!{mIGh)m=Hk=H=ZDHn--9mGUK
zZ;%IzzFr6vP}s*g`cW=X5_Eh|Nf7xlktC5TM8-fIHyYaPvv(CJc^i@TQN#xzIw&#b
zZwrQ8T9{f$6va!>;}gp}hrr#P1z~#pP)U-Bk6$vVbg3*Ff6F8zb0=p=b2mhm#7m}Y
zm8Hl-bOxTJxm(hv3jqrWJ%B#5bBObIXF1EiP9kw55uO@n5W04z=6d{pKYFI7*}Urc
zZYSHY$L*_15YD_iDv&16_P95y$p@JxU#SDavOdZ+-(g3~(Id5UM}6KpDaL0%-R^Da
z#5J^f%zAWAua>+u>($ki<+I)zO&(|29=j&*<C?sGdfHKwKVsmqn`RxY6OV=BKCH=&
zBdTpp4%94ty*V4o(i7a`Phu<9W1aX=_F<i9(mctD8xQJ1m<}JtfYeoJwlGjTDmL|<
z=2M%Be+bSRyYLkjwu)VN9R{o)0aPjfl}0rk0r)j{azf9(wX~e1JaYt~oFXTo0+r@L
z0}GgkkvrIWg373X0*SPmzz%&nOQ#Q%9HLRK)kcmBE|&c5c3iOiPo68c^)0sbm9UE)
z=qFgfw=z5C-9-^(QR*m*#$QS7U|X4%j~+yumxkYx*kQU>ZPLq>6c@5Hh`_+e@xM5K
z=X_B-{`vVu>3CT*{z~F;wo6RT6&ue5RuSVAcP&{i3OLUL?98%weC=Y>h?=5><lEE#
z+l!6hyx72)RK4x$dFZyQHCa1?0vyhyW2;RS+pc6Bdi=}J!?vKT96B#QJl-1Pkh8qC
z{Lo?O$6pQ(R?H=0jgiYaUpyYl;lttQTIJ?TBeth%$y$Iwswv9>L9BY)H7GYYY`a29
z&>vgM&c-vRZP!qxf{!D+AxzrMN(N~vQ$%d~e3sm7+GhHECpp;!PNo;-M?}sMIS*oJ
z@gra(4UYB~rqr)jpZ6>5<Lr~x%*&Me%k)X6*}qBLTOdv|Z<x=#m1=zAS~{tp$C^e=
z+EY=#2GRBTb<Ddz+#Lt(K444a5i{x%n^E=4c6q_ta+moSW`|F>Q3cgzJfw({sTl8i
zGWy2cQCG|r<MCj`caG(!BM(rw(AVWqjE|eBJCEB>PS;!H#$j4!s~*dgZmMv&^W<^5
z^Mpe&L7JTp@G%U);2q^Rp{y<7d_9h{|1|FFhk<GOQ*(BKS?Da=8m}>v$?b7JX7U{)
z*KI0jv%G1Pg8}D5;=M-6NE<P()mUHJQdMj?aV>_gBxc#oyc*^dE574Au{J(t#7%Mw
zw!k-%SN*V$^)1J|TGu~#9(B-T#dFM~6%To9l{eeg5odYcljJQmW%&#&Z>=e7L4<8J
zW%>MDHM0lg?Q&Z@P-VW6MsayZe{3n!CTP`zOEi+%(&iiH8#3MC5(b%U*oHr;<hR+?
z&z%bNO5xnBG?q!ujHeFLF-lD=+{oiFDK>Z0bPl+n)lcDWAFf$y^*HBS!0}v&(1B3*
zWLop$F6c}KKDd;b(n1B?ugu`kEsm<rsN)k0%H$vX>wACu9BzxwTw%$A85=`#nkz8z
zv(;z*MT#~*%T5*m99&U#eV<M3Y+a$QK&OTKnRRHk;$Q*x)Y-X#9^4gmod7hrM0@J}
z=H@*Z>wxZFbMs%{*hf?&3_JW$Gq*qt2hH=u5jq@jLO&Y-nMk81zOCl6Z1vLW#%HGT
zD%(VWcP@=<v1S^|2gtE)E8&qqA`(<SqyoCf#<aRj8aF6)JVA&--%+4d&AumpL^oN{
zI#;oiUb)NOtJ1jVa086K|L}}%!D3B~HX2Xq$U{w3wwm%WVxPdj`W!<Ye<wE5;n_D!
z{?54<{=pagB=Aw-*6G{F=Wi@TzPNff{^jZ7g^}XG>%|jq6uT}JTP~M;qxZO2|NNQH
z<Hg>=yBEJaTD<sHu_aOR{rH<YPlI=E=o_CPHPBhy+q-`r`7|;=RBGE-3hW1lY25V5
zdmp`56!+gwF|yLwiTB4n#m3H(v>WVvfssXNe_1sCmc{)=X*YLr=3^imLu854;4aP}
zshcWyLCAEif_+a9K<c+soaG;a_k4jztzwYUBi7Wv^geR)0xNFyBXiC45l1~Yf;o@_
z{c*xCIO@5DVUDe9Fb7{DpQxjrw6$WD$k7;QX#?y6=Ho91Z=tGHtX95wJOgvolDA`y
zTC&!PS9^1VuXt4}Q$6Wn4w!?s;@D%Wd1q9`*F4+Gv<7on2LNnWpZ#(Fu)|tD-Qj2$
z{4?tUvUSjN;;%?E!{&%6H`&k;*N@Cnkk(Y8XRYs`GT$`WV5{?ZX!WKI_1<P%eLMGb
zey|XlWvw2<wT7zeyTJ`?90kpGHQ&%YjP_OjTyB~ErDGco7-rmmGi%N;<61d(FypgB
z=ppFP+eb<^9lG|xOnxL1qyNe62mYGv#YcaYF}_jWEC=F^Rr+Jo23oPnY(4XBv|<Bv
z#(!cKfi`9lnB8|a{4QNX3eTvi@P(JpzVgbkuz6NAj6;Lr!Xyq?7Lw{a(cjo`FgtEL
zNs0rZy7mUf(B2&Gwmsx9==m2e49W4K#D$BearG+jeiRq6(s>*Tg`TI7CavrNAyxD6
zY-3evbfuJ8rrCBR88Vnb89L`-c48D6dh}Ja>d#l4L)nP+gZ)^A7OG57t7D~sttLxZ
zW9oBl%ZOfVvRK<SOykOye(LjxWx=+r+G;bR*p*Ah>l)@>Zdu7)!41S~xbQY}^=kS$
zetBg)!G5A8p{6IY$&B)cNaRKQt1DHO?Qgt>PFQR^T?)MTgFngn&w;0I&y)gt48#1+
zMQJCyTGomCMPP~&6xuytR@Yqm!euiu!y6aSa7D~?u`6Ob^$^?)LlQQ=;=Y>he~+#;
zb(h24i{WlCMKBUJkBz?)eCTcyIah2v7g!a$=?aq}zGS%&fV6msomm#UndTeWq4~_)
zjO?hgwmqR=WsK@W8#ffAO6pxDUC<#XuC3C~zSJ9j3`%Js{&O)=`n4E^6b!ANe(MFl
zAfwPJAFY}F5=|8DsPSVoI#z44{PBA*sf?<hr}icE-<e5hUb^U*EwIyI^jS5Rxt3B$
ztzl!0T_aS+$vH{nZ6dTSqmYe|K_%9g?+C2C`Y`aF(0`WZS!EtV1hLhdAm4agF4x}+
z!cT;fu=y)t+gC#5E8&@c^agJBf0!+M+ZVm<WpBr#w`2a&iZ`|pS@9m9^ZYW{bZgu1
zpDPDD7K0t7;O=s8|6*`|IoQ1z>@Ek7E(VXz`TxU<A1=DzRPWk-zg2X-?z$fmU9tNi
ex9hU&en51cxF7Pk4&M*@TnB!=#qa82IsG5ZJ?l^a

literal 0
HcmV?d00001

diff --git a/tests/lib/__pycache__/test_baseline.cpython-314-pytest-9.0.3.pyc b/tests/lib/__pycache__/test_baseline.cpython-314-pytest-9.0.3.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b8f25ab99089fa6dc9e5a8a79873cdd3aa73a87
GIT binary patch
literal 7215
zcmeHMO>7&-6`th|e^-`h8;}x7)lcMDTK!r-^-uj5RgNp8uA{DE%1s$<DRL#Vp-6gm
zWyP)ws6`G5k^(efH14T9#zlbS=l}tdLk})cpp;0N#==0*qi;^^B4~Q)d$Y^siejwz
zuHo+En>RCW-pst2{od|C<DL+K^wi(_+doo5;+Ux<kIg(;0A`KM6Gb>j41xc9ro|Oc
zCz+;(w-ZY|=X}$C!#^D`0@Fbw$YtWW(6nSo(+x&LCsAk%QM@fA6*l%Lz7}FMDt@3%
zN&skB5qgLme1sidkVP{zrKM7emd!B3483|a2E4wKGE9F&V~J$kkUb_%Bo`M=Nsnik
zrY5wM5sM4Z8gEQ!y9oD_13=bDgv@)6zC%a?WbNda!ev1bqEKR`5$xUb2-}!}t5z4f
zEMOndmtgu9Gi*6#=&1Z9^Occ{y2kX81r|#uGU?&f7)y*yWfIy*Y)MNSL;73ER7xK)
zG~LigQpp9*sS7b(OC{6VaQ2$npsKNSI%C8PO;y<*(CUR-Zw9hPz9m$c6Tas3)*0*n
z36u~%K4dU8!Oh3_XJBxAGnJ8JshuY5|1h~IT=cAn!(_#C_60&#VWezd6x;Kpz{*Ah
z#e=_yXFj2bM*>#K1<%i02?>%8k`=Q;))PfTdBQddRlE^V@g1?HJ)pmntcY1pR!9=r
zZwgsc&|jAWW+0o2890lkmrsxBV=HMjnJ^pm%bAr_LN!=&X-Q+Ilt@-HO}12pMP^zY
z*56(n(1Po33d^Ptm%VHsn8fx2F}=&M5Gm6a&nzz|p#kHLqk~U*lZLjeGaNL-m6j<B
zVhiCdQ@F}HfYm9GgI!%+tN{zWY8<?fy#zd89u=51^4Le(5B&AiUrv?U2A+C7O+)`~
z8@M@<8-3~};nw{4NAbJe_s%>x*njuTz3#$n<Xce)UlO3sV=wVGd`+OiQy*z)zO8@M
z^XcrD`|mB4q)35A{`<raMLKqLt7FpZiv!~%+@gH3eSS0{>MXTFqBs<uX3+YY1WsNp
z_tg|aR`IyX>$Ml3T|Po5Y+JJ5XH~sk-B#7>btkLn54K8(;u`Zb7oM=^A`4EgkU?<l
zO3I3g_sUd+9-SuS%Eew=ymSo@izq)JQ8a^YQIoof(iHKI3rcuyV%#%fk4EviS~b=*
zfowO4h?SXB{5#flW_iRj=$g;0PYEc&h;KyLBK1f2R(f=4#w}8lzzToGipoD6jG^N~
zRy;1S*I|}~npu)|X9*{<W|llPv(&IVOLb<6*38nL-C3$LOJdC|HSW$*?JOxFfNOhA
zA!Z5-tPM&{Az=!d8D!cLJnwZaVT##nqev#cYm9+_8C@^;Wv?BUbrc@HV0ssmZ$mYz
z$CfoyOlzyAKfyBDtd_VVeCIiN@{VwQAGR2k^{$gAdE*ILmqQ#4Fg)c=Ix{&tYtplm
z=gu>HY@4B1veyvwsWE2mRpYT#N@ZG>0TRi^0ILL5J+`Q!#B7}8HYGFZR~gGNbH8c<
zOjUaua8f#EKnZ{*ikQ(9<136E#116rK(&T*$ekH?XU43IrSQk@%n=<y9l#r&ER3^{
z8;np|)d7yK=$g)c3bpV_r$2#b`x<#Dg>yZ(RzJLP^G1I5<K9mO?+o4*KcD*|`dRee
zcxh&?JQH1?i5B+0S(N72&O8h?<@VoN`Y?4fm7n^!<&*Y1?VswO|LlwNpPj$g^Uu|P
z-T23iLO5Csy}35^FwpSs?E6h?lMf_d{&cuNyK=wgWL@eiQ~N5?u6GYxxy>qD!n#f!
zH?u*z9DVJ%xdH{UF144beHCdt>ciY-l`Ua4r;eN1pzV&nwj4k;AnQ_Fnc7#8wxK@E
zZC2S5R&(mOnGM?J=xfbgEKnftoi5W>>nhS#REN3EDqF%@P8~P1L0cVl9l2i>D3Eoj
zqfG6qNIOs;<~FNr39C7E+zga31T0(dC!kIXfBubo#1F@xBSHmNJP$r}wW^EXYOoui
zlq+Kqkz=<r&xzgq5enEXFxTV4ZXOqEs*l|~l~F0;j&&WJSA*TWJb3cfU^m}$U^idI
zipoEr5+7hU4`8<`#fJekI{+(Z`s0^l=_M_}WXQ38Bm+nWfm~pt$c_VXpKc68M>B{=
zoH~2{wK;ymvBOyP2$G{njv@I85ZTX;BYy(PNhGI`oJR5rl1U&sdeN4{r?q<c*cPhk
zOj=!xB~$DyRBpjme*m)O5Gw3FP?TPFPMUIa?|O4@A>3CC$#AAfO*!%X)3?VrrOpB!
z%pc=qT^cM?`zq4Gg4AhcH(h)QYdf{v43uo&5*W1eknBqU<nV6fjVzhJlohI5V@{}s
z8+9G@e>}LThG!Hs)+7in_HYMn-0Gm<c1{>DV@K-lrro3x`Qs8Tb=UOd1T@t)-K6?9
zq~cDMt(1UAEQCQ-OY4R~+uQ#_P(;NYs?}{5A(j3V?@oP0oZ|!ntR3u$R5z)A$GX)`
z>fI7bd3k`feUswb7oJ?2!xdTe9|Ej;n-l{q0=H1vtn=zKy+^8ZyzP1qOf$^yuJ=sM
z&pgw68g{+s`(EAbdJg~v45peMy#d?ldN7?CvYx~kCLLn<kU?(D0u6{4FnjEJlPjni
zg&4`C5GsvcFntS}VZd8l?`noY)P>MO7Oo$&W3SNfY^KYKs1C_`A4<k#eXts0{k{@#
z9fN=?$Ze<s53>PdpoNuW%1EYl)0fIXoTZyDsP=1Q46oFsRAwQTQqvIIZ38OqP-&FI
z6t~2Vm#onpb(b8mdbYx<(L-BuW80xkGlw>=j}!3zKb_FoC76*P!L9$DBk2b%FBa&~
z?J*!99M7?$)B#+6fRlA;s7&pvNQVjt>Z=lq|KPYI+dvgo?mivLF^essc87IzfH*(k
zG<7r3mK)WNM)f;J_2>H_o9|nf`peY5inO0AtV*mRG{cr{pb9H@pZ23sSWu#%-C-Ra
zAkOz$;u;EVxluA2B|ApR`CiE8d)FnoOzo>kWv;L)v5L?PTeg8JtlWJnqfuB;qM+Sj
z9UUOf_gdoZ6nf{#5MD+EDn7|x!(H;CYP|~>wp<C~`C+E5GWY~x1|1Q8o!~O=-)2~Y
z4_*2&d@&m4A2D?PnZo*Df#A^GSFKR_nq=W}+^5!?35?x2cIKXThj8<(P(N@2w63JI
zQ|vcT3EYq9eLx<If*^cF$UCG=POOs?Uy(!q@!=P!ZyEza=QquwF!Z!R6#BWQ{{!9<
Bp-}(;

literal 0
HcmV?d00001

diff --git a/tests/lib/__pycache__/test_cli_resilience.cpython-314-pytest-9.0.3.pyc b/tests/lib/__pycache__/test_cli_resilience.cpython-314-pytest-9.0.3.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7b2770260492849efe573d9a357a8c651c81a0cb
GIT binary patch
literal 5746
zcmeGgOKcR$wR)!K+rtbP{NZE9*!VMi#()hh7|`-_OqK<^fmucI(rM4M?OED0<EkFO
zUahc_NI`@M4pFp`av-M!<-#$M_OPo+IYwi~AGQKYa>!wC1g}KGDeqNJ&-5512d(xH
zDdlnXtM}?vy?Vc{*Sn+Z!UWo>ueW{lXCEPH95_v`04rkv%#o`^mCg}M5>L;Nx9EwJ
zA=&c9ZHnLW1I2SLFch?c6<o*)iP_;H#frp9BiTVzZ!=NleviGjwNCXl6Dz7p9VFrZ
z3XvBQ5|@W{YqCxj$(~PEP642UG#`Q_DzQ9Q1D3am<Uuw|Rz0wm4|lNqstkQVtR7VT
z0Eg6o6;2ULQG>8oh%2Y^`Kgl`y<i!X%VRJ|SA-H<;f7v9f-8Oi=17uskcZNRhvcD$
z%givxxx^UG@9N@%y;qn)S??ItbD4ZDc|)g}-qZPv(W_4wIcqPwmNiY*YZ=U9y=Hby
zKw8?&YSds^Gi&71MzS!&6-2S@L{2vuZ+3~7bP>>8D*++Xlh}sap%nnhT9Q*_ph4_H
z&{YJd5rBC^Hg`Rr)~#$lSKWNOd-K=n>u{kMcI%<5+Z<IQ)V2)($7hD50{BYGvxD>l
z78_W=Ls7+G?9i}InUPV8xrB$WH?G6!lc1&K6ll<7v=L`{DW02qseVHs80-P6Z?lgO
zDc}kkv#6m@@fNr6ZVC#^Q1&_UESUT<*7DPjOTjJn8E`$Ptunoe(n^CueKkRfIjR6d
zMUXCAkZ8uRjI^a$G&?Z?ZnecNN<D28GDF0HFki+t*+Mu@j)I^-pyJ?4i<mQ7rKzO9
z0#8Z_X|_f9*wMChergKbuPv9i+Q#$6Tn03|VK8eN7xLC1j?Yq>r;hxR&ZTR%?7Q~Z
zvinWFm`hLc2Di*Mt{FsNQ6b0EXuQ;Rv>Y8M1qX!CLM;~(5gydEoIYh}8V_q4*sf?I
z9MQDtqHb0wF-;rKQf8UioRP}|Ian}tYdlY<sPL%<*n{GMDY_4BhmJRDU|4X^w5D62
z*RdkR5pZtMAcAzpGTF41HcV5~;K1u#NMJ)fy(>;fWn2$Cj0DyL-M@YUs(%t|oEv%}
zOTLB|e$jb+TVDhvU*k&{9(zupvrmbvb&ih4M;ooYfUue)DKcEO5MV29w3<;=NKnnB
zzv5;P3~SBzq@=ZGtKKA0n=eXEGL4f_6jQWeOk1nk4l;@<Ks3jP1|&@?dWv4H8k(?5
zRqA2!a&UO;XDx{1MD^pJOHN6Z_n>AfO%2qIeC|=fqXwrBq`Yq2^qFDINvqoQmD--u
z<J}eN#2*ORM%0iRR+Uk7PSJ)j$XB%*_Yt5>v?AXqx|(Ri7>TRe4oFsxhMfH}5CwI7
zO{<t-=;nK0QWZ7g+UL#lz0N-Bx;5wes&k`j&h<NUd*wYYUnm4dH#>5>*4X0UYEL*z
zz*)n2PD-kOKq`3QUmlQXN{#*Bf~o&$3yxx<6>WIC1J$1B&uY*ep~E|BR0nci>W-Hb
zf~g=xzRRZy-q8+cc1>Re`Rb<WOv?Yx5FyjQbmDB{AFJYQr^8u$!Zorb73!1J%^wW*
zdHO3^4>G~}x5@-?mb!?3V<s5Iw~%Pg`5Z|zTUYVFF~VQxhs?14tun&~SERmikCg1l
zrhbFWpf;!*Q-NNoE?<P-B430ZDV*o3eDMo)Q!`oZ8|$lY9`F)JwWQ`H(`Ovy#P3kw
zY9rXW>07XK*Eh8DYy4p6rf<v6z1})2DQ{sV#j%d_w5>$@ReuL*BfCje-*6AuMzU~q
zZzXZ@ch-CU#elb!WTnIwu1pw~rrQNn+I{)K>%&I)UqN4;BekomlPEffC`b~LxRJrV
zO)7}HTICiUJZ9=sV;TJf-;XtuHl=5wG%3>S+3Rriv#+FDn(AyadCY{;#bhS{uvy^c
zP8h!L=E!HeKiexVZ#<%&y>#XL<x84+@#5v#Ae&)HolabT1qGL0%ouyqP-*bsq<-TX
zOY6A{JOFiB(X`lX2yd3jk61o8`##(?4QjP@_7r9gCs_F73ea|3pbk1H007m4)O`T<
z3hIFbOC*h(SttO*pY$ZcpKvdbxi??55ITU}LF^9CMpzLp@^nV*%DtwMLwx^i)V2SS
z1ViMJI<4cO``~}&Jap|~t~qj<3_=nvK%JbD-gG@xCDr3x9`R~)lgvRi{~o*&LS#Es
z>K){u7fdbBZg{P_;4%~%1{c-37aRdfH&1dOm#AG^RtmGwF(`nIn+1A;ANcQ0>F^{#
zB!=&suiz}D=*SoG-=8eS+8^k@F@A0QcJR@ua@XOd*x^6M|IK8W^~3g^((E3Wn))4^
z08lX#m_(Qc4T4tCb2AAa_lcr&Hnex|q)u}NV-yuR8`x`??iBqtA-(b(KMn=gJ~Zu`
zOSc`j{Q_Jiv55zm0VRw@c>pVUD2}+lFoO!=eyV4g!Fc%646}@>vp2IAm(k(5-}V<C
zdABg5jTN(|mCZ5kw;i5`0V_1BQ{JMn$$Zhwz~_Vl)h5jRm~M(%jE6L)j~j^Q(UZb|
zp~@G9Am6BA*`-?=n8(y}qW<SHx;ponc|F5;2&(CfW*IjvIsgiQiVH%`6q|rMKw)jr
zK~zkHD^vO0HDjirTTnj+t*L?r;3PeaTndVB8)d(H(B~fXy9bBtfi2!UT*er5(+}Dz
zvFZEwxo00_SU$jafoijB+xQyJN5_E;-sXheg5)(v{wBwk<>q<0`B#Alt@Hc7vmg&W
zaiO+9YFUtvRxzK3AMgCZf~-Fc#qPD<Y5n=^a;SYV)c!OWyBE0=xf^}yCE<?eexhu=
z|KVb|{lWJZ!^uZm7Q+K`r=K*mmgMgHyB8z)Tvj^gl}-RkN@rQ=24G3)UIx7DSC+eP
zpROVv*G5aYj!jq)E4bjg6>R*x-2J4Xtt9WdpInUKb6M$`S9$;_DLrLn7XVAju4TZx
zer0(VZeGDXu8o#(9h<NqR&c>}E7<sXdDl}VcJJt&qxadT-(PI*S#Ii^Z|W;I9V{z{
z=FU8cZM=W(XSYgn=iR>Bx5`Q!pa%y;2l`o-E5EYbSyJNFVHLRBR|OyA6830G?!0}=
z1{Yk606#cTo$m&K^7rV-=bJvy{wZFLj+Eq)yM2&_$_j!@%E+=@_bVg4ihEoeEy*Lm
zW>XdbevK1IFE@gVfZLQp?XQFnf6>)6(&?M;R7MVk5-}ch5&#{>pxUh2(Vw&f3xYaH
zjT<xz897u%;cHCbs0uvv17r+fV8X{WitlLp9->0_7dJ>u+zTE3E>bX`il7jmckE9f
z;%i~>nEh$z1kJ$|eCxAL=$?BeN%|`xKPEp?o{^Tnkaf?<)|dVb(y@OM=)UfWN~$E5
F`480UTXp~d

literal 0
HcmV?d00001

diff --git a/tests/lib/__pycache__/test_compare.cpython-314-pytest-9.0.3.pyc b/tests/lib/__pycache__/test_compare.cpython-314-pytest-9.0.3.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc54dbf6431dcb2cb3d638ecc9355f14654b4622
GIT binary patch
literal 8487
zcmdrxZEPFIm9ymXb16}xY|CGfDJxOT#G*utwv<Yi<j8jA+OgENLZ_{g&5B$}tXCv8
zyR;%1*CW~jXI#^}v}l3cq3t=i0+p`CozW}M6v&VKDbODUDp_{q4jL5w74$~|J2K$L
zKYI7xESFp|a-A6c)q(Ul^XARWn>TOXy!ZCm#)cq)a?iK-%PEPFG&bzwEmyW)hf0aO
zLS*3#F$MnX8gmz22gsOcdJbTZ>x_5IXZptcrk}UnX98nEQ#wFi_O+8ZksHSaS(H7p
zx6f_OYKE{!_QeI+F9-TuWo*Mv0@z?Hk)@=KYm`H+#0<*~0Gni?gT%srMDp{iE*6+j
zsi9@$VvhMz8qH+W=BmJa>HK^_rP}h~<eU~&3k5x!R?Tca7hP11C@to4+1zYY&1Irg
zQ`2+mw5~;=Q_rVWeJ*d9pXu-E?}>l#@BfBnbvWi`4b!SIr@*xHFv?(3zGxPTrebQZ
znarPtAxhE6Jn(DjxqLL6x%9!69sm8=KP-P|IOd{$V3nejYlJJ{4xns34?qR+z6_$3
ziRpS}kLlh;@*r$elwGi8kL(8Ml|_I)*#poodjSSypBYRMQ<D84k^qw~n8oQ68MR<)
zl!*p3(q#$qg>XTTy?+b&DG`~(L6WP&gTS1+cqwY6)!Zrpfr^Hv&oGa1DXZ%Sj_>Sb
z!`&|$8a2A7DQrHUi!Z7)(><EcXx-|pmNUC7;%-wjOru-RPV*WH3(p{4SYlFI&niZC
zHmB+aYjyA$cXaH7#kK;Rxs_*Eo>)70YzrD>`%R#3mO!y^3+HJIwCD~1WsYep)LA5(
zy9_6a=U!R7&AIr;`}f1REo<)Uu9ZC$d8%OmCBmh+3o0ctWgjC+LK{m5p-;e1OrWUf
zaQr*rhq<Y`Si@TgL}O@*VS)nAv)xXvoIN;|Vaentfgek2?Y$ODIJaFysBo6P4HaO?
z#*+cz^Wx`7rjwBU<NzFaD@h3xPL(KAHdUBE9U|oEGIkc`+)vJeQD_5Q>>vUBb#>tN
z`~)*4%Yp&Tbf+sqf~k@7UijZS4B#N}RU(t*G+d7YXn#t04zRD05*U@Uzzbg!);{Xa
z3ac&}fllkNK~Y`t9wwwMp+>C(4-A4jYOfXO5bJVEG^3eX8ZIl%&dzEyKFII6F%++m
zAApGs)R{j5J8O~Q^{$~6;f;Z7?eDd8Y;+BMw1qPMYGj8nc^X!BtGOk9t7C5abc;EP
z3Sr>xIy*Zpb{cEfWqTjaQyUKM15hHr4@bUXtVf1dE^KxU-RK(Hhz$R$@w&0*AO2?q
zSL^yV!oz^Iw88@uV8P$Yb$<pLC6e;m=bG{1xmLBLgG^O)!d6}vlaz~RCdZTuXV08F
z#~OKK{KZMD9Sd69_7*5yag*<ydBLhG*$k5kdT}<JQ!-g<FwZ;~37rM?yqZzCm0-bn
z^)+R>n3>f~CQgHWV1b#Ut}ANM%o}JWL42K^Phzp8=+)I?E<ML0PU%^=D3BXrzP}IJ
z4!wS5%|G~)8~`p2ZWPY1hL~Sba_YRMC@iQbz)(@gdPq?gimG1j2`kDBFl_4CoR-T&
zx4)pP=1iW>)7>yX#q*&Z2oeZ}0I;13Twky*X+<?bwWf<;|6pXZqNH`zFz_i#Yp^pV
zCa?wv5{BX7jPUKzMw}199{4wKi9i1noY(E}&e9kYk1_EG6MH{&37!$*zUcBCze506
z8^lpHcMA;#YT(v)z=Xk>lL_a|gAOd!)0W|0w~{1|fO5dmGa*_yCxm<NlHK@-+fR5w
zNP#nOcyE=e;7Yk<Zw<yhh31r(34D3#%ED+$>>nd!;YC|NXMe->6IFqi39tk81$b#o
zd3pu;z{O;*D_Q1N_Dxksp>XO-J4v~1yM#{1RFh3GftIDNv{$Sm+N9a`I<D}wsVnWI
zx>tK0dsi8FN#NE!g1?WCe?gD~a_~{(Qlh<HxBGBK-e6O`vLuI6-fp1=>zhJb$BScT
zRP?$667$(J+rM*y+%O;%+yerggwt+(=xICj!k)IezX^P4>q@U{;cF>BO533gOLcPF
zUK^rZ8`>}SKc+T>9dg)SbNu%HyInQ149HD#b1G0L;ozr`aL}IF{&h(Br8rsmsT`@1
zu}5xM_+=`@Z_UYtWJSZ8QBrUsJ8GyK<edwDN;Puo|0}ptZkzs;Fv~py$IiExZCgr&
zn|AQxltZ?D$EfY)<*|1oH$Rd#WYgBkHYo3sTT{V0**2x534E~YN~+18<p4>E_AWE}
zC(@s`m%^~ej+Yu<YIN*!_aixPu-BCLJd(E2rmfp$v%FW{mujxtW#l2djM%e)y{=l5
zXq_;i33pd|h1fw<H>uIu7J0w?M5?83%^jaYYj@au*uPb+El6M=Jn%2Ve+d4;UeLeD
z4d4qt$s~CA71fGTY923iLeP)jdbDSa8V8T6P~Z<~+1v6D>gxP-Mjc`?3`FekLow#^
zi-w|SXH19VS0;rEKn%2(CGyUtcjo!C%KR7*VLU|9wRqr}TkPsdU{IOQ=}YM1gcq~s
z90Y*MWh>xh?!r<6K`+IZ2x~`|8^Hz8s2P3gP+_SrW+1ZfJPYAa=eXnpC*d)R@E8kl
zI*co+52xhga7m8Bd%Enynft8){Y(TF5*#L%{n(Shxe$A7IqYC1@mv}G9E5(Z<Huvd
z@?lOMGor_jb8e5ujNlc-K_zOTdSgcX^ocX)g8u+wcV%-hDq@ey=J=>&%&@UQZO;Wv
z0CoqbO;phdq;6(DLjwoOq!1rku_C%>tif(@htU$Lh^w>UNMVVetxCbof>>Qws8*n5
zdKOR&bw)!B3jrl~nOWV;<_zYw9DcSNm7%C-_$Xyo&rhp5_YGO&34U4-DW0M<Pg#qC
zzMpC;5KZW6j>nKpoH%je0%RNjDi_X;Uu0g$ENHn5_{U}H>3m+#7h*e^G@s91(v}LU
z37)G+K?5NK%r7zt`Ld3q$Jt0a8%L~$HRmyBqu)u^>+Ct|Z1g$F63)g^OEMM*iJqEP
zZ%wO)iRY{?i=#w&l5(rBrmx@GI?`ic^xGNTrdZB>>qD+8{RWJC3jU4XfC~%muo&JH
zTi3<b?@8BQUQeFADW3bCv};p(@`m)}wJ%=Z^=|7;=}hV5?O@~8g>ODx8oezwu7#tw
zr1mv2@xzNhntE#rD%YmgrNpLK{@V}}Yf`(_yyd7@2;Re?mDY`F3p%b%RXUs%AcB>z
ze&v?bwkD4Lu<u8MZw>yebu;n&dgA%DgQI{0xGs%uisiozz({RY^OmE2lT%j6oRu5m
z=x@c*t5<GG`(VB|O%B$j4xoYRihJe!twRG)dmDS+HrEeTS9=dv*ymWST!+=(G@WZW
z>&W7vfw#;3$mF2`s~?8hXy<^xUI(VEVAUC#ppD~!%dY{e2GpRPZ5$KwfiYJ>DCZd6
zV`2faj+i4%1>nJk#F*}vMf;J<Lpp2%5*GCZoFY#u2=tIP)BV+eC*^ubaN)!7Kn_*{
z9dMg!107kaak4}?v@n_Sa3^cVc0Fb-*R|pvO;o~-Dt{2jL{eVhtmWb<hvU&ORUNf$
zXi|0f<;F+SR$Q?<$HOOw<))Oc&I#~O;hp8hF*82Obp_6#;uJfafM)CAr<&Xh5B?5{
z=S=Z&r<j&VJ14R84qFp|9TaM^phvdPf~Euz7Gy$oznu%^7=ExH%ZMgxtqZ0!!StI&
znzI}NMVpy{#8A~Qz!U?2^=oo|xC8J^YBQ%4+-<^h43L=OIX=N~X83sCfQ>v32hOA@
zrWGkhQr4xhB?btd;o)pq4h-f(dFs*1hQ0w5HT+--gMId~`1yFA^e9Y5qX@RmkaD(A
zaR-GGx!|gX(oPo3*)w_&r@`A_&YqR5#7GBV*onvD*O@^*h7+Ts{O1x`J@ezUYftrA
zJmZRXLAQ<b{B5ZG{iaYUnkGv<4nfVCJEhM8f~6iy+_h=W>TRl|S23|!O|K@^_3>AK
z)qed;@1A(?=&6nIS3ja%Krnfl#^J}-iXN!bZ{a{ZC_AUh({F6_Z@?z!36-Nne%|uG
zE$d13SH|Y}t2f49-AJlGZ~0lv+B51;5nQk9+eoN@wR8wA8W#M;f38C#P(i(Sq+A;?
z2)z*GU&1`_B}|ZA_7|@Xj+fo=9Zc*izlmTsy$?eCy9|UF26&~*`~J#@D*GD~rvE5D
zfAH}EQ*ALH{T}>0i13-x`P@u)HrmxSr|E?UVZ`MN_-!nzXAQH0&~le8g!Nr0o6DLH
z4qB|JdN!k)TC_Y9BtO%auqVDWuk+kISFM=D%^^kE^3*UzQPX||nA@SBK`?+|5W#T-
z{Q4v7)$DKC4DeLzz+)Y_l_bW64P*ff!2KW@6<!eT`pC|<4~eVmv~b5u+K2A>+&hQw
z5V2uIxa%i-JFY)-egDlpLw5p*;U~>^cAk#FG$3urSC&~udpu%s%0&Kp{0m@?z`ucJ
z?IWJbZP|0x&-}-j|H%9PKIT{NyS#xBVT&NNX8%v1rJn<^eMDTbq{2fisf(V&8E_iw
zQkRAI0w7rbrxl(ZNDt$5=$e7#evx@+#^QAx5d4j#_-fP717Ke3A!Q<l*>n=|UqJ9h
z1YZJho~8lX0iq)k#6U2_nzYOCWiSq(4&(fbfx#8R3jG3=K8g<=ok1{%;L8XuA%G+g
zp#=mB06+sTV;!wJokg&S0NgP`zk=Wjf`3F%LV(I=$&PEF9juIuSTRcHCzBujFPH``
znDJczAHjzlvG;wk=e`(%)crkz@XlW1X|m#&HzAI>nLK$@Jav1!I7SM;A$+s%js9EG
z{xz}lnwNuhsdG~-|80n!YtnwJdCO71iPRM;X9arr0hX--{XXvc2MGQH!8HVL0a)J0
zZ#yPHz*wL;IH_IMAgrB-PrVSd`|Yrr&I2{uqgXL{+=?l|R>i5dNV6t<*r}kHuZ*+K
zYz5ciQs+msicNmD{IIyo!yOECf_Ws!WotcL=x<o(;vcj=<_yvQ0UVl619kI`TM&c~
z2>CkMB;D_m!*@d@y!%6!zr}keLYl{g+YQb4T0>!1NxHk2ME0#bv$FqP@n-X>QsaGZ
XKsbJv0QhjXS2!u$i?j$&bD8}=iSr-D

literal 0
HcmV?d00001

diff --git a/tests/lib/__pycache__/test_compare.cpython-314.pyc b/tests/lib/__pycache__/test_compare.cpython-314.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..00c0468ec1276a75b62c5b5e28154ae2f19667d2
GIT binary patch
literal 3199
zcma(TO>Y~=b#}S@TuHR(5BfuqXwy~1*rcpliQQU}V#{%*T9B2MA{T}1#cH_{w_5Hp
zGegOefr9O&Mw@=1$RRCyaDV`z(V)nohaPgsPoQGCHU=$%pa&y2I|`7tr@psKs-#<^
z9e|HB@9lf@=6#>)=}r(>Kl<#*4?d3&@;5xR7M%UH>%d=)+$5SbO{OF+P03syz_UCZ
zno@XpfZULdlLFDYuSr@+Q?&58P{7I~NYNq%NsDT+b8-_KJxBl>PZKRM+QP*&HBES2
z>&lW`&lmV%P8RVwgEFhARb3HXv}n;bc}@~h(<@gDYHf`zl&p+VsW`T2aNBb;8wShJ
zs_WYBYQ}Jj8EP43$yj!*3=kd9G@O#h__xoWIe(_`__x2ob#EdU65Y!NE9oG%3|vg8
zUX@p>T<6v;E@CFI=v7wSidkmK%h<)W$F~ps;X9viy)%)MX%sZ2aR4Uj+*(p|vwI0P
z23YR~;gUwUeDeqo9U>msk}H}FFAi%VfDugr7}df6V_F1YT#NF=BH^kQ1B=9kI?t=i
z7mG%PTU00v4rbGI{#AG(7(KQdgH4TSq#yyC^v`(7*jURj({S&RdonOu&WZ@LHQR9*
z@(&J*iQzYyMcME&g~xktVZ)%s;YqJ(4I8VL%k#m<!`xz=4LkO-ze8i84+@n{p_-1Z
zGkeuF9469jdKTIU$KW!CcB+y3srn0z6R+&Tf&BMJVCdB#qO#<kG)woviS`0$YD^En
zUZiYy9XyJD-@ClO{qp~$_o0kH_D6F42u;JeLjkmiZ^c8fsgb3Y8_6h~ab*cT0)IJ)
zVuXYHQ}8E3)Y$0ItqV-UES+%(z_K{pu2nnb1z>L4<iq&y)yDC&fl4U5wG`O+D*YNZ
zpi&D@E=X@E-zCLCLi)%6I6O@jrMdQyXnwhGPx&idguK|qu7b=ya#bayA7U{};`lFT
zF)#lQNuy1}K!}CRJtKi=)ch>`c25I10eaQQ0=WV;rUKEwD18gCw@3{t##PY8zZ33$
zIb=&<$|Mj2XG7-8g)>4j1EXdFhsVbt9IaawIxh0<7A;!bG9g*1y}D}A!kC}uY`n1d
z`2YxJ=<6>)oknV6J3n5Res$sQ@jvg+-p`N!gQ1OooH`&aT!EWIhP&x!buQHMZq=px
zQ3k$laBwhCr>FB?arF5*^&|5+05$SiGW9dIotmi6Kg^GRnjgQPn)oGqn>At+KSglo
z8QT5i1Ym=x2!jAz_`jF#FTtTk79%ac7NH*XwM9vmEbYZaD|lt0>er_iru6x%(=#)o
z$3I+qb0IkA62Y@06$&ZN{g;afR8`$B3bo=?S8Z1>+LVcK8H$7>5{_pSb-zl8MA^8d
zFIS7J78lAg)CUn?sXC5sRJq5n)qwd12T$S(tQc{Os%w@+s@-}4E(4!442l|g+&}d8
z?M7_uLl=N=4ZkSN=DI{o*IlD*>AFbhIw(|iaNnitZ&eMaNlEJZ3MkAS+qGN|$gzrJ
z@D-1i=|PZBk0L-P(IEsc0}uywNMEQfrfzTut>r4zKj7@ub<;5z!&YQk@G>15Qg<5?
z*w8tz`Ol*e<-`YAHCPtF-+uz{d6Ya@n-a<^Liwgp&VDIN;n$?E6*>Is69OQ3AhLGk
z78Wjyz};E60DMm7+A|Lbxc=>J4e~loMv(*7KpSOF32^NY@?P%j-4$t3x=Bu>ZJMhh
zFUne^9kYZw`D+gQ&NJ)$_Xr8<%p0U60dMp<y#5T*$=oIx4e`u6D`O?;kR5B6t;O-~
zA_Pw#fqN4jG__MNf8yvCDuk?V1pT$c?3g~7{k#9WC!nSf4pPMrfo7eEjX6enxoC`w
z9JbI_<J4Kn+hDq5ukf~#)%+BSdaF@=y!LU~4+s&%R*s#Vb_01UT-wN=8NqSLbDd30
z&g6#8OE9|V>%pibx)8B<iu)&=W+=KXpyKB~3Vkq}i;5_-AnZAXt{!xFOe)dcI{0Z$
zF@S_}<>K^=Q06Yq&%>YwP@kW<b{(cEwd}cT)@H@vkk|@^XF`Vr=$Fq-vzn)$V+^_g
zvcCa8_6q=zDoXO9lHOL*zf<qt*dD#Qqs;t8J@imL`KfyH?)N`B^htV0ovvMel<2wh
z*3Vz8O+He48p+H9_4xbOKUjKiX-6GtC?gGZ1n_r$_(1J{|J(;-?~VO2{cz;c_Q<8i
ziOC&xvY|{iROGyK`+<7wUB07c>!JGWgP{w5;yXi4^7hcA?*}l_R+m3jfteXhA6qH^
zWP0w3y_(7AOO{jFN+Qmy;B=aCY{vHxmb)Gxg2~KwZN7CP5N9}c(co65DGADtxrUU&
zX4%;~=6{3khbvIXWozAXyh<5Hv|K{$v({nyD!_PF@T)TO2b923@jV*F-a%&&oI-FK
z0Y<HF^<dD#Mj6!R*jP|c$LV#zVX<Ma19%dWB<V33{wq2Cw2LGUe<{cIN1mie?=|UB
tckfr}uB2R3pB^QtWA&HneV-^hy|34LzK+DDSDz99UmcD}mnGkn{{pbS`zHVZ

literal 0
HcmV?d00001

diff --git a/tests/lib/__pycache__/test_config.cpython-314-pytest-9.0.3.pyc b/tests/lib/__pycache__/test_config.cpython-314-pytest-9.0.3.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d3f0cc35ba0d865ec2b822011f6788aefb16b9c0
GIT binary patch
literal 8231
zcmd5BOKcm*b(UOGT&_OUFP1-&R<<IUl||}j*^>N;8`zM5IHorZlct0hD{@6KrpVRo
z(w1l#QPBXg0|ya`0@Wc0Ukb!P4@Qxq26Bl{&8?CpS;EFaF9nL++}K5c9NNB_{cuUi
zkyX@amdlwpGw;pseRkfPp4yrqf=4y>zB>a@k`(O6V*yX*09ZhmkjP9RouRpB+*|Op
zqj6UEwVN$|-A`Mp^eUPM^Z?|Z32r>72Puwsf*-HetH*2fnsy|zdywecgR&vLR`l;d
zdYxDWX-Et}S}$^tHi*n26b^nvrtx~1G3w%SHj|R|Og?uO<2(*~j9@k|rzGuiCYv?*
zH%RtsJ~x#~CmEn2fAX;K_rm{4A0!JXjxKqQzmHG~>JFfb%rql131~4N(UY*(jiMeH
z-}?<I8`Ub#a4&C4@c!=Id1l9x4@fmhS9!KN}v9cJjC%HtVX*T~2RjZ@J#H3e(Y
zNi645`CR0xj8oB9^C=}Nr<I)EOQ)?xbw$&)Xf`uRAt^~05K-riYDtoFxx6mxiX`D0
z=#zuLwhxj8^d0gs&Ho_D?;9VJ?6VXtJ%Lt~B+IoQ9{_auUjRUsFQOy7-3YAr#=S9n
zt4h^q$b#EZ+$%C}ikQ213oFhHS$$A^&rbLW)*dS6x1mzrf#M#!KRYM=5_Z65D>M0`
z&5sD-N3jvOlT<XV*W^p&T`ivrvxaZxaw>z3;8mP~J=B#s-C#+R;Y$@}R1Gi!`iv^6
zvObNgV0avW#Na4nB%svDEW5h81YAuTswJ}{+K%@?6=le404$)*P|MQrdZ=SzVw0`;
z;KDC2tg+3TzS_mMbzl1i+q1&<eEQlNI|5b9^<C?}t_^lzg&p{8=ue}67`->OI(T-C
zJ-5j>EwfFy7CNL73^{!W?<ot{Kt~r)Nm^r$tXi0mbpV6yO;bU(MUUteS<xr@#VRo%
zauG%hihPufFj0@{Nf7a%&sz*^y2V=6n;`o^AJGE>LN=!25vz|`qpNJf5vVwuAbUlh
zA*+4%&2FpaER!A~S|`?swPIa@ER;U6kPVgc4o3um1f@F&U82Emk2Tp)DepiDYRyQE
zy#w~hsr?o7FO}$QXgm7zj94dz#QI&$wY6fbZRZ+ix6Bbls%(wh&ap%XbE_8{#Kv9C
z?fLex#6wH1ce#&S=wYsnVv~!1+po=M?@5$B;xY`XKe5;5lB*A53M(Ek9e%qHI|mEj
ztoq_?>_7!F6xAxxGy6e2aC{u0lANahT)~O9t(2f=LfkjVh#hau4SI$wg9dW_BbF7{
z-fh4CRra2VFYSmMuyLc_GG1}w9<e3PMVYeN;4U~Aw0pL5!oe{{tr}zS+rUR*D>qa=
zamcQX^ERCtxWvqU8Ly_?{o0mcutI~+$=@2_@4>fjmGO7)3(&WN_T256+jD(RyS0TL
z^lk5t+HURJQNA25?nV~H{X6zsZSQf^S0<sg;sLQWURxozb-R$;I=g2(x8#-w*$sf~
zKH6S(FQFqYOQwe6p;)Ia52bwe^H^7@WJ4g~zO53T{@pI*`%syDhj!e*ki9zb$H2q#
z@5GDaVF-BW=Hp?T1Esv2k0M*nRnZ6EpitaLZPDkpey5Ai$tfNzQ755y*`nR>YB%3r
zPnll$#Wt}$?yt~``d#Qny)7|zZs`T=DgL9_49EJHi+>3U^yw8csZk--+t+VZ2!b}9
zFJx1ajx*`Bf=2}%7Zj&4le>~nQm0VT=T&7?Q0CNZCYjM44AXOiKJ$V;t-wE%OFNB|
zg;ZM6N1d`DOv>6ccwsX$;I(O^!cnIQI#!apl5!*W3R)(ele443!D;#GWkE~Ixr3C#
z@R@D`IoX<ig(s6TJjs58CG#{w))101rz8u|tx;umr4>1+D<hyOR}}DG=8eE)HlMr<
zQ@_IiENcpJP>M%o4SY;pc=@n8KNQx2*DOFdNvrz8+F?@DAJ&f0ih-~e4qi9B>U^IO
z%xIaMrpvjcLMr+VZ*nS4paDz{GXxrXouo&LwXR`~g*8%n%<yKF9DxQ&a=cjQ!s&;B
z4Ho+BD<d5?Nr%JC_nwm{!%VT4Ob+H`&KrQzyd!k`g4czU=2DKr%$eeOLI9vi06>X=
zE_Bdt0>Em5wMQ)HJ$M}!Z+Cc{c6jj_9X_KYQ(S`ABV|%j*=d8d=k`#*nQ$}sn&5z5
z#j<MnQ%VXV0Mv!V#N{&ls*FhuvHRxkSHGgpOOu66RtF{-eqg?w)r=t2l7N9EW1~f~
z6ay<NmeSe$q@0y<@{D3|k|s|nq}r%GLj}Z+nHY_d8KVHPQ?qi8Dh?)RYolI*5wa4Y
z0?{P%F{-S6!##A4=HiW-vb9s$4maRtg2EARoI2XH8k0jNCSNUsC(AJ<WY8>_^Em2m
z>30`~+-Sq@!f`j+fV*(aT_~d&G3bVxgZ7sV)aUN|m_}U8F4GpdQZoV&rh8usIzi|F
zYl<dW`vmctlw^h=@OkK_9{$?D!3$nMk3ucKt^0M|W5jg?o*=HCTbOv<gz8>eys|X6
z$<^G@7vsyV(Yt%^{o>yFO|EhAoyT5=>v-(-{N&(c<f%QhaPFxe)weFaeYa~hG_o-9
zw_wv^{B~xeYj~w=crAEr;nhcc^J4Oou~oi%neE=-yEh<py{l~Ztx>1qL0QSE2Fk^x
zQvxJc$*BgUPsW@QAlapdZ1?h(ao5=HO}_Tyk((pSY}=BbCM$f~25Y^mY}<_yr{n-P
z1|9GLU|6MxCFC{In}g#90DrTt;p1yJuPw8}t-%}DR{8w^EzQzog%>th>s@1oWq!X?
zbbwnhoQ3j$bm5d70J^%NIVC`{OAi6OW;VEc2Y6}L!FL1DLiT6u=GfxwN2iwA!5gPG
zcmZY!;07OA;UfUt?V!mD4^y$;Rd#ThkC?@EN<~}NNHxrzU{TmY0r+l*L+l0~vV&v-
zqc=wv<&RD-HLde)%WT)}PDqydw%eU6JjAN3ch#yhE7obrgx3g(gXadIWotx>#FJ=5
z>t6uuE~4d*SeenS2t;a~$dxDI#0|EKT5T8O8Dk`-;3;?|C*stO5+vqEABcZ>Vos#W
zu0;^Rw&PzA&GwO>JqJ=ggap)soKqgyfCDH45%X5mPJaYr%-f1GOxRz1&pKngxU<ik
zbBZyIxKz;Kl+_n84_WjBfSr>Hh8Ph<+yl5Day&p166}PJ6#e!Xek^Ce>S0(EqMt7e
z?lijNrQZO2n$!}*uMq}=B{faw;Y!nqm3QGdrAd<z!8h%vG?h`ZDJ%k6n9#oiiOX{O
ziRJVY%lWj{G=<0Ro%>?^%gN<|i>o1VVd4>2_i^iot-pD1gX>=Bx*r7^7Kd+7Y;+E-
zbPlZtj(+Dw!Nbr5?*p#;dyT}X?sF|?dVKeL_%kQCvXHAIpdJW-{l5g1gl%mh`!9vm
zZ_|TNR*|1eNUL@fQoyi;G*JAqTu2M5BU1mP^SS)o15CkD=m$K2oRWrf&6N^{DPE((
zBpsnb+KY{;0`VB&nnf@X!2^LqwhwM{lGCOx+^N(Bd;x?9#~?9+a4l6(=#5lK8Sq7D
zDp!D)pQQlK@KPqF;kSWsm=Irr#3lWubHmf7AC&&$a`@B+clsfBdOHPpg@HDBA8@BF
z4d7eO^!o1i@@Gz!Yk+g<AnaNLo^|OUR*N-o?cm$t>fyN!IJrcy<$Fi{@NR3wLpD9=
zM?Lwu^rL=9{QwM0KN^a*Px+#ac!l_45<0?gyMZtXZ)3Q7K$wK94Q39vle`Kp9&_M0
z8bMhFL5Jh2o;TQBUR8>nr5XV%#AN_yP!kHd%enm3oTU!<XQWH|O~7#g!>r5D6FX^I
zdln#aZZ^$*|FfjMG@@h;V_APp1pXZSwGSb2NxqNReIGqBS8sP>e&gO*a$^4Z?dngj
zd^%0e&Bw{Pd4!ysM=Q_G_BR5w<iZ9Qd&tGM(~^szB{6s(a4}0u_O+Zj;k$o=|7i^u
zZUhJ1a?CKfCN=g-=9e*IIym5LMX*Be3JuPwGODaPOw=1Ul7xO>i77Dx>U>Jh>6s+`
zsv05X3fQ;^+zLm`3t`i=kUciNM&e%3FC{e?bnosUdd@QUKX76upDJXPQ+Nt6L4%P-
z27}#Un7<?RKKd2^6{`CR)qIUQ|B8nG>4z)iZ$r&Y_qTmPX87BEHO%l+p_&<{UH%8B
C@eI%a

literal 0
HcmV?d00001

diff --git a/tests/lib/__pycache__/test_grading.cpython-314-pytest-9.0.3.pyc b/tests/lib/__pycache__/test_grading.cpython-314-pytest-9.0.3.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f7cd147c0d46a88d041742af08b0b3101c9f7
GIT binary patch
literal 39560
zcmeHwdu$v@df)W0IUG*&{m`4zh}46l6-iN)sJFCwF15PV@mg9^x1e`08jh&hrO09R
zP!Gn3W9&1I3!HbAcx}%miyZGRi7B6v2u5rufsyd+BqzjxG31O$ZCh(!j6(wd5nv&0
zVDJ7B<o8weYi61pYESEZSk2}))zwwi)m7bJeeZsusxsigcH*NAZ;Wj5c%pP*ZeAnt
z<ZqCe^t|p-rL&%-#DBd#CF9;ro*p?_x{1zs&-!}Gl4U*qq`#*;S>6*!26~jF!q1hQ
zt>~#tR`ygSt9q)F)jc)Inx5KZ?Iw>Zul6MC)Kb8D)d$$1mH{@Ze!x{~IpAtF0Jug~
z0N1J&fa}ys!1Zbs;0CoCut}`}+^E(9Zc^(2gK9nCX0-vZS#1PtQC9(OQC9<QRo4J+
zQ`Z7+SJwfys#1$5wEl1DhR=se_43{b(_(|__=sN7%SNvyk|#$;uEdA*z=`p}p;+=l
zA{NyvPv68(GB$YfYApJuUe*_5gYjq*)oUY(#Be+sxjY;TN9m-d*Rk=DaDt6SW15ze
z7a6=d8V$#crt}J4K75&9MlT&=k-=Ebq?DnH3jUv50+{p+NciEKis@0kYKbcElSNKK
z>XUj)jMI*vKF=Pb)+WzDsd+*z?eO$TCnVW}q40eo54P><yr}!pGu%o%sY^FQvhIx!
z>Q&m+(edHIaFWG`hGI;wj*na)jYg93(UEZSRwAbRaGw~mJDyO9F4K_czVXChBpGAW
znUGiazIj8JFGsYPMvvIepnk05RV~J}j>{}EGB`TYej~yLJ5G)c#yTQHv5{n(_GWx|
zSnEi}w4~NC9KXy{VH%wH$WVLYmR?3<i)*rVXwZ*;Z70B_=aEu<yL)EKY#@E1FRS!V
zo_Z+zzkBYVp1bY+&etC)tLKzWsVx9$dD9b=&wc!9l&Ql{{vUcd>9NOGEp3-npIX)-
zBhN3!TrC%QYCu)mrH;~eZ^AJ*eIB)9kMTOycwUvpy}LZ)-ZQwuSgq>WNu4&fH@)X7
zJ)UyUMo+?<kW^`xG`731WPcAzp4x7d8M@{1Z14;i?Rh-C#(bM?(C=oCTGuCcNosw6
zOP6<#Q9`YB)bcsYi}YqspVxdg)T*LoOU$wz-kpxt5~XUjTGLnBA?4)y2FlGVI$GxF
z$IQ2S;WJw{f3Hev?I9`QJtVP{_A`%hRX*Lgs{UQx=Oatw?HpOPwgu1SCj2L)xFZ#6
zU^OVJ`-X-`FGq%TnZM^X(a~$y@WKLn#iHXtgnF449~y}aYq}KC{aWmLjK!09Iq^D=
z;6>GaT68plV$x+jz)@sO3nxb>2-WOnc<eoTN#a%~fjtvd5e+CI8Qj{MxYZrfD6{+g
z9qAMAks~2#qK>jVLt6025nf_XNDKKG;YL01V&WDdig1MKwc%)FcsR^r35LEVA}pqt
zhqcI+80F|y$N43Iw@x!Q%Jlm1l~Hyrk|b<A92wylEmWra3=}R!bp>sQMaOnI54xNO
zyTyTMYp?TQpGG|lijA$M-nNRa=C2oFG0<TeT`_2EKZ;l3U)urXJ?Z&*dF9m2)T;F6
zuDjoyt)DHOok(98Nc(@~p<DjL^2U_!_RSAVr1H%lmdaJW$ul1XJR3V^uHU_$Z91A!
zj?Fb4O-WOO@4Phi5|U~8m9%nfwrBPrQnLr=lvfsIW6$tY;_!hZouz_SDQ99{e&tb9
z`_h-{=a;&FHGQ%_E%yV?DgBGGv1gE@9218R9O*1CX%%xOaO&g7C7y~GrH^&xm_}%P
zruOt+`Og|^Pal+X7E?k<9K~+|%w0@@&ZT|EGMNB<Lkp`~N@yQv{AziJ+%6@g0jkZn
ze#t~c32&eG^&F%=*5!tx6S5i@yVzIyvS;jyh1%cYsNF)Qy|k9)Z9wg{eLkS}y8hD+
z)UFKJwcL@ag|5|#qGe?h@nXp+)^4d;wxc8m;g_jZYIR>(-ZJ1Hpf{awj+W`k&fBmU
z`OTKiUm&X*ECVI@m$4MYTt=D@Ro;e?E~Am=%jnaNbv+;JhUa6g)_^c<oM^Zjx$$OD
zi$+F*ZEdlU>p{X`cO<<M1l`rJu_S0>z3@gm*+xtT+e9D;@JYE<oi*cF_g@|!jlPN1
z;Uc4#fbe`53sFhu3b0?S0Q<25fCkm>NE7X}0(6D6z&|k(L9r5GQD{#PiXIGnD;U?z
z^@Hs|lUO+REK^M^URVUiN2s!~v|#ZGl`|r?7`=?3Zh7mB2u#*Spq)Smfn5N4g|nSI
z<$4K7f7VH7N?1%|T~u%{9XeJMW0??3%O1ARIxSX_0~%o}W8nw}#nRElmyR5sB5Yjx
zYCOqaL5(%|*Gl+$@hDKAZaI){K9~(0nmqZiy8iaXRCl^%@7-Iohi2OzR<BJR#PU$R
z{X>sb)jE0R7l93_3z<L&MU|?lvES@Y%fWY!ymbUWsZ#<d!A05FGjee1h&XzHWVc9W
z?OY^Nr>x@61kPAjXj@b^&M6zya@+I^9Ly<gi?Xq2<hHc3Q5-&y+vp;6ChN#Xdin*c
zf-|uox3TpYjOW<j16IU=cimTt=hz?53TL98V}E0%HL(Ah=r|h=UQH$w+QE*FNFv^G
zy|aTB19lJt#<4y-OesS9y2T-k#D+jlUytdPk&$FPoQNbL4H(gMnVRDZ0c$~pGkgtT
z$EX2X1K5iMjsu)$Cn$M}fPu<)FN4bYRhCEO>@>BtBo+_YO|X|x(hUQ*xnN-D{e{ij
z=5i5s0nKctrW_dgNV@rGHgGHlBY*q;s{4Mz$Oi}`ZzGJn-4!Em78rSR>U3Hrpln{0
zjXfi8CX7r+50C`1q-564MKX0-<mN$~u`u$sMP<XBvLP*RoAz-qr)*o4jXfi8ODh}1
z;RAUaU4+hL9oa}v`>YDi#DctS8I1h*e9f{Mebqzsxa!?!2t6QNu_$i{VM<I&-H{7w
z-OPnBMzRH6PD-7mj(y%5&mK$*ZHbZrV$t~4XVJ?J(aTSWK3B_#0~oMq^IlghYPNX*
ztqoz`A+o73n)dX%(z`ZZHtAiHFBrt)l~dn^%H8ZKtpAFnma75BbzSS(cd9By%UWwg
z9<^UOP;cIyqm2eL&;A8nZ*dIfeS)s906o7)tpJT#`AH=w60b*w<0LkO5OH)QcE`&O
zp%Gmf9Zx34li_6SCXt;!LGn!a*tK9AyAqQ03g;^r#e-yR^zab*^@MooX3r+Dd>1`u
zFeGEOy;dPRanmyi;r6*YA&*lb7u{PDmqOC{ki?M(Niy4a=@R2&%xwl)R(=>FOGpsd
zQN3F*_c;<?lSCuVo;uoo81yehCP&+?dPGWUxT@!G(ZO`p)*sfVj=yvB?VCTY|5%fe
z{cCI1NuTT#w4y2f5M(gKAO;z{E2Om&e%uY}HSjH>s(0xnkcCicH)};J=R=C1YDogj
zh`wY*9qAP$<G4H?A5O-Bpnalu98c<N!iMOV#3!bRVkB}cmPZ0Z6w^aZk?@#H$k|ys
zbkMt<jNT5`4WJQwDM(vE*S4%+)Yu!SMC*X|y8w{$2I{AerJK8ES7ie|lP4e5G);%I
zHGA<pvp-w22Y9rmd$PwMYG0T<`96(mrfO?i-uk^R3~@%G<btwwQO?;jbl%FFw~rR&
zt*9o>WRWhYiBrqqw_@@J8;|ulMf8wDSUAm=H$cpvCvRx(wRm<6kFg~n!5BN)Cqv%w
zs)<*IguI~$CQ~c=N+EBk?BC;%H<UW?pM}@VYtcgI5GDJHmbGxLT2{2|QW(fbSh>%a
zCvPYls5X1$XxSYPmYMj-`~_YL2(dHC9N0ydk){_lZ^K9{G}2t&@U&xH#mBn(`B<v~
ztOYgf2nJUWm#<Rln*hs^iASkW0j;=e8CublQ|Ge$0f8JjUgQ{ZS#pt+upvK3dmP!>
zY(9ILdUTEevGwe002;k;0_U0Be3u*6yB60ph(^pcBd&!_X-W7>BtFcB(adgY?|%UZ
zEO=<TEgRSc6i5vGzdmxWI@{8dZa$j{yzF4*KYPNl^_^pH9ZSotsW&*7Q(6~gW6#K~
zQ;_J=(E}twHBvHb=OUSUL*(Yc1-W$@r0T~!Ifc0G9<}D{oNNTzG|0sq!cl(ROCnwC
z+)_j$l+vGqNCYwh=~V=^enE&tXe}K?n>%u}QFKMovP+@pUzSL$<U}G=|Ig>cL9~C~
z^8q7_=kxIenvY5>oVAQZ4MSag%N&chLcS6z5iFEHItu;hxHhO)n#|Pbo3RniDE(*0
z;#1(|P<jMS@G5K+sT^L4@G0k|b|PW%QecE(wE#O3VtNh!3B)1UGmwc5=bya1+4GE8
zLoju}f*-aOz{y3iFf#PLd`)y(#DoW9NpKWc2U#qF6PgSAL<ECIjSKdv^NxSIb7~^>
zjkmx36aU9gI0IGJAh8Hau*-CrlE}y{Hb{r90Oz00Z5r&VxJ@>Oh70bGG2{}x@}v<!
z^tSuXr5S1J+tZsDs#|_?>0_qh%(LlG$jc|OXotA)?FKH(uYIS4KSz9m7?Bz#*GS{Y
zFx?)OBoHHTg#bMdY=}UvUc_VyeTHq5-<(1gr^2)ly7Pn0#h(K9tEf$noOTKPJ|~2J
z)N$`ry7yu>@G5Ab%GJ{icfT<^n5{fFdFD~|`qarxb?ao$`&D>2GF6>vxs$lS&Wu9I
z1*LOQ&e=0`-pZS|j~3)kR1;^iNEg(6Aa}lBg*nYsL4ndu7wOI@lw43i_vGvuI&bC8
z+eZs>H>!y<S)>bUK9IXV`-|ZEg``(4hMW+=lm85uM|#x~>1A@Oq(OIT%ljmolTCOX
zvN^A%MK5Xh6!K7~rJ*0JNJC!7xv2UaE(#LC0eWKj)(f<hziqbUfDW0R#YrzAv!fS*
zZzj)Z^45mT&aUrDRt*%?hs=)K89Y^AoAO4*RlaB&{#kuoombDbk9IxuO~^i<{Wuzu
zU5{m)X!TLZK8uZGL48B^S)^W{WX>B%;9_NM<2a`d&3(*la@lD?;a=7zm&<Z2D16+4
zqEMoN)mdzg9@C1Kw?PKuCea`Z3dt$e;JTd#f#^w%L)aVJhsPl3<RkzslLFb(qzEo|
zkW%#U@>Nr)YmVbkmtmRJIZ^G9Pj*62&$GMEvs+YS2LP1u5#q|B+84r0B=Urp(B<zm
z75JhIl<`Ix2sT?yGUf=iLndH>Z(%nT_noL-xLWb$iL34HC5!Gif3-4rdA=;YOt<Bb
zUhWe@QgMf!IhJ4t#3`Y+@7$epw8w?*-%_}^!GvxUh63U75iJHQwwtgS_ziU61pc*;
z0F31=8`uqwzH?7nKK$Nmck8oFdy)Fy;ZzA<U=H7F#^Jr@Ipy%8Z0uS2Fkd|E9C7M<
zcE&t4VO6jaXew1=9ifop$jas&v5rvaz1PIif_(UHy+~3Qn)cdhTFdBq$iLStl7$cr
z7cM<y*2R~eKgBcXTzZ76$hq)Xrdy;$0{Y4~-LjV-nr+(px!Ock)QWuuN+6>yccJR?
z9KP!kCXltY0J$$L>RP^pt;<=$Dq)ptp^#n|(PQ2cmVYI8Bw?X73+0Ms!S3NIF)mK%
z6z~Go!qm$l6>u+aqr@7u_NndzyiAVz%k@6&nJ`hhbwBy)GuQqV*2jI-s&z&0%SPXF
z9{R3V8(aj^Yt3<VwCu0}H_tACz8itMg)rLNyTL4L|0YUa9~}GjK7XF@`Db|}=t>|M
z&)oX{{SHC@SLK;o<rr<(`M1pt^HiJV>S}dOUwNKlAuzD=c?_6Or1=ZQf;o?u)JE{@
z8ytdedc$oi3fp*&OPLvQjGrjc;mG)4tSt&B3Ft>-*G4biDbf9~TTas2qnC1*30)bC
zTaFrn+4W&*86QsC{4OI5mv3!BYQnua7I^!F1>QcscEkSetf5^L>oQhjd+lzA-8Z>?
z*h?{%?fmr{%XNG2@e3CM-$G+;%QOb_XBM-Y+KWZKTFB;U3E&PNdYN%WelNOA?K&(w
zmR;yEQgi_j3O&>u+3B(0B*3{+TKNTQYRT@oDBo$--R(Ts&+Uy((LR2KPP3XX^W2*U
z#xI)2QfT-k{A;}?!#ZtB?u*;zRo1<`@3#)5qIX}pzv|(pmXAuLs;!e}ey-HMbM!6P
zZin9M|G~ib2are&%qgKo+1RskXzHkSWF<%fZ<Lrv3FBzNr7-f|gI%^=*gV3C)PQxu
znLvvV(XYdA9Zt&|Q(YXu#BWhH_Kdu73Iu?iA=1<fBX1rhjH3m+aF$xL$~Y4^vkbNH
z3D|@cQVUBMdb?5!ke{m+YUSrhEv)9$!kQdvf!5?NCba<nprY8=rWR-oewzFE{HTRG
zwZ7<mEln*nsEyB@T3BUL3+gJYGHWNwW6`Ul!4df3xLNEu)*2F{+6xZ*E*(w*Tx4%i
zlCy*yQ4Z1}Nlsl?l4rSIoH>5>YhdxJ^3Xr~E%V~KOhs0T`b~Xpah&hMxUF|!bkfC!
z;r(bTXW4t`&^Pd}{S`p2UiK@6-Lax9gKPgp&c(IR$F9p43a*?tgRa&RC#vNZf9JIg
z_WFYmE9PV3lD?8&KV~6sOUTQM*9zT5AbJ$sltnrBS~$sW9L_oGxx>hY9ETCY#1^;K
z>&m^DyeAhu=3mL3UbQf!#pejV&F&#N0OW1pDy(|oZwM0?D{o;e&J8`yeF#i!x9@r%
z_Dq=6v2{O!Z?o&W-cMnD+!tZuBK2%D#Ex9PtxP3MT<Exj-iXc7vI_@>5q5K2B23Jk
zmdFVmPvu6lHT&0Lh$;_Y?8Me*jlaWDOc<iB0pGp$sh+Hr%}CI^_NSbYg8G<|wNLk?
zImXy^*840U;e2jwVCC~vZr-2y3%(5V#Ln>5vgxUKG6+j2Fl?WCrF=GOK6ah@Kp4hd
z>8S5_EL_Bjgtg((8+`TRF$WCIs*7HgR(uTz#?Vxrnrl00sVXyVCxxz3s8y3F6_Um1
z(PM0@4^7eU`DHl1r(qw%7ryrOe$WK9f+ip-iV#Y3=Flb#%84iBg;z=)=+#0|X{nmI
zqB9h*`9xhx6k!#+(H)IcB{oAS_`$6!ngsS^RDF%w_@-pg5%Y!a0!uh>`K#ZZxGQB>
z@6P(WCXYX=th?Qr>YjOJ){SFq2P4<iKG)QqmfO=vDD8`~v1jFWVjArXktUYR$eTwA
z<7mO9F!G2^v<sU@IDu<fXPgN%`F_=@nZCuglk;sSGga_7J4MRVQyGPl3krP6a`p_J
zxANxgqXqdCs);jMqzh`|R3QmNQ^tkM|A~+La()OF0mJoTY5*c;u!0Bxv4nSUfu$~p
zr2Gx`qm@fo>e?DA7orBlM+0XA8*|w}hn9%$EN^oPazUopS8y+D5f4D;#mZU>S00M?
z5h3Bi>QtItbF^$N;^sMSTkNLSs?X7B)q~wxHL(MMNrGB35`B|g3fga78$K9}j3=)m
zst5wKP|T8p!Pp2zY8jlU&G9-2A{Ghj$3^p<n(CvYBZG0O8W}eIUG;J+^OM$`%LQ>F
z8G$K%aC_c$x3eI%&aGOTDb)fXykLuXK;7{RY)6m4@@`Bh*+Ydmi_=9Z@;qIkN$M$%
zq;efht=qKkJscZ>$T1ol4Bx=8hTn|cx-rTI5s)ROVU*eL<ANlC)BYU*=Y`TOUr#sp
zJ`8q2aHZ^>Jo$6e8Fl}|>UHT2XC79sNgYeCdtv6aduu-|^;VyfKJ<92PD=<{;PX^$
zOhq%wj+xE}N*feIi^`rkg>+!|WC0xdt_<nB_6U8_14_eR)KE5A`J{M~3sp|Yf!1mn
z#Bk3QF<4wlPQU;%O9xs2XYxy<g^Ev!7V0@#XvjeePPMfAt8VFqpmi$ts@rIxI0{)B
zEkG~y%+W$!4qB)KT4<bTG^8NOt1zz)ve<C!I-GG_B_jWZUYLJN;P(gw34Dromd#Mb
zI{+@igcB=#(;(DOAb?8lix#=e-?KkJ(<9V8UBh_K4yKzAWdnyD-i5_qG5;6k6`SYx
zA)=1um~DIUGFg5wFgG;fa?oprmyFJuE3miE(_4yW@;3WnR-xQ7eF_grVFYFxF9};#
zKnRajZdUEDGcA)4w~UO+r9DI;V58#}Ptfy%$7EZFAY|U)i6f9s6VB2fl{i$=<bnHG
zH3RB^KP!2qEdRhwRE>-V$HqtTa0IU+WQ}<CtfgY2k$eM&V{N?1V9-Fk@PI2t5S!>#
z1S0c^P&j<G(0wctN60Nb&=(mVkMWQ<JfZ>YJpVoQ#abvdx-m<2vC>sQv>v_@Pa-^6
zGAue}t_JK6(c}&MYqS(hdj3+bC6}$P>8;s7Yr*5Wb0&VT<zC}#;Qm?ifb9qVm#QuS
z|Np$a`kkh?n%<pAhhCg&S}Z^Qp#1nF|GHFH*1!GlN<4uR67sOmmmmLxD`Ngj#qq|{
zpEfGTx0Oe6VQV9NM|b(eoxpR)<9M*1V{&T<IT`l(WAb_;+C~#1APGYA1v__lxxH3;
zof)kPgb4+J8i^tplt?oWf7N;IuwL9LBO$I3g0~>GBI0aWM8DG&(TgZrcE7?f0J;_#
zW`B%pPU7E*()XvE4`c%e9q-IOqME?EZRohWX|{W|Rd|krZi45ympsS6j8qe3#bBuL
z#sQ`O(o_?7#&dCdv}qcmr1Lf&L66)XbB#!yhJVGVrs5QqO*Iv#u$F$;DB513k>D@K
zOc~Mkgry0^g$p(4ei+JgiplK-WB&mI$o`1HXY%SCUfHV?LlnqcVR*`okAPRvl312T
zVuS2IqF+<^*M0;bG%^RLcV+_}&KK$4`q|45t2a)sn)c7^xErUn=Qv+`PGIf%h0^qS
zavns8@D=M6;0iU^{2db3ZU|sjZV*ZV%LdMCyBXU-5%L5SONA~|K4s`Kw}2|e-zob&
zDgI7b@5u_d7Ap`zo`9l^N3bl^0Id9-a?=3NJ7t9~bBw)Z^uEHeuX{fBcpI>}TTZ%a
zUvkEt<Fm$z${a}z2<OTx@nKlr-ay#OWNai~@T1V#uwb`zB^hBLH^P)>QwX{@x_d&=
zVo?@LYVC-Qwv4OtMTeU*F^7mUMkKgiW=KwSpTXHJFSjAYV2PK7ba|r-s&w?7{S;N+
z!N0bZOEJvg3E99NV8fb*+k>ey)34k;GTZU6W?hO%@EQsW29KGS3{lPWk-KfPO@$$9
z1d4oN=J*2zqU58C%HBBzX30nIH430Wl5Ff5`Dj{!B`_U6pfn`Gl+HTxk-pa`vhyI$
zJo<d`F<}CZkN*nH-g5ZZe$Oe0q{&%u@vz03+SoVYh1*L(#9!#1@esfQwiv}y#bf?f
z=fwPd?)FmXQX=%^_Gr3l^7GwZ9HZ}E-gYG^FIv`?tp$pfwN}55(j5EZQWA8y#96M#
zv&lC{%cK#^+aUP2bf^yfCw#0;_YihcC127STUY^e$z&|i30&zHewpz>VG2ZONJQ@<
zBqjvdZ&T_xK(03kV$*PMkX`3ZZQudY2Ewva7>Ocb8<X-VHkKm@97!A=9`G139X#k&
z5<zTM>fli(1Rh)-evWES0yw=qx_PLE91o7p!T`S8Ij+r(;w}QbhhcgPI@FNavHNKL
z`}o)XD}bOtP4UO0EuHuFWw)G6w?MmfE-MflVtSpsU&q0ma&A#J_N;s^tsrohog>ny
zWo6FuMyx|CfyPo%>j<UHqXjhD0xcOe>bwBb_v@?*&ID={O2;jn_%=ABNQL>yfF3CU
z5g7kS06rTqTCvH~R{_4NWVL75PciDjX#q!>;<a{ZEa+58aeMZ{U@9MQeCWqm^36R2
zm+B_>MhIVrjh?G|X^+vW!xqQBlKZ>A6ppd$S#~o<<M_dSTJ>Nd0gl)9L2&o(a0JzH
z1|V~9+rk?;aVXr&T2ynMEsT%jm_qKd(Yy;sk5`#__Al<xO;txY2C>}DpgMR0s+N5M
zUM=*a8G8N%eAS+S>Sb_1`1unc-YO?<%n3s1{$77caR6TxU^Qty5#EATq4)U{@sH_=
z0LSEg-A}yHPK$3s<i=jS50(@sA;0b!`;OuJ_I*no_hf~%LOH0C#{M3M@7r@)!CuOZ
z6vo@46<`8XjJ~ntV0pYtfSUjX@_g!)ftAx0ipjH>zp&kO&=rYN&=+KHo8mmoDz0_~
z-{PYWYQW)3((8J~_n1!7Cz@3578eCGZ*9aL5-atD>`xC%lb#D+5Gv#xESVw0z#Mrw
za|9(S(S^bqr<qd3a3S|Je)JN=m}7sA6j!Zo+cqiWG~Ps1-^}?3-X#RL861z2VUZ)O
zO(Ob3sz;6g{iJ6uxchE&c3U>spI+O~3Gj>DA5idM{{w{_j(xXrnDZT#1Hy;z!1*)W
z)`<N#Dv;x-4?jN6n(a0@Gafh(zIuW)TPnV&>2ePzLAdpEcYGv89V8swoEelfxWkNG
z8gejnonQ?^6?!eKF^08veCVm&_{I~XThnXo{A=)g9M<@luGFA2RLNUme?g-ZCqQIA
z`|ku^Ch)reIqcpp_G^^o@ciG;ewJUTOH3^BK9(2`i5JBbH@Z$QTSV!9g@3ISG9=9F
z2XgBNa@PlP|NHfg5Wl^gso$QiYfa0$rn>Q)Q9?-0T;O0%*|jJedq&=sCI>$|N2JLf
z)5x2rHpI~aO4lLQ9-YZLY9c*z!K&a);0$#rmU=By-<qzYUQmZZFu<a|T;`ylLpz*J
zh_u-S9OZY&=sxd|QIk4!xv&xBSRMMNIYJ%w2zAV;X&GQn=~$GFJtKGU5wdf{De91s
zH!q_NakPNaG(vPH>!^wJjA~VICUC|cp*zhpuiibI*>N;ohuDW_sAtDA%8N+Op5S0k
zIkPAmdqzH!R$jCYtpxSOC^AoNh@%CRu0zyBI+JzOM0)mwRl%9S8M|kE?z(1<r)7W#
zb+DZ4qTaP-NKV^zcQXfb3QTB?JtKE<k*u8~PEmi1ym@Lv94(-99VU{_Wa$!C(rN_R
zcM*a-zdezIrV6DAWWmiPte3FHEGJ<t{&60%y)z}RIpK!LV+ekmIA3M?Zj#3kq5ixL
z9z*t82l+<+wv5M+9Y?`c+MsUW9z#tz9z*1pV@Vp^WdSZy3j0d5{N!y|xh-zMq>ToJ
zB-F0w&cj>vAf=$#4aLga6ax9-Jk5P<=CqpKzUzJ1Ghwd%*8P0?NDqkBQ1remN;6MN
zQl>Vlt6V&W)|yk`XxTl|gGDEqcLZu?wL!te@I<=!8R4@=YsTK~E6@3W6xOGlNEz-=
z+(!PyZR$Vkpmo10PuyC^Si3*Xme)_7);FN8gZEw`O}g?~bfZblS&X~;d?jRPd@zf7
zAE#Q`JPOPad_IpkdXdk59)sq4ZU2Jz9Q-Bcar5(e{D+)Jbv4+x=80O<7JPX-@J*O%
zM)NM*+BlC#p;*q`kytHdf0w`%fwu@;CUBO(Ac0l_L{{X7TDi#Hrd-m;Gve4d=f(aR
zrA`1KI?BFfeC3Rf@WKy;F$6|MoRqlu<{6Y|sM@_u)f~qTEhoYX`{%fFj!#{#I~{wM
zy0?=+8-W;sD+GoJv=itcunR!1&g-OJm3wLS+bDIX!Ztu=Kcc#P>^Pssxig}6><H>2
zp!)An9<gm25e!cDOK@KYIQJ#Ot{5X^f0KuQ#|_3FhQV@PpZ*#R-l3LHd5QY`1${oP
zJKel58`$qK8C&x6Y5wZec-Fraj(36T+fvFm({%4jTJ6sUE}^zk_0FNU4&i$`?`{9V
z&hPI;BDK@`VI6!mXX=piV>u!{1>$*|*ymBgIKqc?3JN3dz3tAz<`GV$c3LN#3ABiQ
znIB-=Hob*D_p>FP|GA%y&N?D(b_7Sc-Qu5XqYl$ofVLD?6czYYo92|JwA?ZceJ8-2
z(y}NUdq!?alWNn>5ox-rkvA{5m`4ka(phTCD&$O{0?omJw+^J`^{EmL=9Kk|vax66
z^-~9|BP&4{F-pv%gmJWB7tT^^RvBjkXO>}bZUZlT#taTb>0dAg=V?R0iI50I7=vfR
z;5=2xH<j3iF9(Bzpaw-r<E0s#RqE<z&fu&m%HUKzLk8!o^2B}77#wpJ|DhP1=kxg4
zFgVscs%t<PZ=R?y31f1SbT#>3iYC6u-UsPqQ^aSUjwE)ai2nt(NE9*C3H&9%of=2f
zhdc`PUs9nJQmE_^6^1>Dhf%<$C5XvQ4r1~O^b;|gl%xPq8v7g6{bQ;>$IWU%GEN^&
zH}B2{x>iUrCbRx6Usj57`!v3wMnD1KY3vz!JEsBd9C3<BCnIlO-fkW(I7(-!DXWk(
zfeN1k#RyvR+0buGgdiq2rzHi{Jhb5FDxh`?Y4ueR7d{tI17&%f-zI&Q0_yTlNxwC5
z0rkeOq(E#<Q7U|C0rgsS-Iq#0EilJdQ_=8+okuR%3_exxvNL%Un={HM7sM?u#9S%G
z#{MTz&2mJR_++k-*a;#lF{&Zh&h;4o;^C6wx?iB(KgGY6wrH=F$5VC}*7aor{a*@U
z-9Kj1Yepu~nFta>eg-}G%#h)0P|_nO5Q1ZX7z4ghs4k1hD(UlV_PlO<YR5F^O2~aM
z@#^0L?{hy7CZbqEv(mm&w@(I*McjN4$FBF4aedy@cH`Rc;DTivG_yrSMbJO`VTHJ%
zf5@Skg@wX^r3deI)oGG+oHmEYf_)|T_=%RN$LynF3t-oCCj_i|um#BdHh_D1%g2bv
z8h!TnVR>CW<$YN744Z~x_igzx<mvv)Db}dV*8w1~6q{qtU1EO1D#PvT01Xh0`Cz^H
zUA1xSk9vb<TlVkRpZ8VJY>$0l-uKYYR`i55^;O~t+t~jnjwfuDqmS;hXlb_bErc)b
z)3~PSeN>qC6?wXU#8CxlRh4ItT0OAxS*$i&Hh*EkL9?h}79Xh%5UFeuHmF`9wIF36
zVuliW67rIXH4%I|K6wx!<9I|6MCncjuZ)h53{C`NH{%Fxg4lISln9mR0r(t`GekYW
z7a<faIT&LsjL37tdMQ$)O!q~{nTB6KMLN-<k&&mA%1qQ2^h-GW9m5?gNdUmk)*Byz
zRp?GqMZ%FU7$4!LpDylx@a}8JJ>zQ+&^>0TRp{Oft?4|axQvIMPZ9F_X=FUQOfAEe
z(QcBr_9JSohX5yY;*@gJH+Ip9a|HGhur-f_Hk_JAN(=4d3l4!1w2O(zPy{*Qs}VL5
zL)=w9E8*lQqP-+z?Ej#dzreqCm=iF6P1c~>AAvFvLZSBq^{KIRdr!KTWdj;MV9(Wx
z-swwsU%UTmHgLg7sC;HR#T0}#wgS>TDxy=|n3i|^Uh5BG%(`Q`bxwhar?F?`9b7s@
zhYuWStH1(kK*eVipSO?b1gtdermS>U-ofQGp^)ysJRToTBJd*DPU~yKA{JJ=_-Yr7
ztlL?P9Q_dm$jZ{oj65MZv>!pTLlkjkODV<m`22}1qeHR_6waq)7wC9O_tW7k1TGMG
zgTOF>UnTGx1X2XvBk*Gazfa%~2>c-d5-0N5CG4k^x=(;?hZ(7-xxkST<;$NQ5%d@K
zZyxdZ`47WLdVDx`ls!fso+^)a8vyS7lJvhlp6_@TJ(nJMF8!U?D|!E~2jFp_L<&Bx
zD3dNpj~jea`{O#FwEb~iskH5JjVy&8SNWuUkL$eBk;j2DX%iLM@wm<}ot7T2@*xwY
vIcTLSN2tn?$5mwj4PNPM(&K<%+Vgl-K-%zlUAeU7@fv`S)|5#v@cH`xGC7qJ

literal 0
HcmV?d00001

diff --git a/tests/lib/__pycache__/test_harness.cpython-314-pytest-9.0.3.pyc b/tests/lib/__pycache__/test_harness.cpython-314-pytest-9.0.3.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..72c28ab9eb8e98fae4d2f6f0245583263d5e9f8d
GIT binary patch
literal 6910
zcmeGgOKcn0@$GU+k-H=%S(0VhvPIcZYT6PhSwG8`ow%w~IYA9}?a*zZda)um6yw#B
zdb_k^1_o5L1)L&)btxbN0ipsu#Fs+H9CPcH7FEeuIEMfQir$Rar<^)(ci-+(Vk)UW
z_|O$KZ)V=SdGp@v&YO8><1rq=^M~g>ANF$y<p@EafP=i82V@!DMj~?!8I0Akvw;fR
zhh{k=*k|{Ij1cs&*IH)7M%dy7u0>{fgYQGX4EH0E8)IyJh9CyJkr5R`0Apecz_`eq
zLdo#oiNN(FV+sYukmS5vHWUnGBqL*8$%_@OD&$nTl2?XurM%K$Yz&4HgI{iiiLrA;
zmGxv;ig_)V1B($^F#s(m2E~xrB8J6Cnn@E54}U_MO|d|;)a-p|jEYe)CdS3qbbyXZ
zv1K#|8*4wA7K(YT*p`WS6veWLYM^bInG@VzW-dzcF=u^CCx76ut}=xwgnE(KE*=s)
z=HgT`(?&6`?MGt{0YgxspNyM3MEf(H6!Y4CG-tgsghOqkGv|-(7=PFsN3mw(f5M0z
zVyD=3AX|^`n5(I+S!S1w-8;rLwb2^~wsnbz#qI;yw!eM!eCT`{2i(V9<H6Q$@rciV
z&DZ9>(`jxnME!zLIWX5lDW8s`l;^emDC@rAbbl8Zv8T*tnM|)IN3k*|MpDeu&1~@e
zEJEI!TKMx0U!0&LycWLI6IphY5&Ldk9c9OyZwcP)*Z6Jm?Ot;axWQ07x~FWA?r=KL
z;H{8&Ozh2u(o6%l<p4fxp_*w+e0Y&5a~ByrAPP&r$|O8LP;BXE*&vA(OONT>x=$}A
zvJvoN|E=Hpyx6(t>ciAF@%U@(@z>FdVesPb;d?wuJ?FLXJ+}UvU*pODe~k^_-GBRQ
z?12R7l*t+Q%ZeG1b-kz?vSyeqi;Awx1;uR1l{7;EPS_}wRH>pXCa1}Z0Dma!3udsW
zl`9~gD=jX{TD}@tkni0|=s8&foytc>HA--;s1*`g>7E(Fie6C-lgpPh#SG_4x&bQl
zg^xS`^y6=;e|*bi`06-Vp<p91aJqbdEUEJ!J4oV^#TriPrwL~ysSjF=b4fkPKQII3
z`@<$*)QcKfK~5oz5i@`*8o|zClEe{g>^gzt)wpk($)rx0lLU+=neW(3my%3%oJ<4S
zZ3_VLEJ)46;w5L%I{zSX$u@2rY++2csKM}2$I3)ev+RUETN5CTrqQIHDpd^FL5XY@
z{{V&!|C3qiel<>b$mBlMOPU$XR~E~<$<EA}(K5+wCEdV^ya?T;qJ~KxWA8%r?ecwT
zzEV^T*eo-omU6PHn>=t01soz{^N6G`lqzao!b%xS1+_FUtCHpRh@{IOD1>dsuUL*M
zmb9xFm$2C(eNe)SvLPuSl~q}j4d^jr4LfDFOE8yesfFDGD-~1l?Er2EBa^%tREwIT
zV{*uu0<qS1)~HR`(~SFjM*PH>Uu?opocD_j`-yWp$vzYAW4@JsA6`Q}J&)nNE@`QI
zGR~)Gpgc&!XQ3HG`kkVx>S?lLdRi^cTbQ&UV=!D#mG7IUt<Cg10yf6La?a2tYg0h4
z<#s#?Gm~6hzY96{GWu&c`iTD>zw*xNdtdad|NL33W91iXhrc+y-u>m&S9fX`#9H(Q
zVP;peYqx9Bu@}L>6^4I-0^<UJ&ge@Nh{RrnQ6%Q5nO$304}Hmh^`lzHwR-s77eXi&
zTE6-!j)d5w^e5@nH$G23POe?8A3am&&o0kA7rIt*pH0<;ff_fkEevb}^!@7Gz^4~n
z#zq6>vSH*(&ZR)eN4abe`fSRjK!{RX+(2#Dyqny>Ga<e$^gb1OYuwP<v9%a!Vo!yk
zZO-}CxuKfS>yj?=aMVRNxFJey`J^|ADp$ylz<_5$PmLS=TzV`)vno9m2DdrqSLX&F
zUbd;fc&OXG1@bpsYQv*Qhvatn5fFlPKAQeyy2kabj#=QT(6`MwzdG0VaN4C@ge=OD
z*Z>KK+VX@qi8fcpk3i2$c>Nly^j2V4g>M`n6uwR=fuO`qF(NbR&`^fDg>c3#NQpN4
zDMLBD)ZdS8v9zEp2gJbAOqP2WExk`uJt!@&St+|)He{gO-uXbv-kJ?Uve$NN%$Mvz
zq3xB98Mk=xMmH@+y_wuX)<42q6-^YWtxhrOjc;21?j8?C6kG_*?J>?P5XsfSf!1Xg
zuAcs;>+&uD-1XRnt2eIcdc1M44!F|Ta~*CmDDr#Cx<#KT>?!M(IO))iL<ufQnv^Xa
z)T=(t%}e@bWlJ2&76$%V_zythdsB=<!qi^vbn~3V(2%0tO^{%pWbhf7%?y)HlK0%i
zWOke&0CpK+xW_=&sfI~Kj7bS>a*(GCf7j|uM@Asmu-NCWKOxzK{)DOCCTRiWDRv40
zh?7EmPkxeD3?&Cy1uhl}3g#bJDHEg{cFIK36v!cpx|NuWKuYyQuu=odR5RW<8V?X#
z&k}HofHw$ulK}Dx;nM^p0XzxYOB*JP5di-4)j5XkRZjVP&YM;~Vjt=wBMsE3PQm~^
zl<@>ndeS;q8%>n%7G)`?7A3t{&}0=uFhNJ)U*7<*jGprywNq2IlNUDmi{yNZ(Nko6
z?M#guha+S4MqL;JbiLCC!uU4l{5FXIoG%{7#y>sh@;10}N^M!{+#VYNCOHKh4IoV^
z=vX^rclf6Pe7)0^-+_RE_&h8t2mkKlM(ehw(OP6DohZGJcpcakuepCYUbk8Cy1gM@
zlPKt3^}W{d8m@1BQKNZWbT0y}sCbZd{qN#+WFIlui`RR{U@u-r_uQZFkJoUe{U49l
zt-IoNE5z$VRbI(0loAVys+Ov4TFF=-6)9F!<t|jaN!E(nco04(d;!2a`-t24B9WdZ
z;1YoA_%cD?Cg2JIGXR=Iuij?SOH&F^E!|b{doT!I3#8WpSaEA@dM#DEoY~}m+7P%V
zNZ^|A0@uWP7tjENiEYmLZ4v<pTpq{9NvLvo8{7n?wk&mSkBtD6gep&(QqVyH7wPa%
z1NeHEE58E)1FgW7Y&FBQ`m|0Ivq!Q^&y;;UK;4+a$~{~(>^lj#4Irgti^H?eBXXr{
zwuApvJq6`g%Bj3CiF0O~$6th!P1U{A&8oJn1H#(vC&;qNK3c-lAYD<F%lHOpAsMbt
xj=dKFhGG7O&~MPMg|AWaYxL$f!M2s-^<dvJ`)w%7B)&btG83<OmYKAa{R2XcmSz9|

literal 0
HcmV?d00001

diff --git a/tests/lib/__pycache__/test_models.cpython-314-pytest-9.0.3.pyc b/tests/lib/__pycache__/test_models.cpython-314-pytest-9.0.3.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b085672bbbba19b0386bee11168d783f6a121d5e
GIT binary patch
literal 16811
zcmeHOYiu0Xb)MP1`<&(SC0Q@ZT2hkfH7!znNR%ncGDS<JTw5VV^2$wFZI`<vYOUq&
z@Xo9yF=!(o0V=ftY&1bi2LYnAK_j()_|py0!pa}JDA4jHlA6Y7P!urw&y^e))vum&
z@4R+2A}Kp^`a{FznS1Y?`<Q#~<DBoF=}6Ya1-OoV-tu-wogm~fVqF0<aQ`nMa8)=b
z$l^&s7uh{9608Px2_r-g?cz_udKjJrPDVzedX%LFPsT>#dVECErICc57^&0iMv{7R
zmmrfZf?h9&fHufspp9|_Xp<ZT+APO_Zj$3bH_H;xXXFIXEpi>ut#T6RHd)*)r0f41
z8+huLXw+wNrD7qc7c1rClvZduWW<kE3loZdT2=B!-SNwC)0M*T1tovcNQ~0Tq^b{B
z%IAv{Mr<@kHKia|%SI%l&_Xe<*YrXR>IlOB{pk?)!kCDa;w3|W$N@PhlYYWeR3Rgb
zTD``T7*$0%G}W33ye3Tbwwvh_9|=N>FlOcygwe1ogBffU<ho3-SCo@yn|cF%Ca)Zp
zBV$pEdz8t~it&4^&}SyR1ow%Zo|+Q1+!$+xmiL)&X0@=e#gi|&cD^@6IU+}0Z{*J}
zTJ^}Wb$J68Z&&a|SO2I)mgGc+bcr>op|M7*g|3pDtTg8iZyT~ow(b`|Mu$aJJS@^4
zkm4qX+X|^-r+`vCG$K5{9P61J8=lIsMq*E1j?Y-qbM7!3<6lvZjZfwHb;?l=!D`?1
zAavo<zWv>2jHq&1Efw=ceLAA$bLCV~zeS8dv0x;%3zcfAkkx5%VnU%teX;ybCC|2n
ztbRpRjIhd9{!>O!y|UjBRlN5gIbaA6k~vLNsGi#Uyn3ZStzmLM>@9(!mKLWwF|{YH
z#lK|+Qb!o${<PLP&UTvkpQMT!OFNL(Zix>9M~*<g{cGfd*I>I6<UGPlgPf3O-x_(!
z6<yJ4GbAW3jYgO^!-&74UI89Go1;c!HlHh%vQ$xN9=5z3Rg45Y&W=}$CB0bIjBpA1
zr=%HMvu4*JYj&bk8PAon<=muV#IjoMyn@L_a)@=1waFUI+4B`TnbWf<k6fAcGZPMb
zD<j0(nhsN3fei`k<vrZ(j`X=By*$E;J?M@c((oly>@r^NdJ`n2b&b<pxlk#0Udquz
z*Knnvbmb<Lvc6BdSS*#aF6;%Zt5h6kq3k3ul(bIuim{#b4<8HJJW~U*lf`mzGFPJe
zpc>rrwTrMFUKM^7-82)KC98Y8K8SrD6l25U)uCTYiFXfOKQu>nyr**_?N}k^Z;|Y{
zcF2y|fot7%Z~-{X*s{aCgr(U$?f~$wLX8902cWW<CPwBxH4eBl@UT?}#5^^c%uwS@
z)5G~VKi1fPy?>5ueQ%tR>ql0|R{mQgTd(!oF*|@2nj8zjVaApn<|Qo5=5Yt$$$jYa
zTJ;kZ8>V>u3<98dAp^VoRIFB)DrIfyhsCT|4OD|!TOHaZj76+c9Q92O^@D1M7pSIT
z{OiMd&G>F1C3FauQZxbTU5iiz)psXw<DVSfEgTN+6pCUxLb1(fs07pyFMs-@_wo9N
zqv;R@-CCfi5Yc)h4agDaGd5e*3a4NN3-ftvglVp*DH_E#u&I4Zpq&_g5i>pSGj^%+
zHcUA!TP<HKS1y&a=Zi|IK>L9m6(S9lK^V!g`geE!`OY7FdnML>C)Vy7*ujs3zk#9s
zhWJHLh(8aH;kOWLf1u$!_?yO;L;oZV)yMJ>u`ldJCYv8a;K|6Q-&9AThXeTMl1X4)
znE;0=lOXMeCZ<Irk*~m_Sg4drpzohb6N=qJ4<KPZO)zv2iPPD8S!bJK(qOfQrL?xI
zbC-)69e`Y*EDPH6Kzw?-=|=EQbdPWE)(wPyAq`27=-qeN8dPg?IQ`;E2Q$Psoxs?(
z`*F-RO-Au$Jr~R%Tg6^;2ASmvu36=sHNWke(6EqW{bu{W%q)HVH9j6TJeT9su?x9N
z7gMNVz~~8U#ffsRWP~(bQM(}_nnznQ3t9)sS7`|cSgx!WvuaM)6<R)Jh&e+XPY38A
z^fP@02n;z~X1wFhGvxUEwp4>GuZ1d8)go0i*lBZly>um8DJygs3TuIXZ2}04I?_0^
z;||%2_x?L%H{Rd5L!NtR-f#Y1JsXPu<y{aq&z!!g-HG<Hr}MG)FG74dc<?D(66SD8
z*a~&sj!4Hgg>DZfkF|$>xk);<C-%#B>DZy#F>IU7f6ZF=gjR0>#=SX~VN?+ObFC9(
z-E{&u%yl9}_d`R^7@_gXL{+0`BcGzZK)q|iZnh?PG|OyJLq$T<VeKv}1$qK9J_r9A
zE<Qdh!ls$v%~x0VzBqdtSKa~NrJ><S=pUt_<RiNEZ#fzP#q-N=Q&a(?m}<y~%wlb`
zx3GWn*!A|bx5`31K#peufILWNMHlh_c*kQQ53Qb>e31udXjsdR)zZQaPkvuS;o<y%
zM8JaR!T@~pIf#c$*5$QP3l9<zltXej6Z9Yv<S~#4V!gF>2P7f_NQ3}LM62^|zKDb%
zN57(c6Hn#K(68UUeEpCLL5_Vz`PMy^@9$E+as-fzq(M}$d?_m96oFgXj07P@ic+Vj
zJD!@3A=5ssRJml3^SNS)Zii%m172JM4j3IeeYzG65U0D>;m@9a{TpY!nbxZ0#L&st
zYxxuGH4K8)Q-GHR`Ye)NNK!~(;TLEdlHEx500E#KxLM@}!6Lyb!iAoq(oEAxUO>_T
zL_<Z1+4|h*X=Z!wI~<+QzLP6e!9IEec+e!$P!EPZR%&{8@cQ5!X`MO9$h_3LLd@SH
zX}vaR$LzrOz>GCH7D%fVTXvY2ur!;;9RPl)aYbsGms;jX#|?>*d8uQCn7>8RF(<Xy
zQ9FRum@EsV!-_4txR<azo5>x33}5qmLJu&5C=WetU$%0maxLT`li<-OAzuMQk1-5A
zUW2_pu150porP>f1F?6V4QL>bJ_|t&<hOkmf*Q!0XCa_@eA%-QE=TF996cENla`~O
zh5@?fmpqwFu-RDmREl4x6uCY@!7dV*I$?;^SD8RafL)dnabOLlU_7T?0PA&f63}$d
z*}~cf7z|MZ^s+`dpa>Cw+Ok%Sm8z~*^{igGsFblr=inrrpb^M#)YUX<!#YOPYLZb`
zQwN+vd;^Mow8DWlFjF{ET;tfvVOHiEkfqgfwxFEPRZF@?PXlKU{A*dRc0?QhLb^JH
zc=^%mN9Rat2Jmtq^HOSsn7>7mx^~o#*#U&eO^yZNFk{OO^AZ+h^SA@RZ=VUY%rwst
zAfSk>5c9W4TCM@i?<DXr)?ubC0EZb{c9@s2Ae+Y>0Dk)&+hz)L1c}tPLd@SHX~TDP
z5_lNvFw+)*!;CFE%u861&EpQhlZQ1Agw_$Rz5f|d%O_<7tmTn_DvktI0Om6ypcl+9
z18j5n%LGPI>GoXb3UavLOjm;$!OR0dV<y-s0@#QuC|oK5Zvd>-b7jaVV)~v7F=f0g
zTV4%iLZd$D!#Uwa)b7n|>ij4k2zV|vB9o~<%S0J^^S*_NOuWl55!<W54;O%Uc_tGB
z^k&Q1%P#aLKIY`|*$cocB5bu=me%Es*u05#d7~Du2R(_)b#gKj_n;@z*fy&xTqV2u
z%X$hemaLMkJD?}^pu2?OA9NTx1O4A(@jG`l0sG#MGcne;pV)o-(FXgr4*GWM*?)HR
zZG+r6hBgqpMpI_Mb@TPfapZxvlIL=JsaP#EeR7<9{&KXvTL1}bl$%_==+B?9dQWa%
zmp5kddSscDH_4kbNslb+A48V)R>{`glw|_scvr3mOL`kcOCudL;=@X*Br9r#>PGxH
z7=*H-!RpM-(oh~wdM_JM#P1Ql=cjVFqDB;ssdYSD1{fcYh~cOWR`z0n9n2Y0q3CiQ
zFntJMU!)Wc*Nlj1eL_}9QjE}rT=8Pnh~&oe-95d1V9b6KnnwQ+h!J8(%|;ma)FNaB
z<Q}&U^eE^?(u?HawA<m}5O+8@bc%25q1*%j$$neAwV^YIgkH-b0RpB&LN_=h@V9by
z<%8?G6|e?78PE@*$!JiZW3^5=K=0&d=k!f{#S9V#W4wkh+svM!QH~yXO7?B7Qa0*&
zVxc;zvIMGNCPXt`TX`)$K+(eVKnUpuZ<V-<!-)$>pT3Pjms`PpccjN1IlwRmw>v?<
zJMt1&3p@}6uE<;?a46RexcdNW0~>)^TB()`Fkn=T>W~Ya?zBTpJNOUs$~<}HZnSCU
z_ihe+FtOV9-0Z6#?^)f^_LIT)2XAVhhXS#_t3&@2781|QT)O%8hmqBt&&@W@2Eo-J
zap3B!zfL@}l6YZ0@xsl%m5!tH9Y+^Ch87dYt{z{N8s0r}{m4w?9cjlL*|#FK%}Z?%
zxG}-VytHoxQrT~j>|^^hM(<!492GFU<VuI|jR~IYq3{IjI(+>wVk+&7c!8C}Zd`<i
zuN|<{04T9y2&|aQ2&}Z*Jnn!m;9zh2v(UHJwy&b&Y2c)YTfgaW8XO@^1rYq*A~UwZ
z?dKbi3Bbk)C#L9N*(sb8<!HZoGy-Q!BL2C~mc(_>mVm=NTZ*w|>l6(@<I^G^(s<fr
zh*u18(h&2Yp4zJip9cka%qEQyQZB=(5^$hy0ZZ7G0-iJG`JweiWMi`u53<uiez=GV
z5=+%E1jCn)TYj4D9=D#uZ0<MkEy!~m{<T3MSA~Bkjh~R`KOr5f{Jiiz(}4g7_3#aC
zcJ~MQk9+3c%-o5d^&VrP)c(GHh=gtvX=w8!4zm6=;z$$~IZ)sPXR}nrjOce-t%66u
zR8J-dF14>(o1^JcYaeI%seK^fEg8bL@7=EL8=QE@(07AfqrS&fmTKp6sdZ}dQSo5=
zwnv-QLUa3e^7*KJ?tHL)!+3Pg<^9w|;J2Y6IlL}!(Ar!*+jj_0*fSx|_8lHWsG41_
zk~djt&K+7|E00zUuzd$W4TFC=3{vz_JwS>Plp<5Z9#@Xrm>jpCeJV$LJb}6Lq~!>6
z6IRB!ELq+dDy)@L?Y?uAoD-+MYZ)+8x9nK}<EKT)M0`ht1kSntl1XsIT#aYqR<7gX
z#BHXY+i&}b`Mr&qK0D5ixaN0APCSA6ZJR(=8#g$=?R+r56LOtvX8ErN_7O3-%sX%I
z&-*A-D=a7F`b@ZXeoy`9sBLOFb2*c=W-zS9G5g(qv9aEN3yq_ex1_fcuKsWMf876#
z>-MQ_WP&NRUjN&slxKadmz(8HnfluPR~rDF@RLA;BLP>9{w8qXoY04w1zcT14H(%d
zZw4Ts(NkyBW31^-Ry(Y_xu(bYx~>wq$KXE-{}Qb8zmmc8F1?eRX(EI2xJAR^H|N}Q
zGqlz@{4~gTr81cg7;&oTRa!;^&X6>{pir9C;Vje$Q0AxIHyuGB^CfOAy#xoVSGv0y
z?Pc`fbQGU=qdA4o4o%l{pJV2j-2=uECiiq(AZ5=Qu&C3N8idj^z<Srn>S}qdQ6)~Z
z8v87}4Faa!2EOH;H8q+p;oyl5U}`^mE8E=bcG-Jq@cGu)jb>kxG8Hj`-TkLXHh5Y~
zzYWYrIH#&g864{^Pj3)YsiLNvnPk_7ROwYL!DXKvq*K^{3MMlzbkiHWrzUowCf3Je
z+#$Sse@(2vCiap$)>FgT)1%?QOFevXy?Boc>qXP~LQcyTin)n0IE)ta+?!UT--jab
z$XWYeX4!p0UbyQ(zwF?7*G%6WK_cy1A?9z9?81X*CxM3%2sP6dfWwR}JIqU1kj>)`
z06zk;VExUJXJ?KvGA}*5Ld@SHdG;FkYC8!$jCGi43&3H<mL29LEY0R|2Y~->tZ629
zGqVKN>an|IfY&}SGc-ql%u7302soz?+-UoG=TAE!FnfZL`8L3y&EGN^z-BoKJbbOk
zPFnyDGv;Dm!WwKIPXMyq=$t*_NrmVQz?{vhkjIQcu{S#T11Q!npPkD$05A10B8|oZ
znpyXgq}Gt(W~q%VqhX0?wCMMxUkw@^@JFoGASlbj8NyVfH?3{iG<w!CUSJytF+8}>
zwJC$bKV};){-%YG(c{!)qmc0S=H)g<tQML|k(1Bge6WoeP>N!5+@-Gh^V^0DVAboD
zZ7WcoO*xF}mscrLBV*{0!!B3Jc&z2Q;B9SH&b2`%VKWTEW_YGX_Vu8CiK9MSIDi;{
z9`~b91zc)WZkn3$Q*EbCW(XReQ-5dm>;`4OH4_KfZ#(+~m+YHeZ|2{>HqMK<?h_a%
zwxVji%?8Jboe#zdd>_2-dwl(}2Zf0KOiqF#w4EzL0T*V9lg1D=ws)SPe*iJcbRl#O
zGepG?@;!B8y8S|qmK9A)fkITF11a{M(?H5Rv`xX6R;lv{LZ)LbAc?`PIKk;3AvuR6
zi{y`i7z97hnr?ik2znk_CXgkd^jO-?TK-{JkVnmdujasOp1ZDrf|4qhtuN#F2Xp{1
zn|cpuGM&5tOINAIm+4<&b2h5AybjE1$<^II#N0T-{M^jw3_d`GgsMO^#H(CN0@n_n
zW!eEk;T*<i>H)Uc{LJP@kO`Ztb(x57pM{L|eyFeR+C6*b!~Ty3Zx7C${U&}uHfDZ6
zcJMlQscm^rX5>1k2wV7Xk!-mJ9%p9E4&a7urY!)68C!Okm#{FK#~lFvyRl~Q?GMfl
ze@s?WduOX3PJC3lU0U6~W3KJ@0fUaeAznT98Gz8c=1BVI?LP&WH+`dhUP`YJ^S4OS
zbEsor^o}cRGb}?6*kw)j1t*3N0O)m!vcpRx&34oDb|Y$ikjU_VV_TMg3f0LqMEKwk
zRO3susKe)ob`l<|urxk6aHpcvNErIbv|iYd1^&$iT5VisWd|4gFxrn~0Ld_t6G%|O
zV<-F!$fpQ}GR2Rgl)Ya9L<ixNqgqm4raytFAbmlTfqWJeMe$QX_^$Asx=(~XUjzbT
z;5P!0d-0IC?OsDrd_%mK2#9;{#Uo<hy-h)J;9epqw%tn*areDs3_@E&;#=aqhOl_}
M^QNfy0_&##1Kq)vSpWb4

literal 0
HcmV?d00001

diff --git a/tests/lib/__pycache__/test_replay.cpython-314-pytest-9.0.3.pyc b/tests/lib/__pycache__/test_replay.cpython-314-pytest-9.0.3.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4fa721da427ca373db23d5d820ce6c8786645179
GIT binary patch
literal 5190
zcmeI0OKcNI7{_Pr^>Y&^0a_l#BvuH;ZLlFPC!)5X6i5)zXn~}M(ACD?I13xwnOzeS
zR4IoZf_i{Tts10CaOj~(mD*zuz4z*b5Tg|wda8P>N>7~nef#n{G9mP(s=`{$eBaD}
z9y{N7=ePTIq&ZCB_xY3ESHATSl10b<c`f7VS70oX4++nV5REZ%&#<@bX(z+1=4;0q
z&xn6Gpal%uI}#iYX(7XAN5aFL#<i2nff%{m)W?{8j_$DsspXBzcwaYHQ}BMbr$zX{
zPNKE&K}cJ9rjx`&zo7VhoYC1)QJaQ5K&3nta}t_k63xk3O%G1V*R?X0)P=U`xkHCj
z35|+bDRD(D7L>XL2tjAK2v2|T;Q~xplP6>XmdgC`uqeDIWi3^SjFC~+uB+uGhWF0K
z#yn@pZ10;^asEd__L2#|tr`v5RxOVazG=+c%ka(P+j>2HR?j@k`zDOAp>njTrnjva
z>9fos$n((+BSmaoEsw#>?cmrm%<bIJuC=DOEqA+f4DaLpt~+Y1XY6tKzzek=TbuND
zJ1a#<iRFWQXpBuVb)~+E?e@G}J@2r~YB|i>XZLL9(+nRz%9NO+4DEp^zFt$?R*9L|
zW!sH5tE+aet*hm|9^3&KkC)y95n~UuyxIeG(f;T6z`k1aYzzL$9L8~Q_)o;K)re!;
zt2q9fapc2rFl^I<)8gD!H7gdLaalzy=cGf~Votj6(S7PwSyA+0NfEWFBAvOYdrR|&
zbw8XBWko}$2b2DVh^VR()jGNkmgZA&6$?}67ktJM0>_I-R^!Y<FV^<N)$k3==tvt4
z4#(9*PR<D$mGgOthCk|%RimOm4$Yu(0U9SaX?)E*ys%-xZg`)0=1HfaVPcbpv|JG6
z9JEWDGr<8{DIQpQ;MB+m7ccQAb<Q<4J$$S*uSu#Vh*WPAvZA60R4UPIai%0vNpFH`
zVX`bMnp{wIzfuH+s_zoi>0(*Q8B!sy6emSRD2OwX9uib>O2QuW$Z=x@<znF^rA4Z5
z7jVHvO^~j`>K8-}YV?40rxZ^+ILD7)xS)qH6)v28e;nds+)vz3o3Wn5*_eSj;&%Gn
zPOs^hgAKTyK^0Gx4&$e1l9wRBYI2f_g<P?am=kF(c>;DxQp`&Q?U1oi)FiHrnpEUT
z!@?U@#Q8*NUf*Y|ngtxf6fMp`xPdH$5>8O1aTp4bLcIYXu}B^_@Az!|-kI;CYt5$@
zPd(z=s$6u1i&ogKdppd8>#DNW^?>cFa8bu~jHLm`TxGj#Z{4L{!~PtR+kgfz>aEPp
zOoh#SnZK*tQNB4{?M|<Brz=q`gLH+<R9WkKz-E?eo>Pp2TE(lNu)Ot}c+Kd^5xEU$
zfKy1{Ojp>><<my8!gW?z>w3U;E~OpMF>nw|u?h;yTd#@Nus28JHlXGy#GsyZkueR>
z%WN6+wg7`nV;G!`HA2MMo-wa+sJv(6*GVS-+6Dl79wM>^B6hkE(aW>{QA9*|bx>}!
zF(NwH$NK>i|9;eU5Y%4lEuyZ|4p2A12VZy}UK=ixY~7LP<3oIS%(nqw{V##9e!FKo
z2YlrKUp;`YCmQ1`?7Id~`<M4!Wc<~>+v+Qpvw!}d`;O-TwOVKZo~9HZRTNJ<itnd7
zBT{_pL}=>MnMuqC>WFI4B&xyl6yM>1Y0hrKG&LBqpk}}(E-)<O>I33(bkN`MCjF_|
zB9&^0^qk%-RlLcdCkhW|LfbKc_hdrx)sXJS1ix+2H!#81Knic)gyO3NpeBwJ572Hb
zN?_87<t~Wnr(I5o$tx*0uMDZUxF)Z3(1R$A)=`J73OPxW=!{&DRZY&)e$e1URy|`v
zN3i9y!1BU(=hlMnE*^he=RM+@ZVlZWs<5%Up*tbSmP0FCtjb!~8XH?0ay-XaI^vkC
zps>9424W~%X53=i!-|ko;WnU0oK@;(s=`LU$bXuL>~<a~smfZ{8XH|oIi6$SJS@d3
zC@gQiff&kei*B*)VMX4la2qfteGAq!-lhj_5;j7w?-opUPMAmvJe#1>9F;ZTV5bN_
zN*QIw#?b>M_*4ZrN%5nS-YQ)Ko=$)dEs-ouRCg860Ff%HiN`@fn35Gq5OhDnc|kLG
zC6cBQ`TN9}sJT;u??BVCB4y|?sK<|4Y8sMXy$r+rOvuONA=&kW?1$^8aAfKFCy8pf
neI?vp4IfwuANYFD!*Fk9VB}$VWHIoYpJn#{9$}e%M)!XJH=bX8

literal 0
HcmV?d00001

diff --git a/tests/lib/__pycache__/test_reporting.cpython-314-pytest-9.0.3.pyc b/tests/lib/__pycache__/test_reporting.cpython-314-pytest-9.0.3.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..376e4b558b9347053c1e4938bf67cf330e5e5ce9
GIT binary patch
literal 3235
zcmeGeO-~y~bY^$`f!Ej&l2ED?NDxi7fIor@p$Sz{nqb-}O%~%Q67p(eFW|6vH#2Ko
zqg3M1ACO}!RVqg+^^ikVB~qp4&_j+U#sZC6wYRG3B~m!$)S2D2SyR;hfsEI0-h1=j
z+c$6C&U+(~&JdvE@8A3GKji_)Gom>-7kPDqBAXxwWH<*bXwO`lFLMJREm*z*w!+Q%
z(*Y}x4q8E*=I1)nAuBWhWWOx*0v%eS?CS+qScXGD3jD=P-;_8Ky0)(BvWCmLMFLq3
z6^eOFf~50axu_QuTwYsKQ8n*#Jz<kb|I}0PYLS9Xun6g751IOx0e4dw?r!&D(|2%7
zXs#d5mnr}hf&iQWxq{41QjCoukh|ceB>uIS*VS@C8_OF7jRaTJ$|}yQB`HXPrmk8`
z23jLNyjs+C;z#9@QY?@NUNOqla|;!hmo-GhLh+D@x?Cl1Af)B#j@O+XQ!A9ENjzeH
zBep#vT%Vb{A^8wH9~5DnP*dGlC4T2Ugs1i~3(vc3keT=b))0>0Lu#pDlwuVX72;Q^
z_i=SuD_LW<vp8;P*urtWc+bWZq?rb?ilyb4StXo;&?#!rPycv|f=%$Nc<QS!9)I#|
zeSdi5`ThO=;h!o$KYaRdH}&cE{DF@PFTj_8^DV$XZnu%IsB(Kowa)HG>T>(p{r8Eb
z-1GFDHuPvPUyqDqG&qljd*)uHV>Eb=#xZ#$3-src{gZBf#sm{AvOU^E4#>eQNU_7D
z!-QEVcZ|O6Pz=f;FJ*E${=vm8e<=;XgOAU<V#`&QrHkRF)-9F@N5)wopUw*7Q0~1m
zG|r`5KV{LQ)pbOLl4n7dOS#{U9R6Qfz9}2$Mm^CseR8K9$@=23jq6|RX}-~8`DBxK
zI8!hDP0P)B1<KLO(Bv*dboE~`>pBuM;Z4j~G>^oryFE%WAA@h=bWEI19Enp;j+0O3
zpnNI^<8Qu3`!n@smK>$E`>hS%N@XmObf$~WG;UAH@#`5{!@8*=I>7+D1yFzic4v^T
zW&u<4`7%=TRT5}cLf)(<2?eQ*h>B?mvj&GRnAHghGv>ri;<F4()k$<o)v>0S1}<8~
zbq$k<TR~|_Etf1z1e2<7bTBxEb)!OrCAFwa@HIC*Ey0Zw%xY4?gVWRY3sMr6M7!L%
zMfcXs{5<JkP?^8+*&W1gPQ<QM68gxjvbw3Lh;%D?Ro7YFM0sP)RFOtH6s#_3j6)(b
z_6~|h=^8=?B0UQ06jV#mKBsL%Nww&T6d(a)w>_9ZOhjs2aW<at660RtqC+^w-uDud
zm<4&z8HnApq|f%Xt&MVHgpzcfRTTUSz5i)-k9I#g|IOsLSH8Ydk6!rx-1eOxmiH!R
zcP3`*@1@%pJ=EiC&xW68cBAr39~aHQ&DjG#=s5l8{?^#z-g<Cqb7o%**E$Cp;^0<a
zLma6IiR}kA*bx(Zg8S5kL`@uV$c6{sWzsDvFY-c292)Lzh*C|6ZQr)Rju_h$+@~(Y
zYNF(j4G+G{q+3#6<b@Ek4fi+1ftoP7HM=#%rl}oqbWd=fx-eQ32OP5D!FQQpOUjG9
z5JqjQ?=-~znvmX_*iLS*K1n`VdZyOSr74HP9WlKpxKEvO#D0ftc<^0YvL)t4=n^_j
zPgCMEnT*s)g6zy`Glxl-B~^>+Q=I&bIcY|7Fov`WDq1vmTP&88HV+N6B<j*=caCGU
zQH-t8CQ`?q{<P<Ct<^u;UyAcNu_XHr*q%8F8N5PGly&Vx^Z{LBod-Tc!Al-O_$L5g
Xf<2J<4U8T5dAI-%!a+C(ZS8*m3SWGk

literal 0
HcmV?d00001

diff --git a/tests/lib/__pycache__/test_reporting_render.cpython-314-pytest-9.0.3.pyc b/tests/lib/__pycache__/test_reporting_render.cpython-314-pytest-9.0.3.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ffa6a9a282a809bdc70bf7c3fb2244a921186246
GIT binary patch
literal 23278
zcmeHvdr%x#dT;kk&x@JiC4`WK#OPtHkz`;1N!XTcN!A0&w!*gX2z!a<4lshT5i`?0
z2#LtI*vbnfsk@}I_ZnR%5sp(?lx$LL9u;jK73WbES8D5C5J;fD>+H>~<d5W!jaPP6
z5+zm1?>pUnx~J6`l)Xvptu)Nn=Y39}bNV~q`F-E%edW<I0oSol>fhM7Q4spcV_qJ^
z@$pZ<F(F(KWbw366xrR==^gbn2%S>V*FZ6z)BestG0+(-20KH=5DW93F6#^z!<~_0
zq(P9SO+qm$SIE9i!mwB@m;LZp$N|twIS5)Mhd`_4GSCfj7_>%?fNqqdptW*2=q6d*
zCB!TLni_WYmRBo#F*7_YXYx6vsFj_#oQ7K+9WH8tWJVdt_7~$G&DWP67|dwZN@ipr
zqom%*^`+F&OPA8h)sklop~L`wAGd={2)!b1?2_@+vPbszdd-le&~C)a(jlK7idOj^
zhWm%pqXU_z`f~#rEjW}OdsFRCk7&N3boR|rEs*Z(PqenR-|}g}{BXK>F{fP8eCnIo
z;bF~p30a0Ue}C@Mr7W7Su*Xp29))Vvq~WYujEhP&JSx?_SL<wkRm~`BbDxqP8OV(^
zjir@=<`+>)b9yi{QhbV)t2P%iYEf+-&h{}^N?~1<9T|j+x}zz7Rr96#(TNHMU!aFk
zw}VUw-wJP>erxW``-3an8h=xoJO3NePrPE}g^vW$_rk|WA@uu+8dn5&YU#&658~{-
zZsE1|ZqI8~z2<xpx+Ei??bb0S2m>giUbryuig;m9()XkhJAf9|3rW$I-(I(os-p+_
zF^Wj(Yr=>fq7lWlOK8IEw0v=|LNN+W`*=txCwB#i=2u2XQrQ8mTph}d4iBV?N_KEC
zqiErQtmUaNLR0-YMalGIs4nX>4I_!5j8@S%is@EN^(mQj|4>GaOA0-#LS3s#Cy$<f
zSqpU?J$Ej3?&TjiANOlsB|D(hvq&mh^Xs*1Uev02$A+$|^sJ5SQferjC#5KjaL@y&
z-5}OrsVh(H`(R*Y`xC!6_6OI#ac!ynrCaCbzi{r`Ep4|0e{yK~#qNa{yO%n;XVjU@
zCjoCHC4MA$eV~5K_Y0yuNU24^{rK@K3>O>XKLd3Ral0|Jd31K6vWpKi&n@IVvZv6x
zLr8kfm#1*N(9MT9NCPK(3m22Z)5ug9%loVea6%mXSr#wJKH1;PCr_8lG}s}~3rrW4
zGq{cN^`dnB>SD~!^yzoBiNDaiJ)0c)Xo5Q~d%jmh*)Iod^|+_^n9nW;*G$X%Ne-=<
zRx-<NmiF8Fr0}IKn%Ryk88lNj2uX=Q@q3OZu5@IBcZVQX^#s~Pxw^-n6xuxP#z@Fz
zYqrpD)+vWSyCc}_vG-m+BuC_EGSn>ELksV^_Gy{<d=2*5M*P}wVb)*)HQ$$J!CrPg
zESJj_$#7|n`A9GEW_H>7nO;!Gg|Z`NJIwo#C|5o!=6(2y_<8X2;wL>TDuZ$*maPq%
zbTOSBRvJ*)xM#-~$2Vp#4K}Gm+05{O+LX;Tv-Pui{KOM`@>e_JYS|ANj@WZCa<!X5
za2P?YakWV;rj?>PmMspIy|stZ9gM5-vbQvE{%Wh{xdh&ZtV9kH>CmM7)kK2Pw(%gP
zPaKG=%zt3KBCTSrE3)bmXaWoFID1RfJlT=BI9|ijCs2QOL{HR)$|4HA3R;=f5-oBh
ze>K%NnjJ1?M^w!}oa;{yt6CXCX`M`^6|F91aDS|nN@{R8*OwknjifJSv`|V-U(8Uj
z=1=GISU$C?6!H(#qN-rw&Zoit%Tu(hr@{Yox#3)1D?iGbp3RM%P?VgaZ7}0AS0Dq7
zq>G4&N0q(Q(lE77f7FEC(`NS^uzL>KJuUj<83mO3TJ?l_+n&}F>YhY<iSJ;E?@)>F
zX*G&w#_X5AB5h^8$L2{YtD*<l+(~6GK@?G5LY^x8)V~A4QWYwnygYSzF|>W+*nR29
zm!G@-+`M#Tx&~zOx#^n4@b(4i$i49P`CZQ-=yu=2t|QBm^Zy<axANdSmX9Ky(<7Fs
zRPLM)Z|CV3QJ(3A8#UeT(<ARPA5W+|P^KA=090o4Ig+mCeRMV5fX-m6=`-Na_SMu9
zGiBk^S2~l2*m8)Eox@|x!B<JN&A4PQ{;=wrVs1ON8CPEDC3%2d*6NuTn`oW)7uu2{
zTgB}wJopt>_yAUTJ#MUUp07Rn>K`mj8gVyDR)5dn3v6xTYibt>ZH^0T)83?nwQ1k^
zgl%OC^;)@HS2Rl=k;~+8FXz}@Bx^Y?SJ}cU-<>0iCMV`ktIHNvKKFL<e2^`|j+Lp5
ztxRR5>qOz#yEZ!-3~w@Q)^A_!PlyArtglbmdcPm*y=d#z@-=(aTJI}9H@zB>E9I(W
z#L=sFTzfTY>(u}sSzU&*Ms}5X+7rS&eX1u|(x=sH_9>U!WNeQ=Co{ENu8}t;%N>3C
zz_m{+tUk5%#g<3sG~^u5sxnliTq|!%R+iS7uj<84K)-Bd(^l7Uq3kNN9p)V}R2^g{
zlA&z!Q{E4M0CLoI#`QKA5?wJzVesawlhqdA$Ztq`lN+48aS*!_@Ww6qnq-a60rMNX
zH<s`~aqy4V_r5Y7*hID1-q-IFE+|KBW9QB_tIaW!x2~CXgPFE;*Oaf7>*a09TE`69
zl=QgH+D&FDcK$l}i|ZS$o{iuH%{>@(+h|p-+435*PI>$1h7;7u4RS15=jheVtMuw-
z^9iw^viRw0yg)Q%A;*5IT;B00_sLg01~1?|$!yOmyr3GqU;}<N`HgZ*rEu~g-ux}e
zErvX^(_A42|7LyjBI&`rNKk&;yx1x4`ZwVETg)dj+c>a3`Gn_#e6ma4{kfTkTjeL@
z#^hGVJgk3=IZ<!+n0co;k@vjxPl6B<wqt+awbNxC0RENVmh@nK*=FX;Z(k*?Ult0N
zkutzj)+DzlYdV0d<r|U>=12h-8(G(UYE3p^KDAL9CG+VsD@Dk=ZEw9hr?;@Q#x>LO
z9+#h7GwpV>&*eRz-8n5kmIS`$GxK>SKgQbA94x`EqH<i`o{T!)tGdT{uj<U6H}3|o
zC7Ej*e%tYDz>mpY`5L(n=vDhe-_Y1oEeZYB%5I7Flifi1=_OQl5!p?IgZ>KbTnTh)
zBXWRYFAoC*Asqs=grGwVUut2r^`Q<%0o-E#c1HCSPwPNcD}W~oZB3K_Rjo#r*7mdG
z!9<KOw#>jNsR91q&;3*^K@~l16pjLvCTbLcU<Zvr;9N=qfe-EI7pn_7Ta+D$QsN-v
z<vQYHf^o_|irG)(DT=mdPXL5dS`m6n9IvJ)22N&TW`NreszYflR&WAf9*5Y_L1yyA
z(=7dQM=1!^0YW$oG!U8eAj+*{hN!D)I5(D2fDHl^<n<DSXhL{|7kU-!QU&EKkskm-
zLs+wI?PJZ;#KVZ1q_!ZNUK<=19H|E(tr3I*nzB%UT`|B(RE&1S4(sS4e0qiiHAruA
z0$Gees$CDn7?l*mI;-rhcs0|eF)fXu_#u@KgIJXSJ<K5@&k~`%fO3GyK_VSQo*{CW
z$a6%V2hn`2zce3fbv&j?Ole2Bp#pTI1UxjrK7CrWD=#{t4fxN%2@^WRXZQ^G&+v8F
zQNx3FPe%zb*jC~@VE45eIHV3XwzihUmE>zJK{k!r^oAI<8LcvM8NQO*5+$`IT9wn(
zKWB*0`(l3r160-9bwD{nX^s*(2BOA5>^P^JTgN^RG3?W(RZb11RcHrCM~Z3+s@W7n
zh;w6Via5Y16uYU5Sw$@>UqTgx7^#2d)m3<vU#Xrt^|R-1_TK1StlImvW3yww7x;t7
zHzJGsPkgItgBdmFyPf%<W~u7z$56+1i4({FGPHI2_+n@W!Ko)E0j8dqKDI#c<_S1w
z0HU6lt6tdEru%OP;D5jM8&P=m|9g~4ciy#}_UP$j+;@ix=RSu6*=L@atJXuA$B_V4
znbC+rSvH>|>5_EfcHn(LywpmRbvxilfU?YJ#Govj&yjRB?|rF#65Z2&v+hP6Za3=Q
z*@D~TGt*lZ!n>BGc6esS7@6Do?pp}p|CC1e+@%aWTzA})+NaSAM)EtBk0R>yh$T3U
zB6v#fSfq;QcItlAFf+!ZbO#DDqY;1_Y(7WQ)w~N*`~RufJLz4isGjP(eswumr~fYo
z>n7EyKJMp^NtHYAP#W&zj`?7nEh7@yd^`~xsN%a`p>p3xej!r+<&Nte^HR&yDUj<O
zQ>PZg^>9rqH?Q2d0>>=oYs>VN*}!5$3taPvZD?5tw=7G>|6MbXhj0gJ2#i#Bq!!b6
z56%ufVu|YIPL$2l>kcHHR(Kf7Fnttpg=d__+@yL?92&>dI~-`Jt_6S$&o7ch@b(KI
z4<pnxu;~QK1S|nY3;Q|74!Pfpy@FwE!t=uc!xUOk*kxha=6)FEV8<bI{V>dMe1#Xg
zT#)|4s}@RC7)bgVMm1*cGzLFp`wZYz`qjnv<Yoxd4wolMJ6v)8q-~es@3nHd;-N|2
zIynHnIG6UjI2LcO!$rGqmG6q#TfJgFjj_YE^0~K*Yv8fN4MGdQR(hW)w?ZGjR$7ja
zJFur9^y86a&>=;Kda)<hFI&lVW|(!yo-t&$!n{L@jzVgN#+RX$NjXX;REmgP(4f|g
zDcNEsH3&uRD7TCnFEdP}nxLrNaO!k-=c~t3=ekdHrOv(j$}30Z*TzG;cEyHZHk2zH
z51LLAX%k8p%GJb+$`^1~&Vf9%3#bqam0kp%Wnw6VwoVe-Y^qy#X;GKFAT}C;7v$C^
z7}&rrCv&B!3z6}twjl!xFi4i!K17mD7z~}1AEEa6)l4xna#{0Y;nhMs=NfVtm%id<
zTCjL2pUS6;Lt6M!Zsg6(Rk-_ySn0}-)1#7@r3@3fO5{x<3X!)!RGJTsjiwl(iw--O
z=-}4>DP6&=Dqlu6+Q+EB1~MUhD0Qp^E2j?M4K_?1U6Cr5rP>9lc7D_Tul3LE_?3%G
z(t-O@-E{qu)VLDfI4?Ec+<IdxZqr-e*+#c*3*p9P$@pKC8fobw-(AbeqwgVo75Ck-
zd=#-&k3c=s+jttg1BrBQN3+blh<h<^*9nJ>+lj>OTzMPJ?qFhPa7?5U!OQqt243dJ
zOb)P_to$&_RDKlXWL$*(Tso^_@U_55t~dn8S!^Y)Qwn`Hz}6140oHZ@%?wkilE#=m
zF!9v~W+a!(r^bddBdO65%qit7$hH?hbq9z(K(=9-%gjr)OVVK)mCg60U61O4`3mq0
zk|3nR=}0<i^-M0@>B&Vpy}77iVkvd{@&fuHoQx!+ZP3lalroqXsb9`T-FULe1E7_y
zFjBejIx|%>Y(tU)c!;M-Oxk)r4-IgkyNh#XJgRXOF4|U&(PV(F8j~iUHC7GI$K7?R
za%D1zRio<s`I1$`%H^(C<@1288k`q)xnjgFtA<s6z4<iO9eu!8CG%;FRl~~X-Y%Zc
zST)v4&sPGh8f&HHIE`b~2(eYeVX2fa>m{2WcG*gf89}xio|fA$nRCHdHAv?bWVS-(
z>zo02;|q^F1FG2!*zlh-z&1|qv+F-{29VrPqjOv0mJlAIhEerEb`bVc{dQ(gjE#(3
zJ{|x$?41zKdBMC5TWT^Zc}+Mck*wgn9CpAEUJn`KYF<1?!Yq{mGti_6E8l==n6gW_
z6|_No1RHfDgqR((?p26ED4U58gHW~-p%-7-Mub|ciTz3xci36edlUC*;$=-#m<6_F
z!>iIN))<?Zm|$y#@&@HR$QBH}7s!ANyL}}yh`qiF^RCooHPx3^GsD@DOw(cA=Ir?<
zf^Uc`si#2xenPk(shx?=z5YR9rE%|V2}ZmPvAI*f{qomeUTQva%lm%%6Q3t~TAUQW
z>lZ4w%sh9y?1Lv)cI=)l%=W`ByRr>-*`YSJykJ;DW_2pbI;<U;0p-U~5tGgs;%Uie
z$|OZFq3mIJBo{L?-?AA>vko$mLx%bNt3);%<`)+q`ms0z?z0PldI&?|)-MR!ORdr>
zEg_H@)vJyH)a8J;>6vMml|dpyL_Qsq<HF#ra)sq)Bk%1WRWymZOz+iCl>!|tQE7IX
zA|dWajvUeDk3H+1PRcK%gckhNlX&+YP6*$Yj=+5GLn-lhlF!%n-G~sbm>m1^wd>aw
z%jzdyT#2-NefQkhZ(sZRwZ+!s^WhT{$M1(~r@d37v(i%NsflAN;o5m=_su;w_TV<X
z=bgQD+q)3ny(}63i_-4NXLa9Q%gLkfAw7T#Bk>)}M-hAUh$SkMr?EQ_vl0dtv-hjP
zcZ0ai1m7#8TiHSw8I1o$X)gg*<hyG*dGtM`_i*1G%SRDGJz|N<<Z0{<B>EH??Vmy=
zaw{Y;cJKv9Ve*h+;r5;;361UEci8mS=9=nex8@cmOZJ_=?6dpw-d?h%WLMJLMJ(2F
znX-;qYo+k2H0o&*Mm;AXco&Y#etUUc+_u;%#w@{<d<G%4*9$K}#03W7zAj0QQi;<#
zF0)j#U1vQnozXR>d0$>)no%y(<zY=<7&A}^OPcsPLm{|?*X1g?$@NT{*mnN1O`Z()
zTDjb<e)!5Nhaic?%*MOvaxIyc<Xz*8fmQx)Gp}`rgvI4NvsVpy(#q%FE}jqaWSPSb
za5cHltU|Co^&YD=rfYS%Pq_k{CRn4xB2!0#zFD|t7CI25QIIPN9~yDr$p@0$1`ayu
z|Gv&qia_e$e*VX7qZG2W*S$9^8@R~lW|Yd9U7^!j^B=qRnd4yHlic0}ySxbR@n2kD
zkM@+KM;m&gNx!X6tJmyPZp#R}!q3ahuVi+Gk68I7!>+x`E&5<hWW8^sBXwMOBf_g%
z7FILM!fFRLx1rY@CVScLZyUEbGkKt~%YqeQKCJJw@)7K|{#>s6Pwq#Evc)z3eYu~!
zxp29w$uT+LEqv4B{jk;yBx@Xq$Dgn7O=T-ojPl#w)bEouN{-mxF?TM`PbtCP#Cjjy
z({fu!2j)>BZ;`hqD;#g*#>aRYH=1vudAGfdz&!F1*qC6?OUIl?J$WfB#CNdWF91fV
z3rH|FiUemW{{ut|Ud&!8l6sdN(j5=c{;uf_H8(O|c8-qt#^`jiV-Hq>j04<ed$O}^
z8>{)~5b+g7rV0o}SGtM3N+e0-JV>c+2R2Mf4@Hrsgz^hSCW+9_&fK~&>j~S|4LG{e
zNAdkcu*?YYIxRpjH$ZgdYZUlPMA&mO0F|~rS}+AA_HZVZ()@JlX`~2%9dLVQKnp;b
zfRooaBbB*=Gs~*Bfo;N4c=+r<8j1$$66L3;%yn+JY}OD%$QG~Fo5fnh8Y3yWTu~GA
z=rZ)VYTJjsNo{4Ps&qAfyHcQv$zDQ7>MxONgor{!B~m2P31T-@NOb6^KEv`!0=8}2
zY)<?3tpplL<#0^+ayB!@Oak&ahtAA5ejg<r#ZP??fC{0<^&d+6uzR~7uJ}suC!b#q
z#}>k|<?xdW;U{PJ-VL|UUA-GV0+oMl{d70k3A}S*sdhgM0irL9_iOi0Rn2|zw;GmP
zIu}|xms`$!;uXqY5+`3@j_$h?-8cJ%xvjsNTHb$ZVgISc{ip9m&mhM~Ucpxe=m7cu
zB2+syx)|C<o4`%;(zfZ=g)p#v<9|`wHhEb0-L;%N`W~g{zB?s89t;O!?nD0yBZU^j
zNP)DvGlxKC``*Q_&QPC{lGgA+Z)!U2(PV~TBqqhI<>SG+14`MML$iIl2W8D1awI@i
zW;9|@md)o#>gq)WE8*&S3EJ-KXfpKs^uH+8O`;}p-nE>10O&nR&V6@Ed^{Kq#IOx<
z<IT<+out9(eCG^qI4F0<smei+u7DwB4jN)9df+CV4Z6q(BV~@^qlivDf|NN%5j-V#
z{KJDNO%gSN$@+)&+s{2M{oxDyK#6Vei$FAbEkC}3+5EleGA^7nIWqJX<)pW`@VXhY
zhK_<7=(yIUqaY}TU2a+qfQ}Vx#jbQ9)K)Hz+?(Yv>4ooSnU12yjC0>ZoBLzip3*9x
z>o?3YSovJnV2Yr|p3=(a-Y%Zc&{3?Fp6@}iKUga*-$y&v>(%zEgv7GTR<i59ejPdr
zLa3`Ogqp3)?x-}arf#UTJ0=ZdjCG>W$_*3@!C;VXJ{?^CIk=tBpBsvN#18xM1w)6E
z*~cYP&|4PMtK-QeC#bZ#{k<EW+<5cajcaoU2uOZr9XPUb69s9$v77>Ia~;^R@*X{A
zD-k9~RM;d3i9+d-ZQv^{=IfRLzKX=F!lkW`-*3tvp+J(8)ZYPl=nkAJKe@-mOqJi*
zGar8TQMeS0Klb*}PY>ZXHS{xY(CrOcvL)kxQHs$zK)$<{lSkh}daMIR;yad)B8K#c
zB`TAru{)6HQLvI2AY;NoCQ`6(^$0L25;{G35kMt`Sq54XQ0X;P>g<RweZWEv;5d#8
z3q=kXz?k1q0a%Bv89Nw^BeO{fdYH%|8eLpCv<5v-k~2O4dC^t3vKA|GQ7xl}8pd=t
zmH>I>Ha6kT+&BZmw}vWv|Nk>x&I})u>GFJFx|NRrc~x??^HUR0;C^q+5eJf#EAj>~
z#afFgf;j~+Q25r+r~;v#eFuk`S$Cft-#(O9MxeEdW$0rC&&1fLgkq$Wxd?;S0sH52
zK3$=vNywDZ0t1<gX?%}Kc?UHz<r=mu^m!tyOebskHA+dgL%M~KG6!e;Q+NqIi>}C7
z=YAp{cJN}|BAYR3Qt3GM56bUS80{NWl5ZXn_VsTc*zJFHhE30}{3&vzDC@6-SWM>N
z{b2QU=j@r=V=Frw=T3ZZU}fvhn}=^4o-NEDIuHIFdQF@-3PoR}7N(Em?{8k&7Mo4X
z?xGJcHBTJ>3vTecLdKFiXFLnxoy(H(zbNfwmW<@RYjg7i_b3PV-6`?$U^tefosV)C
zF5=AvOI0E3;4XgXY~8ub0Q9&hk1iW;0mosyrOe_jaw7~Ke02DoK9oYRYBRVGL_)vL
zbzrvQm#a;Kp?m-!c2KS=yzQoBDg3ep<rKb=3^O3-n}u)jw;C-1%<U_D$1rR8=X^97
zHDdvDV+p~hg*fMOH&>LOOcKn!=lnZ1pl5?E7w1p<^AiRC<VJi1bM|tTOP8w{=pr~;
zt>W-XN%@>dn$Kehj#fVRcJX}RVH@SzHJ^i<mB^dcOv^u1;1C=u<vMxuqX)W^!LDuA
zGnp>y@U{zpZatmr${|15Q-QIH_mCN@^>L#uYqq@H+?>c;Kf9ylSaGV^A?HWH>{WV|
z1J?l1tzL8o?1HNT=ruU5+>oq!1hRu-QNL_u?=-`#JKhU7$Sww`jtpE0vI`MxhhG_f
z0MwPgWgK8f;fC=J+urX%Y%=hE@66XG1FrnxI~IS)S0rnb6~rO(o4PlB4}4(n`L}H|
z>-&TcIA&HIjsw@a9Q1a7Uu_4yErr+M{Q7un&nCRD`+6`owqD(Bn?>$itd_s8PkeUY
zSNXB{&YI866^jn;`ftexRy&se@5cvf!3Q=GACO~^g7;feFoOo124l+q#2c=gI&py&
zpBNj^D(E|AxzS?E_}HVe2MM(Zg_=P)yNMHNfW0qXq|ZX)i=7wgi=F?AO6(#+GOaFK
zGB}^1@;^vnO+?NUxj=;EMRxF0`7h*p9pvn|H_@CJ_a|bxi?M{R6X>Q$rVg<7-2)Vs
zA<_o|yQ(wHxZZYJk`90xrx>mCk#x%O(esY<5jVSKRrMPC`9Z38h{&f4;?Gc$!$h7V
z@;pd<i&G>gU|+FBa@LD`DU8q!-M0QJxyFeM6M2!y4-vUW<SioqmB?u#Ff<XAQ6iU#
zShzn4_6+=2x4>nS?j24Y*^k+g1|8s6Q#8^k6@w02KjnX;6jD2=1-AX>_E;qGzcRPw
z{Zl0IAOB#tDe<c$?zfxLzVX3TyR`p*BP}bT=;YXZU3~W7-1F}r{Gen0)i2Hmdq47u
zZ5QwXPZ6K-q)kEXv@~^j=J--5PA35qlLzOe#M?bT-Gkdy&(FM0x7QcK4a-sj&KaDy
zO3W(nz5=iQe~+^0&byY=9z7*<-yIZY_>ga^M~_&dDtH=)W05MI#c?XC4>io-<dspi
z?m$6iGy+hA&F4tEnitPR>V*SPCrll?4l^~<E=<Fgt!1X~t3&SsrD>TNdJiZK0%zV>
z*iv#>Yx01O?~}s8aB{#ON`k3l+y^Cr&Budzb}HQFaU?)lGkrV^Wtl#T0OmuDBCUl{
z1W&o91CPZvY;mc(yIUhgUeNs1Av<8KZBFUmIc(C8&SJXam<&69qJ?-6eS4UN>94V+
z{c~V=etK~NL^cv>B=QoGFA<@0E9`6pvrtripS;*F32aBqUXoh^YpniW1rI|@I69np
zUb%-vB7W-2AeiE!_-#S>5n)+4ydWI@w(#@=zejxGlQKz6i4V#>;!*KISx8JgsPl<W
iKB)AHyB|c##eENUhT*M`iaQ>}LgKDZ>LOx-)%Gs|zZr1=

literal 0
HcmV?d00001


From aebb4a3e21c5c7b9c9c74214c2b42bbcbb7b16a4 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Tue, 2 Jun 2026 09:52:27 -0600
Subject: [PATCH 51/61] chore: gitignore __pycache__ (untrack files
 accidentally added in prev commit)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .gitignore                                      |   5 +++++
 evals/__pycache__/__init__.cpython-314.pyc      | Bin 154 -> 0 bytes
 evals/__pycache__/cli.cpython-314.pyc           | Bin 13163 -> 0 bytes
 evals/lib/__pycache__/__init__.cpython-314.pyc  | Bin 158 -> 0 bytes
 evals/lib/__pycache__/baseline.cpython-314.pyc  | Bin 2953 -> 0 bytes
 evals/lib/__pycache__/compare.cpython-314.pyc   | Bin 2922 -> 0 bytes
 evals/lib/__pycache__/config.cpython-314.pyc    | Bin 2722 -> 0 bytes
 evals/lib/__pycache__/grading.cpython-314.pyc   | Bin 11213 -> 0 bytes
 evals/lib/__pycache__/harness.cpython-314.pyc   | Bin 3801 -> 0 bytes
 evals/lib/__pycache__/models.cpython-314.pyc    | Bin 5475 -> 0 bytes
 evals/lib/__pycache__/replay.cpython-314.pyc    | Bin 2276 -> 0 bytes
 evals/lib/__pycache__/reporting.cpython-314.pyc | Bin 19859 -> 0 bytes
 tests/__pycache__/__init__.cpython-314.pyc      | Bin 154 -> 0 bytes
 tests/lib/__pycache__/__init__.cpython-314.pyc  | Bin 158 -> 0 bytes
 .../test_adapters.cpython-314-pytest-9.0.3.pyc  | Bin 19301 -> 0 bytes
 .../test_baseline.cpython-314-pytest-9.0.3.pyc  | Bin 7215 -> 0 bytes
 ..._cli_resilience.cpython-314-pytest-9.0.3.pyc | Bin 5746 -> 0 bytes
 .../test_compare.cpython-314-pytest-9.0.3.pyc   | Bin 8487 -> 0 bytes
 .../__pycache__/test_compare.cpython-314.pyc    | Bin 3199 -> 0 bytes
 .../test_config.cpython-314-pytest-9.0.3.pyc    | Bin 8231 -> 0 bytes
 .../test_grading.cpython-314-pytest-9.0.3.pyc   | Bin 39560 -> 0 bytes
 .../test_harness.cpython-314-pytest-9.0.3.pyc   | Bin 6910 -> 0 bytes
 .../test_models.cpython-314-pytest-9.0.3.pyc    | Bin 16811 -> 0 bytes
 .../test_replay.cpython-314-pytest-9.0.3.pyc    | Bin 5190 -> 0 bytes
 .../test_reporting.cpython-314-pytest-9.0.3.pyc | Bin 3235 -> 0 bytes
 ...eporting_render.cpython-314-pytest-9.0.3.pyc | Bin 23278 -> 0 bytes
 26 files changed, 5 insertions(+)
 delete mode 100644 evals/__pycache__/__init__.cpython-314.pyc
 delete mode 100644 evals/__pycache__/cli.cpython-314.pyc
 delete mode 100644 evals/lib/__pycache__/__init__.cpython-314.pyc
 delete mode 100644 evals/lib/__pycache__/baseline.cpython-314.pyc
 delete mode 100644 evals/lib/__pycache__/compare.cpython-314.pyc
 delete mode 100644 evals/lib/__pycache__/config.cpython-314.pyc
 delete mode 100644 evals/lib/__pycache__/grading.cpython-314.pyc
 delete mode 100644 evals/lib/__pycache__/harness.cpython-314.pyc
 delete mode 100644 evals/lib/__pycache__/models.cpython-314.pyc
 delete mode 100644 evals/lib/__pycache__/replay.cpython-314.pyc
 delete mode 100644 evals/lib/__pycache__/reporting.cpython-314.pyc
 delete mode 100644 tests/__pycache__/__init__.cpython-314.pyc
 delete mode 100644 tests/lib/__pycache__/__init__.cpython-314.pyc
 delete mode 100644 tests/lib/__pycache__/test_adapters.cpython-314-pytest-9.0.3.pyc
 delete mode 100644 tests/lib/__pycache__/test_baseline.cpython-314-pytest-9.0.3.pyc
 delete mode 100644 tests/lib/__pycache__/test_cli_resilience.cpython-314-pytest-9.0.3.pyc
 delete mode 100644 tests/lib/__pycache__/test_compare.cpython-314-pytest-9.0.3.pyc
 delete mode 100644 tests/lib/__pycache__/test_compare.cpython-314.pyc
 delete mode 100644 tests/lib/__pycache__/test_config.cpython-314-pytest-9.0.3.pyc
 delete mode 100644 tests/lib/__pycache__/test_grading.cpython-314-pytest-9.0.3.pyc
 delete mode 100644 tests/lib/__pycache__/test_harness.cpython-314-pytest-9.0.3.pyc
 delete mode 100644 tests/lib/__pycache__/test_models.cpython-314-pytest-9.0.3.pyc
 delete mode 100644 tests/lib/__pycache__/test_replay.cpython-314-pytest-9.0.3.pyc
 delete mode 100644 tests/lib/__pycache__/test_reporting.cpython-314-pytest-9.0.3.pyc
 delete mode 100644 tests/lib/__pycache__/test_reporting_render.cpython-314-pytest-9.0.3.pyc

diff --git a/.gitignore b/.gitignore
index 61cc84f..f5df676 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,8 @@ docs/superpowers/
 .worktrees/
 .idea/
 *.iml
+
+# Python
+__pycache__/
+*.py[cod]
+.venv/
diff --git a/evals/__pycache__/__init__.cpython-314.pyc b/evals/__pycache__/__init__.cpython-314.pyc
deleted file mode 100644
index 1990f70eedd4cd32a352f0cbd78158fb52cf523d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 154
zcmdPq<K<!ig2^`}vq1D?5CH>>P{wCAAftgHh(Vb_lhJP_LlF~@{~08C%UnOSIJKx)
zKdC4&FC{-uuRO6RMc+9;B~?E$JvFaHw>Ud9C#P6HwJZ@x#>Z#oWtPOp>lIYq;;_lh
cPbtkwwJTx;ngg<_7{vI*%*e=C#0+Es0ESB=fdBvi

diff --git a/evals/__pycache__/cli.cpython-314.pyc b/evals/__pycache__/cli.cpython-314.pyc
deleted file mode 100644
index 238dc5be17a18175947cc896b1ad6df7cd8a1bba..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 13163
zcmd5jZE#c9mG4PU-(R+5<1dV5Fc@LW24iCb27<vx=EKMjBNn3I2um1&kRR_o1<)oL
zXWD6;W~Z1oyO_4SnC|XicGJa7I}@|b4z$fq$gfJ+j>L<ZY`g7r`VRuxw)D@QbDy3i
zj9BP)JG0mJz2}~L?)SO(o{#4`tHnq`IQ54m$N#O8qW%*zN>IxH{ji;;s9}nwMCvfb
z(%rO2C90|rR&}dIHAxwfA*n{xK&tN6dbFa}qZ4(6j_KBW45ER+nr@@VB$^1U?KXQX
zqJ_Y^ZmY*8+6b)gF7OnJb^;r^9iAew$Wts9dlrd{JSAcY!5h0vJ!N8971cnorF&_y
z{1rnr^-4uEj1_}-y`1L8&6$jMLoQZ^SyR0!ixro!<`Rmvlu(fZv5K_<Y$LFfEdaQX
zz}2iBU<ZK<*&=|83A~hD1aJv~m$9m4l&dV>u|FC<8V&`Wq1OTt=eFJ54bIM}$e(p`
zL*b|>Y#?}{-Z?OIk_+&mdM6(m<O4wffIb`yh@l1-Etvw*=#Ur?!$VP_$EA|=SRjT^
zh9o`4@u6twDlIYIfOrh3jgg^1uwOV4jzl1*A?;x!gCVg$5Dai)h=;5$)02ylEJtGD
z2-Jw3JQ?86N@hM34TkuBF>oXjlB}6rW`vS%00uD>2|*K<3{LJ#G8_x=(U2g7gaO=)
z%!W`5z`#=ULni>kl#e>BYWpfh1p&91>ZOm-EX}G|HOn;Ol4*XU0y2r_;w1ro5Y|Kp
zHEiVvV<$sV(Th1=(()lO#z!Tl2j@+}K(t4P_>%!qz%G|cO^y43OlUm92cp5DXv66M
zA8gz<6bv;62B8%<Ss9@bcSL9$h=dzBm~(%BW{pGr{d^g;Qv`oP1w_NtZz&z!`T(Ou
zQQdt^CmE7`2*nhcECLv`@JA-Q0>*-q^(xCiisguf(*)dW-wP9O&78KXXVJpK_;E={
zU^UOB*RncR-<qlM%vKG`d3h#<(a+5HYGTdL?bpIuS=)2_EqIcOErcz1Jl478P3gM|
zyv>JB#ln0j2%L6oC_E4n9vj^53~N0$4MYO5V8}fH+X=+FVCZNd77_U>7;BtyyTd`r
z!0~XH#Qty)ID-j>1ThZnpM(mLxEX-J8TXM`5DvrOw=ZJ&WD7LJh-bMFzYJT^y4^?M
zC`v{_9O6U$A|J!Ur40<=@r#?>?xV3t#2tu<Lp@j3lKxoW^a)`g5S7#cE*y6VVqo9|
z;@ogX18yM{3cA#iIS>r?13wcsmJHtATl>29vU~PPnxk-7MHkIGpliavEBI0Xfg#~=
zog9i{ErDMHD8i620EVeFqj|gMt)9{K<Kmr`{^6b(=2Zo?al-Q9S;TZJn04bO%-1XV
z^)pPR0&jj_Gbv(eyMk#O-;Mb-N`BMC;^Cf8nRR?Q^!&uRMiwKizL<<D|G93Ymr+;M
zyd6k*oB&^p(ISXo&Lxn@XO_&F96J&%!2(<nJbRnzq6Zj_1W6(f1j^<^0YGFRkKs)l
zru_2--uZJgkD{vKJ!9$C%&FukzeXwZQBRx+or2G%$~(f*euIMb(O&YBDtS3oTH;iG
zi&6rVRt3(c^OS0Tp@Q|}a*^Q0HCqs>DP<h2D);TmKayMurLP8>V|*$v?!tmlK<(GW
zR1H*2)wLF+gI+K4;(}0{$p>MV7E?Zr7dc}=fU)GM+4dI{H3&PknA!yl?bGzAcC}EH
zfm%XwT2ABBDlHGbgfxDTOXR?+^S5WFSf)wsM{Xg(>&yT-3EY_TcVQ0Y(j1ZuSWO+K
zl@%hAhPK=8ad0|M5vNCKgERP7DYf$NW$s$Dy4>9J_l7h2j9$0Wel}(G@MFl%!KW>t
zI_a*zq9{&NLHX3)#w@J@!Je#E;B1<I4A0m<2B@2|brp!!172Z21qmr&*X3k0AYj{`
z#<IrN%<17wF#e5CQyjRS_e^E>ku&=@DRmCK41By<$z@aMInP-GY@V~~Gqa{ve77NR
z7S^19MS#~VK9lz)LiR)iB~~cucmhc>8|itDCR|?y(j-t90QL5#siCA0O1kov*qcnO
z!`uCI8PwSMG*;o``kW%xk(=YJ2mUeN$%RoAH9c{!c=zNqk^y~Ir3a874BnUXP*}1Q
z&+Mt<1$wf0UwOLq!zy;Ma))R8>7;j^lkXFMzf#ASBkA}9^Vs~Hke_P|&9HkIxB~wX
z83H{cs6g4Yn(`l2a^7PKlueb9dQW6=3Zxt=UjbXW0+2o<yCnAp$+rvsh*H}dRiJEI
zP5BDQI%Up{(mT1^s^D&G;|d8}-o2i)`|MtnEfxfjQdfeMig%DxkpR7~Eg*ud_w3U+
zWkz{UK{e;-D}?doU+-t_O`iKGPuoj<4xfQt)|z?OxguD}n&+*gujs|O40dRFea@RV
zf5zqauz+2Wx70>n%L2AGxAZX8_8zE0pl`r?mwm^UKE0YxzD{iUG7A4<ppVB?oF+^;
zDSwYrOG^bGVT+utPghO#<y~dK=D7>|^1kMP^(t*=Q?@uiURjS-)c;%_1j%SI)u;Ee
z*)|nuj|uR7N-mo|r$uth9;WuthlA`Q{{f|5pH1O?FGwH@1KfBeeQ+-~F>LAl%VatD
zC)|##8OE8zAxe+XQ`sLoD%R1YQ$}A!Rl=@wwk@_!)$R{vC`^Z`jp|A&OuJl?hUm1Q
ztuUYi4N5vL5*rLhg}5m?<dn4{XRSonqF!_~3@W@zGIoL%%!UNep(LFz#0SFz0PqAM
zpgN{QvjkWwBz@>iSP=V%PPjCZhG<i~4kjiUL80oG6)K4#ijx5qq<&P8K=aXak${Nm
zxuhX_w`2;2gP>G}2M0rZT(!&@w>VeI3O(v{o8l&yaoI9wi06lRqeR6EFeiv$k3fS&
zIAWABLotzyiTz^ejA)G8k>*H1I7W0PQ2GU<L>uGA29Xa8gc`&%VprUe(IB0nGXo(G
zRk;n$$EpoZmx1quY4G^$f@L8V1-((S3de?GU=hg-liz`bwoDE>;rS3Q(`%Xa(g=VG
zNetL%Lh(g7n5Q~3_QcH%^0GD@7lxv!QHH_F5)~x_ns7jEJjun)854{`V%!#r!0vEE
zya;<`9RS-+3{+Bi@}RTA>?K_+dLlY>I!csRmnJ(Kek++0s*z!lN0I5VVF21AjZ3zp
zVEqX7b3;N{48Il<6ht&M2zDk!l!#z*iu50e3=NzJM+f_l!s<is0vL?cgJCgl;6tyu
ziGc?7YgjQ}lo(Iv*6prMl3uahNY*EfH$;<_jHH^Z+9DH?)#p(vlysS;lhlI9^IAly
z2Tp@AK;~b8uR)XwHc7+rU_t}Bvx20RIYeTBNkyrE9D{~P_(%g`RDnk!lW)hGX0~(R
z{@s21`q{mE`+$*81jAra0;587;FzS9J0UB_qd>=R!w7{n$r=oau<P?Yp+}^4;8<wj
zgdiES91MpLc~>&!Fs)?D;a7<{4z@oqy5%ygWIWA>MX)yDmKr)09_4pqFHG0g?%iPC
z+q!QbkGHF&@9&QWPJ*Qh%n}?2);h^XCfXm2o#gtFzj!my@o3rPQFxKmz#=?DpqMxy
z8N$MlY%_z=0J)^WngYKcDU3qw2rL9xUIa-ECxWalOaM+W90AT%EU5yV#GqjU_U0(u
zGa0L*WRfqOeppk1STuN)hq9|rxKPqTTS&rxjf@BG1Ly`APD75XM293@<{Zf5QTGcR
ze-LRI<X-_z2+n*#wwy301K}wDH$a3Bs?ZIy9;W6Dg(-vcj=?!TIAdrS?wDf=M(s09
zQQBlVx9Oct?`|31{!3%gXy1iH=MPOAox|JatYu?0m+CLpPg`q;cg#5!y(eBc^UX8g
zijQ~CmbfRG4~*B1vn3m*9UBw&_TgRW^2$qf->#ds`YzUu_Fg-D?eN6msZ%$_8{#eT
z<2Qf&W^&8^bEig5U9214k+AwcH`(X(mQmB#_Dek%dlD6O6KfJy_Yb#yu=D!PS^bue
z93Pkdr1W<6M-{(jn$y+-=-*mC)_2K&(LZZ-O?1y%Tc((s#v8_2Ysc`8G-G;u&s%%O
zY9@+?_slR2bC#kJ`o5~f&_k!~WfvOGH;flvDZN}e(R^*g)eV!UJ`k^qQ!l4$>#oEv
z$ESkn`qkI=Ufp}E=4027UFo{UYn!ibzOSPk)ekAPW8=uS`+CYzp0ck@+E-3Et{uO6
zeA>Q#WZRs*WQ-nHUol=bPWFA^zwW<X^t19$%F{KqkJPlo1=X}vY57?7rJ9R1V~fV#
znAmsi@YTbK`u6)2y=iHB>GCVC%dRVp<Bb!1qPhu?YgeXM)?Kq*wIyoXAiI7+cGZ0y
zv#1Wn($IA6;MIeZLaNo1Z1qgLyT3Fri+0jbvWM=aG1E(rblx{o*5XXx@kGh0S<9-F
zd0o=HZZbAw-gLi+G8CM%zGEE~X7m+lL&1gM`QWH;tUcjaIcr#%(l;md&6E35EuLhH
zC*kc)w0Ndl*pz<XgBED^Q5&Tx94(n)%I2&Uu!B1<?wnw*nXj5B2dAx@VXrEdU26Y!
z`}=ytO;@y}Dz+pmwoF%S9o{jrE6J3mHRh2wXEl{6#+77T@7G*g^-rrN58UdVS+!$^
z@%)OZ`;1wfwiKK@@Xmod#qGEI(&bC;tD)bA6m4~W?p$$MdnI-`_V=fMtER2EoYwYq
z(?+1sn0{_59uqIUe)09m<_|Vp-;iGH#KEN&uT3moJK>t@y1fH7$Go2&-u@pjhQFNE
zl&6>#NoK{kn5x~FtlfBPO`>+=bZzGh(*+Btw<T=Ol&*S4S3Q0p;oh8DzGY_lmbBTC
zD00u58zu{9%uRE8(>eXu^`nK~Fg;LHmbJhC;sj0E4$!}Qw3;d?dqnAh3Gdn$j|Z<r
zE=Q(nZmznq>UQtX`abDP*Sk~o?aBJ~+so5UYg0`f$)=8(hK_X0x>QRi{3cpD#}8cb
zf6srr^=Iv$w5M0PQY+iwH?gwq@|vlEbWQ7r%vA4>bg4E^vdxod?Y?clz4i9m#L~Tw
zOd1EUz_$D5XRtupUNqW!q3?X(Xjh`V>4(iz^bcB7YqurWZc8+6ztwT;)UAQr_C&#M
zSYJyI{RM^bAMV?5Dvt%U(EMa+LvJhd(_$^e|K8fE>TOpg+fDm7Xzu8?uHWCHnW48f
zKr-8+0o?!2WXgl}?fGU>RC4Fdq*l7@0R_=0SW6*!+Dr=KwrnQ-B`AddKg^_{0OC8f
zAUsnoL__a_0JQc6D98e~C@(gT$rD^KD1t!XRK$g8=FqjtbXz=Djsf!q18V)eYNm+d
z-fFo{Zrq&7c=M=ltR`0l%|Gv43Rat&1NuD5S}p~cPc`1<xucO`a0Wf;4$`oO+<IP=
zCl&-S%B@_0f-I=(DliXgGv;A1J964a%BS5%vqqnmHRh_d-c?W6XH7t5dLC7yY8wq^
zVK5J)Sr}AXEtoLOiuqGn{VK|j2ayDyDvM>+^D416Ot~!RtR#L_9<7rg9HU6QP_WYt
zMI&cH>hIyvWWXbn5jmxg-wfbmLnM4ulx1e~>9BaLUy=H4m_tz}gW!9C+*QKAiVXxX
zf>}`f5sX0j&j`5#2;~_Dk!$&{Kq;zR;xQ0jW$71WJ=E1e)<d;yDDqke#8je_@o4nn
z(elE773*n0m5YUVJm)SQ{|@56j(CP2IxUc<NsIh95J~d_vVBI3lKn3iu?j}yVyUH{
z6r92{hShlmr#VyUSka~Oi{;~eSNxa#Q$;t+Z-6AQ^2*-Jd*RM^-EgIs)n3_rdGl2B
z&CNG9KhmhoyWkCasHHRo_n5_Zt@iiY-i@ZLHA!pDv~@+w>P}kSlcrhg#;M@V$c@OX
zb?5DavsPc?;47(v1IdE}v(|y9#U6ZCbDy*;@$bcd-qH0?O&dJ)-HxuQ-ur6WxPD|S
zJeiQZId*D%$!yi?RMom<)w;B?Vyrh&wR*O4O{%grS=kB?rExvfdO)hDy}dB%mHX&N
zluFY_KZ21vMw>rnN`L=_2Ada9Px)v`OQ+lX=e+MZ=w<%C*U_cugD%DU9+Ic;dmIzU
z{`A8hvXOtBTMQi683P)$G9Or#A6q4XRV$~GxRCQvGpq&-h-NS#Y8B(*Gil+L@}o*6
zw{3WWKIa1sg+jS&!4-%6P6FJLRt3(cfPLY216&iREkOXs9jX~5_*4x!jv>|rE^7;q
z_c6Iu`11gPGZQaT9$le*JUXs;Cq~s6;d9_p$vSQ)z&(5&VoW(VMFY~|((-Vx%6A4R
zd6_RXNDp##a(m#D=~8D8I6n#q0S6AJSw6{RbQlqqL4$3s3M8N8G^2fc?^}CEcl<-m
z@ZK4wR>^lyFn89yil~5s+BH%6kpVF~bM&P-qkXjb!iMu3;MgC!d<af@{f+t=;||cB
zK)G5rTKp+f@+WwWUrYpw<J1*Tv6F6oKtVJLUuQt_^pQXYBS+#323G{0dAb{x2vQY(
z-Nezv9<J_Idl*js(uwKTc(k0Fux%dZ0&lVgm>Uil2Josf-a;A9jqwS7!iSfi%6Om<
z+A}H7*%RMf<ga1z;m0$%-)7{ev%Vmmjf|k+86&`DUJ&MwAslp0n{#9|`Mk^CvIW&4
z%N8oVDQN-K1(tqJ2i{Gs7eByR5WwSy2W&y`>Ad(2$bz7ZPib38tEgR2gdzs)y}i58
zV@DsaPY*(l#=BxcBal_<<?%ProPnjmd#lMOG%3<m@E64TpO?mqx50v-jC;QRPq2O?
z^{jsKuB<~DHLU1+u%agL71l542(oNpu0Y6{edZS#QO1uC7uPfP!~7!q!CCxx^^uUz
zXAS#ddXaVxd_SMJ+Gok@tG49xRj*a!w(kP75pqSJG4D<red=oJKo~qqMe`4Z6?Q(4
zmph+dJJ)NP^<lHc`C9;Q!6N^b%;LaVP<9jj6-^o?2R>zb83o=Sr$PzdA9zU?Q|K)K
zkLUzN&3gz+e}hCBLAMV;@B~;g-}eQ>gJ6ZmkL%FfOrAFUpo-&pR0en#L~)DTE!(gK
zH`?#erp)8A#jW_wln{!9qaiGb8{KXcn+dFOyFp5h8{`feP6qiyPz8D}B;P42^@v6A
zJ6NKU3<^?^7;Go5J<Wq{-5FOoSH^WtVg+=$obm(ucQ^`|F~vtPA|^~n)&eZg2MpN!
zKVxwKBl6s$U=F5Dm@Is2X3ES9M<HqW!N|}N{!Jus$c9zrYfNHXmB923Us$58md6K3
zvS+(fM$Ug93(QKEw5!Lt5J*d$DFS~Ph-9H(gNq?7wBD;><zxx}7&fPlgrY8+JZ&;#
zoUA;rthlcs%Oj}<j*~I)|AI8vu(~R89L(jQSo}4~oeT=7mt<4`C6NV)_puU2<bnM)
zJaEI*XN&{H^?OX+J@`Cq-ky4F#?Uz2p-2i{<LxLXtXA@SCu*jcse?bVA=Z<@CW>k%
znx~2owOJwDJuXi6A!dVu*)d)-$xOaHy?QI6J8~qCwAnh`IcEiPZK`;AvUqu_xGq^-
zH?eEFxb0Kx`VrNAmD*68HW!}T{m$;O@>E%CvaEGVb<=#q{3Gjh*^W=mo(CGrw&6=7
zrD?faP(5CgTDms5bnSFObE=>{QP4ic40oqZw&Cq*o$2j2-hN~F4bX2YFIJ|?+{rR`
zs;u!&S!3E(IKrgui!Qj&yHob%N&E8g6QA0fMmoT}bisArHFjdw4rb7_V-bo-7uvts
zez%}B?WjyS)+8NkCYOESzV7}(<Fun=WcytEOE-7^>&{#J$FKkR^`E{u-R>LNj?Io2
zT`9j@KGApW(A7hU!nSGi`UfV;u@($l1x4pxA9;OL9AWMn7LCTHm>I(++ynjFp0_+H
zrX<OfjHzariaEwG(gi|V+xfQ9Rbw6B6~<e>6CaCA)FzBgpE7HI_o$j!>1adoXXYYM
zj||0+1vIDpw6LLjh3b<PrtWo`Pg-b*|D0;g-$IY`GMt`X|A2yMw1y;4%W$X+5E<?a
zVaop-wFcA_1E?uasx+t{F9^!J%+a7$pftH4%)i%G^4{mAX`uJ0<^DJonpC>rBIeY7
zoEHf`wQ{#}jNhza^Qkr<Eiyheya$@>w}T*o<}rvVEds6f3ECp1{cM^;tIPF0@AAoS
zBdZ4m$SCQRD^b>hQ1byT2xYJsOrCmqkPn5Tul(Y>-+H<66(_$FP;q0%2`$v4*H?C(
z$=|s@OC`cPTGo7UGk9#6$D5G<5Tm0Q@eqM!Cwu>C<?p-YYVSg+fVwHU-sHROBdmrI
zVZ8GI<SIwUXAEny_gVLy+FrbV*xc)<vij1YONYh|jRemff9Lo+{yXji!(E>hmW*iT
ztfgbc)7GjH^_<>z&hmB3m~}ihJ~+YtP(8WrgI(8mO|O0_(YW<i`CQrJv3-{gT|D%i
zszj-4q7n{YN!x!>v|;^24P}?Ld6-o-3{KLZWBd&G8e=@XR%p}W!jU2Hd<F*3x@sg{
ze}8aj0PI6XuoT6JgSekp;T&JZ=qnh7F+vTI_yGC&Co|Vo3g7h~4G+pr{le^nhW|c*
zuP1W5l!MRsX=Sf%j>i?=+eqN7tRG*&9w#wE?v{O$sIHTZ%xz3D(-AuvQGqWZ**Hez
zjXa6a8jL_flZ{m5c_RBrz9%L(1=(Op-wuy<EE3wxCxIBQ2}(Ex5v(yy-=#eFsHLA#
zw$CWjXO!`Cs`egL`YVi=-J@LhsMViSebZFmJ*w^(=9h*IcWsVQ&%1A?Y|E3j<<qv>
xl&v9YYnZH<wY3kMztqCN0(@Bn-zI<Ap#q!xmrKDA^JNjJp1<v);SM81`frvl(~|%I

diff --git a/evals/lib/__pycache__/__init__.cpython-314.pyc b/evals/lib/__pycache__/__init__.cpython-314.pyc
deleted file mode 100644
index 0eec099b1401da63bec7e7e626a7771eec53f33d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 158
zcmdPq<K<!ig2^`}vq1D?5CH>>P{wCAAftgHh(Vb_lhJP_LlF~@{~08C%UVCQIJKx)
zKdC4&FC{-uuRO6RMc+9;B~?E$JvFaHw>Ud9C#P6HwJZ@x=42-6$H!;pWtPOp>lIYq
g;;_lhPbtkwwJTx;ngz107{vI*%*e=C#0+Es04|Co>i_@%

diff --git a/evals/lib/__pycache__/baseline.cpython-314.pyc b/evals/lib/__pycache__/baseline.cpython-314.pyc
deleted file mode 100644
index a9616b2a5c6f6ab1342061726c363072a8402882..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2953
zcmcgu-ES1v6+iQ_-|Jn2NjG*39=ybO$l{fYlR72B5EEi>n{1}ZI<(;FcxP;e3^UU^
zv*5T@v{Lh6L=Z+rVyh%d9w9Fw>O=e1B!7W1G~SGeTB+(o-b|r#Q}v<e-r2RywrSsb
zr2BR5xgY1;bAIPOmFY+V(!*HK+22P1{!9lg62cDmpp2awDB#01DBN*wg7^78+UJi8
z6QVD&zHmG;5%r^epolq8A~~=+{g{$cqBx4tk;VCOC61#6n@cFkJjm&Vu~L;7`%7>7
zXB=A^v>oZlaapQ3=Uk1L9<rQSNs~y`mb5v|v^^h3Gn!{urfpF6kjw=s&9)t1^G(P0
za442E{|vUtal^6{!*z&X;ZOxyC`GZLZrp=C*r`Doruosg0d(Z`!YkYvPT>?@5wM6#
zq$uDmqhIqV8pJA9LJZp{A|1tB*9fA-@T<gL$mnLp|B2_5f`rxSn}C;y&N}3r=6m$;
z{@`HoHP0YkafWEN?$|>YG@=*B9o;BubGY$-?+w$kyrOYl!`d=uimfw-+&6=?sx}oh
zRF!n#hH)(3Zmeps4%@iweOfK;Tcgd5Bx`QH#h4FJ2r5*I7U64$R`@4gjas49n%UBJ
z!fSk$FG7_+mIwG*e`)IuO}R2(qV{N`618$0&G3^?0wiD;xPr@<x&P~lqKkUCkyDA*
zrOLuc(<8R<#1^j*Ysmq0!m()pH#((1iz{@T>BVT6Di5H@WENnmvW3qz`A;DKnL^Lz
zE>p+0jz(YK*okkh7-?Uj2wSAYrqqil^~g?-NJW}bKL${u&HhhAo0lVbe8)a4E^lt>
z$+1ANjSE3cCywhHdLVk*IU^8fHPZ?b#F!(7=OH^XYtCa6tDMp7IYXBtK@4ZAmhWKz
zRBhG7x$_30F%U%1fvZL!c!n>>2sI_4z8Y{O5HN@Wk@|HYI8{Foh^c#2Vqid`MAB4H
z)XmvhLS38r_!rozL1XCYkDZU4JIO;gog3S(j9wgFdgc1bt0%9kSJjo&o|{f>d^vfj
zQT*O&acreHb|-cCc5&>+?kf`)CziD9=2i2$bJbZ%KZ)Gh(dE?PMo({T{9bnZ(t%6!
z@8#?IyYgx_znsl43XN3eov}u)cX9mEM5Ckgo#{qSYW6R`Uf=P4_uY>EMsDXO>pOQl
z`Wm_Zkg@IZ$@;+E4ylpbMHznz*WT}j&OK{Akj~aleDl}ckQ)0YNFDZ4sM-~Id_?>#
zE{{JeK7v$KcZO7hO#lCiRJ;;VqD2wWDn@AR1*s-lsoMFuKrn4TNVm|0vG5ISozQT=
zck&6Li9qHEy-$c%>?Ih^u&S3%gcn%{k=u8lR`nmP(dL%Gn%^B@&8HrlBN<##!TWCz
z?K6FTfSnplaRrur!g*IX)r?1g&DbpSqu*~~vjTaXnTFA88yqf=SqOC&7816zKXlC?
zI%D|0L4wEyhj<2pu%OL&vPklH5J3b2LL)cebdLhp06JdibSbam5JS}s%h$+$oMI}=
z*g@Zmcl{yWUpsOymRubE<@u!@Z@=|rwx^MMygq!bx;VNl${+5<D%6?k(b9z;!E~0`
zKHvI=Vv*_*3Exh1F@-IpXd;t-A-&|-279Q50hVF>s%se=c_vIY1!*(eo&KM1NiE?a
zLZM>okfezNtNP>`ZI%lP8*ZwCa*U{3F;&W_Tn!LTxiai+PU;f1*On)Q7L=B7O*<jh
zF>0-zLz!ME<8%JU9cZ#@BV}WMgy$b3Jo8V$DSAUjB^o+yv>Py0QTJ}6)?Hfnb%m3|
zuL>pVy={ct&Ndbfk-Pohxd#O5WRov9!51}?;1tC+Mp4jFr(WoToiM2Zm~c?o2_`3V
z3yBJTTZYc!-$zaoYPuk5%$pv<Y=q2N&P@3G=sJhL21`YpHFeE5R4Px=TGLhEnD>Lk
z3-cAjrC*{T?pm5pzac?{{V#$DTZ;iv#NA0BJ%X>C4Mciy7!n&eyv=fO#|_-jO+pT$
zd>)H;87mBo?~c^Q|Cq{N^Xo@{wXm8Syqz0d&Fx#s?Q3MWFDCD0wl9rbI=h<Lbvv`G
z{_`&~Pc8CmypVY8Uf0gmuHlug;amN`Eqqe=&Cs2$Ba27ZqA>d0;;UCqUp!qu`oW1`
zpSU4@oV=M_P9L}vAGx1`Y+)_U5cKa4W6;_C5V*u+L3+oUfbBm#`V5~X_dMUtf8L#X
zelYU+UJmP*WbSKfM>!e9R8@B>sv0C!b+(G1Hu{qa|8_Lu@W&Neyp#RFhHyN@Y~?Yd
zvwXyWff<2C8H<t-KH|@zY}GOjkt28-h6s2PR(Lm#`x0LG3i`i-$NvI*{}#pX+4>;D
L=6VL>jqUI+IzeS=

diff --git a/evals/lib/__pycache__/compare.cpython-314.pyc b/evals/lib/__pycache__/compare.cpython-314.pyc
deleted file mode 100644
index b46e281f18bd3c88fa9014db789515624223dca0..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2922
zcma(TTWlN0agUEX@_3~9kRnT#Es_#!Gm^<dk!80woF=kqv5F1mAfMt!$>Gd9K5CkG
z**jW}P@w$ihm0bqoEnfEAgJ;cy9i+WD=yHFq(5!{EZByajnx_``q4kSHemFxvv;JV
z$Oha2c6VlWc6N7mc4kIGK^_4-<NAMbh(qX~l<0sxgAnud5^x$wL>BrGiA;`}a#^lH
zio0^|DUao`ad*x;#agV5dve^A&+^&0H|L+?E#Ai2TwqGDgh3>7qEGaTyciG#F$gUr
z`Z8Xzs~3skUQ`WNT~<)^4WU$I;fz?X#Wkf+j+<q@UR7laS4t%f#|^AsG_3f1#VW^@
znrb6@-HKafEpA?@RIBl-uBe)NG>$bx$5y;rDO%|iLjp>zrdx_t(Q77<p1fj}VdSZ}
zY_9~$Tf(rIN|!WCR#e5XGz=ssX;`fkEaJVWt6KF8;V&we<@0qF#7duCr|`EoiRZk6
zwJh-$>(#2P)GeKGdae4gTu}=Q7?U;%VQ6k@;Uj=Fkc2*T9r-mvD)0u-yG)r88PO%W
zp?O5F$ZlHA4|elX%VnsILjQKba^kg*PTZ!2iO;AiR#C?nu@9yec;Ll<89$@fH0-x!
zi*Q*B(-(%b>UtFi0L22OS*%%gTwCC)71Mg3-t~i<2yV_&eTrq$!zam4GG|Q<o0;<%
zY_HeS^9oiolVFmJQqpSHLHjUfM!Td`%}lj&K2w0#0;8mjkBK15?I+M=8Sespbayic
zT?0KwLrnMy?N(wfI^1yE!|eroXzm?qxq;gpo3O{ndlD@#(2vKDJLAu0o>2NJC_@Em
z7z$o!UC?Y}{|WjAn(I6Qn0>sB8{8bsIUq5cT;CjBZ^Inh!Sd9yJ0O9^+Z+*%(mS<5
z`T^C^n`R8Z#O2%HtRqXuFgnZEU1?NzWkI3AOMIR_%?{Y#rc1WFO!p!ru^pb;O<f4q
zg?20gQeuz1hfsUf7|{F~*YpHJezX@E0fSE?Ly!Vtbdt&bl<uBEMo^4|QMLk9XvUY1
zyhJCTkG{w~_tO0ZB)X;0=1J|-i=NYr=oQeId-hW)G&?SVyC0^m3Se|ezI=~^Z|`SY
z<L&n~!gFwY2WD@9O{q`qfZ4DV7P-Ui_nZCuKA4}r;M`gYip*&L)`I*tyu&pjAmw_c
zR8)%24RqGz_ja(Yk=kDEvtK9j3dwnp^iWCBXT(ltUa6VN|9_h4863@i(CiX<BPI#?
z#MUW0m>77u4~c>l5ra}x4835xWM}QxD0aPIyO&9YrC5gT{Ek8Jzv;>L`v^nu7Y3mj
zd<CVVgfpu3Ql)0jWC?SScxVnF9PJ?iq`Y>}!*D-jM>{*r*#R!)#?*H)4Fx2iRyLCf
z@zCTzIINYhW}3uPgHTCa6_p57t9+>)^t;-7Y%=`zbU`;QM-ujzZHm>3MXg|^Tr=5J
z)Hw@vJxoJBra^|d3-c;m#Li+S3H>m;;UOx*R+W0KP$sM+MMA1(fznP2#561swoujz
z7fiyn4~Zd<LsZYcm3xP<ieYFqmH6nH<hrS*y6ww5W}p#lhlQLJ;4TzPgli`~5^#(y
zs}*b##=x)J`msK564uU=Dwq#)CvoF?jkv+A#5I2jnvM^Es0pD6?b|m@A-)|j{j((4
zP7F@w#cu-7&IW&gnA||mSQO}4I(whr-I#n7>RUN*KQz=h^(gk*HEa3ur<Z@Xu$p_=
zd+@gRVeg^)v59-p!;R@?|DNBEd^YkFF<m+4`pD9|&1h^rntT{dK69gR<R7DN-2Uim
z>yCAI{?4zSQ_kW?*GC$sTDE+PMM9`?;!&`Bk!eO^i|qH^iNzDmXy0=BYI-&D<*qMw
z-8l61%$=Fr*}I9qsDI4f%b&Y<^yl}+-ft#Gzfqf`6MsF|@}gMcDRRZ$V-_btVXS{W
zdSESj;706zbbN90(eC(a<VNgP|IPlp<>tX3J##YyV~g3}yz?Z0dXlXW3Ux1?_+)N9
zm{<!Y7G2F?Z0Y1DAFKzHYr!Otf#9{?<$<dM4+48ulV5SSgqy;H<ao1Z_mylj*mdpj
z^0BMO9s~zh-~MX<t<jsK4~8e2J$tWYANTBD%r<?(Zv$5XO(DFLSRTGQ+!Xp(-fjwe
zR&Z13U3qQQ^ZCGfUwW-CO((Lehdw{OJ}|a6FxC{hmx`?**TX+YoImhn2!(oDV@L=$
zrvLk&qtwnOjkw=NyHE9bz8&Fc|3(P<KeH_KsSx30S=9@&On6x?)}g+jnV&{)_yJIU
zL4n|Wim+WUq=8Ns^jfh}vcC?AY_X}wRca;2_qXxO3a-J|#q^hvZEEZ?&9py-?Od2<
z5)5GyIsWH61`_qEb{wAq9xVjSY3N#RhGD)#GaD$mfp+~1?cYE{8z{AbhFdI@7vG$4
fs*!!KJJraKk2}@KaLTDd-a7f5Rv~sBV{i69@fVTu

diff --git a/evals/lib/__pycache__/config.cpython-314.pyc b/evals/lib/__pycache__/config.cpython-314.pyc
deleted file mode 100644
index a07684f594f1cfa99d3a27318c12dd7ce24e0d61..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2722
zcmb7GO>7&-6`tk(@NY>`KNh9fF_*MsnXpZ|hSJ7z8Z?$AOOd0{>v)wqk{2s-C2d6Q
zu4ji<MYsTg7O2q#uu`OmgrbO#_AN&P>7fa7@F52Zj3asDph(jm3iRMq1_IyuW|wQT
zMlKzIGjHC^yr1vAeUC?yi5P;>a0lN>@(6uLH^G2C9n9|MFgK8l2)c}9W|El*kx+)x
zp-Gmou(Fff1W))jPap#DxXJKDghV_ZKN+2fkyr-J@mVw%A7(tks5I!Wo%3*eQ+lqG
z>(f8RBgs7|i)0~<<nUZ%nDuXxE;%MgfEP`pYO2bRlpH#M^6~oX=Pa!vX=X*bs#T4O
zMs!J&oMoe09daZccv9Ii7mP*eu#K%1n>d9vZKWz5kziZa9p_kiNiQ!ug?Ajw%op;E
z8`n(JA{sF))0u`nSJKE5tkKh&qrYxd^eSw`k|%7`6J73ftZgr|`%~aH&>SKR;2gS~
zB<w-JdXAGrhu}nJf#N)h2TJg$FyZA0colWy7pMomc$akuc6raIdQa7G$lIOT=iG4H
zvwF-stQci7=L)`Gx6yQ7a3iW}YAd>`x-nH<u`0DHrA1Y}Qq!vKlP*<VFt9_ahN+tt
z90Q{R3SjI}RfhsoRy9JfF<&FPqpDa0cXlZR?#bdsN5@Wa9z#4Vv#_e+N^uk*D{70n
zNsf6gI7J#2r&u-Si+*Sd_8Jbudv8ht;q=1vJG2u|{^qrvM5e`McEtW|t{+prcdJYj
zuAOv902*Ltb`Jv4Knl7Xdg-?aRe-k-ooAL9nUO;>D|0e03q=+H4*x4eB}M5Jfys@`
zLvYoKf$MP^Zt0pdZJD|&V4c*k>03wrgwP8M7GBYaL$&8{AAJo#wvhr@G0k(zc4$l+
z=m*rxoMFEA=JoPJZ@u}pzuuYh`JQ8CYEv+%X`WAC0JnkOWKJ<d?XyL|Y9Gp=+2Ag+
zLo?FR_>(QN+1V$uO59U^KxL7_%Bb^2iQ@mv2tM54<L<D)LpFm7$mSF-g)Z{7Pyy9K
z;}ATXpQW~YLyi}b!k=b1G##2sArwXXkuBJ~BEXyQ(>GUEOJ30m-uY|Cn}{7&BJd_!
z3mvC=_91j-qm=mH`*f#718XlcFMWW}qG#z8ghYuhho<(oZHU=XB^KaW*-P=9?FyvH
zjDkccg|poOM~MYeMRbA-=p|;hC%93f0e@zAD6{`B$})GfqfpxM*>oUQi3FN$QHiI}
z_$fd;p(K8+=*oNn4YzXuZsy=-a^~MBA!pl&K(=&vEGP*%+^HA*k0?o)qbK-4>k>ob
zH}!KL!&yMhA2Wgymt#Kzr0F;S@yWo?=8b>~KcS)q%8PZeU9Y@KTz{@ov#Um#7H*#}
z8I{9MUNTIn9vYGAy}*68P-B|xZcMY`CDb5d)x%!7lZM=oQK=sulAaz(SYObwZh{P{
zW-go7su?I6a+#q#>z4tVfSy>dp5>EnbWC~m+y(W_r0jCEWw=pCThP@iv>P{yb*-Wj
z{Z|Cjo{FER=I~{!W)e51Xw{lNhOvcR!LcyWD=tU7i_6ou4wQ49xa^`%yt0z-^7Yd&
zyBsY;pokWKT8-Rz8!8oRt1gSJRZQ)48QWzI5XBfS5Af-Z4~Sq?R4SY)y8_WLsBmaj
zA37v;4!>r_s4=Ri3BL%Jyo&V;NRS59-QMlE)Qn3XtToQ;r276SHYOkR^nY0RL*Zur
zmU%Dt@^<#@R`zTwJK5@aZM&z`>?z%yZ%o}UoNOgWZZF?--n+KdmD?zP*_FG!+&J4z
zj(j^PwPM-L^!lqehTj|BOgBcGvF!bUy)E(hCcFNVkJdKVZgv0Z+AVD>Q@nBh`uR<^
zG1e51|1B&&NDegjzWA~J@ywmmf4TZs{gYqcv6_R^t>hcc$Q%Dibl<r2`%4>x%@;=Q
z4UTLNjBO2!wFb_%5)*d^?j_{L_=7}hz4zU>8slFldapA({pk<&_w|i)w{l<gAG<!j
z)3xWTt^*JHrJergH?*7FpBFcN**r9T_v+oHrZUs)dux|v(wCTri0Qt>?D8mG_?|_{
zzDFVw(;FA>jDHy)ZE~ZJ|C2#oryl=3mVNAs$DIK{^V6Qbu{8JT(Y~>L+-E|1EX#eC
z6JX9W)A?>Utg02O?04gZ8gv-iHt5%iY0H3q4lM}UBc`7emMAM?uT?Y?S}&&M+)a5c
zv;cPs(9QL#<E0^O^9zR8+QV3Ptm;+hV73OiPjH5cqD9^dwwL36H>GiRBQN`Z3SPml
zz?OD4rv%eOmSLFt=)yNB_ce-tg9g5%^ZtLL{I_Ciec;{cZ85hc=33&xZSlyKcw}2V
Zu_c~pL>~zN+j{;Hg5?MCY5Vgu{ug)=Z#Dn`

diff --git a/evals/lib/__pycache__/grading.cpython-314.pyc b/evals/lib/__pycache__/grading.cpython-314.pyc
deleted file mode 100644
index d45e90940f5720b7715dda0efd3aa186a6017c6b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 11213
zcmd5iYj9IndiU!6ep<F=8!Th6EnBvEV+@!W;}^l;TpNLdK_jGVTS1n@xiSRrw$Ub;
zsY$jA&SqkTY=h}`1~c7EoM}37W;zqMe>R=|k&v=d?=nN$Zl|;V%tFetKX&?^b9E)#
z$grK=&h(6Q&bjCDopZkPo$qx{lR>YcARKvI_1cjNiuw=yAqI(%p&xw!nHa@TQR+O!
z(EW6uBr2)Kc}c%CDuq<qFYA*><$a2%qE8uB_Nk((K6O;xr-^F%v{7v}#mLGiMqWk*
zO;H`AfV`6A^^6MgYLYiF8pvx&-pJ@6uP1pEV}QJo<jsr;@@A5^Fp^rzX_<bv7kT}G
z(0IEeg2FzQ;~c&T)_0L}G(^HE%K9B+C_L$iPOuJN&^zU4n|)zF>zMGO5VUo=91#{Z
zL(j=bRO9tGPlSCA_Vozsi+ZDhaENm`E=>8y*{Fl_g%OP6a(F|20xdd$ShksicC6zv
zi~Ip!w8csDT5l*6&UHTFRPyRUFXC7~GZo@>gJhVlaA+(r&TEbkz(X(=ukR*9vi>eS
zbzbQf;PDpdftN&z8J3$0MtM#5Wp7Z(`)Jr0{Kr87e<^%E+6tK%<)+R{+TNxpKa^Eb
zuh0`TLo*Ua%E%ZwBWaZa?i9bqJIpHteEm)n3~<r2cpc|>wSW;%z|Tv$DB=}}jZUG^
zT?$sH7z?9GZ<NCWZ{gcpPjM{bS}!0#OgPkX#f$u{U4Y_N?>HNZHggvP!64Vl!lb#@
zVBkXQxUgL<k*mDU<H=!*^>|PTz|+B>tAQv+J)xAe@ga`ludU!@afl>Kuad$?(1`No
zNEGKO8T?_f@<%bq#Hbc3B5_kDs%H;HMWk-&pol1_6$3->&Xq=FZdrY<*ssPYJ6*CL
zzQ+mrUfxPkZpk3z*9W&$giuYrypN*P)MjW2$dyd@cM6o~@J1rR0LwW7V~#)+D35c5
z!eXH(8g?{{2g4V<L8l{(93k%{w2V#!I84q!l#YPEh1cXjKx?OjmjXTV3MgVjey1E^
zoFM~5yfi!&<!K+UjCv8Y;W#;z*3~&24rImkW&+0Ny%1zQ!lt85P)?AJ7aOC#)#zef
zztmf2=&Z#$w{3P?T$i$Jn^8V6md2gGsA$RRjMtA{J9gvJj|LtpDMQJMiqaWl$B+$r
zuc5}=X3&ou1n$Pv*ou)QM_OpcOVLyaVI(6Hj*K`mL*%591rmYc$WjQgkf;<_Y=#I^
z3{7{@4@rDu{|ZUxt-^O<hR~FR{0t-TiMd-b0%?JBG`O0`a-C1oP>vz23n%gfHK<%y
z1)sZfMX+L;ryWiiFMBN<2=V$0UT(tU3r|kM8RB>?Y-409>WQ+iM|pWLe1%1TfokYX
zFi<CKrrI;%y$bBdcM;V=6~RL*WMWiSYq;Katu0Zq|AsG9TAwVfhwOs&)9Sm`kV$FV
zGuoGu+Luz=BMI4&0%FAT@xNgt3c^T~Wc$brh3z9GsfWpsjr?6X;VjScvU4xoT@XnW
zw}q_g3DrQuW<nJ2U%|=3G)WR2Zc-{NWc6NQngVK~A03Bt;I+o6QE`c|tKzW@TVdym
z9qgj}3%icdvo5$+eT#4hfr+dS;{Hb@z?`IR+C+6rdb)u<0rMHevt1W3$KIV=QAEB$
z8)*wAwlPts6jK0uO40-8<<i*m=5RvtGB8}st|H2aVguN+HLz8yV9)%4u+<UOC?*6F
z+}2$*(E?O%r73?J7=<|hh-O_|tR<-2>Y{D6ZaGtdxt1SJS{3EiGKynVMAuCZuu!9F
zI|DVq(V?a|Zq<xN9QS!^iRe$?+YfX@Ah!WzErI-_=RnpKAV&<Nm@|_wh?&~DPy<?y
z+aS&(VssnwLI`*o%-uK!N1%!t#r&HD#^9DQMi)L9xuByMliL8Xd8b^Yz6siG%tLYr
zkOnc6UKh?6_|<dhb%16R5tG}LhX^_&%*`}p#h4K^48}4bb!%V`=!kCMZvanLf~U7h
z%i=pwvWB|MIqHIyM$B&W^Y#R27uG^(*8*)V`FZsbTwzv7!U6gf*)hrwoU@9mrU>=f
z2t`>66x9Q~O6;E#_HYt6Q?Q;AJOWcXV5-@muZZmfv<SfuoQ8~*qsD=1kuC=+1njJ8
z3-XP$Rj=(W^Z=Y&B05l8?BZ$)>1d^xUr!5e-3HWN#*~XxcKUA_`#!80KoD0@4V>s@
zz#dUwKsTHM{eDUkQ3S-CDxWh4C}$<AfzG@vdNsoGrkvpNu&=YeDbToiB^MYEfxgG<
zy`gBp6Y)l)EDCW%K_J6Igjwz`%$(aGbCBKw8x*64sUASeCG(JEmF@-G;ig}wVsg;1
zKpP;NLe)}xqyhRaeL@f+QeM)sou@foB??6j4^Zcb<%H!yP)0$oM;-h&P%=EATVcIA
zrv<ef42FF$-;S1ivs&oQl|l5|U&Fi-6^Cz+&zlz~KHGF_$ITr-9e?^r1$A$!D(D-U
z3i`yKeq{)Y^4!|ccF6Oz7wv>}&AJd^kV06jY{gZD`IaBJ-5tNQ*l_pUE%QzDPrIKY
z(1IymI~D-d&wGIb&79W?TA~M+@f2_Mg+u-THhR24Q5WIW`NC;+eZ$%F_2-(L_0yI4
z(n8amu3OJb0{&@3eY+#y#!)}Lxn5LzIk2IEZ1bh5FgDFDvRAK!k$;%Ch%^*I>==vK
zkdMVS2+kwcUmU$y>aHRj;9nx)<cI|bXEm`DL22Z9C<Sev;xOM8NT90Yv;Kuk^9>)L
z`@8<9oD9m}GMCUjyozJN{0l^{@=CCMLa;tiXAeO&uZVa#F#XUG{Gk)hgl8-~74m~Z
zj}HdPJEesa6k^B0?7YnKHnLHG!YJ&NCtqyLX@+8f(?E1Tgf%2ykNa?*D=-)6#p{Gm
z7WNe8unk(Ea@<S?=OD^U5X&n#){A@-yvFSfPO;qxg%K|wN8zamFZG75BFuRZrgLYB
zpxa@!j#qnwK|(VeuSCQo<#;)<CLqBK0qwy3@a`atLSBy7%JG^U0ww}cUPYD<Idl}a
zPzaRB>+>|@!A2)N$t!adf;us{!#E;`08|&jumRg<Rd`7_?UrYcI64PCh&DL_3e|68
z)O}-l-1_dH$NI7+>zry<6}M*WjY)gs!ceC9P_p^ZH}*p*Q%A;hG-*1TGWEpzvki^0
z-d~x^X5?9uZB9L_j@Ko&>`s~XKtWy8oxttDqIzj(x~?N-Ih?HPm^aVuo89-pe#p)!
z6PCl-t<F3BxBC~uODMhdNXpce+<GK_X|8Ox>;pSwXXFV}*L|BkjX6!K;_z}w%GNWZ
zxo<5?<FlM9Kd>~MvL2aHeXBJkN`}+g;jC@bd`ZH#?VX;i-g09yt*_16D&|IJN8*3B
za3N*ek+JPd+V-Vv`)7Kxw#uZ<MH(hbo6`EGY*o#z{tx@-FD~v(RqdS_xNow|sozt_
zT?_h@Y41$u17p>^`LFHwOC8zL9gEwRWPjhZxCsoZ!{yoXZ8zRX)E!-s(zYJ@kyKLF
zORrE8b1(fwK{dDB@Xn3Tj>o+}el1g4mn^MImo_Z!p6PwCdD}w$QhB<ub9VS^P2$z_
ziN5pK@VpbHjcI*j*0tl){=5B)7ngUYT|KGt-lVH%{!-k0tL($FkL~a=)1N5sePAqq
zV6Xbfn#LzFRn>9Jddqq<|CO?LWb8u;`_OXR*U?1R(5&KHd(HfC!tR8et$IG1ur+0D
zrxLbPKR>ltzBHWN(U}+=p6U5kWxQ^<W=NMgv&NDe!+-g``Mn|@rx*68Y<sfy+Q&*+
zx#kHaQ)?d{qAcYrJv61W#!fu_O&3jBU!tG#rmm;IsiW+DG>7fff80}ktW5DwwdKdE
z6hF7vkJTs$0|Qlwgg@TC;#Go+iuF%hiD|XQeRNK+o*Q7W7ur`~d{7*gb=Z2gVEYOp
zY+pf=*l|=y`@N+2Spy5Z4nfSyAN?NIR@}T2Or23Ij7f0k6;ANXd`nSR;54Fcx4f7!
z%|fp#3affhU&QQU-MGGZ-SgB(H_<L4ieYInqpM(oFcP;6)N1KY>9B&4xfOXe4Gfx%
z#s#$vOq_jKJU}R@S1-R#R*^RbgnAs039FphsQyp562(qwVHJ(wlr4Pu#WfX~Az72M
z$eRAWpfEaB(?1-{ds5f9P+J@V=WkuD3_84sbzA_o>mmru1A_7Z-bM$Ad=A%XK(vH|
zmqACka8+>Lg4kh^!#nN`gt(~a>una?Si~s{o<?wq`5hCWKeU{fR*Prk?6m4!yD$Y_
zB@XG75gAS74-wD^ABAbfxpq9uXKQ<))~O=WmRGZ(sYw=r<Cj;F@v)m!A&ejh+PviC
zb*mTcR0?v`1HgzJLqm|`=>RW<$?{SVh@28ih|5lN+7V&q^@C#?6m0C625BCEM&#)J
z8)RbCV+Cch=cE~yQui&D@xEVJwp~+z9}k>C2d^Exr>*^0T?zhZzt(<FTa&fe<{D=k
zGnOq$%a)X7>w+q2**?>i)mg8fxOO64*_^d)j$exV=F1b-#>HLB6%U$P;zPGyz4_|m
z(Z_O0Rr`Na68(V{1!XX2^qZ3UO=-OoU<&XumWHIIA!TV?m`Gao7K2)vur@3XeO|WQ
z{-AYdd{?Hj>0V{ik_<!cz>p6=v{BZ&l?uXV|1Vrryh8BnBROD@FwAK+VWpsWyx1-E
zDs;-bsWo)TLkglBffbxARFGszO$wi7ns|fZ$kUI053Dcerltp>aD8C=37esc)wzrI
zXnP;5@fOmnsI!{EUd#2vz&9h&1Dh)j6W8sO6&M+`$}zXqfY=c{>L^|t3B|pmB9!7V
z$n3#P0oGc4_Od?|K4#j*$Iz``3s!H}1@@I3-iy=~targzfN#NIq#`ZJYCs9za5Zwk
zQIUSRxfiWjw-4mzZtdh*U??ymHCUu>gSi)J`9eBU6d!?6=IE#hOkB5S{@CxY0ygan
zb5YL}=XXp?TXv02OC24KX~P<aIDCs9@$f}9#5tz*SFjPnvDjXKPif2cF{c8UaSoJV
zxzqw`cxez!A^n=}yl!=Gr&Qp{7jbLV1;l!RrE`LnU6;Ez(Ft5W$1xDJBn;pX(R#8K
zRbWIY^?&ik58k+^+%nJI`Tp(iFOGc?{w$oX@6M`B*NxYVap!#3o#VHUFIp3|dlrY5
zO24Z9vifUlqHQ2iej=?p3H>w5%A~ULp0fUDU5nDp_Ri$?&XnuOvOCi~lI$KyyGBG%
zEkCnjBc&tR+>vq~PBgr<EM4x&96ggfdM42|k|=*Qtvb6Jp4d3k+?8nVTCQ2XoasHA
z>^+-yoy~(>ux6a?NoRYa;lR?arOTPaL&?KKiH_k!`Kh$ZO`ugIl@<4tbqie?*TJOg
zV8+#%aCI)zc!Fn=-DlEGXGCbt3(`zOTe6{T$(pD?v^2C_n&}))b`B>_ok_eTPEc!&
zoqU$DW7a~LqXROr=Uig0x;7Fnz#5<f;p4nC5Q@G~@g0Ze9Co8&#aBx2d5EKgwP^*X
zi&FAgm>#qvA+NgMgwNFqFGH2VWD8zJk<&%!!kq&{Z_q|kP>;`3=lyQ_ywQclQZBew
zohG;BynLfN(Z7c;AB%fTFy-bQZOzzUQcN1u=f)NMx-Ww_6*wHos`Y`Pu!lG2Gj&qb
zdC^G+t}Ehqp>s|;3GzbU6L%#AZ42_-p$y?2;4tjmQ6mm5-tNJ-?Q9XZ+yriq4>s}2
zS(p#U;^t5LKIwaZ$CCN0iZ3gcr<40%ncMN+j*rHkKEm#bH?8~Vn}&V#Cz^fq|0kq4
zC8!IgcLIT3K%uJ;J%a``4W(GqD=gSN_@O&af7<p*+xs<3^jDfMHOqly+sV0__i8?J
zKSkez#`u1io0{aMlit^Xw_zL*^poLo&chsAQwV5AZ$KMz9y%d|JtLMR#%#*Mq>N0g
z@4msD!~>_ygDJxxXqQ^U%%vN(an=0JztVqW*|wlutocOyjb&F#yE`G<T{tzs9!bC^
z1v%8%QTe<>4KEWEjFM3?YWQmyEu&-ftxBM0gK$5r5U#|L4B!;IZRAyvpf`%2Cy={i
zUQQk#@LGSs?}?(o_&AGr*@bX8C|Jm94CY1m%ahQE6A<%++D0oL;^>BJ1t;?(Bw5ms
zBwJFpB8AedqgF_F?iJDl-4AiOfEYjo2{;8`w}6<>;VVX+9xQeo+xp-_1PffD-<mjo
zRADLDA{D>KfuKMg6UU-Pc0+%RwRNF5)M5FsVwGhG7y?*%`@5LZGGJU)zlvRn!?nO`
zj8za}URA*8256!@VG>;zfZ-IeL!VV(@CE6<T>n~X6z?SoZn|(`cG^*F<s!2b(}>co
z90gB<5Kfy0^Os0s8Cdf<Cg=iNSqu^7DE6w7kh^ZG!mEQPPf}p_tFAdWyl9dQm<hX-
z59r}Ka9yvwAW^wh;_mv_??@4hui23z%49?hSI`=_dey!aX+^H^4&wCyrBsOPETk+k
zHb(Iy5)uG!MOuk?gYh&Qt^q&cSQjGdKZb%wZVoF)YebN4rHDzOtv?yX!4oP!jDXr9
za_Z2VIR6$z@LmconZPSx;3mRT!94dtZ{td1ju*i#3*2t_qP#NqevCH?DyBzNLwUp6
ziz?t~STG{Yl)#Y>cPPbWS_JCzg5LEaLElIfPL|&!Q16B!AGlHOdIk1=wU|d&V9@7?
zGrk<Z{3<5YG1|BWh=@?PGygpNHGOO1!-=1`G8J8kimrtnpSIs^U#dtpb)_r1{$=<n
zhe`F#%Dwc*<$LMj9;X>y!($VEj&EL&5+YtU*f})B%X&Kdj|&Ks!(|d+9Nz?xcTdEP
zf`~Kb6<mf*0EBfUc(eyyJwwM&o+iT4sYO4)b=5e+#8cR5vZLr7+)0M_3bruk)bfmZ
z;XcNX-o@=NLBzp*6u5Tkwr<@@t}KMR3b=3qte`m55mD0zQWM@`m2Jt?9Z15j>_F_~
ziiXl#GrAi1C3H1+F5WlV!TZ0s_ly0X?ax-$WGc5OE4MFQ%vLo$*t+Ze-nrxN9sh^2
zud2SR%2sV&Ko9H<v$YG}rP{AtU%Ig0$2seqYrWALM~{sv6UZ)=TJyMs(ru3W(zUx6
z-CvD-Ig+aFO=<gLN3$|j8Y;LuCvTrj>2`l3+Z{XlU#0dR`;vH@Qg&v}H|KjV|CK6b
zVkaMLZvNr;_24_f&-Z<C@Uw%l-fsm9$?$&toNLyVR&Ji(d*{&YL-%SA-nUlIOYfY!
zeJ*9)HKTf<wcan?B)nLf>HU?~mNnZKE`NIU?$vDBrc9YDS>{@5%9ht>O=Zhl?;F5d
z|88)8=+5Zv(X7FIJ#Z~>Jv<YR?|iIR>J;!)P%cPZr<GTFJbrMWd3cS-Gd2Ye8S!&N
z%*@dMM7)VSPHch4i7k`xa52aUkC?E$6x>5t_7R#Qw_yUEVV);kz7cu~;7?3DgkZ>a
z64inTePO3n2Lbw2knKP}fp)OX6o-9yaFasQ4<$6+P5&pg<(HKDiA+jMuu&qxR*2*~
l9lU+~t^_}A`%VSVp1yO_;GF-i5nN5*Re-VbL?`MN{{w;oiO>K5

diff --git a/evals/lib/__pycache__/harness.cpython-314.pyc b/evals/lib/__pycache__/harness.cpython-314.pyc
deleted file mode 100644
index 15f3b2012aeb47b55a9db4e8056fec107412f9ce..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3801
zcmbtXO>7&-6`ox#|CTf*N0w#9X}z`t39K!$c2qfVETM)ZRElbWT{o<XSl+J4mALI{
zmz^D2w%P-W7D(M5oLhX<QNAfR_m<vrB%vA-rWTN(2$0@nD1m{V`rhoW%qXsnqz~Ym
zH-9^CzV|ba=X1wY0`$?7{$J0>3HcXxB7#1WbnpmxljwwzTSS*;r5TyYXHb@BV@!cI
zHXENwu!I;>W|K21mJ)J&Ha(*<^$b}|oF$8yQAvD|l=_3VlaGd-((9e0f&8K|mg^;F
ziJs^qdU7!}8VgpkE<FV!=|19gvu<4;AbEBDiy4!;mhWqI>M^h4Iof&6teSOZQH@$F
zw$JF@f_BY49Q4*)U$bso^e(H}?utemF0)sy5x;I#>}9)hMXh;j8uK*AY`B%0#%h*k
zu2?R+poJ54llqosGhc&`TjuHo&2+2UTFr*lzP;j_j<4Bn)vlP#!tJX68{2WTWt&>n
zf;Rp9=ql<WKA}}S*Ke#^zBsOg2UPWj3x~iZScAHLAusWa>AD^>nGK8Qp|6xoR)aQG
z3hxeb`fzil6YCH{>EITOHpwDk5|~`Rm1D6%F#DpS%R?}u#{ghO036rjAcF}2?1Y{K
z8NdfQi3ZSBA*b{V$XOw$^<zxcb6}J%o(zq`V{kp5syilI_ULK_T_ki9qJRuqNe3DT
zO;RSe<hTArNEOE3AU~68k}m189@CW~82VxS-!d%YaRDVwp_~Yi`A^ybBfkd+Ej}aw
zKmt!PnUCk3<5!CB`WE$zOB8J6xrH^8R*REf)hY^`j)?F1Me8;gt?1ZG#aiG_h5B8d
zHH;8&%P{zf@J@x!(QY`ze+^iZ>?oJp%H@romU4NsAEW~qB^A^<BGd?Omp({<fFTF>
zlIAIX<DtwI-*T4eG3fZXX=sSzcX_7b*hc78yeHD3V^FA|z|to9Xp;7V@ZDji8Q3lq
z%1iV($N`E%6WW7wCxD5G=+YByWnwedQYN<2AO+{Y;?lG)I)$FXGdi#Pn(f~9AaKCN
z4{!Z?bn9=<2~~p2|Lx8H^YZyP&4M~SM$I+hVKj$&VxoK{B#6eLAB?q?u?>J^Y!e_E
z`<9U8Wr_)up2aVAB9j$q#|ZMpb8qY%6<UO@@B%c@?L@>4g7C74)8l%g7=uVoMu;@r
z00(r>=h$#%$@837f`g9Zqs=i$rrXN&M!uy?Z%qPwGzHxNA<*oQUJxnIohBa(5W;*F
z#0zd6iUKBAwU||52BY@MibeU!$a}s%E|dze`uo5)JqP_<VD)1VUV@dZL*_)h^rW8B
z)4Hl>^sIgi+8q44^zNbxI35phOu!qpT4w=(2qomOhHKbWo~b*H727qcHuY&2&cs(C
zNjd8(*jxL(Ht)F>R~^r+8bJme#x?4ydB<34R97s<<I6D1z#=BKczU_vIEL9^-nZ~!
zyt2OltVx~`IX>|e*@GMV*lhNQW~(BaPtu~<?H|~W!Q}469|i5?jA%dY?ca~X_@_fZ
z2-@OXPq81M92On~+n3u}4eWdY779Dl5tItZipYM5>>{!^kztYHW5ENiFf=rX6r|8Z
z@RIXf<OxAv+!wWIlFzP;3+03>&*w8dWf-oxifLap46sDQK^Y!|-!@Dqoar(Qc+!35
z*skSzFbxA8542%Wtaj-4kqscrBO3$8dku(VNOBd!WZ<Vu4QBbU9#eUs2SzeYMhx?i
z|2<&ulil3$KVIKcF7GK5d&=0JGW|>jWQD1Mow(_{KLBA4q-i`FU)jKageIun|9TcM
znlNqocxCjTq}2Bm*@J<7Y&M2PFYo}gdPMkT5odQ`v`KD8<M29Oz{e=0^4EI0)Hzp|
z%2Ekaz!5@!KpGAjvI3yKL2gF=2L*1KWTE0IOIdPBiOScSbQANWpb{c*gi0M+MN_&`
zCMAsLBZPTjG<*!<47n^ty}JC0Q!ob{p|Tw5SdW$E5}M-(mCy`Fs2q!Sogw8|37!53
zVV>vdT2S{`{<<2L&rkxb_q*O<;S6<^7FBZE@Ope$GnvLy;cA0RBU~~dcGA=3Yd^hV
zOwH;%Vcmh+%;##^bQ;z)r5@#Jd$sOSh?*s?G??x1KCHZrWk`(>PY`EL`2O=Gwcxyp
z#k<1@6l-ubPhcv9<estaz}{e*C$CLiEB$<0&nv=zxl)Jn80sqA5Q8<o0yiJhG%c*G
zAZ;3<_Ed){;lLk)&?HYXCqDSy<IK5@g~yqZt?c8>)#lW0rtALO_ujrg{+ID5xl^sv
zqgxAG?>w?Tp89yK)&I_RZmyM@gMIfW?oI4uF19llw`LwK?2KP;k6-_!(z;RJ9{)w_
z>O$+%FI(>!+nHZAr=F-i_Y3z5J8HhI<~Iko)napUS4q7;_lLP1<#by){b2GRing1{
z;jt~{q==sUah^1cs#ghe({cm;9YAiP=w>{3h;0F(r9Z*JB*Fl2Q;ZRwV0Y{Aw}&DY
zyj#@01(+-VK9=K)S01yUu!Mmo@IB|Y#S>6xz<(e*1>1tegoqu<SUhz#FvSGLK*l)t
z=YZ|UBuRQI%kb~=s|1k-{z-cG$f-SY`g1b$S+@IwQ-7V`$)0Ox&uwQ1ce3Z(+4DQu
bOYQ8XX8OxSN}Bj{?Mnj9vy+fW1P1>FhhaSh

diff --git a/evals/lib/__pycache__/models.cpython-314.pyc b/evals/lib/__pycache__/models.cpython-314.pyc
deleted file mode 100644
index 90d5bc9f676b4bca2947cbc37217ca203c28377c..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 5475
zcma)AO>7&-72YM6|D>p2%W`Cik{qjQZLvulG)`PM{*7$Qw6jiCqzScHkxOag6<3)V
z+Om=ZQ3s6<J?NNAfgE~hZtbb<sn;HDFo4-X1Egq+Ah!e}TGXe0Z<drOg+aQI-oE!{
zcXsBz@BIva(3eSSxGuKF-}xNeKS=0Y(SXr+le(tWHB%GXvS#Yn^o5WJO%flv7G5xf
zq4@B%$U;;^6>nUNEyP89At4eANs*k?Mzkr-jErh#v=~1V4(1gprHw%w@6x7~HUVw2
zOPf*J6twALU(q;YwCBhweFplzuBZEzHVbWkmv%sD2cRA7(he%^5VXTx+99PKf%Zt3
zcGw)9(dLeBZQp)SwyVM|<;u3Ob0xnjm|YS)SMga+tT{R7p6%uM11_BP`JA(f(I3xZ
zY{f0l=GN`I4ySi2?rKsz*zkF7&1U7C=iBAe!d=JPjQNb`+$!F;%Q?T2<F4mah4<jJ
zyIS>`Q_fXf$1C&sIbEjgYSkCEaQ!NWXk4k*)^U$5y24?$Hy4(Pi#B&|`enzH$xG_9
zm)(+(*>zG`SgDI$_^j0NJKfv%)pu7Q)U~1}bof8CoD|^`TG4<vbu;uM;DRm@F~cHi
z8pW6yISCn39G;Guv9^>@QoJoC)sqP&B}K{%!=@>jyjUx*I^xcTQ<BlZR-foHqKsE~
zny$b+BKLl-;8z_PFZo<pHC}$2bgO&X%!<v|tdhUJZdc1(X8fAis0mB>cbzJi>HD_l
zS=@o)%ls2<E-mAhRkhb0%aTdUf-!0y@oCF?w`P0ok*sA^T*igxR-LMESuBeE$#h`4
zz=y1lRCut<5qPj4S~fsrkj6(Wi<6g1mMsKxS8BrHcxnh^yZG1S+5Atr!+3s$VfB7>
z{ys2~zXTWM?N#{xG{5V59?ui9c;0hY@@frye&Yd4;C)3XmiZZw54FDz9shXY)1lM#
z#pdy;`ogx?Fs7Qh6ZM-rFE@-6UEI{O^~If2PmE{TGo5LABViN6T}T-GzmY(=i0&bQ
zB!!D&5-5tBi4Ka)B%%!w?)B3S=mIv<BCYU|k(n!-fQE3&m)4xpT}E4FDUia`Bo}NS
z88m9SWf|p&72A_(3RG(YR(DucM(hgi6enc95&^gkVl4z7sY%}*2UFLUwB^uCA81+`
zx^eAweN89G=XHQJ{3Sgrqs$RCR^7_Z9$4C}aw<MSC|<V(rw>dsnw7xSMhQXf*n#)v
zyT``xG$(M>wf|_5$ne)hKJwK~IZ7=Z<LV+%Vws8E9WZt6?ar2vvq!O!+qB$67qrG9
z6+~!hu0z)g;eM^-lQZqPH}oZaS%2w143s7Vy-grgm#KH2S<{y^m@9N(tyrh_0-5=H
zV5YT6toIG=YFyLa9Wb@)`leQoOlmja-h?)R_0MQ8hFyJ5F9cQ%$<X`{rM}5&ODA)I
zRTJkBA<pIoJ1<WkGO4qPyzD)&kTvGdd437ObAEpB@ne{T4}*M(RXiHC9_N>TJN5DK
zuQQGO@_#wm=NJ8h`qtG0ljhcIT;#YTkl5AaIZRGTy3~eVE{}Xo*aOoIzPX4D^V&Mg
zVO};xWEw;!mu#=*T%nvl7m@~dyoyX<dStR%RUFu8+UnZRH{jDXb`%ry1(3S-&*9O>
zqt@??KV*KN`J?#rdw+WGFPEPz79Zm;zuXwL8pGChqL~{0NdMLOzYUImrRfu|>bsgg
zbU_E7xuAcZ9okL>$$<blmtkZvHbUeGkx?Q98%CcEwnzBg-y@5W_#zQ9xpHMNBSi(9
z=n%@3`~t{<{PoNv^4B{JW3oARync22dc!#0oSZ?{`hLTh`8&d5_S498D}(^q4QtUE
z#Rap|#3lUiUF1?+`tJLnUCC_f+mcxkWlg272)H1L#lW>o8Msavi<x0aDfMg`<u3!N
zkED4`B+M+Nel<2Al4cq~F(}iwnZLdvf{MJK&wvrZWC$6|Sn+*N_VG2p=9MiJjn!3$
zZJliw_P1t%c+V}lV)j&S)?MG{QI-VCCW;u1RGn3noqNt~QD)uhJ%j+Y3Kkj<M@Bc)
z5~L<0=ph_l*c#Zc!kv^FT~UR3YshoCc<VqY6lENXP+2($&(wEdL&4%zScQ?7*+n9~
z5b+WX5TP*nCL-dghp#n<k2Z{>&5=UGC^XaAhc}u7L-i}$FFd?O#qH{&^l_?lCr;KE
z9`#K<F-``Epg9j9LWcsEzDu##g@{ukZ0f==L!IVEpt*tHqB6GRurf7J#)i%LR)Xkh
z?)mqnQL$Z58KN*3?_~*T+`e$<j!Y1>?kwK=nKZ6mxVG5S1edWMM&uO!IW-c$)S9V*
z`sHT-GxclWKDZogiJm#ICCae4^xfZpc5Uf-0y*Gln0hf{hT7naDk<ESVoGX5UUhMi
zb{7{1j1zb-sbHKU7(>dmr4;(_h?#9mY4qI@bD%9{)RTisiike+f)R5_Ny8#*reM?&
zn8aq>DVw$G(}cnqf}p5+D4D6C`Q_Gq=AwjGWjYX@GWy#x`w6zl&VtBPr`-eezkotr
zGNSrM)uI(x7P_D%%MeG|hE0N?%cSfplR-f*p)-&vF3QkaqT|xJ{$3sp+B^m8n>1a4
z$Soq&74G+c%p~y+kvE9+0{smdAo4u^c?IM{tvNY`mhCJxZ~NRt4#{t>VdR>lWAz(5
zLk(lBd1MSp(jgA$Z#Q{CYu7NIZ5|y*XrS>Mr)Kc_PLk#e-hE<>1u&uy_l6NIy$eP^
zAs7KBhz;e$Ft|>RG&(sFQ4@zg$c`}_4oK=i2Mr*co|c2ifF*}>)zO&_uX#fGlD&*C
zvR6Qmp%7h8d23=X>#d-Q-*v02l-tlFSBXU);?6w>2zgKlJem$FuvI|~Ez6`T!-@~`
zRL|&S%8hT*w|`EgmyeccfCvGP)BfQjm5$Q=4;Pz#ry9no{o>TN_7h_&ura;Z+r}Sb
zq|3&y9kMZ5TZO2y`(bNGFd7`Shzn{{qGN9ae8fyaO4Di~MNylKpf;f{8GCgphGVg@
z7npJNr4i-fBM5zU@Ri9cIDT|_O2#)lTTq`XBkF*Hrnpu``>njk&f$wPRd)BPm&~@y
z7@8Z#P&ibElQH;?QYu@d7qUUYu~wL4qo?9qM^#4nCRmEH4+V@6WU1bZ%P@|ESP3R~
z*lo<kUMHeLksKxsDs$}&o$MLU3IpULR^EGcmAGDLc$)@@yoP^H-US-^1|HsQrurUU
z52{j7rUphD#z=Eyi~=+>+%SflQ#18jJ8z<cY384+7ak1*InM>J?T2#zVEyv;%!h>l
zJ@lai=rN$crSH;67kZYyvzwt_rF~Qf*FKzq3s4jXx8K)Lf==U?1IL5sH+&}cdzBzs
zi}*TQ29XhUCY4b-gS%p@Kj;|tAXAhvrlOn&F8HK(7)EJ6)k-R7^fJ6lj}Rf7x5KdO
z#Jlf;eLrN5KQWF6hNKsJ8}d-9v%b@i`)wg{-)=tq98(ddfB&%UUZ#y)29eqOu0W?J
ztadEm2MZ`V($@4|0`xEwB|qk@!kS$5csPL|2}Yetg<^`@^a5L_sfi6XJx-k8Ar=@u
z19G6#R*7t6oa)4&f@*sHi7_9%Pp=kGsbiK^_DjK!CY2fvz&Ob=I*m!g!B0;tOl%bG
zB@#0ugd#_@e=TDXh`~ug9SG>i7&QN?-#bep*+LaTm1R|MDMtrOKe!zkKL^0!XPfhE
z6R%M#$G1Vg3hTQ5bx7AQ>HpHEKhs7&(~f<iz4BOlr4^0nCwInL8g8xrv_8CZt)=1C
znuzPOJ7-%OZmlC>eQ>+9bEc(%X{D3;v7JOq!>x5JqECOM-kzqnhiE3;S`$%yf_U6o
QgAqNy?fx5I$ScGB56L<7;{X5v

diff --git a/evals/lib/__pycache__/replay.cpython-314.pyc b/evals/lib/__pycache__/replay.cpython-314.pyc
deleted file mode 100644
index d1665b9a2016a4e5733b6f84632c5adc3e260203..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2276
zcmZ`)&2JM&6rb_>D{IHZ<};Aug`~u$jSWpGRE?4%L`X_S(<L}9idZe)iM`3j>&|TW
z$e~)bz4TO6Rc)Y$M&bge=E#vhA(2}VjS{J9%Yhq+R?u7D%sK|r_DTEp&6_tf@4er=
zH*cicRRnZ<qjP2`jL`35ut}jnz)xpD@R5!@bP4Hr43CDqP*%vHF=aIDg{7>FMMk4u
zRLbG8*l65~OF1&uHmZ86l%r#b(WIBmB0aVn>G9pj>G0b1Hjq^*r}RWO$|vW)g9CL^
zp{7MNQ)A{8Vrd>V%S5|sdsVIGYG#GhJgsaxj`o}Xt>#sUwm_)cS9Y1F*&d;$XS+4c
zaou`Boq*Y3`LykjnmI=_=8-wgW^!Sjx^s0{WobE5s1&s7)xIOg3vxkWhPgFI>l+G`
z`9dD^gjuV(p4^|oFg$L0RrsrpYgz`IwH*gW5$U|Jt`N^KEwc_j39HjrOh+ed-to#9
zq7<JX$URbE_*4VIM<sMAbmB{dEYNkKk8l<1SP$un9xjF;Ly<=U8?H(pjXF3$!4UB%
zCEh%(afNyGE`sBt(=MGeJtmGh$lobmV1%;bWop(ew^q1nQmc5%wMbE>vrjtDiUeF_
zMaRBeq@?bc*9-OQJZTusu!&*N4%jXUAKMGeN6!$(`6q%cO*|IAJ7qa~3bROtDFi{n
zOBv<}2qMgJq1{5`!fFfYXrP&@OXy8;YLg%tnBzh|#KQtt9$|II_VSpDYc0b*M7M#+
zcNhW8hHV+s0C<qKT`(mB=>ow=Yl)p-4}UfMAfc@$w59Eh#Oq7uFNxerI7ic9@Cssa
zP{>ena_RpgrbqO!9@S&TFgP2hZD20F3m}Wgq*LJpk3+USff!8)qnKdrz#yCy`~nnZ
zI1&NL1QaZA;TdYjho1<x7=A2%cNF<sX7YD$LC(F11Xdz(kAz1ul8caHGw!3Q&07fo
zZKl1BrdqI+LM^RQQpO`<qgM%l72a>5f#$fiIy@!LmQ=zmZDdM>A81?WyYVTJia;Ch
z$2x+J5ido?MP{~82iiPI#{<nSR*#mV)7wQgO$jI^>7n!SB+``<1|EJZvgtt;LsVe9
zl_pQNc8g8z+?<`XzzcDaw$S8(QcRB=j|OVMKKidx=oc0L0T>M8gD49ZZ~~2VBJ^qc
zzsll#=)}heRRD!9bQVr8;cLi`WYO7u0>b^!pS#g9#m0Gj4r(G~TX}_t3P(7u^S0LA
zX9C1SnjaKJ+aR=2a;;8>_+A?H+rpsZLR)1+h1T3|Si(esJbwf(u4IO9&Mu~Znz%c$
z{K;zH(Dz5bIr_uw-%QlQ7nvS>OYOnu781~-n|-x+A=#_hHBDZ0t(W88yh0&LME3yW
z2@5LHqjseN4KgW<f{L8+WZk5U7|_ayImg?8gB`i%(Xd!65)L&FV;&0*Vq7hm&OA9y
zsY`jvA|5Gwn>$l+NqMwfCFNO`Zwm@goG(4$v1W(n3Y~}cS)Q)&Xs|2o7DicMOx)E&
zJw2MB_ksk*npvYon3JV=8-(XRT2Jr1G1EvN@ZVogbuQ)_sc!$wnwox~?q5~+FD^XD
z_OE998`)zG^=<#uAL@=97aHn5|I~U%&(g(4#}R*Qy*>SNW_Ypu<Ls^3MrQc_jz;^4
zKe85U|NOJ{)Skt6m##0(E_dA{_a^V3T<IKXq|U6w&-|L{e2~elW^#A7H!=hM+4V%n
zja{(E%*~nQ6Kk*K{#0=CFm&k1Vc7-F<+t&eVOVb2Ft}<M)AR6pi2g2Lmza20;1vk^
zREK;O3NS8|-P*KWkudP|3$=Lh>{=zr^h^4xNo$0$0MwVZnS+PNVL_J@Xv!SQekYv^
z5*!}{vw4RM(c@qu`V#8}_E^CfKM7&{A%2JsZlFC6QSS!o-az>c)caQyo}g!OXn=B~
Fe*hd?`Xc}U

diff --git a/evals/lib/__pycache__/reporting.cpython-314.pyc b/evals/lib/__pycache__/reporting.cpython-314.pyc
deleted file mode 100644
index be353e00764fdd2e06d0b4f47ef7fef896fcdb0f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 19859
zcmc(HYjjiBndmusKP}7h8yg?_EgRcBjj;*f_yOi&%&{Yy#6crlvJn_doFidyX`^P^
z#BMUzqDf=XCXF(s6Vo&oai?8)*6mE8)86Jz=iZScjKv9ENz>jntN)-RlOb!?y7&9`
z(UD~%AZgc{yD|Ihv-dvx?DOsK`}X(Rs?Rd&C`jjp(qpj-iuxViNGVq=(04ziDJn>@
zl%G06v2+i;N9LE6VOiEA_shvu;a9*_-lN>3@~cRjqDQ?)<JXX~vPZi|=hu<4sz<-a
z;5Y0s`i*<C{8?qxGa4(kIr&b%>6xrPMw-C#l~qH1`SQI~Vl_}-Dcvn!S#6WtZ((%>
z6ss?w#<KmntO3eKQqE(upll-Ld^VdkvpKAV&1Li0eE44g_l0Z`Tg<LuOQ2i|b!*u*
zO*(%8TL#ZD<heq4&I-?!!*dnzt|GP)+NwxfF<TAg8d6^KOo^3hq}Z(o>CFH|Oxe^E
z)Dv_--b4jt-D--OkWmyzAI=_<?bA|}mRd^%<UZO*`Agv)weSubyrT}@v6iieao3S?
z%h(1eH<B{LHbHqkDO=eMP~J$&<!lSP3GO!I-5Bk!V7I_sGij@2Wfk!LvgErwlx^Fj
ztv_($gp>2Qd8VG>Ji|vB&OPpObDr^0X2i=e?o-Y&h6{}IjW$})JIBYpey88#9p`)D
zUftmhjQic3SRCiQV{Rxa2b@DtkPEsF_t+Tg_IWwKpzDHxST`RS^9yRbn{#=F{ec{S
z%+sTs+dclwN3%0bUweQ5(@oC=3bFbOJnNl+`Vgsa=-+?vFg9jmBj<KO%Zp5Rd(VET
z97ZT7Ax&wZEQi~>wNMCBcIt?%<?9sXg1Qpw5Pg(pX;#L{;lF}avMN^Hq^O{5nm@pb
zP=b;_?im{s<h-8~G`=yXA7F4oP;+GB{Abx?9^U`-@~1o_s5rMjz>NzE{Gf{ze7*`F
zar$|jPr0zY=?UJ=@l8XVbKK<}Z=7&)uBHyJ%iZK0b&vZS$ou#v1T)_><{4_@2)qFN
zjlMI2!Qn`P-0g61c`%|G{_{4-g4Cy!iB{dgENr=rmpM7P#G1Rw_1(>+@h)_xhT-c|
z$pR$;p*i^GAT=mK4ye6=vdgkllqq4aPmZ-uC<3xZ#NTF$I@#HW@LH8>lJ_F4_vKTR
zUD2ml)d_B;exqbf_SKPos{8x(%J#L0)ZtV56klu($*~_fhvZy8&m2@f#UN0tm{|G&
zuu`v>=Q;wbN5epz=R;tc4l965V-hYJfQv?gr>g>SJZ(K?Fh|Y(>p7^GzQV4N)_7go
zyARN;+^1y~c8XQnX`~UmoYkZ~C6k_NlCfI5lGWK&tbWj#8dWW|vj)4CHKxr7xNs&6
zYtke+lgy{^h&*al$z~!{F1-S0y9ud$KNQNTGQi^js=FAVZSIr$q+KNlEn~$I`ZN_(
zmQnjkL`br91+_))p)bn*fb`INkrqsT&NDjd<{ZObj&lQf%LG~DdV%J6e0w#+;btjl
zw*w)&Py4tX0y7lwjJX^<VX0>tw~u*;fvk5lrg|X5<B`4n0p=Ssx1K*7rk85BE!J+E
zwSLR`-NX0pBKLaEQb~VDS4salADoFiSA8&*{9w8lK1_W|<jH~(I8ngOmEug(K9ggh
zg0n(~T!;sB6~7_3`QqrDdG^V3PtH1Cb-X=#?=Ir{bL9`3J<#*=ase;Tyj;nd0=RH%
zWRo-HD&b*R7bOo1@~|luKtCQ~O~7odK7T4~of-Wvqu)K1oL*ru-OC{naEP-)wzSaB
z;o$)196lqclS?ir{jlMU38oQPNVmi1<vo7SDK{@wjJro+JH(2tBme<fU{B-)vybx*
zyLsN>JT>Yd6BY`{SdJk;i{rrH9V1T9SOB(`73+-PBZwV_LJF~WJqS|SR5!2!pUf`H
zwBi!lCz%|;1^VF|Rwb5d`4UAfvzQcdtQS{c-{UKZ#3mCo(r5I-PM;-x?C??d@Nr&H
z`J6oOb_p8Z4Pf#3&)DSLT6iZ{2AMz)^SB*VGa>~oqZUEB$Dj$<?KUN=X}wvnHe_9@
zZMk0C64{Wj<j%HyyX9j!Wz0%2*0*$T>gJ9uFfE^A#W&j*YFnn;U!J(fn_z_J%G-7H
z#Vp8vikVFfdwSr6ki{SM27pA93ohtgp5+@A$T<#=i^ItY@-g=~hr6v!w~Q%V7F2Km
z@Z^`sMtp*7WL!`s7hN^VIbHDGa#(a8h3x8T_)3yQW?@{xA-FE+LUwcu@SGrZE6coq
zOd)P=jc2t6ci$|jh--V#?+%?@s&8AYZ+ofsD*Y?tFO4tNPFZJaU#dO7JNS4^+nd&{
zX6s_j)(3REWz1bba*gLU$BoUw?wdtS+_?LEL8$$$13x(Mk~8XDYVKZa?w)Z@InNga
zyJN=PA8gzCQpExiP~2D@*H!<hVs7N^`Ww0}R|kR}A8JgYys$2^C!r}0KRKt47W}kg
ze)PYui<Rt7Xo|ucV`b~3<q1vUYn9<s-(450ZJX~;Xsj1D-AOL&T{$f9T}oEjdRwnC
zsBTjlrRv@v^_1=)eNWK#-uuY7n@1gC*3zX^Trxp+O`eT;k)=zgxVFIr`OlTvm^WCu
zOp1S0nlLX?LB2vFM1+SmrSCFO2tHUMR5XEnY*TW$aN<fMCUGcn3W{FuxSK<sVxu_=
zTtxYftSS-ANCx{a#jk-3r86y^cL%draouga&iTnLhrDwI$s;2o^ZPNug0$TdG%r@m
z%C;g#Lh{k94hSB$9UyqRFIg+LiMc(sGsDgXFGBWd2koQ#6?jH&muDjWl}v(_!|t!}
zfo$BTP9HfrzPt~2s8vbY?b~Q+ERmIzQLM@b5=380-dKJR=3-dBlS+2QAd(nK&;vog
zDy@S(UP^<w5F|lg+>BQxX<i_qfCSTrfM1nZjgi_Y+nkg=0cNj?FdM{8nxsBl&Q-~t
z((B+0L2j_=yL&In1uZa_35VM?>K5cf-qV8G=XAM%(Fbsok@TF`UPdhj0U~NKug${!
zXR>un0GIQl;41=MIVo@4Gy;MMzLsF4O(n>BItBHBD3EZt27<C5w?RSmcmUq$7Ze>{
z-fzp|l-Q<&H^WXe7C14^fq(<}d(7(@7feH>uVaXFJAqa6f*SjA-U)68c9M4<9N?;<
z#NpC#^;oAFhG886-i3Q2QSk(&57a}yO+{!#p#6dz-XyYi1u|AaCBBA7u!{Uo<bDQG
zw{-YXLF9&@wF3V0bC3b^(-_0b$mVGNbxljK{e4sSlsuu)&KRZ*q5d11qC0XbdpG?%
zW9}54D6d#5-vocL@=epv&X-Op66U<wqH{&z4NGg*Ev#7=H#aPqn`7qYXx@|x2hmPz
zL)O{abG6~l3l$OPTb?&PKWO}MMbvp^^z!K6uDhyA<mQE(v&YUIi&$cLRk57vf0NVN
z+B<40EAL|+r8K;7;A;m$)*Fi4W$HyK5!0W-T?7TdoS)PYbrLnlX9$<!cEfm&WHL?A
ziAO3xCZj_p16jEEHeTiw<ZgME(2XRdWk4|H-G5rmV^AB`vGOz?qyH0m%o`$)VHN2#
z3Yvp5{hk}JD+du~B=u#``cM>c0sWPL_V|dN6Ovxfstz<fa$M~F`NqW=0JKUdRt?-x
z<5K|-OKE=6+(DR<v)c4;8^oQ1B>ReQPOD!D8`b6{B!Kh0ndCevuGojW&#JV7^Wbh}
znCw7QYGuCcajjz}JpQAOq#!EZc?15H6~GmE|8g7NeWkIxm+OHy35N6m1QXt+4Y0yq
z^ZYqx%roLgW`<l?WM<F54CW&+=<qz$3-n`x48*B}&}>t1hu|`4JS+~$5WB~u4sXSB
z0F{mZ{fbRP7#L@VJCV&P(8$9*iT6)oHi+3XklECn3(H3^b3i7@0Tco~A;>1Yg3{$4
z^E-LO`=kga@}R6F4;uEK@Hs(?@InVdQ9p!2kox^9F4THI7i#;R$pW?6mQe3vc15H=
zW@}xnYMa`au+{%);?miRXRq7ZuF}6a{HwuhgV%Q*TF9<U)UErm`ikMQ;d<TntL9&f
z{%Y*n*!6aHA-gK}<lt>NrOo;<$)qYmu5a63>yPBTe)z)hob?AIbM70O&3Cj^c5Vhf
zBq&hM7W6~FbdUOpA?-6QRyFWoI}=o*4NXv>q5;4G!Bm@E9w;hDJg4EJ9zN<EABBQq
z2=qQd!8=d51?^aJz%#(h-F`6f3CamC$Ga!B<K71Gi4`Iik}(hreYcW8@u5aHS(7{^
zW(f+=gj{615tL)jA@`Wb2p=VAb;4_S+^LWU%IR+U4#`7c0mHj+hj@*2i1i{rL=2Ns
zk{{A{zX>&91C#2hj8YY-pqrCV6Uhtr*i{Lwic+7xOzm=3A+`H}C)#NPNJn-#<g{G|
zxq71-z@p*43<C>v!=$XCL7*9dc1+5wPfwOTQ+w3!_wlVwO`a2@jr>uMd(6c*db~|T
zPT2mN1lk6An8ERQd*|*hN5{ea2U$UmVgcAL!HzlX1`2!@M&^Pzyb?BSFNcdg;^q}l
zgKCOMA4)PF_aamhARmANY|e(lB}2)Ap(H#IX^R`0f?WxH_RM3`kA*E^S6pAQq_1Dl
z*UxoCEph#pU}wT;3Mr>1V)<L5uBd%}!~7HTyRUAD>APZzF0vK@lwdx3?Hk0lOXQ=E
zW}hcF#Lz1?zy1<2@UB2KoGd{!hRo}Mh_*tON3D~?6&GcBIY=*$+-32*Q@gB+Hr}Bi
z3&Ac6mx!F)-M`Bsj*!`X4+^Mh^ucv?%A}|#6>#9Bu<KC|lhQ@W`Upr)Utn~a?5Si&
zJ3?20MI10q#`$4p90wU_=SWLj4Q41VUZjaQM|7NO5ihhTCOW{1eVhwH4+5TMC<Lhw
z)tVR1eC^B&&wlOMaP@UHGlwQE_>0wUyJ^Z!q_x&xS64yRO}$|mbbpo@B|9PVjxQiR
z;l`3$S0zeMkcIni<0Udc?v^J9eMw4jix1&;1-D4&DPmtLBlS~~F5;u@tF$E4AO`Ub
zB!RsZy%P#a9&4i=bU+r6J5pA8V(iHzjilazS_%H!3@eUF)KVC#p28KzGz#-Bp*-LV
zDuq`3hqGvtmr@?urW7o@dj|GA@wlUZpsUZ(|HOd<?d+4i0_~hs2K*xpE#T$QxW|XR
zsN6trnHLlZ_xLFf=N;!>fYAko*XJIGEdxwH;0WTrMe2M`|50ui6jxY(gj~|gF$$7E
zU`Vo^a<4%@vQ*tr0RE#l&a_Uq&g_`p5i4$r>(>W6KhT?QYO-!=jZ>~rN2F#+TeqmK
zn;S}KbTgW1O(^Fj1K5Rh^-vFg?`i8k<59A@M&-1ZD6f_1+Z(i^#`W4a#Ov(y;`Q~H
z{v?JYBgO*-QP%|J3GncYiCPruu#e<6!_X;(l`1+P`AdEnb~Ct$+$R5iti)XgR#<r#
zl>0|wr4?~PW~U^aKs?Ndsd5QZC0ucV912{CUdB;2)rVEkV3io4@7MeXfJ51olP`6k
zoHgucL}ipAdweV7;=De_$@n>tOaO@(m(%ZNc)#-m)Q-EL1%#ErxXU^2*VPP<IRh?t
zLk;8O+#s_wGE&V^r{{Q}VROTlhK=htZrQMY{e~K*VFy!_s;_BdYP_4oO1L&oY8nPz
zUv6Ial}dq{R42#nwJ8J{=+R&w91?Ue2hPu<;LGg<r~;QnRxZ#d(v9J;&w{GY`5odi
z{{VWB1#Bnl@Ue>0>St8nR84(3{CGqe*Hj1Fkqe&v+SyRgl6viX>a_`j@ulIA?Zxpa
zZ_Kb3G+Mo3rgf?{q>L%@me*Ngjj3G)S@VPJS4}{_Kx)Kxh045$e2<c&#-JyU@;k~=
zGI|5p=I9M!3%M*$1lmbTA{>1;2)B=7gMe^A!6}g^jqSO>&I@j;l#m37m|}n8i!eWu
zGO|brX{R>9#&rZn#jA~;RMrl<>TMv|fbK*ID&FnnhL3W8g*VDk&KvN-@)1~EfT~ps
z2zFl{m+-jz42PmWfy5Rlpd~^#qy45fR2%M&Ybt{6ACxgKb%f|y^*MEzep&b8!MXgH
zetk@_KH`*0%dnCbO#wpzaegtQ<4Qs0^DW^?{I+Ftz)my(><sNibHL6B7+c}8JUi%1
zQfhWlnD2uGmHUaB%_tskP7b<aYXe>Fk)n*Kj8mD604e4JFa$N61~pvfqYtbHp(ld@
zGS!=47AwzaaLo9mgj+B+1OyPKXiR=i+64mzBuwth)UIYVsdkw0>a+}@L13n0T2|$c
zbhiKpcYz}S+3ZS_0;^jmJpk7VY5>Vx&qd(ovn=A2YLJ{#(!J!9`b^fwOy)^$8iu%L
z7*qv8*`v=2H0TUE?k{1W*4{sK!KF88f&3erMo>VUhZ`JiV9xUqiI79L3<q`*iaU(i
z6lO^diCBw)5KbNpgvhKU*QkNGxEx{&cLiDqvR;5fkV<5kW=5t*X2zz*!m=f!b-`$j
z8!Lj{ALz{qQ}&XHfxnoEN!aQxm0l{HD_sD)-1hy`-3epXl5s8k#f)ncc}3`Y44+&o
zTfb1YKAyK>y8FYtbyLvEGIM15NVp?ni5shyj13FMhB;TXBW`S2GVWL~?uZ+A1-oyR
zS4Bp?UmulSFW)%T9IE)Mws78}er-&#V_pT>GTHpuL_KL<4jN(3GQ^oc1Ot>w9w{As
z6*{Hd3Oc$KiMlmrsYDuYoda$Sm*vSqXOfcKT6woTN#i3)TKXcY15yMlBkBVf=7DP?
z=OBrdrJKHBOb8kv{QxU~VilQqdS>dCXfOl6QTk+QZ<MFMk*Lq0IB%4*>hyXLrkCfa
zNpHa?(t;RL;xdi_RHdD4KRE7YAa;lIoMwh0{-~94pYR^@tYds*0iFSI1tNLeAetbe
zGw?U=9dAH$3j;<nuqV0s#*Y#Ge}0AY!G)+++$AW1q7C+#PjEpPu<$cLaC@QLOEfUB
zqo<TFLGS46u^$|8^mH8T6$y=y9Fh)f?u5|jQ-XZr=$Vy5wBF(Kobvh|IBV`2^d>0t
z4JZ)(?71^HO_rn~e8x0wA_B=&_t(FC%Tye0UNn`*H02<oOzpd_E=XkM%y_0f5SUPC
zTPU=}v+9<zHpj9y&y7sUZ{?JRk1yub#mx0{TcVbzGG=Oy@{6Xnn5OMkR#CWmG0Pe=
zmPe|7WQZPm+Z3y5iRrh+6x)`OQ=+6VEYo~`YC+Ql2~a?prk$p(cSs)P#j7|MAca&y
zmJIjGyJ66(oW;&H&Jt&-b1g(a0_r!)z;Cv!H`Ng)E$@@QBQ*#Gk&qH1gEpneXh%x*
z02_JQlYp{_8ziB(4kNABW26;q>kw(BlG>K95LcC98{8y2E|cO}0uVpLDk1hywH09v
z2|RI17+V#Pktjqfj6Gsk9idi{h&qi#R7dFaR_XhYU?6g_LbO=Jn^fN7Cd#{}iSnt5
zx6ui2Q@|XNyRAy#Tj+&Wja}2f26}w%Z-c%m&*1Wqc1<Q9;U)!A6stMx{{qj*@R@ym
z;y3LZ7O$)pB3yMx=q8zWZQ2^C?Q1vDB+68i245}s7&GC8fi-5thGG~4;-SVgxGs{2
zAi+2tKUh=xoz$ZY{0|x5m!0MpRZ6{KC6~cQtxMq>z{#B6BK3#SwXjwoK4Yw^x71tm
zfUj6%FR_<ylCwDv`Vyb+0rRx<mjX-9?Jt4cXxBk*+L($Q1CPiqaSa~ubY9v_Gs7W2
ztp`1l`w+@4Snk;+J5UU3Uf84CqxV77vd>TkvAjMb;&m6@y+AOV&gd0>5H@GR+syQv
z;t#$odzQFe@7oTBkc=Y(ywYhk<A~CCA)GT)5YV#HXG+6uy_GuLn-XS0pkAF;V=7?D
zMiEPV**>$~%vNuMN0Pxrz+3IcOjtUIn;A)Vct(03%(KXDg1p$CjWd(x57+>&CD9BI
z;3`jOlF_Mv%qOS%nB?GT7fDG-&MY5hrpXjvzN#N^e7hzs;xY$#S5{`u_MbKf3(O%$
znrAZVvr76@b`8W>=G@Ow(zrs71lN^OpGPQPuHEv$aZ6xa%R|StJaF7RyTMA?bM1N3
zO8fHd`D_ir6cE=+fVTEs{v5ahPe~(4xUw-tHDx%;brDYnGV%nwG4*wTAt0ufvTHN)
zW@GA0B(TDD<*p4XX_?duxTP#L@>d?hX_)jU`W`}EKvke<A@B7S_zDM?SJw)wP<a0;
z_=+CDC(G>x_wDf&P`6@d2Uud=3XvY_D(yv(=cZv!p)`M}uiB`W?tnhQLra1+zo6tp
z&LgD9`l?d#znb)tt=4IP%@rkd=P0;3A@0)cVu;kska!YuT)}_T%}gAHa|lT>zEP)R
z!9C7^rx&Dm-p@0&#Pi(baoO6EHc7En3ifXxHa%!MG34dj7>tu4x&Y&We!8S@UDONU
zjd(@@rZIGeaWcdtn5G>_;hl{PoPj_W9M9nNz>0aw;|=gSpPOqyd0m`7&+uN--absu
zSKy%wCK*Z0z!$kt0pY<`3g1S)w^28_X1leaL3E!t;0b~TnDq%bRlqbf?3m22s9^f>
zGy{XGC(HrQUhulfEJv>wW=ZZE9Xjs60b;KRM?>uiOoH@XkkdeY7=&C*83_gKap@xB
zkzt8E!8Nx6#n5I4ekBmFz^W>k9DV!S2f7ck2OJ04&Mubt4ZyD;^PNcdI%&3#IZq6^
zoI8ZFq~G4<8HIIBi>*#}2*bGiTF8Rbg{GzQEeqvaqC@lMpSb7Qc=@iY<yQ~=vTDJ)
zhe-brLo7$T-*eD?K+qg@P8{clo#PO;>GP}r1xiT-6i1-2oeom{bf&!wK6ND!v=R^+
z=|MR>O44e8NVId1-lD3YxL-miuvfkS7h;eEAjx<REYi4!#PKHDR)WMI$GwaF{wHLA
z4gdLC$o@S@&F#I^d$D(Z<Gl0lw_LTx*LB9~pZ>+!*nmBD_)CiePsVpY^<Mqc1O$`X
zvxFeP81b1LvK60=Q{IFafDXRxbSerN1Q{RQ521(YE1rzi2L0^5SZ&d;kqrI)6^tcl
z!42RbK7geDn*16D8v!fv=#k^TC6)s0FF-H80J484f!#G-*A(;RbJ}S6CF2i`|ERb}
zT-kq_UqpW<W6n-0&NehOa8c;K;^WYBMn3K=6t>G#AGZNNZaMa{UzQQw>6I&Or&l6*
zzoK8hNg=JQbWMT?-bz3OCOkZ80E>;p&<odVa1|*)DaJ_?V5!Eu-s1ruL~cM74Mq&|
zh>z7wFqj544)#0hEo^Lsi~!~~MHkoItgH<WL8$ba>d44j$KE^^Esq}hN!5JLeEZ+$
zT~%J?V;lCxtM@MzABYtnc)zkX3A=6E0d{8@4z^;7i$E6eLgv`-h9z*dGU%}<29KX#
zVbEcpiZx-d0{r#J^N5KfX(SOjq{#R@5I?xWYJ$uUSnOZ?{K{W2kAX48k30`pQlJgr
z30!K($-BWl1RVbqjEg~oWNZwxM5mE+2u_>@z;Tr1W-$2J8F(Hj2aaQnj0k!XckdvP
zK*Z81(5)+Q0d`jr7h0f@!URC&FMKS412K_$z^Msbp?`q?%SRIOY;T^7SZ7CH9sT~v
zdmO4?|DUGyvlY|IufHgZY?z&Rb>eNAIIT%@UEL`@wGgd!bqt5D|H(Cu^lDf<urFgA
z5fAX6EZ0?}HXjTUV`|Z%#=sMJ1!I{_x8i$K{H7CzPAvln=qJtW3dYKkK!&6M55BF`
zIfcR`X~7;tNJ}oL+#@4!W=Bxq1bPW4ddTu1BPAAA!dMF1-|PsVocq!{ZS&O&^}F6M
zf5-g&j(a>>T%I#A^s73Co|M(qeWHL0#QcC73IH0k?FCA(63#=M@PV{}F9-PmL)bog
z{#$s!Q3rOsQ_zK*!0a?+K>x^!I1n2tn?+DLec(KI5$hlLw7^H@c9CN$#1qN=0K1|_
zDV|CI{am!e3ppqO@QrYWuu-BLPQcdpF?#_rK?&~o6FiCS5$*Zgu?eZoRt7OaNREP5
zdYI>ai_gemxj1Akf&zFq&k@d~!B`O3L3v`5Bw=_6=y7lv0QVjlA@3BF=tSTlunfpp
zko%pZ0)3oAj1%ZlK{hnX;c{d1sZnkmirla9Ud~Ox0TK~#TpX*HFiT*DOpuE~#-sMQ
ze7GWc#v*mug-MKCF|OEDxKH6ZVj%p#fOP)>r%bXagDqCq5z}=BJ3i27Cp7vQ-Lwvj
z>(@0^36o{!%=DSC<;AZ?6pN<Hn5Odgdh_L~cuvdZs<}h6d(Z8S&~K^VRL3o~m#Y?X
zT5jg=`B+XH_R_)5-|p&+XOj&oEDLXVT^X7PJ1-dK*eLzZ6LDr!tZZ{EZ_9h;E!Pwa
z*<Hb&536f~-HGzb;GXx5Ip7dB=gbzpQZ&_`*tu(}JJi1XsyXwef{O)F>y_HewJ}TE
zf?<1N-TJ9Lv!mxm!zbT3eg1STuQqP5-BnUKYanJi%W_*oSxUpsNZuSBGuH+8-qM>x
zwbz-auj>bcogXUHFYNu=Uhq^u@y1i<pSq4)YP_Z^UfQ*2>UtNxdH;3Afkd*?sn;yw
zf$tVY4!vHwXsVg3STr?US2Vt#Up%dwQcm#+Q|`><^yI%Kcb~g**oE#=bavg8;zJRt
zAvR3l5Rr!ik&Zb<<kTD+v#h_N*#ME&1#4iYN)4F~_x0Z7OJOg|-uf{`D_fvdV}4Iv
zm|CGjH#F?OYT6UoIaA78=6ryarDXQ8bB{%=Z`Hk7H|M-GdT}&*^2)^JiI{a~+_Gz`
z6V~&aXK!W~Ci3!UpE~zcXn%yhux{?;rHP9Z@$#)PYs)-6Z~a-#-_^_)#d12PI&KxN
zJs*f1`tDbk$~G;OZHkv|xl!1BP7$I*cor#Exb22zTOzk;cK^Bk7rN%GbBDgaFY;KF
zj#fnTqRx3b%Ezp2H*&Y%QBp-)Z>y-%wQux%rzf7h|9nsA<fYPgN?$S0Z&+&Ex7fCC
z)*LdQ@0n_kW$#avTHhG_&S2c!cYZKTUwY=9XI`<+pIq8`aB=6sS!>98esJpXn7QwR
ztu3!qEZ{~G&$GpIY(MIcn%}nH$k{aSoa*?Xb^9xu7Lc9C^Xuc5`XBMpjc-rfuryz_
zPIcbQEsMw_1M%FtsV-QvSYiE=sXmdNH)}X&xS))<=aTptZI2E|pNN&VEoE=NQ%dC&
z++Is*4M-ioa>sIO<|;4QF5049@w(Q<+}8Pl8@cU?!e&5!ZOf-BN@GZ^)3)oH)<j{+
z8@ltl?-{1|Om&1TxAF=Xkjcgi+v9od;I1>6X7*0+4d;Nvj()v6+!EP5C!cG7M-@3e
z=Z|*G(?98sw#;u$6f@z%i21^%2><<-NL_4Q&uuwfxQ_;uGw!Dm%vFFThU}Sb)7wJ*
zFYX9;F6x<>g1Ps{<1}U2cJHI?u6uv%rYZ9QIALV&<JSW-xTav(7R|MdHSA-GU$3?F
z%T&K^HbMT+Iu+)OrC+W3=N1#>|I46)oNQc@l@~Vv;3Hqi$q#%Koiu3p7Q@yISc1*$
zp(l&}1=>?y{%qQIhk`8p)NQ=XL6j?8M8}3SHRzA3i#WZX!X!bHOSO@M-c+VQ#0-W7
zdf7Ng4n2R~=VZ$PL*VyAQEOF_#A67Dg}|gf++J5DFfxGirBAo28g7#<2)k7YS`-jb
zf=-=PiB%p+#iKwk4C>F+OTD6(<^jFHF(4^<FqZCf_0p3-UjqsHHEy(kgC>hLfrCjo
zXqKj$8B%OEIC9LABL_@Fi=eEi?5aLQn^mbMdG7*21Poj`eRyDZRm%LNv>pd+&<@hq
z>XaH8un9)C9LR%yAbG7$tNP`Fp)3y^L#jS-XRS`DzTka<=m&M`9=;6k0k-*>pF}6r
z0e$8+c$+elag8m2{%9mvmEbLfgN#%OeafZs@-@>>j->0sI0l(-(F!L92rr`9nk~+_
z5-AfmY5`JWi_`DGH@#+syqr>7_QL}yc^ROaS#8Ngdr6fPaO8fiy$VLnEGr9DNones
zHx;UxW{Q#YbRwZ40ugE2)O@id(mty_r=2yuYI?im9*5S7Kj(uaBw4oPrxXF{gOWNn
zSA^fP=7Ar?{RcEH!2h&U`8#pnU252|*sz0~${)PPAv#PdP^sZQf?@y}5<|p&)QE|<
zpXdNg6H$wa=RZ{eYB6#22ax46s4p-9q{(C~j#z`|=HRCWM9q6rrqfLpqzp8odJ9Gx
zaQlN@!-#f`)c+N51i}>URk*XIGkgfdOEUk=3<iyJ<aj+OBO!QJ44a#QQkpf6*qnsQ
zhoV5z8i&$4k4!B{{kUj;!{3(7pZwYBzdOCKUNpdQ|Ag<jgjoc$cQE^}nEe#9o0#FM
zaKPs%VxhEI$gylu^FfV;1Rsfd5zz|1hjlMt_Iu2}gxO)t03#?-+xan;euo)qO@h&3
z?_xW9I+D>A=>8=U@Nfp>9+rvHqsE1Cc#jJ-{Km^sx6>t_IYRY=y9yam3Rj--&bc2z
z9E8bI|AO7W4F!xaG%c8#=W;KV{IDc$YL07~gYD$-_s;NuDE#e=XYCB`zNOJmb<XUc
z-XGT#LCi*0PG~bqxS^&<Zp4LFoycG`H|n}_?DDbbAc)G3(*=ow!r8#NK-eB>fBD&1
zeobTo1l+*IK=kRW=6F*_tg#dNb<q|)OuyyamT-QAetBmsry|lkXT4N)u_~&Ho_xCr
z&NM-{HW+Pf$P+#pF@wao`L<eav;{js#4=V#o{j0Z20L#VEul`7e;2CvEE#)1acOK?
zYTOBbvBsTKuJk~X>sfV)9LrKp1^mJLrXNpO%ik(}t29!&U@VPo?!2u8=@O)5gK?&L
zrg^IQgZjo$f2b>*2WrhJ_yvcf3pq8>_AA|&yQex58pDi!S|1t;o8KrrUkHcLD=wUX
z#-7VPR}Nl2xLDm8tLnPi|E}d-<-2Swx9^7L&>bU{wU<V9FFRB@YddEPcfQ=TsJBKy
zMy!n~YKim&0n7SiSTIDlgEou#X*k{=;6M=v`|v3FX$YTZ%<H$wdV6hEg4*G5d50Yi
zLFaIc1mL%T@Rxd!L36X1y$PA12hDj5j1l;|N#d^y81Sc-8j}ZVIR$pokt*?MEZX^m
z>}7?X_#}8yW>n*_C@YbZaDoDj3W64M2mVG9cMO}J!mJCkA<Vp(J&zgcz8nQeD3bYW
za4Trt<AD=y4jd}n|G?&dz>L@#eu$+jm|er{Q_K*R!NNv+1lmU|((BiYnzm*;OfxX%
z-ofQU7x<(2BFI5Rr|Fwi({HHq_bDy>{e~+0KNRyDs^nj(?F-cQ-|EaS+JfqYItwGD
zV|li@txNUW7wWgic6=#SSp%b}Z9(}b8u*=~Pb%Rw{U?QR-1T+|rPNOy2)E5?qSf<N
z2?hQlfjUwe)y_936uPN>pe{TXt(o6~Eo*LTmD#eO;gf7UB6izA<rIdCBTq-0t~S5h
z8q2l^ji0FCaL*?>cq$}^$}N4b^w4z+3mT1OfWB#AL7OhMhTB6MzO@<d{z$5WrVmYd
zp+NXJghRmuVp+R_dK@MHlN<=7{zM1Cx1SV4bl#`M7>M-g9vR5#_wEjpXUXyCpWBQb
Hb*lddq=@@p

diff --git a/tests/__pycache__/__init__.cpython-314.pyc b/tests/__pycache__/__init__.cpython-314.pyc
deleted file mode 100644
index 5c17b561fd1fdcfb95c0dcc48f687b6a1b2df703..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 154
zcmdPq<K<!ig2^`}vq1D?5CH>>P{wCAAftgHh(Vb_lhJP_LlF~@{~08C%UnOSIJKx)
zKdC4&FC{-uuRO6RMc+9;B~?E$JvFaHw>Ud9C#P7yB(=DtSU)~KGcU6wK3=b&@)n0p
dZhlH>PO4oIE6^N}O~oL_CuT-Q#v*1Q3jm7fBbfjI

diff --git a/tests/lib/__pycache__/__init__.cpython-314.pyc b/tests/lib/__pycache__/__init__.cpython-314.pyc
deleted file mode 100644
index 24f6c4f519bd0d774101722a5ac550fb779692af..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 158
zcmdPq<K<!ig2^`}vq1D?5CH>>P{wCAAftgHh(Vb_lhJP_LlF~@{~08C%UVCQIJKx)
zKdC4&FC{-uuRO6RMc+9;B~?E$JvFaHw>Ud9C#P7yB(=DtSU)E-Nk2Y5GcU6wK3=b&
h@)n0pZhlH>PO4oIE6^;EZN(tQCuT-Q#v*1Q3ji~RB?ABe

diff --git a/tests/lib/__pycache__/test_adapters.cpython-314-pytest-9.0.3.pyc b/tests/lib/__pycache__/test_adapters.cpython-314-pytest-9.0.3.pyc
deleted file mode 100644
index bb7b09001f5dee478db74eb0d60dfd80958c8597..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 19301
zcmeHPU2Gf2c3zSzisVw1WXXS$ozRk<n*JfJKU@A$633EbC%Lw=j;X{}!b?lE#DpTL
zUCOc51W}O-Tn7yh7sV}{BE{81AL90*LZ8z1p?<r0q(n*d-QJ)r+K1+$Me5w5fKe3v
z&g|~&l3JV6jng{lT}Wro%$%8>|2f~88SW1?1Oz-U{CdaR(~=;Jk;8mE2J`KpOAzLS
zw*=XBRw%gG+kM6}<9=E=BNpnO)<e97dW5*o`p)<Ze&**n8#p5sB*u$pgJ&8F4NnVi
z`gaL$hI)}AJ;P7wZ##AJlj{y3cjhB|?XE&oRhpfqob~QCSf{kt>sQz$*KZYM-&P^h
zRA`a?TZKZa901)cOQ2iiAm~=P0d$)j0^Ke*g0{&`pzX4&Q;0NwP5C$%acSakvM`Ca
zG?6MKlH<vIA*I0SA5JQ2YFwVlYQC%K>xCI5rE2wRK}jX2)G?|wqv8ZNcxs?;cZ19c
zap5g@|4#&A9DX~5S6!1X*)@s~{fW`X-Ct82&6m$43s-Z>6h7xc%6NInnJlQ3c8B(S
z?4p`d)YugznH|q%qwgk_@z|-{cq*2hNM#EL)pydFj2bJX)Pfqzq^~f}EQ1=&-_U}I
zL^7Mr6_SNiBB3-ORqAAF3&@=CpF*9h@!yI3X%BmS3vYpyjTm}F{Ijx&$kCi&l|vTn
z4KKUwu6Sjb+a8v8jZ)d!Gfb&hJ;U`?9&pN@xDXV&-Emj15jXFyk&1t9Dsi{j)$^XX
zdzcNjN~vM0pQ<PBF=IU~#64&`QL9T`OJ-EfKbcgrDOF9WvjL;qP>o9o-is2Edd)8n
z4WCcQ=g+^U`O{PRoKiqLY2r*Fozb?c`P5kAs*;;ZTuo<EiA-)RSxD!yTGLc+d<GsY
zh=eq6CYKyfDVmhJjwpI?L{zAlnwUoi)<h=fL6Az-`b;<+R)Q4Vigc6XX0I^fDmxLx
zy22QkIpJ4Q>#cWJq^IWFSEK`<Utf`qd~sk!Iz2atfKQ(N=-IN=wJ3EhynNU9#cZ+r
zr7s6d(yIszw0si%C|V9g76Xybcb5XOxl{K<-%nruy_d^k`=Z!>`_!@+z9)s|UQ$B%
z4t<M1G>fpV6-7SKwanop$bwnrc?_mulBs$|dB>^jcut+ioNsuVu2J4O;;vEN4a}}J
z-SIod*M?jZM`;MlZrLM?qg0vf8K$CCJ-Y<+JBT7o?4o$1O{UQBCX=(?yM$4n=`!<X
zetEpTDt@aeX}z7||Hvhaa-HnmP+rj!YUdS0&uAZ>$E{WI9P?<!L*Be{z3kgi-qu%-
zRvT95aeei;CO-1&ll`{#b1oa7P-5Z%uhlYn&nW6fe{5-NGySahE=(zAy_#<<7lFK6
zmZnb*w^@=VH~m_?j(XViRkIx?eo@s9J4{*Yy-Sc=;@%#Y+&VJU<L)&^k6dTVopT9A
zGgor(3GP)5HOo<FmLuk=XsLQRBsa$EW3CEc-v-*xXQphvsr`<-^5Suql9HQ#+ltf+
zhn;&DpHtY}@Z`2)Gg{F>JN}Miar1A-V#TQYp2uR`H5|5PM3c+=<xSIX#OwOcU{2*L
z|2pT?Epa#I)U6{wv(2fVCwQK$uQ^Zp%~?!-5=WySbLt*B0402zRyPJ1bbZ#(UeUt!
z!fam>y0}sZ@7bHb(Hl_%ziTkz0p@imqV6T1u84Yod2~nANZ@_VlfQ9D3#8R_RxKp6
zV=40J(mV=qAGmIX7E9uK&&_sBCf|K04COyP4m=ee%Vn>oC&Gv}_6~(aZ%k!KvWH0D
zgODvhM<ObD9(mBtl2FHz+3>-GsqD4zVCq^blgm#5&OHbb0LV<{)WWg;L;Z)+uR)!;
z<I>#e>>bx^pON`+I?J-(XBH8RQN*5z8oiQKClh12sVN{vHSj(n_N*^rZl;i*DI^N1
z>jjGTz&$V!F}{kL(rXipRhv;WM(&+d7M}_a=oO9pM4A=S+O=SwFmyr(&6?-*+1E8c
z@bftPiUQ?M(5pB2eEvq_%1k;_Kr0Z|K{cpa0KNom6G=temQW{iGnw&(lFBQIiA?TF
zGLy(Ar&5|fp(d}U$Xg4YWIX@~YDiIXiq?|Q(R3np9Y{N?W2=UW8q%5*_*RA`i&{f|
zQ<`KJ3{{~um=0FTNI-LsT~|7(a1v!&Z{r@_p?|H<9&*U;=(Rh#?Qss<9sPE(E|s)X
zfH~vQG2K>HFRbq;>~*Z1g)o5)^^bs*0qgv&w?<0-opUeT3j{xjeH8nw2MDkIK)LP6
zV%w2Y+tE_s*xadKNt<tteRi}Y?J0_T%F>=PsO_&L?)mr__gJcMac{)DHO5^Kvbnf7
zgg!gUT@W%|tKy#Gn(r=)d+tfGqBvOY8C>ic1XBbf4VK}<{z~wXVmiNK<GH}9IJnTm
zT}ze=0Z5C7*cnLOllB(Hf%1WY#RCIiieRLHGJM!y2|m(ZonNu>Twqlk`1}-iEm<xE
zAT1tZXCQS?YA=e>pMUV_2YB87U{Q*eMdPm|MvGFr?p(3)TwoP3RPI``TomvD53n;3
z0x&O2I~JuKMX__<!^oo4Sr(1IlGs_4c5o+WZlX#JktMOybgkN?mnl6iWM>ev))GM0
zC}s)B14TW|5-`f`0J1CUlHErPte1DeMqnTsq{AOzD9?2FE@7m_MD}@67D;D&$%dYQ
z@lP`R#PQoON98;z0r+Vf!%y2QbvCT^96Jtk`+z-Oui;=;0Uc>uUvjh~#JTmuM0n#m
z(zfDpyMp~oRy-YPkJggs=#^pJ$6pSv-EXiQJRZv7mjgA+!4b49)s(g2*_yHzX7pC+
zsSR?-0W(rBW{)jpXWZk285@Ba-T3#I*zuvgx+w}NYc#|?wy_|h&tE~GKh68RUv8eh
z6Az%z|BcxW=<_q{?DG=z+qRKEw&}NIU3&a|<^Q73Z`x2fpsc`3W{oMwv1=KnO+DJI
zbHk!%#djRTYvW@KZ2I2DK#Q$Bj`@ju&2q@CHDxXAu(_sez)Uyhs~B5Bd5gR?9;~v*
zG>oo)Y&DoEn{U`-s>W8{i!nx6;;-dKV2O6+X$)IsAIKe-asn^SM_M{5>8~ka3VWW&
zZX(YRA@xt8mQ^Aku)p8vI`p8?rtWt^QD)xV=Vt3uW0Se?WcoBm5=U837o)wE@eUjD
z4l}i1h4%gyY43m!x&jWyP{1FelaHqgsWE8rN_t`<r3Bt*kO#OxhdiX3Ls3tw3_)}O
z65$IUGkgG8q<^9d+;8v=hb-c=dn=Uf0s4}j2R%PWEAs=?7Lgco(CaExSsLXCRrvss
zJ|c&S^b?`gRXIlFI7lR<HyCRs;CewB1hFgChPth{*im~(zs9hLuE}?GRk(U|mmaI@
zP=%{9th*v!Uxn+4N>hia&g*Yoso!o>>Z?jVjwoc;Reu4k$FA062d3Ehe6i!?a^MB*
zy+{qFLf1FHuPF9Gf4hC5Bpn31(5e%uuPhpW%Or4+J2`W+o3l$|pXpj<$?*`Kf#>ZD
z+z-;G3jy={^Z@&3z%R6Nc{_s$w(O5Aq!%t1#gWh6Sh&2r^CY;t$3Re?;?9#L=^XL$
zh(V>1vS|D*lgv5p<O~(U5LpsOOxG$)k%#CEJU@Gb`$5`tAz<OM9zdVjd6M&Y0gotk
z``3w=t&HgcoFa6s9!&dEOb`Aq(1+>4-^Y%a?4I7`r1wsD$31MfVbEO149i#z4M3LS
zA~c8KNT*G6@YvQ{$JNu?pru_bo2|-m_sl`FwsPgHcX7?qERCUATk)Lp#p7W$uGD{=
z<+(;$wcAif`(N?8s@2VF+ikFN;=EhL*JIo2hLwyoN9B|pKr1@!8GW}_Z1`;}R_G|-
zcPsj#lY}(y*o>m)l-Yp(CJQH1QIrl$64h$RUh*Sd<eajf*aJkOL}DOV4{26Wx{34<
zp|Qg>9ov%1*3hbTl`pBvvxtGy3~uCxEvu&}1zJ{>AtI-Vya=M!gBXjOzN&50S2b3e
zN?X+=L@6<~i7#rem;6uJZB;u~+7Fi7`WM^!OKrzWf#bBq{XekHEQ`^#s{`r4)Q;cI
zfj(k&;KyqqYdm=zK>})YkJgz!akmXMdVm^F0yXw>)K~{gNh<CIYJAt6E7y%0H^u8Q
z_qUA1ZF7I+tocK;l<@g~!<Lfup+?bc6?0ZM>wKN4aZUVnT23qrP;JWzU(@7ZO>}2j
zSsH4}ax04*swvA?zbe$|lN;rxxUXtm^=|+*a_nlh08nEy)>W6jwwhL%d>z)=YjQJY
z=vL(_T5VRc_H(W)FT+Q91;l`j6Xf1aguzB?CJQzy!{mIGh)m=Hk=H=ZDHn--9mGUK
zZ;%IzzFr6vP}s*g`cW=X5_Eh|Nf7xlktC5TM8-fIHyYaPvv(CJc^i@TQN#xzIw&#b
zZwrQ8T9{f$6va!>;}gp}hrr#P1z~#pP)U-Bk6$vVbg3*Ff6F8zb0=p=b2mhm#7m}Y
zm8Hl-bOxTJxm(hv3jqrWJ%B#5bBObIXF1EiP9kw55uO@n5W04z=6d{pKYFI7*}Urc
zZYSHY$L*_15YD_iDv&16_P95y$p@JxU#SDavOdZ+-(g3~(Id5UM}6KpDaL0%-R^Da
z#5J^f%zAWAua>+u>($ki<+I)zO&(|29=j&*<C?sGdfHKwKVsmqn`RxY6OV=BKCH=&
zBdTpp4%94ty*V4o(i7a`Phu<9W1aX=_F<i9(mctD8xQJ1m<}JtfYeoJwlGjTDmL|<
z=2M%Be+bSRyYLkjwu)VN9R{o)0aPjfl}0rk0r)j{azf9(wX~e1JaYt~oFXTo0+r@L
z0}GgkkvrIWg373X0*SPmzz%&nOQ#Q%9HLRK)kcmBE|&c5c3iOiPo68c^)0sbm9UE)
z=qFgfw=z5C-9-^(QR*m*#$QS7U|X4%j~+yumxkYx*kQU>ZPLq>6c@5Hh`_+e@xM5K
z=X_B-{`vVu>3CT*{z~F;wo6RT6&ue5RuSVAcP&{i3OLUL?98%weC=Y>h?=5><lEE#
z+l!6hyx72)RK4x$dFZyQHCa1?0vyhyW2;RS+pc6Bdi=}J!?vKT96B#QJl-1Pkh8qC
z{Lo?O$6pQ(R?H=0jgiYaUpyYl;lttQTIJ?TBeth%$y$Iwswv9>L9BY)H7GYYY`a29
z&>vgM&c-vRZP!qxf{!D+AxzrMN(N~vQ$%d~e3sm7+GhHECpp;!PNo;-M?}sMIS*oJ
z@gra(4UYB~rqr)jpZ6>5<Lr~x%*&Me%k)X6*}qBLTOdv|Z<x=#m1=zAS~{tp$C^e=
z+EY=#2GRBTb<Ddz+#Lt(K444a5i{x%n^E=4c6q_ta+moSW`|F>Q3cgzJfw({sTl8i
zGWy2cQCG|r<MCj`caG(!BM(rw(AVWqjE|eBJCEB>PS;!H#$j4!s~*dgZmMv&^W<^5
z^Mpe&L7JTp@G%U);2q^Rp{y<7d_9h{|1|FFhk<GOQ*(BKS?Da=8m}>v$?b7JX7U{)
z*KI0jv%G1Pg8}D5;=M-6NE<P()mUHJQdMj?aV>_gBxc#oyc*^dE574Au{J(t#7%Mw
zw!k-%SN*V$^)1J|TGu~#9(B-T#dFM~6%To9l{eeg5odYcljJQmW%&#&Z>=e7L4<8J
zW%>MDHM0lg?Q&Z@P-VW6MsayZe{3n!CTP`zOEi+%(&iiH8#3MC5(b%U*oHr;<hR+?
z&z%bNO5xnBG?q!ujHeFLF-lD=+{oiFDK>Z0bPl+n)lcDWAFf$y^*HBS!0}v&(1B3*
zWLop$F6c}KKDd;b(n1B?ugu`kEsm<rsN)k0%H$vX>wACu9BzxwTw%$A85=`#nkz8z
zv(;z*MT#~*%T5*m99&U#eV<M3Y+a$QK&OTKnRRHk;$Q*x)Y-X#9^4gmod7hrM0@J}
z=H@*Z>wxZFbMs%{*hf?&3_JW$Gq*qt2hH=u5jq@jLO&Y-nMk81zOCl6Z1vLW#%HGT
zD%(VWcP@=<v1S^|2gtE)E8&qqA`(<SqyoCf#<aRj8aF6)JVA&--%+4d&AumpL^oN{
zI#;oiUb)NOtJ1jVa086K|L}}%!D3B~HX2Xq$U{w3wwm%WVxPdj`W!<Ye<wE5;n_D!
z{?54<{=pagB=Aw-*6G{F=Wi@TzPNff{^jZ7g^}XG>%|jq6uT}JTP~M;qxZO2|NNQH
z<Hg>=yBEJaTD<sHu_aOR{rH<YPlI=E=o_CPHPBhy+q-`r`7|;=RBGE-3hW1lY25V5
zdmp`56!+gwF|yLwiTB4n#m3H(v>WVvfssXNe_1sCmc{)=X*YLr=3^imLu854;4aP}
zshcWyLCAEif_+a9K<c+soaG;a_k4jztzwYUBi7Wv^geR)0xNFyBXiC45l1~Yf;o@_
z{c*xCIO@5DVUDe9Fb7{DpQxjrw6$WD$k7;QX#?y6=Ho91Z=tGHtX95wJOgvolDA`y
zTC&!PS9^1VuXt4}Q$6Wn4w!?s;@D%Wd1q9`*F4+Gv<7on2LNnWpZ#(Fu)|tD-Qj2$
z{4?tUvUSjN;;%?E!{&%6H`&k;*N@Cnkk(Y8XRYs`GT$`WV5{?ZX!WKI_1<P%eLMGb
zey|XlWvw2<wT7zeyTJ`?90kpGHQ&%YjP_OjTyB~ErDGco7-rmmGi%N;<61d(FypgB
z=ppFP+eb<^9lG|xOnxL1qyNe62mYGv#YcaYF}_jWEC=F^Rr+Jo23oPnY(4XBv|<Bv
z#(!cKfi`9lnB8|a{4QNX3eTvi@P(JpzVgbkuz6NAj6;Lr!Xyq?7Lw{a(cjo`FgtEL
zNs0rZy7mUf(B2&Gwmsx9==m2e49W4K#D$BearG+jeiRq6(s>*Tg`TI7CavrNAyxD6
zY-3evbfuJ8rrCBR88Vnb89L`-c48D6dh}Ja>d#l4L)nP+gZ)^A7OG57t7D~sttLxZ
zW9oBl%ZOfVvRK<SOykOye(LjxWx=+r+G;bR*p*Ah>l)@>Zdu7)!41S~xbQY}^=kS$
zetBg)!G5A8p{6IY$&B)cNaRKQt1DHO?Qgt>PFQR^T?)MTgFngn&w;0I&y)gt48#1+
zMQJCyTGomCMPP~&6xuytR@Yqm!euiu!y6aSa7D~?u`6Ob^$^?)LlQQ=;=Y>he~+#;
zb(h24i{WlCMKBUJkBz?)eCTcyIah2v7g!a$=?aq}zGS%&fV6msomm#UndTeWq4~_)
zjO?hgwmqR=WsK@W8#ffAO6pxDUC<#XuC3C~zSJ9j3`%Js{&O)=`n4E^6b!ANe(MFl
zAfwPJAFY}F5=|8DsPSVoI#z44{PBA*sf?<hr}icE-<e5hUb^U*EwIyI^jS5Rxt3B$
ztzl!0T_aS+$vH{nZ6dTSqmYe|K_%9g?+C2C`Y`aF(0`WZS!EtV1hLhdAm4agF4x}+
z!cT;fu=y)t+gC#5E8&@c^agJBf0!+M+ZVm<WpBr#w`2a&iZ`|pS@9m9^ZYW{bZgu1
zpDPDD7K0t7;O=s8|6*`|IoQ1z>@Ek7E(VXz`TxU<A1=DzRPWk-zg2X-?z$fmU9tNi
ex9hU&en51cxF7Pk4&M*@TnB!=#qa82IsG5ZJ?l^a

diff --git a/tests/lib/__pycache__/test_baseline.cpython-314-pytest-9.0.3.pyc b/tests/lib/__pycache__/test_baseline.cpython-314-pytest-9.0.3.pyc
deleted file mode 100644
index 8b8f25ab99089fa6dc9e5a8a79873cdd3aa73a87..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 7215
zcmeHMO>7&-6`th|e^-`h8;}x7)lcMDTK!r-^-uj5RgNp8uA{DE%1s$<DRL#Vp-6gm
zWyP)ws6`G5k^(efH14T9#zlbS=l}tdLk})cpp;0N#==0*qi;^^B4~Q)d$Y^siejwz
zuHo+En>RCW-pst2{od|C<DL+K^wi(_+doo5;+Ux<kIg(;0A`KM6Gb>j41xc9ro|Oc
zCz+;(w-ZY|=X}$C!#^D`0@Fbw$YtWW(6nSo(+x&LCsAk%QM@fA6*l%Lz7}FMDt@3%
zN&skB5qgLme1sidkVP{zrKM7emd!B3483|a2E4wKGE9F&V~J$kkUb_%Bo`M=Nsnik
zrY5wM5sM4Z8gEQ!y9oD_13=bDgv@)6zC%a?WbNda!ev1bqEKR`5$xUb2-}!}t5z4f
zEMOndmtgu9Gi*6#=&1Z9^Occ{y2kX81r|#uGU?&f7)y*yWfIy*Y)MNSL;73ER7xK)
zG~LigQpp9*sS7b(OC{6VaQ2$npsKNSI%C8PO;y<*(CUR-Zw9hPz9m$c6Tas3)*0*n
z36u~%K4dU8!Oh3_XJBxAGnJ8JshuY5|1h~IT=cAn!(_#C_60&#VWezd6x;Kpz{*Ah
z#e=_yXFj2bM*>#K1<%i02?>%8k`=Q;))PfTdBQddRlE^V@g1?HJ)pmntcY1pR!9=r
zZwgsc&|jAWW+0o2890lkmrsxBV=HMjnJ^pm%bAr_LN!=&X-Q+Ilt@-HO}12pMP^zY
z*56(n(1Po33d^Ptm%VHsn8fx2F}=&M5Gm6a&nzz|p#kHLqk~U*lZLjeGaNL-m6j<B
zVhiCdQ@F}HfYm9GgI!%+tN{zWY8<?fy#zd89u=51^4Le(5B&AiUrv?U2A+C7O+)`~
z8@M@<8-3~};nw{4NAbJe_s%>x*njuTz3#$n<Xce)UlO3sV=wVGd`+OiQy*z)zO8@M
z^XcrD`|mB4q)35A{`<raMLKqLt7FpZiv!~%+@gH3eSS0{>MXTFqBs<uX3+YY1WsNp
z_tg|aR`IyX>$Ml3T|Po5Y+JJ5XH~sk-B#7>btkLn54K8(;u`Zb7oM=^A`4EgkU?<l
zO3I3g_sUd+9-SuS%Eew=ymSo@izq)JQ8a^YQIoof(iHKI3rcuyV%#%fk4EviS~b=*
zfowO4h?SXB{5#flW_iRj=$g;0PYEc&h;KyLBK1f2R(f=4#w}8lzzToGipoD6jG^N~
zRy;1S*I|}~npu)|X9*{<W|llPv(&IVOLb<6*38nL-C3$LOJdC|HSW$*?JOxFfNOhA
zA!Z5-tPM&{Az=!d8D!cLJnwZaVT##nqev#cYm9+_8C@^;Wv?BUbrc@HV0ssmZ$mYz
z$CfoyOlzyAKfyBDtd_VVeCIiN@{VwQAGR2k^{$gAdE*ILmqQ#4Fg)c=Ix{&tYtplm
z=gu>HY@4B1veyvwsWE2mRpYT#N@ZG>0TRi^0ILL5J+`Q!#B7}8HYGFZR~gGNbH8c<
zOjUaua8f#EKnZ{*ikQ(9<136E#116rK(&T*$ekH?XU43IrSQk@%n=<y9l#r&ER3^{
z8;np|)d7yK=$g)c3bpV_r$2#b`x<#Dg>yZ(RzJLP^G1I5<K9mO?+o4*KcD*|`dRee
zcxh&?JQH1?i5B+0S(N72&O8h?<@VoN`Y?4fm7n^!<&*Y1?VswO|LlwNpPj$g^Uu|P
z-T23iLO5Csy}35^FwpSs?E6h?lMf_d{&cuNyK=wgWL@eiQ~N5?u6GYxxy>qD!n#f!
zH?u*z9DVJ%xdH{UF144beHCdt>ciY-l`Ua4r;eN1pzV&nwj4k;AnQ_Fnc7#8wxK@E
zZC2S5R&(mOnGM?J=xfbgEKnftoi5W>>nhS#REN3EDqF%@P8~P1L0cVl9l2i>D3Eoj
zqfG6qNIOs;<~FNr39C7E+zga31T0(dC!kIXfBubo#1F@xBSHmNJP$r}wW^EXYOoui
zlq+Kqkz=<r&xzgq5enEXFxTV4ZXOqEs*l|~l~F0;j&&WJSA*TWJb3cfU^m}$U^idI
zipoEr5+7hU4`8<`#fJekI{+(Z`s0^l=_M_}WXQ38Bm+nWfm~pt$c_VXpKc68M>B{=
zoH~2{wK;ymvBOyP2$G{njv@I85ZTX;BYy(PNhGI`oJR5rl1U&sdeN4{r?q<c*cPhk
zOj=!xB~$DyRBpjme*m)O5Gw3FP?TPFPMUIa?|O4@A>3CC$#AAfO*!%X)3?VrrOpB!
z%pc=qT^cM?`zq4Gg4AhcH(h)QYdf{v43uo&5*W1eknBqU<nV6fjVzhJlohI5V@{}s
z8+9G@e>}LThG!Hs)+7in_HYMn-0Gm<c1{>DV@K-lrro3x`Qs8Tb=UOd1T@t)-K6?9
zq~cDMt(1UAEQCQ-OY4R~+uQ#_P(;NYs?}{5A(j3V?@oP0oZ|!ntR3u$R5z)A$GX)`
z>fI7bd3k`feUswb7oJ?2!xdTe9|Ej;n-l{q0=H1vtn=zKy+^8ZyzP1qOf$^yuJ=sM
z&pgw68g{+s`(EAbdJg~v45peMy#d?ldN7?CvYx~kCLLn<kU?(D0u6{4FnjEJlPjni
zg&4`C5GsvcFntS}VZd8l?`noY)P>MO7Oo$&W3SNfY^KYKs1C_`A4<k#eXts0{k{@#
z9fN=?$Ze<s53>PdpoNuW%1EYl)0fIXoTZyDsP=1Q46oFsRAwQTQqvIIZ38OqP-&FI
z6t~2Vm#onpb(b8mdbYx<(L-BuW80xkGlw>=j}!3zKb_FoC76*P!L9$DBk2b%FBa&~
z?J*!99M7?$)B#+6fRlA;s7&pvNQVjt>Z=lq|KPYI+dvgo?mivLF^essc87IzfH*(k
zG<7r3mK)WNM)f;J_2>H_o9|nf`peY5inO0AtV*mRG{cr{pb9H@pZ23sSWu#%-C-Ra
zAkOz$;u;EVxluA2B|ApR`CiE8d)FnoOzo>kWv;L)v5L?PTeg8JtlWJnqfuB;qM+Sj
z9UUOf_gdoZ6nf{#5MD+EDn7|x!(H;CYP|~>wp<C~`C+E5GWY~x1|1Q8o!~O=-)2~Y
z4_*2&d@&m4A2D?PnZo*Df#A^GSFKR_nq=W}+^5!?35?x2cIKXThj8<(P(N@2w63JI
zQ|vcT3EYq9eLx<If*^cF$UCG=POOs?Uy(!q@!=P!ZyEza=QquwF!Z!R6#BWQ{{!9<
Bp-}(;

diff --git a/tests/lib/__pycache__/test_cli_resilience.cpython-314-pytest-9.0.3.pyc b/tests/lib/__pycache__/test_cli_resilience.cpython-314-pytest-9.0.3.pyc
deleted file mode 100644
index 7b2770260492849efe573d9a357a8c651c81a0cb..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 5746
zcmeGgOKcR$wR)!K+rtbP{NZE9*!VMi#()hh7|`-_OqK<^fmucI(rM4M?OED0<EkFO
zUahc_NI`@M4pFp`av-M!<-#$M_OPo+IYwi~AGQKYa>!wC1g}KGDeqNJ&-5512d(xH
zDdlnXtM}?vy?Vc{*Sn+Z!UWo>ueW{lXCEPH95_v`04rkv%#o`^mCg}M5>L;Nx9EwJ
zA=&c9ZHnLW1I2SLFch?c6<o*)iP_;H#frp9BiTVzZ!=NleviGjwNCXl6Dz7p9VFrZ
z3XvBQ5|@W{YqCxj$(~PEP642UG#`Q_DzQ9Q1D3am<Uuw|Rz0wm4|lNqstkQVtR7VT
z0Eg6o6;2ULQG>8oh%2Y^`Kgl`y<i!X%VRJ|SA-H<;f7v9f-8Oi=17uskcZNRhvcD$
z%givxxx^UG@9N@%y;qn)S??ItbD4ZDc|)g}-qZPv(W_4wIcqPwmNiY*YZ=U9y=Hby
zKw8?&YSds^Gi&71MzS!&6-2S@L{2vuZ+3~7bP>>8D*++Xlh}sap%nnhT9Q*_ph4_H
z&{YJd5rBC^Hg`Rr)~#$lSKWNOd-K=n>u{kMcI%<5+Z<IQ)V2)($7hD50{BYGvxD>l
z78_W=Ls7+G?9i}InUPV8xrB$WH?G6!lc1&K6ll<7v=L`{DW02qseVHs80-P6Z?lgO
zDc}kkv#6m@@fNr6ZVC#^Q1&_UESUT<*7DPjOTjJn8E`$Ptunoe(n^CueKkRfIjR6d
zMUXCAkZ8uRjI^a$G&?Z?ZnecNN<D28GDF0HFki+t*+Mu@j)I^-pyJ?4i<mQ7rKzO9
z0#8Z_X|_f9*wMChergKbuPv9i+Q#$6Tn03|VK8eN7xLC1j?Yq>r;hxR&ZTR%?7Q~Z
zvinWFm`hLc2Di*Mt{FsNQ6b0EXuQ;Rv>Y8M1qX!CLM;~(5gydEoIYh}8V_q4*sf?I
z9MQDtqHb0wF-;rKQf8UioRP}|Ian}tYdlY<sPL%<*n{GMDY_4BhmJRDU|4X^w5D62
z*RdkR5pZtMAcAzpGTF41HcV5~;K1u#NMJ)fy(>;fWn2$Cj0DyL-M@YUs(%t|oEv%}
zOTLB|e$jb+TVDhvU*k&{9(zupvrmbvb&ih4M;ooYfUue)DKcEO5MV29w3<;=NKnnB
zzv5;P3~SBzq@=ZGtKKA0n=eXEGL4f_6jQWeOk1nk4l;@<Ks3jP1|&@?dWv4H8k(?5
zRqA2!a&UO;XDx{1MD^pJOHN6Z_n>AfO%2qIeC|=fqXwrBq`Yq2^qFDINvqoQmD--u
z<J}eN#2*ORM%0iRR+Uk7PSJ)j$XB%*_Yt5>v?AXqx|(Ri7>TRe4oFsxhMfH}5CwI7
zO{<t-=;nK0QWZ7g+UL#lz0N-Bx;5wes&k`j&h<NUd*wYYUnm4dH#>5>*4X0UYEL*z
zz*)n2PD-kOKq`3QUmlQXN{#*Bf~o&$3yxx<6>WIC1J$1B&uY*ep~E|BR0nci>W-Hb
zf~g=xzRRZy-q8+cc1>Re`Rb<WOv?Yx5FyjQbmDB{AFJYQr^8u$!Zorb73!1J%^wW*
zdHO3^4>G~}x5@-?mb!?3V<s5Iw~%Pg`5Z|zTUYVFF~VQxhs?14tun&~SERmikCg1l
zrhbFWpf;!*Q-NNoE?<P-B430ZDV*o3eDMo)Q!`oZ8|$lY9`F)JwWQ`H(`Ovy#P3kw
zY9rXW>07XK*Eh8DYy4p6rf<v6z1})2DQ{sV#j%d_w5>$@ReuL*BfCje-*6AuMzU~q
zZzXZ@ch-CU#elb!WTnIwu1pw~rrQNn+I{)K>%&I)UqN4;BekomlPEffC`b~LxRJrV
zO)7}HTICiUJZ9=sV;TJf-;XtuHl=5wG%3>S+3Rriv#+FDn(AyadCY{;#bhS{uvy^c
zP8h!L=E!HeKiexVZ#<%&y>#XL<x84+@#5v#Ae&)HolabT1qGL0%ouyqP-*bsq<-TX
zOY6A{JOFiB(X`lX2yd3jk61o8`##(?4QjP@_7r9gCs_F73ea|3pbk1H007m4)O`T<
z3hIFbOC*h(SttO*pY$ZcpKvdbxi??55ITU}LF^9CMpzLp@^nV*%DtwMLwx^i)V2SS
z1ViMJI<4cO``~}&Jap|~t~qj<3_=nvK%JbD-gG@xCDr3x9`R~)lgvRi{~o*&LS#Es
z>K){u7fdbBZg{P_;4%~%1{c-37aRdfH&1dOm#AG^RtmGwF(`nIn+1A;ANcQ0>F^{#
zB!=&suiz}D=*SoG-=8eS+8^k@F@A0QcJR@ua@XOd*x^6M|IK8W^~3g^((E3Wn))4^
z08lX#m_(Qc4T4tCb2AAa_lcr&Hnex|q)u}NV-yuR8`x`??iBqtA-(b(KMn=gJ~Zu`
zOSc`j{Q_Jiv55zm0VRw@c>pVUD2}+lFoO!=eyV4g!Fc%646}@>vp2IAm(k(5-}V<C
zdABg5jTN(|mCZ5kw;i5`0V_1BQ{JMn$$Zhwz~_Vl)h5jRm~M(%jE6L)j~j^Q(UZb|
zp~@G9Am6BA*`-?=n8(y}qW<SHx;ponc|F5;2&(CfW*IjvIsgiQiVH%`6q|rMKw)jr
zK~zkHD^vO0HDjirTTnj+t*L?r;3PeaTndVB8)d(H(B~fXy9bBtfi2!UT*er5(+}Dz
zvFZEwxo00_SU$jafoijB+xQyJN5_E;-sXheg5)(v{wBwk<>q<0`B#Alt@Hc7vmg&W
zaiO+9YFUtvRxzK3AMgCZf~-Fc#qPD<Y5n=^a;SYV)c!OWyBE0=xf^}yCE<?eexhu=
z|KVb|{lWJZ!^uZm7Q+K`r=K*mmgMgHyB8z)Tvj^gl}-RkN@rQ=24G3)UIx7DSC+eP
zpROVv*G5aYj!jq)E4bjg6>R*x-2J4Xtt9WdpInUKb6M$`S9$;_DLrLn7XVAju4TZx
zer0(VZeGDXu8o#(9h<NqR&c>}E7<sXdDl}VcJJt&qxadT-(PI*S#Ii^Z|W;I9V{z{
z=FU8cZM=W(XSYgn=iR>Bx5`Q!pa%y;2l`o-E5EYbSyJNFVHLRBR|OyA6830G?!0}=
z1{Yk606#cTo$m&K^7rV-=bJvy{wZFLj+Eq)yM2&_$_j!@%E+=@_bVg4ihEoeEy*Lm
zW>XdbevK1IFE@gVfZLQp?XQFnf6>)6(&?M;R7MVk5-}ch5&#{>pxUh2(Vw&f3xYaH
zjT<xz897u%;cHCbs0uvv17r+fV8X{WitlLp9->0_7dJ>u+zTE3E>bX`il7jmckE9f
z;%i~>nEh$z1kJ$|eCxAL=$?BeN%|`xKPEp?o{^Tnkaf?<)|dVb(y@OM=)UfWN~$E5
F`480UTXp~d

diff --git a/tests/lib/__pycache__/test_compare.cpython-314-pytest-9.0.3.pyc b/tests/lib/__pycache__/test_compare.cpython-314-pytest-9.0.3.pyc
deleted file mode 100644
index fc54dbf6431dcb2cb3d638ecc9355f14654b4622..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 8487
zcmdrxZEPFIm9ymXb16}xY|CGfDJxOT#G*utwv<Yi<j8jA+OgENLZ_{g&5B$}tXCv8
zyR;%1*CW~jXI#^}v}l3cq3t=i0+p`CozW}M6v&VKDbODUDp_{q4jL5w74$~|J2K$L
zKYI7xESFp|a-A6c)q(Ul^XARWn>TOXy!ZCm#)cq)a?iK-%PEPFG&bzwEmyW)hf0aO
zLS*3#F$MnX8gmz22gsOcdJbTZ>x_5IXZptcrk}UnX98nEQ#wFi_O+8ZksHSaS(H7p
zx6f_OYKE{!_QeI+F9-TuWo*Mv0@z?Hk)@=KYm`H+#0<*~0Gni?gT%srMDp{iE*6+j
zsi9@$VvhMz8qH+W=BmJa>HK^_rP}h~<eU~&3k5x!R?Tca7hP11C@to4+1zYY&1Irg
zQ`2+mw5~;=Q_rVWeJ*d9pXu-E?}>l#@BfBnbvWi`4b!SIr@*xHFv?(3zGxPTrebQZ
znarPtAxhE6Jn(DjxqLL6x%9!69sm8=KP-P|IOd{$V3nejYlJJ{4xns34?qR+z6_$3
ziRpS}kLlh;@*r$elwGi8kL(8Ml|_I)*#poodjSSypBYRMQ<D84k^qw~n8oQ68MR<)
zl!*p3(q#$qg>XTTy?+b&DG`~(L6WP&gTS1+cqwY6)!Zrpfr^Hv&oGa1DXZ%Sj_>Sb
z!`&|$8a2A7DQrHUi!Z7)(><EcXx-|pmNUC7;%-wjOru-RPV*WH3(p{4SYlFI&niZC
zHmB+aYjyA$cXaH7#kK;Rxs_*Eo>)70YzrD>`%R#3mO!y^3+HJIwCD~1WsYep)LA5(
zy9_6a=U!R7&AIr;`}f1REo<)Uu9ZC$d8%OmCBmh+3o0ctWgjC+LK{m5p-;e1OrWUf
zaQr*rhq<Y`Si@TgL}O@*VS)nAv)xXvoIN;|Vaentfgek2?Y$ODIJaFysBo6P4HaO?
z#*+cz^Wx`7rjwBU<NzFaD@h3xPL(KAHdUBE9U|oEGIkc`+)vJeQD_5Q>>vUBb#>tN
z`~)*4%Yp&Tbf+sqf~k@7UijZS4B#N}RU(t*G+d7YXn#t04zRD05*U@Uzzbg!);{Xa
z3ac&}fllkNK~Y`t9wwwMp+>C(4-A4jYOfXO5bJVEG^3eX8ZIl%&dzEyKFII6F%++m
zAApGs)R{j5J8O~Q^{$~6;f;Z7?eDd8Y;+BMw1qPMYGj8nc^X!BtGOk9t7C5abc;EP
z3Sr>xIy*Zpb{cEfWqTjaQyUKM15hHr4@bUXtVf1dE^KxU-RK(Hhz$R$@w&0*AO2?q
zSL^yV!oz^Iw88@uV8P$Yb$<pLC6e;m=bG{1xmLBLgG^O)!d6}vlaz~RCdZTuXV08F
z#~OKK{KZMD9Sd69_7*5yag*<ydBLhG*$k5kdT}<JQ!-g<FwZ;~37rM?yqZzCm0-bn
z^)+R>n3>f~CQgHWV1b#Ut}ANM%o}JWL42K^Phzp8=+)I?E<ML0PU%^=D3BXrzP}IJ
z4!wS5%|G~)8~`p2ZWPY1hL~Sba_YRMC@iQbz)(@gdPq?gimG1j2`kDBFl_4CoR-T&
zx4)pP=1iW>)7>yX#q*&Z2oeZ}0I;13Twky*X+<?bwWf<;|6pXZqNH`zFz_i#Yp^pV
zCa?wv5{BX7jPUKzMw}199{4wKi9i1noY(E}&e9kYk1_EG6MH{&37!$*zUcBCze506
z8^lpHcMA;#YT(v)z=Xk>lL_a|gAOd!)0W|0w~{1|fO5dmGa*_yCxm<NlHK@-+fR5w
zNP#nOcyE=e;7Yk<Zw<yhh31r(34D3#%ED+$>>nd!;YC|NXMe->6IFqi39tk81$b#o
zd3pu;z{O;*D_Q1N_Dxksp>XO-J4v~1yM#{1RFh3GftIDNv{$Sm+N9a`I<D}wsVnWI
zx>tK0dsi8FN#NE!g1?WCe?gD~a_~{(Qlh<HxBGBK-e6O`vLuI6-fp1=>zhJb$BScT
zRP?$667$(J+rM*y+%O;%+yerggwt+(=xICj!k)IezX^P4>q@U{;cF>BO533gOLcPF
zUK^rZ8`>}SKc+T>9dg)SbNu%HyInQ149HD#b1G0L;ozr`aL}IF{&h(Br8rsmsT`@1
zu}5xM_+=`@Z_UYtWJSZ8QBrUsJ8GyK<edwDN;Puo|0}ptZkzs;Fv~py$IiExZCgr&
zn|AQxltZ?D$EfY)<*|1oH$Rd#WYgBkHYo3sTT{V0**2x534E~YN~+18<p4>E_AWE}
zC(@s`m%^~ej+Yu<YIN*!_aixPu-BCLJd(E2rmfp$v%FW{mujxtW#l2djM%e)y{=l5
zXq_;i33pd|h1fw<H>uIu7J0w?M5?83%^jaYYj@au*uPb+El6M=Jn%2Ve+d4;UeLeD
z4d4qt$s~CA71fGTY923iLeP)jdbDSa8V8T6P~Z<~+1v6D>gxP-Mjc`?3`FekLow#^
zi-w|SXH19VS0;rEKn%2(CGyUtcjo!C%KR7*VLU|9wRqr}TkPsdU{IOQ=}YM1gcq~s
z90Y*MWh>xh?!r<6K`+IZ2x~`|8^Hz8s2P3gP+_SrW+1ZfJPYAa=eXnpC*d)R@E8kl
zI*co+52xhga7m8Bd%Enynft8){Y(TF5*#L%{n(Shxe$A7IqYC1@mv}G9E5(Z<Huvd
z@?lOMGor_jb8e5ujNlc-K_zOTdSgcX^ocX)g8u+wcV%-hDq@ey=J=>&%&@UQZO;Wv
z0CoqbO;phdq;6(DLjwoOq!1rku_C%>tif(@htU$Lh^w>UNMVVetxCbof>>Qws8*n5
zdKOR&bw)!B3jrl~nOWV;<_zYw9DcSNm7%C-_$Xyo&rhp5_YGO&34U4-DW0M<Pg#qC
zzMpC;5KZW6j>nKpoH%je0%RNjDi_X;Uu0g$ENHn5_{U}H>3m+#7h*e^G@s91(v}LU
z37)G+K?5NK%r7zt`Ld3q$Jt0a8%L~$HRmyBqu)u^>+Ct|Z1g$F63)g^OEMM*iJqEP
zZ%wO)iRY{?i=#w&l5(rBrmx@GI?`ic^xGNTrdZB>>qD+8{RWJC3jU4XfC~%muo&JH
zTi3<b?@8BQUQeFADW3bCv};p(@`m)}wJ%=Z^=|7;=}hV5?O@~8g>ODx8oezwu7#tw
zr1mv2@xzNhntE#rD%YmgrNpLK{@V}}Yf`(_yyd7@2;Re?mDY`F3p%b%RXUs%AcB>z
ze&v?bwkD4Lu<u8MZw>yebu;n&dgA%DgQI{0xGs%uisiozz({RY^OmE2lT%j6oRu5m
z=x@c*t5<GG`(VB|O%B$j4xoYRihJe!twRG)dmDS+HrEeTS9=dv*ymWST!+=(G@WZW
z>&W7vfw#;3$mF2`s~?8hXy<^xUI(VEVAUC#ppD~!%dY{e2GpRPZ5$KwfiYJ>DCZd6
zV`2faj+i4%1>nJk#F*}vMf;J<Lpp2%5*GCZoFY#u2=tIP)BV+eC*^ubaN)!7Kn_*{
z9dMg!107kaak4}?v@n_Sa3^cVc0Fb-*R|pvO;o~-Dt{2jL{eVhtmWb<hvU&ORUNf$
zXi|0f<;F+SR$Q?<$HOOw<))Oc&I#~O;hp8hF*82Obp_6#;uJfafM)CAr<&Xh5B?5{
z=S=Z&r<j&VJ14R84qFp|9TaM^phvdPf~Euz7Gy$oznu%^7=ExH%ZMgxtqZ0!!StI&
znzI}NMVpy{#8A~Qz!U?2^=oo|xC8J^YBQ%4+-<^h43L=OIX=N~X83sCfQ>v32hOA@
zrWGkhQr4xhB?btd;o)pq4h-f(dFs*1hQ0w5HT+--gMId~`1yFA^e9Y5qX@RmkaD(A
zaR-GGx!|gX(oPo3*)w_&r@`A_&YqR5#7GBV*onvD*O@^*h7+Ts{O1x`J@ezUYftrA
zJmZRXLAQ<b{B5ZG{iaYUnkGv<4nfVCJEhM8f~6iy+_h=W>TRl|S23|!O|K@^_3>AK
z)qed;@1A(?=&6nIS3ja%Krnfl#^J}-iXN!bZ{a{ZC_AUh({F6_Z@?z!36-Nne%|uG
zE$d13SH|Y}t2f49-AJlGZ~0lv+B51;5nQk9+eoN@wR8wA8W#M;f38C#P(i(Sq+A;?
z2)z*GU&1`_B}|ZA_7|@Xj+fo=9Zc*izlmTsy$?eCy9|UF26&~*`~J#@D*GD~rvE5D
zfAH}EQ*ALH{T}>0i13-x`P@u)HrmxSr|E?UVZ`MN_-!nzXAQH0&~le8g!Nr0o6DLH
z4qB|JdN!k)TC_Y9BtO%auqVDWuk+kISFM=D%^^kE^3*UzQPX||nA@SBK`?+|5W#T-
z{Q4v7)$DKC4DeLzz+)Y_l_bW64P*ff!2KW@6<!eT`pC|<4~eVmv~b5u+K2A>+&hQw
z5V2uIxa%i-JFY)-egDlpLw5p*;U~>^cAk#FG$3urSC&~udpu%s%0&Kp{0m@?z`ucJ
z?IWJbZP|0x&-}-j|H%9PKIT{NyS#xBVT&NNX8%v1rJn<^eMDTbq{2fisf(V&8E_iw
zQkRAI0w7rbrxl(ZNDt$5=$e7#evx@+#^QAx5d4j#_-fP717Ke3A!Q<l*>n=|UqJ9h
z1YZJho~8lX0iq)k#6U2_nzYOCWiSq(4&(fbfx#8R3jG3=K8g<=ok1{%;L8XuA%G+g
zp#=mB06+sTV;!wJokg&S0NgP`zk=Wjf`3F%LV(I=$&PEF9juIuSTRcHCzBujFPH``
znDJczAHjzlvG;wk=e`(%)crkz@XlW1X|m#&HzAI>nLK$@Jav1!I7SM;A$+s%js9EG
z{xz}lnwNuhsdG~-|80n!YtnwJdCO71iPRM;X9arr0hX--{XXvc2MGQH!8HVL0a)J0
zZ#yPHz*wL;IH_IMAgrB-PrVSd`|Yrr&I2{uqgXL{+=?l|R>i5dNV6t<*r}kHuZ*+K
zYz5ciQs+msicNmD{IIyo!yOECf_Ws!WotcL=x<o(;vcj=<_yvQ0UVl619kI`TM&c~
z2>CkMB;D_m!*@d@y!%6!zr}keLYl{g+YQb4T0>!1NxHk2ME0#bv$FqP@n-X>QsaGZ
XKsbJv0QhjXS2!u$i?j$&bD8}=iSr-D

diff --git a/tests/lib/__pycache__/test_compare.cpython-314.pyc b/tests/lib/__pycache__/test_compare.cpython-314.pyc
deleted file mode 100644
index 00c0468ec1276a75b62c5b5e28154ae2f19667d2..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3199
zcma(TO>Y~=b#}S@TuHR(5BfuqXwy~1*rcpliQQU}V#{%*T9B2MA{T}1#cH_{w_5Hp
zGegOefr9O&Mw@=1$RRCyaDV`z(V)nohaPgsPoQGCHU=$%pa&y2I|`7tr@psKs-#<^
z9e|HB@9lf@=6#>)=}r(>Kl<#*4?d3&@;5xR7M%UH>%d=)+$5SbO{OF+P03syz_UCZ
zno@XpfZULdlLFDYuSr@+Q?&58P{7I~NYNq%NsDT+b8-_KJxBl>PZKRM+QP*&HBES2
z>&lW`&lmV%P8RVwgEFhARb3HXv}n;bc}@~h(<@gDYHf`zl&p+VsW`T2aNBb;8wShJ
zs_WYBYQ}Jj8EP43$yj!*3=kd9G@O#h__xoWIe(_`__x2ob#EdU65Y!NE9oG%3|vg8
zUX@p>T<6v;E@CFI=v7wSidkmK%h<)W$F~ps;X9viy)%)MX%sZ2aR4Uj+*(p|vwI0P
z23YR~;gUwUeDeqo9U>msk}H}FFAi%VfDugr7}df6V_F1YT#NF=BH^kQ1B=9kI?t=i
z7mG%PTU00v4rbGI{#AG(7(KQdgH4TSq#yyC^v`(7*jURj({S&RdonOu&WZ@LHQR9*
z@(&J*iQzYyMcME&g~xktVZ)%s;YqJ(4I8VL%k#m<!`xz=4LkO-ze8i84+@n{p_-1Z
zGkeuF9469jdKTIU$KW!CcB+y3srn0z6R+&Tf&BMJVCdB#qO#<kG)woviS`0$YD^En
zUZiYy9XyJD-@ClO{qp~$_o0kH_D6F42u;JeLjkmiZ^c8fsgb3Y8_6h~ab*cT0)IJ)
zVuXYHQ}8E3)Y$0ItqV-UES+%(z_K{pu2nnb1z>L4<iq&y)yDC&fl4U5wG`O+D*YNZ
zpi&D@E=X@E-zCLCLi)%6I6O@jrMdQyXnwhGPx&idguK|qu7b=ya#bayA7U{};`lFT
zF)#lQNuy1}K!}CRJtKi=)ch>`c25I10eaQQ0=WV;rUKEwD18gCw@3{t##PY8zZ33$
zIb=&<$|Mj2XG7-8g)>4j1EXdFhsVbt9IaawIxh0<7A;!bG9g*1y}D}A!kC}uY`n1d
z`2YxJ=<6>)oknV6J3n5Res$sQ@jvg+-p`N!gQ1OooH`&aT!EWIhP&x!buQHMZq=px
zQ3k$laBwhCr>FB?arF5*^&|5+05$SiGW9dIotmi6Kg^GRnjgQPn)oGqn>At+KSglo
z8QT5i1Ym=x2!jAz_`jF#FTtTk79%ac7NH*XwM9vmEbYZaD|lt0>er_iru6x%(=#)o
z$3I+qb0IkA62Y@06$&ZN{g;afR8`$B3bo=?S8Z1>+LVcK8H$7>5{_pSb-zl8MA^8d
zFIS7J78lAg)CUn?sXC5sRJq5n)qwd12T$S(tQc{Os%w@+s@-}4E(4!442l|g+&}d8
z?M7_uLl=N=4ZkSN=DI{o*IlD*>AFbhIw(|iaNnitZ&eMaNlEJZ3MkAS+qGN|$gzrJ
z@D-1i=|PZBk0L-P(IEsc0}uywNMEQfrfzTut>r4zKj7@ub<;5z!&YQk@G>15Qg<5?
z*w8tz`Ol*e<-`YAHCPtF-+uz{d6Ya@n-a<^Liwgp&VDIN;n$?E6*>Is69OQ3AhLGk
z78Wjyz};E60DMm7+A|Lbxc=>J4e~loMv(*7KpSOF32^NY@?P%j-4$t3x=Bu>ZJMhh
zFUne^9kYZw`D+gQ&NJ)$_Xr8<%p0U60dMp<y#5T*$=oIx4e`u6D`O?;kR5B6t;O-~
zA_Pw#fqN4jG__MNf8yvCDuk?V1pT$c?3g~7{k#9WC!nSf4pPMrfo7eEjX6enxoC`w
z9JbI_<J4Kn+hDq5ukf~#)%+BSdaF@=y!LU~4+s&%R*s#Vb_01UT-wN=8NqSLbDd30
z&g6#8OE9|V>%pibx)8B<iu)&=W+=KXpyKB~3Vkq}i;5_-AnZAXt{!xFOe)dcI{0Z$
zF@S_}<>K^=Q06Yq&%>YwP@kW<b{(cEwd}cT)@H@vkk|@^XF`Vr=$Fq-vzn)$V+^_g
zvcCa8_6q=zDoXO9lHOL*zf<qt*dD#Qqs;t8J@imL`KfyH?)N`B^htV0ovvMel<2wh
z*3Vz8O+He48p+H9_4xbOKUjKiX-6GtC?gGZ1n_r$_(1J{|J(;-?~VO2{cz;c_Q<8i
ziOC&xvY|{iROGyK`+<7wUB07c>!JGWgP{w5;yXi4^7hcA?*}l_R+m3jfteXhA6qH^
zWP0w3y_(7AOO{jFN+Qmy;B=aCY{vHxmb)Gxg2~KwZN7CP5N9}c(co65DGADtxrUU&
zX4%;~=6{3khbvIXWozAXyh<5Hv|K{$v({nyD!_PF@T)TO2b923@jV*F-a%&&oI-FK
z0Y<HF^<dD#Mj6!R*jP|c$LV#zVX<Ma19%dWB<V33{wq2Cw2LGUe<{cIN1mie?=|UB
tckfr}uB2R3pB^QtWA&HneV-^hy|34LzK+DDSDz99UmcD}mnGkn{{pbS`zHVZ

diff --git a/tests/lib/__pycache__/test_config.cpython-314-pytest-9.0.3.pyc b/tests/lib/__pycache__/test_config.cpython-314-pytest-9.0.3.pyc
deleted file mode 100644
index d3f0cc35ba0d865ec2b822011f6788aefb16b9c0..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 8231
zcmd5BOKcm*b(UOGT&_OUFP1-&R<<IUl||}j*^>N;8`zM5IHorZlct0hD{@6KrpVRo
z(w1l#QPBXg0|ya`0@Wc0Ukb!P4@Qxq26Bl{&8?CpS;EFaF9nL++}K5c9NNB_{cuUi
zkyX@amdlwpGw;pseRkfPp4yrqf=4y>zB>a@k`(O6V*yX*09ZhmkjP9RouRpB+*|Op
zqj6UEwVN$|-A`Mp^eUPM^Z?|Z32r>72Puwsf*-HetH*2fnsy|zdywecgR&vLR`l;d
zdYxDWX-Et}S}$^tHi*n26b^nvrtx~1G3w%SHj|R|Og?uO<2(*~j9@k|rzGuiCYv?*
zH%RtsJ~x#~CmEn2fAX;K_rm{4A0!JXjxKqQzmHG~>JFfb%rql131~4N(UY*(jiMeH
z-}?<I8`Ub#a4&C4@c!=Id1l9x4@fmhS9!KN}v9cJjC%HtVX*T~2RjZ@J#H3e(Y
zNi645`CR0xj8oB9^C=}Nr<I)EOQ)?xbw$&)Xf`uRAt^~05K-riYDtoFxx6mxiX`D0
z=#zuLwhxj8^d0gs&Ho_D?;9VJ?6VXtJ%Lt~B+IoQ9{_auUjRUsFQOy7-3YAr#=S9n
zt4h^q$b#EZ+$%C}ikQ213oFhHS$$A^&rbLW)*dS6x1mzrf#M#!KRYM=5_Z65D>M0`
z&5sD-N3jvOlT<XV*W^p&T`ivrvxaZxaw>z3;8mP~J=B#s-C#+R;Y$@}R1Gi!`iv^6
zvObNgV0avW#Na4nB%svDEW5h81YAuTswJ}{+K%@?6=le404$)*P|MQrdZ=SzVw0`;
z;KDC2tg+3TzS_mMbzl1i+q1&<eEQlNI|5b9^<C?}t_^lzg&p{8=ue}67`->OI(T-C
zJ-5j>EwfFy7CNL73^{!W?<ot{Kt~r)Nm^r$tXi0mbpV6yO;bU(MUUteS<xr@#VRo%
zauG%hihPufFj0@{Nf7a%&sz*^y2V=6n;`o^AJGE>LN=!25vz|`qpNJf5vVwuAbUlh
zA*+4%&2FpaER!A~S|`?swPIa@ER;U6kPVgc4o3um1f@F&U82Emk2Tp)DepiDYRyQE
zy#w~hsr?o7FO}$QXgm7zj94dz#QI&$wY6fbZRZ+ix6Bbls%(wh&ap%XbE_8{#Kv9C
z?fLex#6wH1ce#&S=wYsnVv~!1+po=M?@5$B;xY`XKe5;5lB*A53M(Ek9e%qHI|mEj
ztoq_?>_7!F6xAxxGy6e2aC{u0lANahT)~O9t(2f=LfkjVh#hau4SI$wg9dW_BbF7{
z-fh4CRra2VFYSmMuyLc_GG1}w9<e3PMVYeN;4U~Aw0pL5!oe{{tr}zS+rUR*D>qa=
zamcQX^ERCtxWvqU8Ly_?{o0mcutI~+$=@2_@4>fjmGO7)3(&WN_T256+jD(RyS0TL
z^lk5t+HURJQNA25?nV~H{X6zsZSQf^S0<sg;sLQWURxozb-R$;I=g2(x8#-w*$sf~
zKH6S(FQFqYOQwe6p;)Ia52bwe^H^7@WJ4g~zO53T{@pI*`%syDhj!e*ki9zb$H2q#
z@5GDaVF-BW=Hp?T1Esv2k0M*nRnZ6EpitaLZPDkpey5Ai$tfNzQ755y*`nR>YB%3r
zPnll$#Wt}$?yt~``d#Qny)7|zZs`T=DgL9_49EJHi+>3U^yw8csZk--+t+VZ2!b}9
zFJx1ajx*`Bf=2}%7Zj&4le>~nQm0VT=T&7?Q0CNZCYjM44AXOiKJ$V;t-wE%OFNB|
zg;ZM6N1d`DOv>6ccwsX$;I(O^!cnIQI#!apl5!*W3R)(ele443!D;#GWkE~Ixr3C#
z@R@D`IoX<ig(s6TJjs58CG#{w))101rz8u|tx;umr4>1+D<hyOR}}DG=8eE)HlMr<
zQ@_IiENcpJP>M%o4SY;pc=@n8KNQx2*DOFdNvrz8+F?@DAJ&f0ih-~e4qi9B>U^IO
z%xIaMrpvjcLMr+VZ*nS4paDz{GXxrXouo&LwXR`~g*8%n%<yKF9DxQ&a=cjQ!s&;B
z4Ho+BD<d5?Nr%JC_nwm{!%VT4Ob+H`&KrQzyd!k`g4czU=2DKr%$eeOLI9vi06>X=
zE_Bdt0>Em5wMQ)HJ$M}!Z+Cc{c6jj_9X_KYQ(S`ABV|%j*=d8d=k`#*nQ$}sn&5z5
z#j<MnQ%VXV0Mv!V#N{&ls*FhuvHRxkSHGgpOOu66RtF{-eqg?w)r=t2l7N9EW1~f~
z6ay<NmeSe$q@0y<@{D3|k|s|nq}r%GLj}Z+nHY_d8KVHPQ?qi8Dh?)RYolI*5wa4Y
z0?{P%F{-S6!##A4=HiW-vb9s$4maRtg2EARoI2XH8k0jNCSNUsC(AJ<WY8>_^Em2m
z>30`~+-Sq@!f`j+fV*(aT_~d&G3bVxgZ7sV)aUN|m_}U8F4GpdQZoV&rh8usIzi|F
zYl<dW`vmctlw^h=@OkK_9{$?D!3$nMk3ucKt^0M|W5jg?o*=HCTbOv<gz8>eys|X6
z$<^G@7vsyV(Yt%^{o>yFO|EhAoyT5=>v-(-{N&(c<f%QhaPFxe)weFaeYa~hG_o-9
zw_wv^{B~xeYj~w=crAEr;nhcc^J4Oou~oi%neE=-yEh<py{l~Ztx>1qL0QSE2Fk^x
zQvxJc$*BgUPsW@QAlapdZ1?h(ao5=HO}_Tyk((pSY}=BbCM$f~25Y^mY}<_yr{n-P
z1|9GLU|6MxCFC{In}g#90DrTt;p1yJuPw8}t-%}DR{8w^EzQzog%>th>s@1oWq!X?
zbbwnhoQ3j$bm5d70J^%NIVC`{OAi6OW;VEc2Y6}L!FL1DLiT6u=GfxwN2iwA!5gPG
zcmZY!;07OA;UfUt?V!mD4^y$;Rd#ThkC?@EN<~}NNHxrzU{TmY0r+l*L+l0~vV&v-
zqc=wv<&RD-HLde)%WT)}PDqydw%eU6JjAN3ch#yhE7obrgx3g(gXadIWotx>#FJ=5
z>t6uuE~4d*SeenS2t;a~$dxDI#0|EKT5T8O8Dk`-;3;?|C*stO5+vqEABcZ>Vos#W
zu0;^Rw&PzA&GwO>JqJ=ggap)soKqgyfCDH45%X5mPJaYr%-f1GOxRz1&pKngxU<ik
zbBZyIxKz;Kl+_n84_WjBfSr>Hh8Ph<+yl5Day&p166}PJ6#e!Xek^Ce>S0(EqMt7e
z?lijNrQZO2n$!}*uMq}=B{faw;Y!nqm3QGdrAd<z!8h%vG?h`ZDJ%k6n9#oiiOX{O
ziRJVY%lWj{G=<0Ro%>?^%gN<|i>o1VVd4>2_i^iot-pD1gX>=Bx*r7^7Kd+7Y;+E-
zbPlZtj(+Dw!Nbr5?*p#;dyT}X?sF|?dVKeL_%kQCvXHAIpdJW-{l5g1gl%mh`!9vm
zZ_|TNR*|1eNUL@fQoyi;G*JAqTu2M5BU1mP^SS)o15CkD=m$K2oRWrf&6N^{DPE((
zBpsnb+KY{;0`VB&nnf@X!2^LqwhwM{lGCOx+^N(Bd;x?9#~?9+a4l6(=#5lK8Sq7D
zDp!D)pQQlK@KPqF;kSWsm=Irr#3lWubHmf7AC&&$a`@B+clsfBdOHPpg@HDBA8@BF
z4d7eO^!o1i@@Gz!Yk+g<AnaNLo^|OUR*N-o?cm$t>fyN!IJrcy<$Fi{@NR3wLpD9=
zM?Lwu^rL=9{QwM0KN^a*Px+#ac!l_45<0?gyMZtXZ)3Q7K$wK94Q39vle`Kp9&_M0
z8bMhFL5Jh2o;TQBUR8>nr5XV%#AN_yP!kHd%enm3oTU!<XQWH|O~7#g!>r5D6FX^I
zdln#aZZ^$*|FfjMG@@h;V_APp1pXZSwGSb2NxqNReIGqBS8sP>e&gO*a$^4Z?dngj
zd^%0e&Bw{Pd4!ysM=Q_G_BR5w<iZ9Qd&tGM(~^szB{6s(a4}0u_O+Zj;k$o=|7i^u
zZUhJ1a?CKfCN=g-=9e*IIym5LMX*Be3JuPwGODaPOw=1Ul7xO>i77Dx>U>Jh>6s+`
zsv05X3fQ;^+zLm`3t`i=kUciNM&e%3FC{e?bnosUdd@QUKX76upDJXPQ+Nt6L4%P-
z27}#Un7<?RKKd2^6{`CR)qIUQ|B8nG>4z)iZ$r&Y_qTmPX87BEHO%l+p_&<{UH%8B
C@eI%a

diff --git a/tests/lib/__pycache__/test_grading.cpython-314-pytest-9.0.3.pyc b/tests/lib/__pycache__/test_grading.cpython-314-pytest-9.0.3.pyc
deleted file mode 100644
index 7b3f7cd147c0d46a88d041742af08b0b3101c9f7..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 39560
zcmeHwdu$v@df)W0IUG*&{m`4zh}46l6-iN)sJFCwF15PV@mg9^x1e`08jh&hrO09R
zP!Gn3W9&1I3!HbAcx}%miyZGRi7B6v2u5rufsyd+BqzjxG31O$ZCh(!j6(wd5nv&0
zVDJ7B<o8weYi61pYESEZSk2}))zwwi)m7bJeeZsusxsigcH*NAZ;Wj5c%pP*ZeAnt
z<ZqCe^t|p-rL&%-#DBd#CF9;ro*p?_x{1zs&-!}Gl4U*qq`#*;S>6*!26~jF!q1hQ
zt>~#tR`ygSt9q)F)jc)Inx5KZ?Iw>Zul6MC)Kb8D)d$$1mH{@Ze!x{~IpAtF0Jug~
z0N1J&fa}ys!1Zbs;0CoCut}`}+^E(9Zc^(2gK9nCX0-vZS#1PtQC9(OQC9<QRo4J+
zQ`Z7+SJwfys#1$5wEl1DhR=se_43{b(_(|__=sN7%SNvyk|#$;uEdA*z=`p}p;+=l
zA{NyvPv68(GB$YfYApJuUe*_5gYjq*)oUY(#Be+sxjY;TN9m-d*Rk=DaDt6SW15ze
z7a6=d8V$#crt}J4K75&9MlT&=k-=Ebq?DnH3jUv50+{p+NciEKis@0kYKbcElSNKK
z>XUj)jMI*vKF=Pb)+WzDsd+*z?eO$TCnVW}q40eo54P><yr}!pGu%o%sY^FQvhIx!
z>Q&m+(edHIaFWG`hGI;wj*na)jYg93(UEZSRwAbRaGw~mJDyO9F4K_czVXChBpGAW
znUGiazIj8JFGsYPMvvIepnk05RV~J}j>{}EGB`TYej~yLJ5G)c#yTQHv5{n(_GWx|
zSnEi}w4~NC9KXy{VH%wH$WVLYmR?3<i)*rVXwZ*;Z70B_=aEu<yL)EKY#@E1FRS!V
zo_Z+zzkBYVp1bY+&etC)tLKzWsVx9$dD9b=&wc!9l&Ql{{vUcd>9NOGEp3-npIX)-
zBhN3!TrC%QYCu)mrH;~eZ^AJ*eIB)9kMTOycwUvpy}LZ)-ZQwuSgq>WNu4&fH@)X7
zJ)UyUMo+?<kW^`xG`731WPcAzp4x7d8M@{1Z14;i?Rh-C#(bM?(C=oCTGuCcNosw6
zOP6<#Q9`YB)bcsYi}YqspVxdg)T*LoOU$wz-kpxt5~XUjTGLnBA?4)y2FlGVI$GxF
z$IQ2S;WJw{f3Hev?I9`QJtVP{_A`%hRX*Lgs{UQx=Oatw?HpOPwgu1SCj2L)xFZ#6
zU^OVJ`-X-`FGq%TnZM^X(a~$y@WKLn#iHXtgnF449~y}aYq}KC{aWmLjK!09Iq^D=
z;6>GaT68plV$x+jz)@sO3nxb>2-WOnc<eoTN#a%~fjtvd5e+CI8Qj{MxYZrfD6{+g
z9qAMAks~2#qK>jVLt6025nf_XNDKKG;YL01V&WDdig1MKwc%)FcsR^r35LEVA}pqt
zhqcI+80F|y$N43Iw@x!Q%Jlm1l~Hyrk|b<A92wylEmWra3=}R!bp>sQMaOnI54xNO
zyTyTMYp?TQpGG|lijA$M-nNRa=C2oFG0<TeT`_2EKZ;l3U)urXJ?Z&*dF9m2)T;F6
zuDjoyt)DHOok(98Nc(@~p<DjL^2U_!_RSAVr1H%lmdaJW$ul1XJR3V^uHU_$Z91A!
zj?Fb4O-WOO@4Phi5|U~8m9%nfwrBPrQnLr=lvfsIW6$tY;_!hZouz_SDQ99{e&tb9
z`_h-{=a;&FHGQ%_E%yV?DgBGGv1gE@9218R9O*1CX%%xOaO&g7C7y~GrH^&xm_}%P
zruOt+`Og|^Pal+X7E?k<9K~+|%w0@@&ZT|EGMNB<Lkp`~N@yQv{AziJ+%6@g0jkZn
ze#t~c32&eG^&F%=*5!tx6S5i@yVzIyvS;jyh1%cYsNF)Qy|k9)Z9wg{eLkS}y8hD+
z)UFKJwcL@ag|5|#qGe?h@nXp+)^4d;wxc8m;g_jZYIR>(-ZJ1Hpf{awj+W`k&fBmU
z`OTKiUm&X*ECVI@m$4MYTt=D@Ro;e?E~Am=%jnaNbv+;JhUa6g)_^c<oM^Zjx$$OD
zi$+F*ZEdlU>p{X`cO<<M1l`rJu_S0>z3@gm*+xtT+e9D;@JYE<oi*cF_g@|!jlPN1
z;Uc4#fbe`53sFhu3b0?S0Q<25fCkm>NE7X}0(6D6z&|k(L9r5GQD{#PiXIGnD;U?z
z^@Hs|lUO+REK^M^URVUiN2s!~v|#ZGl`|r?7`=?3Zh7mB2u#*Spq)Smfn5N4g|nSI
z<$4K7f7VH7N?1%|T~u%{9XeJMW0??3%O1ARIxSX_0~%o}W8nw}#nRElmyR5sB5Yjx
zYCOqaL5(%|*Gl+$@hDKAZaI){K9~(0nmqZiy8iaXRCl^%@7-Iohi2OzR<BJR#PU$R
z{X>sb)jE0R7l93_3z<L&MU|?lvES@Y%fWY!ymbUWsZ#<d!A05FGjee1h&XzHWVc9W
z?OY^Nr>x@61kPAjXj@b^&M6zya@+I^9Ly<gi?Xq2<hHc3Q5-&y+vp;6ChN#Xdin*c
zf-|uox3TpYjOW<j16IU=cimTt=hz?53TL98V}E0%HL(Ah=r|h=UQH$w+QE*FNFv^G
zy|aTB19lJt#<4y-OesS9y2T-k#D+jlUytdPk&$FPoQNbL4H(gMnVRDZ0c$~pGkgtT
z$EX2X1K5iMjsu)$Cn$M}fPu<)FN4bYRhCEO>@>BtBo+_YO|X|x(hUQ*xnN-D{e{ij
z=5i5s0nKctrW_dgNV@rGHgGHlBY*q;s{4Mz$Oi}`ZzGJn-4!Em78rSR>U3Hrpln{0
zjXfi8CX7r+50C`1q-564MKX0-<mN$~u`u$sMP<XBvLP*RoAz-qr)*o4jXfi8ODh}1
z;RAUaU4+hL9oa}v`>YDi#DctS8I1h*e9f{Mebqzsxa!?!2t6QNu_$i{VM<I&-H{7w
z-OPnBMzRH6PD-7mj(y%5&mK$*ZHbZrV$t~4XVJ?J(aTSWK3B_#0~oMq^IlghYPNX*
ztqoz`A+o73n)dX%(z`ZZHtAiHFBrt)l~dn^%H8ZKtpAFnma75BbzSS(cd9By%UWwg
z9<^UOP;cIyqm2eL&;A8nZ*dIfeS)s906o7)tpJT#`AH=w60b*w<0LkO5OH)QcE`&O
zp%Gmf9Zx34li_6SCXt;!LGn!a*tK9AyAqQ03g;^r#e-yR^zab*^@MooX3r+Dd>1`u
zFeGEOy;dPRanmyi;r6*YA&*lb7u{PDmqOC{ki?M(Niy4a=@R2&%xwl)R(=>FOGpsd
zQN3F*_c;<?lSCuVo;uoo81yehCP&+?dPGWUxT@!G(ZO`p)*sfVj=yvB?VCTY|5%fe
z{cCI1NuTT#w4y2f5M(gKAO;z{E2Om&e%uY}HSjH>s(0xnkcCicH)};J=R=C1YDogj
zh`wY*9qAP$<G4H?A5O-Bpnalu98c<N!iMOV#3!bRVkB}cmPZ0Z6w^aZk?@#H$k|ys
zbkMt<jNT5`4WJQwDM(vE*S4%+)Yu!SMC*X|y8w{$2I{AerJK8ES7ie|lP4e5G);%I
zHGA<pvp-w22Y9rmd$PwMYG0T<`96(mrfO?i-uk^R3~@%G<btwwQO?;jbl%FFw~rR&
zt*9o>WRWhYiBrqqw_@@J8;|ulMf8wDSUAm=H$cpvCvRx(wRm<6kFg~n!5BN)Cqv%w
zs)<*IguI~$CQ~c=N+EBk?BC;%H<UW?pM}@VYtcgI5GDJHmbGxLT2{2|QW(fbSh>%a
zCvPYls5X1$XxSYPmYMj-`~_YL2(dHC9N0ydk){_lZ^K9{G}2t&@U&xH#mBn(`B<v~
ztOYgf2nJUWm#<Rln*hs^iASkW0j;=e8CublQ|Ge$0f8JjUgQ{ZS#pt+upvK3dmP!>
zY(9ILdUTEevGwe002;k;0_U0Be3u*6yB60ph(^pcBd&!_X-W7>BtFcB(adgY?|%UZ
zEO=<TEgRSc6i5vGzdmxWI@{8dZa$j{yzF4*KYPNl^_^pH9ZSotsW&*7Q(6~gW6#K~
zQ;_J=(E}twHBvHb=OUSUL*(Yc1-W$@r0T~!Ifc0G9<}D{oNNTzG|0sq!cl(ROCnwC
z+)_j$l+vGqNCYwh=~V=^enE&tXe}K?n>%u}QFKMovP+@pUzSL$<U}G=|Ig>cL9~C~
z^8q7_=kxIenvY5>oVAQZ4MSag%N&chLcS6z5iFEHItu;hxHhO)n#|Pbo3RniDE(*0
z;#1(|P<jMS@G5K+sT^L4@G0k|b|PW%QecE(wE#O3VtNh!3B)1UGmwc5=bya1+4GE8
zLoju}f*-aOz{y3iFf#PLd`)y(#DoW9NpKWc2U#qF6PgSAL<ECIjSKdv^NxSIb7~^>
zjkmx36aU9gI0IGJAh8Hau*-CrlE}y{Hb{r90Oz00Z5r&VxJ@>Oh70bGG2{}x@}v<!
z^tSuXr5S1J+tZsDs#|_?>0_qh%(LlG$jc|OXotA)?FKH(uYIS4KSz9m7?Bz#*GS{Y
zFx?)OBoHHTg#bMdY=}UvUc_VyeTHq5-<(1gr^2)ly7Pn0#h(K9tEf$noOTKPJ|~2J
z)N$`ry7yu>@G5Ab%GJ{icfT<^n5{fFdFD~|`qarxb?ao$`&D>2GF6>vxs$lS&Wu9I
z1*LOQ&e=0`-pZS|j~3)kR1;^iNEg(6Aa}lBg*nYsL4ndu7wOI@lw43i_vGvuI&bC8
z+eZs>H>!y<S)>bUK9IXV`-|ZEg``(4hMW+=lm85uM|#x~>1A@Oq(OIT%ljmolTCOX
zvN^A%MK5Xh6!K7~rJ*0JNJC!7xv2UaE(#LC0eWKj)(f<hziqbUfDW0R#YrzAv!fS*
zZzj)Z^45mT&aUrDRt*%?hs=)K89Y^AoAO4*RlaB&{#kuoombDbk9IxuO~^i<{Wuzu
zU5{m)X!TLZK8uZGL48B^S)^W{WX>B%;9_NM<2a`d&3(*la@lD?;a=7zm&<Z2D16+4
zqEMoN)mdzg9@C1Kw?PKuCea`Z3dt$e;JTd#f#^w%L)aVJhsPl3<RkzslLFb(qzEo|
zkW%#U@>Nr)YmVbkmtmRJIZ^G9Pj*62&$GMEvs+YS2LP1u5#q|B+84r0B=Urp(B<zm
z75JhIl<`Ix2sT?yGUf=iLndH>Z(%nT_noL-xLWb$iL34HC5!Gif3-4rdA=;YOt<Bb
zUhWe@QgMf!IhJ4t#3`Y+@7$epw8w?*-%_}^!GvxUh63U75iJHQwwtgS_ziU61pc*;
z0F31=8`uqwzH?7nKK$Nmck8oFdy)Fy;ZzA<U=H7F#^Jr@Ipy%8Z0uS2Fkd|E9C7M<
zcE&t4VO6jaXew1=9ifop$jas&v5rvaz1PIif_(UHy+~3Qn)cdhTFdBq$iLStl7$cr
z7cM<y*2R~eKgBcXTzZ76$hq)Xrdy;$0{Y4~-LjV-nr+(px!Ock)QWuuN+6>yccJR?
z9KP!kCXltY0J$$L>RP^pt;<=$Dq)ptp^#n|(PQ2cmVYI8Bw?X73+0Ms!S3NIF)mK%
z6z~Go!qm$l6>u+aqr@7u_NndzyiAVz%k@6&nJ`hhbwBy)GuQqV*2jI-s&z&0%SPXF
z9{R3V8(aj^Yt3<VwCu0}H_tACz8itMg)rLNyTL4L|0YUa9~}GjK7XF@`Db|}=t>|M
z&)oX{{SHC@SLK;o<rr<(`M1pt^HiJV>S}dOUwNKlAuzD=c?_6Or1=ZQf;o?u)JE{@
z8ytdedc$oi3fp*&OPLvQjGrjc;mG)4tSt&B3Ft>-*G4biDbf9~TTas2qnC1*30)bC
zTaFrn+4W&*86QsC{4OI5mv3!BYQnua7I^!F1>QcscEkSetf5^L>oQhjd+lzA-8Z>?
z*h?{%?fmr{%XNG2@e3CM-$G+;%QOb_XBM-Y+KWZKTFB;U3E&PNdYN%WelNOA?K&(w
zmR;yEQgi_j3O&>u+3B(0B*3{+TKNTQYRT@oDBo$--R(Ts&+Uy((LR2KPP3XX^W2*U
z#xI)2QfT-k{A;}?!#ZtB?u*;zRo1<`@3#)5qIX}pzv|(pmXAuLs;!e}ey-HMbM!6P
zZin9M|G~ib2are&%qgKo+1RskXzHkSWF<%fZ<Lrv3FBzNr7-f|gI%^=*gV3C)PQxu
znLvvV(XYdA9Zt&|Q(YXu#BWhH_Kdu73Iu?iA=1<fBX1rhjH3m+aF$xL$~Y4^vkbNH
z3D|@cQVUBMdb?5!ke{m+YUSrhEv)9$!kQdvf!5?NCba<nprY8=rWR-oewzFE{HTRG
zwZ7<mEln*nsEyB@T3BUL3+gJYGHWNwW6`Ul!4df3xLNEu)*2F{+6xZ*E*(w*Tx4%i
zlCy*yQ4Z1}Nlsl?l4rSIoH>5>YhdxJ^3Xr~E%V~KOhs0T`b~Xpah&hMxUF|!bkfC!
z;r(bTXW4t`&^Pd}{S`p2UiK@6-Lax9gKPgp&c(IR$F9p43a*?tgRa&RC#vNZf9JIg
z_WFYmE9PV3lD?8&KV~6sOUTQM*9zT5AbJ$sltnrBS~$sW9L_oGxx>hY9ETCY#1^;K
z>&m^DyeAhu=3mL3UbQf!#pejV&F&#N0OW1pDy(|oZwM0?D{o;e&J8`yeF#i!x9@r%
z_Dq=6v2{O!Z?o&W-cMnD+!tZuBK2%D#Ex9PtxP3MT<Exj-iXc7vI_@>5q5K2B23Jk
zmdFVmPvu6lHT&0Lh$;_Y?8Me*jlaWDOc<iB0pGp$sh+Hr%}CI^_NSbYg8G<|wNLk?
zImXy^*840U;e2jwVCC~vZr-2y3%(5V#Ln>5vgxUKG6+j2Fl?WCrF=GOK6ah@Kp4hd
z>8S5_EL_Bjgtg((8+`TRF$WCIs*7HgR(uTz#?Vxrnrl00sVXyVCxxz3s8y3F6_Um1
z(PM0@4^7eU`DHl1r(qw%7ryrOe$WK9f+ip-iV#Y3=Flb#%84iBg;z=)=+#0|X{nmI
zqB9h*`9xhx6k!#+(H)IcB{oAS_`$6!ngsS^RDF%w_@-pg5%Y!a0!uh>`K#ZZxGQB>
z@6P(WCXYX=th?Qr>YjOJ){SFq2P4<iKG)QqmfO=vDD8`~v1jFWVjArXktUYR$eTwA
z<7mO9F!G2^v<sU@IDu<fXPgN%`F_=@nZCuglk;sSGga_7J4MRVQyGPl3krP6a`p_J
zxANxgqXqdCs);jMqzh`|R3QmNQ^tkM|A~+La()OF0mJoTY5*c;u!0Bxv4nSUfu$~p
zr2Gx`qm@fo>e?DA7orBlM+0XA8*|w}hn9%$EN^oPazUopS8y+D5f4D;#mZU>S00M?
z5h3Bi>QtItbF^$N;^sMSTkNLSs?X7B)q~wxHL(MMNrGB35`B|g3fga78$K9}j3=)m
zst5wKP|T8p!Pp2zY8jlU&G9-2A{Ghj$3^p<n(CvYBZG0O8W}eIUG;J+^OM$`%LQ>F
z8G$K%aC_c$x3eI%&aGOTDb)fXykLuXK;7{RY)6m4@@`Bh*+Ydmi_=9Z@;qIkN$M$%
zq;efht=qKkJscZ>$T1ol4Bx=8hTn|cx-rTI5s)ROVU*eL<ANlC)BYU*=Y`TOUr#sp
zJ`8q2aHZ^>Jo$6e8Fl}|>UHT2XC79sNgYeCdtv6aduu-|^;VyfKJ<92PD=<{;PX^$
zOhq%wj+xE}N*feIi^`rkg>+!|WC0xdt_<nB_6U8_14_eR)KE5A`J{M~3sp|Yf!1mn
z#Bk3QF<4wlPQU;%O9xs2XYxy<g^Ev!7V0@#XvjeePPMfAt8VFqpmi$ts@rIxI0{)B
zEkG~y%+W$!4qB)KT4<bTG^8NOt1zz)ve<C!I-GG_B_jWZUYLJN;P(gw34Dromd#Mb
zI{+@igcB=#(;(DOAb?8lix#=e-?KkJ(<9V8UBh_K4yKzAWdnyD-i5_qG5;6k6`SYx
zA)=1um~DIUGFg5wFgG;fa?oprmyFJuE3miE(_4yW@;3WnR-xQ7eF_grVFYFxF9};#
zKnRajZdUEDGcA)4w~UO+r9DI;V58#}Ptfy%$7EZFAY|U)i6f9s6VB2fl{i$=<bnHG
zH3RB^KP!2qEdRhwRE>-V$HqtTa0IU+WQ}<CtfgY2k$eM&V{N?1V9-Fk@PI2t5S!>#
z1S0c^P&j<G(0wctN60Nb&=(mVkMWQ<JfZ>YJpVoQ#abvdx-m<2vC>sQv>v_@Pa-^6
zGAue}t_JK6(c}&MYqS(hdj3+bC6}$P>8;s7Yr*5Wb0&VT<zC}#;Qm?ifb9qVm#QuS
z|Np$a`kkh?n%<pAhhCg&S}Z^Qp#1nF|GHFH*1!GlN<4uR67sOmmmmLxD`Ngj#qq|{
zpEfGTx0Oe6VQV9NM|b(eoxpR)<9M*1V{&T<IT`l(WAb_;+C~#1APGYA1v__lxxH3;
zof)kPgb4+J8i^tplt?oWf7N;IuwL9LBO$I3g0~>GBI0aWM8DG&(TgZrcE7?f0J;_#
zW`B%pPU7E*()XvE4`c%e9q-IOqME?EZRohWX|{W|Rd|krZi45ympsS6j8qe3#bBuL
z#sQ`O(o_?7#&dCdv}qcmr1Lf&L66)XbB#!yhJVGVrs5QqO*Iv#u$F$;DB513k>D@K
zOc~Mkgry0^g$p(4ei+JgiplK-WB&mI$o`1HXY%SCUfHV?LlnqcVR*`okAPRvl312T
zVuS2IqF+<^*M0;bG%^RLcV+_}&KK$4`q|45t2a)sn)c7^xErUn=Qv+`PGIf%h0^qS
zavns8@D=M6;0iU^{2db3ZU|sjZV*ZV%LdMCyBXU-5%L5SONA~|K4s`Kw}2|e-zob&
zDgI7b@5u_d7Ap`zo`9l^N3bl^0Id9-a?=3NJ7t9~bBw)Z^uEHeuX{fBcpI>}TTZ%a
zUvkEt<Fm$z${a}z2<OTx@nKlr-ay#OWNai~@T1V#uwb`zB^hBLH^P)>QwX{@x_d&=
zVo?@LYVC-Qwv4OtMTeU*F^7mUMkKgiW=KwSpTXHJFSjAYV2PK7ba|r-s&w?7{S;N+
z!N0bZOEJvg3E99NV8fb*+k>ey)34k;GTZU6W?hO%@EQsW29KGS3{lPWk-KfPO@$$9
z1d4oN=J*2zqU58C%HBBzX30nIH430Wl5Ff5`Dj{!B`_U6pfn`Gl+HTxk-pa`vhyI$
zJo<d`F<}CZkN*nH-g5ZZe$Oe0q{&%u@vz03+SoVYh1*L(#9!#1@esfQwiv}y#bf?f
z=fwPd?)FmXQX=%^_Gr3l^7GwZ9HZ}E-gYG^FIv`?tp$pfwN}55(j5EZQWA8y#96M#
zv&lC{%cK#^+aUP2bf^yfCw#0;_YihcC127STUY^e$z&|i30&zHewpz>VG2ZONJQ@<
zBqjvdZ&T_xK(03kV$*PMkX`3ZZQudY2Ewva7>Ocb8<X-VHkKm@97!A=9`G139X#k&
z5<zTM>fli(1Rh)-evWES0yw=qx_PLE91o7p!T`S8Ij+r(;w}QbhhcgPI@FNavHNKL
z`}o)XD}bOtP4UO0EuHuFWw)G6w?MmfE-MflVtSpsU&q0ma&A#J_N;s^tsrohog>ny
zWo6FuMyx|CfyPo%>j<UHqXjhD0xcOe>bwBb_v@?*&ID={O2;jn_%=ABNQL>yfF3CU
z5g7kS06rTqTCvH~R{_4NWVL75PciDjX#q!>;<a{ZEa+58aeMZ{U@9MQeCWqm^36R2
zm+B_>MhIVrjh?G|X^+vW!xqQBlKZ>A6ppd$S#~o<<M_dSTJ>Nd0gl)9L2&o(a0JzH
z1|V~9+rk?;aVXr&T2ynMEsT%jm_qKd(Yy;sk5`#__Al<xO;txY2C>}DpgMR0s+N5M
zUM=*a8G8N%eAS+S>Sb_1`1unc-YO?<%n3s1{$77caR6TxU^Qty5#EATq4)U{@sH_=
z0LSEg-A}yHPK$3s<i=jS50(@sA;0b!`;OuJ_I*no_hf~%LOH0C#{M3M@7r@)!CuOZ
z6vo@46<`8XjJ~ntV0pYtfSUjX@_g!)ftAx0ipjH>zp&kO&=rYN&=+KHo8mmoDz0_~
z-{PYWYQW)3((8J~_n1!7Cz@3578eCGZ*9aL5-atD>`xC%lb#D+5Gv#xESVw0z#Mrw
za|9(S(S^bqr<qd3a3S|Je)JN=m}7sA6j!Zo+cqiWG~Ps1-^}?3-X#RL861z2VUZ)O
zO(Ob3sz;6g{iJ6uxchE&c3U>spI+O~3Gj>DA5idM{{w{_j(xXrnDZT#1Hy;z!1*)W
z)`<N#Dv;x-4?jN6n(a0@Gafh(zIuW)TPnV&>2ePzLAdpEcYGv89V8swoEelfxWkNG
z8gejnonQ?^6?!eKF^08veCVm&_{I~XThnXo{A=)g9M<@luGFA2RLNUme?g-ZCqQIA
z`|ku^Ch)reIqcpp_G^^o@ciG;ewJUTOH3^BK9(2`i5JBbH@Z$QTSV!9g@3ISG9=9F
z2XgBNa@PlP|NHfg5Wl^gso$QiYfa0$rn>Q)Q9?-0T;O0%*|jJedq&=sCI>$|N2JLf
z)5x2rHpI~aO4lLQ9-YZLY9c*z!K&a);0$#rmU=By-<qzYUQmZZFu<a|T;`ylLpz*J
zh_u-S9OZY&=sxd|QIk4!xv&xBSRMMNIYJ%w2zAV;X&GQn=~$GFJtKGU5wdf{De91s
zH!q_NakPNaG(vPH>!^wJjA~VICUC|cp*zhpuiibI*>N;ohuDW_sAtDA%8N+Op5S0k
zIkPAmdqzH!R$jCYtpxSOC^AoNh@%CRu0zyBI+JzOM0)mwRl%9S8M|kE?z(1<r)7W#
zb+DZ4qTaP-NKV^zcQXfb3QTB?JtKE<k*u8~PEmi1ym@Lv94(-99VU{_Wa$!C(rN_R
zcM*a-zdezIrV6DAWWmiPte3FHEGJ<t{&60%y)z}RIpK!LV+ekmIA3M?Zj#3kq5ixL
z9z*t82l+<+wv5M+9Y?`c+MsUW9z#tz9z*1pV@Vp^WdSZy3j0d5{N!y|xh-zMq>ToJ
zB-F0w&cj>vAf=$#4aLga6ax9-Jk5P<=CqpKzUzJ1Ghwd%*8P0?NDqkBQ1remN;6MN
zQl>Vlt6V&W)|yk`XxTl|gGDEqcLZu?wL!te@I<=!8R4@=YsTK~E6@3W6xOGlNEz-=
z+(!PyZR$Vkpmo10PuyC^Si3*Xme)_7);FN8gZEw`O}g?~bfZblS&X~;d?jRPd@zf7
zAE#Q`JPOPad_IpkdXdk59)sq4ZU2Jz9Q-Bcar5(e{D+)Jbv4+x=80O<7JPX-@J*O%
zM)NM*+BlC#p;*q`kytHdf0w`%fwu@;CUBO(Ac0l_L{{X7TDi#Hrd-m;Gve4d=f(aR
zrA`1KI?BFfeC3Rf@WKy;F$6|MoRqlu<{6Y|sM@_u)f~qTEhoYX`{%fFj!#{#I~{wM
zy0?=+8-W;sD+GoJv=itcunR!1&g-OJm3wLS+bDIX!Ztu=Kcc#P>^Pssxig}6><H>2
zp!)An9<gm25e!cDOK@KYIQJ#Ot{5X^f0KuQ#|_3FhQV@PpZ*#R-l3LHd5QY`1${oP
zJKel58`$qK8C&x6Y5wZec-Fraj(36T+fvFm({%4jTJ6sUE}^zk_0FNU4&i$`?`{9V
z&hPI;BDK@`VI6!mXX=piV>u!{1>$*|*ymBgIKqc?3JN3dz3tAz<`GV$c3LN#3ABiQ
znIB-=Hob*D_p>FP|GA%y&N?D(b_7Sc-Qu5XqYl$ofVLD?6czYYo92|JwA?ZceJ8-2
z(y}NUdq!?alWNn>5ox-rkvA{5m`4ka(phTCD&$O{0?omJw+^J`^{EmL=9Kk|vax66
z^-~9|BP&4{F-pv%gmJWB7tT^^RvBjkXO>}bZUZlT#taTb>0dAg=V?R0iI50I7=vfR
z;5=2xH<j3iF9(Bzpaw-r<E0s#RqE<z&fu&m%HUKzLk8!o^2B}77#wpJ|DhP1=kxg4
zFgVscs%t<PZ=R?y31f1SbT#>3iYC6u-UsPqQ^aSUjwE)ai2nt(NE9*C3H&9%of=2f
zhdc`PUs9nJQmE_^6^1>Dhf%<$C5XvQ4r1~O^b;|gl%xPq8v7g6{bQ;>$IWU%GEN^&
zH}B2{x>iUrCbRx6Usj57`!v3wMnD1KY3vz!JEsBd9C3<BCnIlO-fkW(I7(-!DXWk(
zfeN1k#RyvR+0buGgdiq2rzHi{Jhb5FDxh`?Y4ueR7d{tI17&%f-zI&Q0_yTlNxwC5
z0rkeOq(E#<Q7U|C0rgsS-Iq#0EilJdQ_=8+okuR%3_exxvNL%Un={HM7sM?u#9S%G
z#{MTz&2mJR_++k-*a;#lF{&Zh&h;4o;^C6wx?iB(KgGY6wrH=F$5VC}*7aor{a*@U
z-9Kj1Yepu~nFta>eg-}G%#h)0P|_nO5Q1ZX7z4ghs4k1hD(UlV_PlO<YR5F^O2~aM
z@#^0L?{hy7CZbqEv(mm&w@(I*McjN4$FBF4aedy@cH`Rc;DTivG_yrSMbJO`VTHJ%
zf5@Skg@wX^r3deI)oGG+oHmEYf_)|T_=%RN$LynF3t-oCCj_i|um#BdHh_D1%g2bv
z8h!TnVR>CW<$YN744Z~x_igzx<mvv)Db}dV*8w1~6q{qtU1EO1D#PvT01Xh0`Cz^H
zUA1xSk9vb<TlVkRpZ8VJY>$0l-uKYYR`i55^;O~t+t~jnjwfuDqmS;hXlb_bErc)b
z)3~PSeN>qC6?wXU#8CxlRh4ItT0OAxS*$i&Hh*EkL9?h}79Xh%5UFeuHmF`9wIF36
zVuliW67rIXH4%I|K6wx!<9I|6MCncjuZ)h53{C`NH{%Fxg4lISln9mR0r(t`GekYW
z7a<faIT&LsjL37tdMQ$)O!q~{nTB6KMLN-<k&&mA%1qQ2^h-GW9m5?gNdUmk)*Byz
zRp?GqMZ%FU7$4!LpDylx@a}8JJ>zQ+&^>0TRp{Oft?4|axQvIMPZ9F_X=FUQOfAEe
z(QcBr_9JSohX5yY;*@gJH+Ip9a|HGhur-f_Hk_JAN(=4d3l4!1w2O(zPy{*Qs}VL5
zL)=w9E8*lQqP-+z?Ej#dzreqCm=iF6P1c~>AAvFvLZSBq^{KIRdr!KTWdj;MV9(Wx
z-swwsU%UTmHgLg7sC;HR#T0}#wgS>TDxy=|n3i|^Uh5BG%(`Q`bxwhar?F?`9b7s@
zhYuWStH1(kK*eVipSO?b1gtdermS>U-ofQGp^)ysJRToTBJd*DPU~yKA{JJ=_-Yr7
ztlL?P9Q_dm$jZ{oj65MZv>!pTLlkjkODV<m`22}1qeHR_6waq)7wC9O_tW7k1TGMG
zgTOF>UnTGx1X2XvBk*Gazfa%~2>c-d5-0N5CG4k^x=(;?hZ(7-xxkST<;$NQ5%d@K
zZyxdZ`47WLdVDx`ls!fso+^)a8vyS7lJvhlp6_@TJ(nJMF8!U?D|!E~2jFp_L<&Bx
zD3dNpj~jea`{O#FwEb~iskH5JjVy&8SNWuUkL$eBk;j2DX%iLM@wm<}ot7T2@*xwY
vIcTLSN2tn?$5mwj4PNPM(&K<%+Vgl-K-%zlUAeU7@fv`S)|5#v@cH`xGC7qJ

diff --git a/tests/lib/__pycache__/test_harness.cpython-314-pytest-9.0.3.pyc b/tests/lib/__pycache__/test_harness.cpython-314-pytest-9.0.3.pyc
deleted file mode 100644
index 72c28ab9eb8e98fae4d2f6f0245583263d5e9f8d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6910
zcmeGgOKcn0@$GU+k-H=%S(0VhvPIcZYT6PhSwG8`ow%w~IYA9}?a*zZda)um6yw#B
zdb_k^1_o5L1)L&)btxbN0ipsu#Fs+H9CPcH7FEeuIEMfQir$Rar<^)(ci-+(Vk)UW
z_|O$KZ)V=SdGp@v&YO8><1rq=^M~g>ANF$y<p@EafP=i82V@!DMj~?!8I0Akvw;fR
zhh{k=*k|{Ij1cs&*IH)7M%dy7u0>{fgYQGX4EH0E8)IyJh9CyJkr5R`0Apecz_`eq
zLdo#oiNN(FV+sYukmS5vHWUnGBqL*8$%_@OD&$nTl2?XurM%K$Yz&4HgI{iiiLrA;
zmGxv;ig_)V1B($^F#s(m2E~xrB8J6Cnn@E54}U_MO|d|;)a-p|jEYe)CdS3qbbyXZ
zv1K#|8*4wA7K(YT*p`WS6veWLYM^bInG@VzW-dzcF=u^CCx76ut}=xwgnE(KE*=s)
z=HgT`(?&6`?MGt{0YgxspNyM3MEf(H6!Y4CG-tgsghOqkGv|-(7=PFsN3mw(f5M0z
zVyD=3AX|^`n5(I+S!S1w-8;rLwb2^~wsnbz#qI;yw!eM!eCT`{2i(V9<H6Q$@rciV
z&DZ9>(`jxnME!zLIWX5lDW8s`l;^emDC@rAbbl8Zv8T*tnM|)IN3k*|MpDeu&1~@e
zEJEI!TKMx0U!0&LycWLI6IphY5&Ldk9c9OyZwcP)*Z6Jm?Ot;axWQ07x~FWA?r=KL
z;H{8&Ozh2u(o6%l<p4fxp_*w+e0Y&5a~ByrAPP&r$|O8LP;BXE*&vA(OONT>x=$}A
zvJvoN|E=Hpyx6(t>ciAF@%U@(@z>FdVesPb;d?wuJ?FLXJ+}UvU*pODe~k^_-GBRQ
z?12R7l*t+Q%ZeG1b-kz?vSyeqi;Awx1;uR1l{7;EPS_}wRH>pXCa1}Z0Dma!3udsW
zl`9~gD=jX{TD}@tkni0|=s8&foytc>HA--;s1*`g>7E(Fie6C-lgpPh#SG_4x&bQl
zg^xS`^y6=;e|*bi`06-Vp<p91aJqbdEUEJ!J4oV^#TriPrwL~ysSjF=b4fkPKQII3
z`@<$*)QcKfK~5oz5i@`*8o|zClEe{g>^gzt)wpk($)rx0lLU+=neW(3my%3%oJ<4S
zZ3_VLEJ)46;w5L%I{zSX$u@2rY++2csKM}2$I3)ev+RUETN5CTrqQIHDpd^FL5XY@
z{{V&!|C3qiel<>b$mBlMOPU$XR~E~<$<EA}(K5+wCEdV^ya?T;qJ~KxWA8%r?ecwT
zzEV^T*eo-omU6PHn>=t01soz{^N6G`lqzao!b%xS1+_FUtCHpRh@{IOD1>dsuUL*M
zmb9xFm$2C(eNe)SvLPuSl~q}j4d^jr4LfDFOE8yesfFDGD-~1l?Er2EBa^%tREwIT
zV{*uu0<qS1)~HR`(~SFjM*PH>Uu?opocD_j`-yWp$vzYAW4@JsA6`Q}J&)nNE@`QI
zGR~)Gpgc&!XQ3HG`kkVx>S?lLdRi^cTbQ&UV=!D#mG7IUt<Cg10yf6La?a2tYg0h4
z<#s#?Gm~6hzY96{GWu&c`iTD>zw*xNdtdad|NL33W91iXhrc+y-u>m&S9fX`#9H(Q
zVP;peYqx9Bu@}L>6^4I-0^<UJ&ge@Nh{RrnQ6%Q5nO$304}Hmh^`lzHwR-s77eXi&
zTE6-!j)d5w^e5@nH$G23POe?8A3am&&o0kA7rIt*pH0<;ff_fkEevb}^!@7Gz^4~n
z#zq6>vSH*(&ZR)eN4abe`fSRjK!{RX+(2#Dyqny>Ga<e$^gb1OYuwP<v9%a!Vo!yk
zZO-}CxuKfS>yj?=aMVRNxFJey`J^|ADp$ylz<_5$PmLS=TzV`)vno9m2DdrqSLX&F
zUbd;fc&OXG1@bpsYQv*Qhvatn5fFlPKAQeyy2kabj#=QT(6`MwzdG0VaN4C@ge=OD
z*Z>KK+VX@qi8fcpk3i2$c>Nly^j2V4g>M`n6uwR=fuO`qF(NbR&`^fDg>c3#NQpN4
zDMLBD)ZdS8v9zEp2gJbAOqP2WExk`uJt!@&St+|)He{gO-uXbv-kJ?Uve$NN%$Mvz
zq3xB98Mk=xMmH@+y_wuX)<42q6-^YWtxhrOjc;21?j8?C6kG_*?J>?P5XsfSf!1Xg
zuAcs;>+&uD-1XRnt2eIcdc1M44!F|Ta~*CmDDr#Cx<#KT>?!M(IO))iL<ufQnv^Xa
z)T=(t%}e@bWlJ2&76$%V_zythdsB=<!qi^vbn~3V(2%0tO^{%pWbhf7%?y)HlK0%i
zWOke&0CpK+xW_=&sfI~Kj7bS>a*(GCf7j|uM@Asmu-NCWKOxzK{)DOCCTRiWDRv40
zh?7EmPkxeD3?&Cy1uhl}3g#bJDHEg{cFIK36v!cpx|NuWKuYyQuu=odR5RW<8V?X#
z&k}HofHw$ulK}Dx;nM^p0XzxYOB*JP5di-4)j5XkRZjVP&YM;~Vjt=wBMsE3PQm~^
zl<@>ndeS;q8%>n%7G)`?7A3t{&}0=uFhNJ)U*7<*jGprywNq2IlNUDmi{yNZ(Nko6
z?M#guha+S4MqL;JbiLCC!uU4l{5FXIoG%{7#y>sh@;10}N^M!{+#VYNCOHKh4IoV^
z=vX^rclf6Pe7)0^-+_RE_&h8t2mkKlM(ehw(OP6DohZGJcpcakuepCYUbk8Cy1gM@
zlPKt3^}W{d8m@1BQKNZWbT0y}sCbZd{qN#+WFIlui`RR{U@u-r_uQZFkJoUe{U49l
zt-IoNE5z$VRbI(0loAVys+Ov4TFF=-6)9F!<t|jaN!E(nco04(d;!2a`-t24B9WdZ
z;1YoA_%cD?Cg2JIGXR=Iuij?SOH&F^E!|b{doT!I3#8WpSaEA@dM#DEoY~}m+7P%V
zNZ^|A0@uWP7tjENiEYmLZ4v<pTpq{9NvLvo8{7n?wk&mSkBtD6gep&(QqVyH7wPa%
z1NeHEE58E)1FgW7Y&FBQ`m|0Ivq!Q^&y;;UK;4+a$~{~(>^lj#4Irgti^H?eBXXr{
zwuApvJq6`g%Bj3CiF0O~$6th!P1U{A&8oJn1H#(vC&;qNK3c-lAYD<F%lHOpAsMbt
xj=dKFhGG7O&~MPMg|AWaYxL$f!M2s-^<dvJ`)w%7B)&btG83<OmYKAa{R2XcmSz9|

diff --git a/tests/lib/__pycache__/test_models.cpython-314-pytest-9.0.3.pyc b/tests/lib/__pycache__/test_models.cpython-314-pytest-9.0.3.pyc
deleted file mode 100644
index b085672bbbba19b0386bee11168d783f6a121d5e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 16811
zcmeHOYiu0Xb)MP1`<&(SC0Q@ZT2hkfH7!znNR%ncGDS<JTw5VV^2$wFZI`<vYOUq&
z@Xo9yF=!(o0V=ftY&1bi2LYnAK_j()_|py0!pa}JDA4jHlA6Y7P!urw&y^e))vum&
z@4R+2A}Kp^`a{FznS1Y?`<Q#~<DBoF=}6Ya1-OoV-tu-wogm~fVqF0<aQ`nMa8)=b
z$l^&s7uh{9608Px2_r-g?cz_udKjJrPDVzedX%LFPsT>#dVECErICc57^&0iMv{7R
zmmrfZf?h9&fHufspp9|_Xp<ZT+APO_Zj$3bH_H;xXXFIXEpi>ut#T6RHd)*)r0f41
z8+huLXw+wNrD7qc7c1rClvZduWW<kE3loZdT2=B!-SNwC)0M*T1tovcNQ~0Tq^b{B
z%IAv{Mr<@kHKia|%SI%l&_Xe<*YrXR>IlOB{pk?)!kCDa;w3|W$N@PhlYYWeR3Rgb
zTD``T7*$0%G}W33ye3Tbwwvh_9|=N>FlOcygwe1ogBffU<ho3-SCo@yn|cF%Ca)Zp
zBV$pEdz8t~it&4^&}SyR1ow%Zo|+Q1+!$+xmiL)&X0@=e#gi|&cD^@6IU+}0Z{*J}
zTJ^}Wb$J68Z&&a|SO2I)mgGc+bcr>op|M7*g|3pDtTg8iZyT~ow(b`|Mu$aJJS@^4
zkm4qX+X|^-r+`vCG$K5{9P61J8=lIsMq*E1j?Y-qbM7!3<6lvZjZfwHb;?l=!D`?1
zAavo<zWv>2jHq&1Efw=ceLAA$bLCV~zeS8dv0x;%3zcfAkkx5%VnU%teX;ybCC|2n
ztbRpRjIhd9{!>O!y|UjBRlN5gIbaA6k~vLNsGi#Uyn3ZStzmLM>@9(!mKLWwF|{YH
z#lK|+Qb!o${<PLP&UTvkpQMT!OFNL(Zix>9M~*<g{cGfd*I>I6<UGPlgPf3O-x_(!
z6<yJ4GbAW3jYgO^!-&74UI89Go1;c!HlHh%vQ$xN9=5z3Rg45Y&W=}$CB0bIjBpA1
zr=%HMvu4*JYj&bk8PAon<=muV#IjoMyn@L_a)@=1waFUI+4B`TnbWf<k6fAcGZPMb
zD<j0(nhsN3fei`k<vrZ(j`X=By*$E;J?M@c((oly>@r^NdJ`n2b&b<pxlk#0Udquz
z*Knnvbmb<Lvc6BdSS*#aF6;%Zt5h6kq3k3ul(bIuim{#b4<8HJJW~U*lf`mzGFPJe
zpc>rrwTrMFUKM^7-82)KC98Y8K8SrD6l25U)uCTYiFXfOKQu>nyr**_?N}k^Z;|Y{
zcF2y|fot7%Z~-{X*s{aCgr(U$?f~$wLX8902cWW<CPwBxH4eBl@UT?}#5^^c%uwS@
z)5G~VKi1fPy?>5ueQ%tR>ql0|R{mQgTd(!oF*|@2nj8zjVaApn<|Qo5=5Yt$$$jYa
zTJ;kZ8>V>u3<98dAp^VoRIFB)DrIfyhsCT|4OD|!TOHaZj76+c9Q92O^@D1M7pSIT
z{OiMd&G>F1C3FauQZxbTU5iiz)psXw<DVSfEgTN+6pCUxLb1(fs07pyFMs-@_wo9N
zqv;R@-CCfi5Yc)h4agDaGd5e*3a4NN3-ftvglVp*DH_E#u&I4Zpq&_g5i>pSGj^%+
zHcUA!TP<HKS1y&a=Zi|IK>L9m6(S9lK^V!g`geE!`OY7FdnML>C)Vy7*ujs3zk#9s
zhWJHLh(8aH;kOWLf1u$!_?yO;L;oZV)yMJ>u`ldJCYv8a;K|6Q-&9AThXeTMl1X4)
znE;0=lOXMeCZ<Irk*~m_Sg4drpzohb6N=qJ4<KPZO)zv2iPPD8S!bJK(qOfQrL?xI
zbC-)69e`Y*EDPH6Kzw?-=|=EQbdPWE)(wPyAq`27=-qeN8dPg?IQ`;E2Q$Psoxs?(
z`*F-RO-Au$Jr~R%Tg6^;2ASmvu36=sHNWke(6EqW{bu{W%q)HVH9j6TJeT9su?x9N
z7gMNVz~~8U#ffsRWP~(bQM(}_nnznQ3t9)sS7`|cSgx!WvuaM)6<R)Jh&e+XPY38A
z^fP@02n;z~X1wFhGvxUEwp4>GuZ1d8)go0i*lBZly>um8DJygs3TuIXZ2}04I?_0^
z;||%2_x?L%H{Rd5L!NtR-f#Y1JsXPu<y{aq&z!!g-HG<Hr}MG)FG74dc<?D(66SD8
z*a~&sj!4Hgg>DZfkF|$>xk);<C-%#B>DZy#F>IU7f6ZF=gjR0>#=SX~VN?+ObFC9(
z-E{&u%yl9}_d`R^7@_gXL{+0`BcGzZK)q|iZnh?PG|OyJLq$T<VeKv}1$qK9J_r9A
zE<Qdh!ls$v%~x0VzBqdtSKa~NrJ><S=pUt_<RiNEZ#fzP#q-N=Q&a(?m}<y~%wlb`
zx3GWn*!A|bx5`31K#peufILWNMHlh_c*kQQ53Qb>e31udXjsdR)zZQaPkvuS;o<y%
zM8JaR!T@~pIf#c$*5$QP3l9<zltXej6Z9Yv<S~#4V!gF>2P7f_NQ3}LM62^|zKDb%
zN57(c6Hn#K(68UUeEpCLL5_Vz`PMy^@9$E+as-fzq(M}$d?_m96oFgXj07P@ic+Vj
zJD!@3A=5ssRJml3^SNS)Zii%m172JM4j3IeeYzG65U0D>;m@9a{TpY!nbxZ0#L&st
zYxxuGH4K8)Q-GHR`Ye)NNK!~(;TLEdlHEx500E#KxLM@}!6Lyb!iAoq(oEAxUO>_T
zL_<Z1+4|h*X=Z!wI~<+QzLP6e!9IEec+e!$P!EPZR%&{8@cQ5!X`MO9$h_3LLd@SH
zX}vaR$LzrOz>GCH7D%fVTXvY2ur!;;9RPl)aYbsGms;jX#|?>*d8uQCn7>8RF(<Xy
zQ9FRum@EsV!-_4txR<azo5>x33}5qmLJu&5C=WetU$%0maxLT`li<-OAzuMQk1-5A
zUW2_pu150porP>f1F?6V4QL>bJ_|t&<hOkmf*Q!0XCa_@eA%-QE=TF996cENla`~O
zh5@?fmpqwFu-RDmREl4x6uCY@!7dV*I$?;^SD8RafL)dnabOLlU_7T?0PA&f63}$d
z*}~cf7z|MZ^s+`dpa>Cw+Ok%Sm8z~*^{igGsFblr=inrrpb^M#)YUX<!#YOPYLZb`
zQwN+vd;^Mow8DWlFjF{ET;tfvVOHiEkfqgfwxFEPRZF@?PXlKU{A*dRc0?QhLb^JH
zc=^%mN9Rat2Jmtq^HOSsn7>7mx^~o#*#U&eO^yZNFk{OO^AZ+h^SA@RZ=VUY%rwst
zAfSk>5c9W4TCM@i?<DXr)?ubC0EZb{c9@s2Ae+Y>0Dk)&+hz)L1c}tPLd@SHX~TDP
z5_lNvFw+)*!;CFE%u861&EpQhlZQ1Agw_$Rz5f|d%O_<7tmTn_DvktI0Om6ypcl+9
z18j5n%LGPI>GoXb3UavLOjm;$!OR0dV<y-s0@#QuC|oK5Zvd>-b7jaVV)~v7F=f0g
zTV4%iLZd$D!#Uwa)b7n|>ij4k2zV|vB9o~<%S0J^^S*_NOuWl55!<W54;O%Uc_tGB
z^k&Q1%P#aLKIY`|*$cocB5bu=me%Es*u05#d7~Du2R(_)b#gKj_n;@z*fy&xTqV2u
z%X$hemaLMkJD?}^pu2?OA9NTx1O4A(@jG`l0sG#MGcne;pV)o-(FXgr4*GWM*?)HR
zZG+r6hBgqpMpI_Mb@TPfapZxvlIL=JsaP#EeR7<9{&KXvTL1}bl$%_==+B?9dQWa%
zmp5kddSscDH_4kbNslb+A48V)R>{`glw|_scvr3mOL`kcOCudL;=@X*Br9r#>PGxH
z7=*H-!RpM-(oh~wdM_JM#P1Ql=cjVFqDB;ssdYSD1{fcYh~cOWR`z0n9n2Y0q3CiQ
zFntJMU!)Wc*Nlj1eL_}9QjE}rT=8Pnh~&oe-95d1V9b6KnnwQ+h!J8(%|;ma)FNaB
z<Q}&U^eE^?(u?HawA<m}5O+8@bc%25q1*%j$$neAwV^YIgkH-b0RpB&LN_=h@V9by
z<%8?G6|e?78PE@*$!JiZW3^5=K=0&d=k!f{#S9V#W4wkh+svM!QH~yXO7?B7Qa0*&
zVxc;zvIMGNCPXt`TX`)$K+(eVKnUpuZ<V-<!-)$>pT3Pjms`PpccjN1IlwRmw>v?<
zJMt1&3p@}6uE<;?a46RexcdNW0~>)^TB()`Fkn=T>W~Ya?zBTpJNOUs$~<}HZnSCU
z_ihe+FtOV9-0Z6#?^)f^_LIT)2XAVhhXS#_t3&@2781|QT)O%8hmqBt&&@W@2Eo-J
zap3B!zfL@}l6YZ0@xsl%m5!tH9Y+^Ch87dYt{z{N8s0r}{m4w?9cjlL*|#FK%}Z?%
zxG}-VytHoxQrT~j>|^^hM(<!492GFU<VuI|jR~IYq3{IjI(+>wVk+&7c!8C}Zd`<i
zuN|<{04T9y2&|aQ2&}Z*Jnn!m;9zh2v(UHJwy&b&Y2c)YTfgaW8XO@^1rYq*A~UwZ
z?dKbi3Bbk)C#L9N*(sb8<!HZoGy-Q!BL2C~mc(_>mVm=NTZ*w|>l6(@<I^G^(s<fr
zh*u18(h&2Yp4zJip9cka%qEQyQZB=(5^$hy0ZZ7G0-iJG`JweiWMi`u53<uiez=GV
z5=+%E1jCn)TYj4D9=D#uZ0<MkEy!~m{<T3MSA~Bkjh~R`KOr5f{Jiiz(}4g7_3#aC
zcJ~MQk9+3c%-o5d^&VrP)c(GHh=gtvX=w8!4zm6=;z$$~IZ)sPXR}nrjOce-t%66u
zR8J-dF14>(o1^JcYaeI%seK^fEg8bL@7=EL8=QE@(07AfqrS&fmTKp6sdZ}dQSo5=
zwnv-QLUa3e^7*KJ?tHL)!+3Pg<^9w|;J2Y6IlL}!(Ar!*+jj_0*fSx|_8lHWsG41_
zk~djt&K+7|E00zUuzd$W4TFC=3{vz_JwS>Plp<5Z9#@Xrm>jpCeJV$LJb}6Lq~!>6
z6IRB!ELq+dDy)@L?Y?uAoD-+MYZ)+8x9nK}<EKT)M0`ht1kSntl1XsIT#aYqR<7gX
z#BHXY+i&}b`Mr&qK0D5ixaN0APCSA6ZJR(=8#g$=?R+r56LOtvX8ErN_7O3-%sX%I
z&-*A-D=a7F`b@ZXeoy`9sBLOFb2*c=W-zS9G5g(qv9aEN3yq_ex1_fcuKsWMf876#
z>-MQ_WP&NRUjN&slxKadmz(8HnfluPR~rDF@RLA;BLP>9{w8qXoY04w1zcT14H(%d
zZw4Ts(NkyBW31^-Ry(Y_xu(bYx~>wq$KXE-{}Qb8zmmc8F1?eRX(EI2xJAR^H|N}Q
zGqlz@{4~gTr81cg7;&oTRa!;^&X6>{pir9C;Vje$Q0AxIHyuGB^CfOAy#xoVSGv0y
z?Pc`fbQGU=qdA4o4o%l{pJV2j-2=uECiiq(AZ5=Qu&C3N8idj^z<Srn>S}qdQ6)~Z
z8v87}4Faa!2EOH;H8q+p;oyl5U}`^mE8E=bcG-Jq@cGu)jb>kxG8Hj`-TkLXHh5Y~
zzYWYrIH#&g864{^Pj3)YsiLNvnPk_7ROwYL!DXKvq*K^{3MMlzbkiHWrzUowCf3Je
z+#$Sse@(2vCiap$)>FgT)1%?QOFevXy?Boc>qXP~LQcyTin)n0IE)ta+?!UT--jab
z$XWYeX4!p0UbyQ(zwF?7*G%6WK_cy1A?9z9?81X*CxM3%2sP6dfWwR}JIqU1kj>)`
z06zk;VExUJXJ?KvGA}*5Ld@SHdG;FkYC8!$jCGi43&3H<mL29LEY0R|2Y~->tZ629
zGqVKN>an|IfY&}SGc-ql%u7302soz?+-UoG=TAE!FnfZL`8L3y&EGN^z-BoKJbbOk
zPFnyDGv;Dm!WwKIPXMyq=$t*_NrmVQz?{vhkjIQcu{S#T11Q!npPkD$05A10B8|oZ
znpyXgq}Gt(W~q%VqhX0?wCMMxUkw@^@JFoGASlbj8NyVfH?3{iG<w!CUSJytF+8}>
zwJC$bKV};){-%YG(c{!)qmc0S=H)g<tQML|k(1Bge6WoeP>N!5+@-Gh^V^0DVAboD
zZ7WcoO*xF}mscrLBV*{0!!B3Jc&z2Q;B9SH&b2`%VKWTEW_YGX_Vu8CiK9MSIDi;{
z9`~b91zc)WZkn3$Q*EbCW(XReQ-5dm>;`4OH4_KfZ#(+~m+YHeZ|2{>HqMK<?h_a%
zwxVji%?8Jboe#zdd>_2-dwl(}2Zf0KOiqF#w4EzL0T*V9lg1D=ws)SPe*iJcbRl#O
zGepG?@;!B8y8S|qmK9A)fkITF11a{M(?H5Rv`xX6R;lv{LZ)LbAc?`PIKk;3AvuR6
zi{y`i7z97hnr?ik2znk_CXgkd^jO-?TK-{JkVnmdujasOp1ZDrf|4qhtuN#F2Xp{1
zn|cpuGM&5tOINAIm+4<&b2h5AybjE1$<^II#N0T-{M^jw3_d`GgsMO^#H(CN0@n_n
zW!eEk;T*<i>H)Uc{LJP@kO`Ztb(x57pM{L|eyFeR+C6*b!~Ty3Zx7C${U&}uHfDZ6
zcJMlQscm^rX5>1k2wV7Xk!-mJ9%p9E4&a7urY!)68C!Okm#{FK#~lFvyRl~Q?GMfl
ze@s?WduOX3PJC3lU0U6~W3KJ@0fUaeAznT98Gz8c=1BVI?LP&WH+`dhUP`YJ^S4OS
zbEsor^o}cRGb}?6*kw)j1t*3N0O)m!vcpRx&34oDb|Y$ikjU_VV_TMg3f0LqMEKwk
zRO3susKe)ob`l<|urxk6aHpcvNErIbv|iYd1^&$iT5VisWd|4gFxrn~0Ld_t6G%|O
zV<-F!$fpQ}GR2Rgl)Ya9L<ixNqgqm4raytFAbmlTfqWJeMe$QX_^$Asx=(~XUjzbT
z;5P!0d-0IC?OsDrd_%mK2#9;{#Uo<hy-h)J;9epqw%tn*areDs3_@E&;#=aqhOl_}
M^QNfy0_&##1Kq)vSpWb4

diff --git a/tests/lib/__pycache__/test_replay.cpython-314-pytest-9.0.3.pyc b/tests/lib/__pycache__/test_replay.cpython-314-pytest-9.0.3.pyc
deleted file mode 100644
index 4fa721da427ca373db23d5d820ce6c8786645179..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 5190
zcmeI0OKcNI7{_Pr^>Y&^0a_l#BvuH;ZLlFPC!)5X6i5)zXn~}M(ACD?I13xwnOzeS
zR4IoZf_i{Tts10CaOj~(mD*zuz4z*b5Tg|wda8P>N>7~nef#n{G9mP(s=`{$eBaD}
z9y{N7=ePTIq&ZCB_xY3ESHATSl10b<c`f7VS70oX4++nV5REZ%&#<@bX(z+1=4;0q
z&xn6Gpal%uI}#iYX(7XAN5aFL#<i2nff%{m)W?{8j_$DsspXBzcwaYHQ}BMbr$zX{
zPNKE&K}cJ9rjx`&zo7VhoYC1)QJaQ5K&3nta}t_k63xk3O%G1V*R?X0)P=U`xkHCj
z35|+bDRD(D7L>XL2tjAK2v2|T;Q~xplP6>XmdgC`uqeDIWi3^SjFC~+uB+uGhWF0K
z#yn@pZ10;^asEd__L2#|tr`v5RxOVazG=+c%ka(P+j>2HR?j@k`zDOAp>njTrnjva
z>9fos$n((+BSmaoEsw#>?cmrm%<bIJuC=DOEqA+f4DaLpt~+Y1XY6tKzzek=TbuND
zJ1a#<iRFWQXpBuVb)~+E?e@G}J@2r~YB|i>XZLL9(+nRz%9NO+4DEp^zFt$?R*9L|
zW!sH5tE+aet*hm|9^3&KkC)y95n~UuyxIeG(f;T6z`k1aYzzL$9L8~Q_)o;K)re!;
zt2q9fapc2rFl^I<)8gD!H7gdLaalzy=cGf~Votj6(S7PwSyA+0NfEWFBAvOYdrR|&
zbw8XBWko}$2b2DVh^VR()jGNkmgZA&6$?}67ktJM0>_I-R^!Y<FV^<N)$k3==tvt4
z4#(9*PR<D$mGgOthCk|%RimOm4$Yu(0U9SaX?)E*ys%-xZg`)0=1HfaVPcbpv|JG6
z9JEWDGr<8{DIQpQ;MB+m7ccQAb<Q<4J$$S*uSu#Vh*WPAvZA60R4UPIai%0vNpFH`
zVX`bMnp{wIzfuH+s_zoi>0(*Q8B!sy6emSRD2OwX9uib>O2QuW$Z=x@<znF^rA4Z5
z7jVHvO^~j`>K8-}YV?40rxZ^+ILD7)xS)qH6)v28e;nds+)vz3o3Wn5*_eSj;&%Gn
zPOs^hgAKTyK^0Gx4&$e1l9wRBYI2f_g<P?am=kF(c>;DxQp`&Q?U1oi)FiHrnpEUT
z!@?U@#Q8*NUf*Y|ngtxf6fMp`xPdH$5>8O1aTp4bLcIYXu}B^_@Az!|-kI;CYt5$@
zPd(z=s$6u1i&ogKdppd8>#DNW^?>cFa8bu~jHLm`TxGj#Z{4L{!~PtR+kgfz>aEPp
zOoh#SnZK*tQNB4{?M|<Brz=q`gLH+<R9WkKz-E?eo>Pp2TE(lNu)Ot}c+Kd^5xEU$
zfKy1{Ojp>><<my8!gW?z>w3U;E~OpMF>nw|u?h;yTd#@Nus28JHlXGy#GsyZkueR>
z%WN6+wg7`nV;G!`HA2MMo-wa+sJv(6*GVS-+6Dl79wM>^B6hkE(aW>{QA9*|bx>}!
zF(NwH$NK>i|9;eU5Y%4lEuyZ|4p2A12VZy}UK=ixY~7LP<3oIS%(nqw{V##9e!FKo
z2YlrKUp;`YCmQ1`?7Id~`<M4!Wc<~>+v+Qpvw!}d`;O-TwOVKZo~9HZRTNJ<itnd7
zBT{_pL}=>MnMuqC>WFI4B&xyl6yM>1Y0hrKG&LBqpk}}(E-)<O>I33(bkN`MCjF_|
zB9&^0^qk%-RlLcdCkhW|LfbKc_hdrx)sXJS1ix+2H!#81Knic)gyO3NpeBwJ572Hb
zN?_87<t~Wnr(I5o$tx*0uMDZUxF)Z3(1R$A)=`J73OPxW=!{&DRZY&)e$e1URy|`v
zN3i9y!1BU(=hlMnE*^he=RM+@ZVlZWs<5%Up*tbSmP0FCtjb!~8XH?0ay-XaI^vkC
zps>9424W~%X53=i!-|ko;WnU0oK@;(s=`LU$bXuL>~<a~smfZ{8XH|oIi6$SJS@d3
zC@gQiff&kei*B*)VMX4la2qfteGAq!-lhj_5;j7w?-opUPMAmvJe#1>9F;ZTV5bN_
zN*QIw#?b>M_*4ZrN%5nS-YQ)Ko=$)dEs-ouRCg860Ff%HiN`@fn35Gq5OhDnc|kLG
zC6cBQ`TN9}sJT;u??BVCB4y|?sK<|4Y8sMXy$r+rOvuONA=&kW?1$^8aAfKFCy8pf
neI?vp4IfwuANYFD!*Fk9VB}$VWHIoYpJn#{9$}e%M)!XJH=bX8

diff --git a/tests/lib/__pycache__/test_reporting.cpython-314-pytest-9.0.3.pyc b/tests/lib/__pycache__/test_reporting.cpython-314-pytest-9.0.3.pyc
deleted file mode 100644
index 376e4b558b9347053c1e4938bf67cf330e5e5ce9..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3235
zcmeGeO-~y~bY^$`f!Ej&l2ED?NDxi7fIor@p$Sz{nqb-}O%~%Q67p(eFW|6vH#2Ko
zqg3M1ACO}!RVqg+^^ikVB~qp4&_j+U#sZC6wYRG3B~m!$)S2D2SyR;hfsEI0-h1=j
z+c$6C&U+(~&JdvE@8A3GKji_)Gom>-7kPDqBAXxwWH<*bXwO`lFLMJREm*z*w!+Q%
z(*Y}x4q8E*=I1)nAuBWhWWOx*0v%eS?CS+qScXGD3jD=P-;_8Ky0)(BvWCmLMFLq3
z6^eOFf~50axu_QuTwYsKQ8n*#Jz<kb|I}0PYLS9Xun6g751IOx0e4dw?r!&D(|2%7
zXs#d5mnr}hf&iQWxq{41QjCoukh|ceB>uIS*VS@C8_OF7jRaTJ$|}yQB`HXPrmk8`
z23jLNyjs+C;z#9@QY?@NUNOqla|;!hmo-GhLh+D@x?Cl1Af)B#j@O+XQ!A9ENjzeH
zBep#vT%Vb{A^8wH9~5DnP*dGlC4T2Ugs1i~3(vc3keT=b))0>0Lu#pDlwuVX72;Q^
z_i=SuD_LW<vp8;P*urtWc+bWZq?rb?ilyb4StXo;&?#!rPycv|f=%$Nc<QS!9)I#|
zeSdi5`ThO=;h!o$KYaRdH}&cE{DF@PFTj_8^DV$XZnu%IsB(Kowa)HG>T>(p{r8Eb
z-1GFDHuPvPUyqDqG&qljd*)uHV>Eb=#xZ#$3-src{gZBf#sm{AvOU^E4#>eQNU_7D
z!-QEVcZ|O6Pz=f;FJ*E${=vm8e<=;XgOAU<V#`&QrHkRF)-9F@N5)wopUw*7Q0~1m
zG|r`5KV{LQ)pbOLl4n7dOS#{U9R6Qfz9}2$Mm^CseR8K9$@=23jq6|RX}-~8`DBxK
zI8!hDP0P)B1<KLO(Bv*dboE~`>pBuM;Z4j~G>^oryFE%WAA@h=bWEI19Enp;j+0O3
zpnNI^<8Qu3`!n@smK>$E`>hS%N@XmObf$~WG;UAH@#`5{!@8*=I>7+D1yFzic4v^T
zW&u<4`7%=TRT5}cLf)(<2?eQ*h>B?mvj&GRnAHghGv>ri;<F4()k$<o)v>0S1}<8~
zbq$k<TR~|_Etf1z1e2<7bTBxEb)!OrCAFwa@HIC*Ey0Zw%xY4?gVWRY3sMr6M7!L%
zMfcXs{5<JkP?^8+*&W1gPQ<QM68gxjvbw3Lh;%D?Ro7YFM0sP)RFOtH6s#_3j6)(b
z_6~|h=^8=?B0UQ06jV#mKBsL%Nww&T6d(a)w>_9ZOhjs2aW<at660RtqC+^w-uDud
zm<4&z8HnApq|f%Xt&MVHgpzcfRTTUSz5i)-k9I#g|IOsLSH8Ydk6!rx-1eOxmiH!R
zcP3`*@1@%pJ=EiC&xW68cBAr39~aHQ&DjG#=s5l8{?^#z-g<Cqb7o%**E$Cp;^0<a
zLma6IiR}kA*bx(Zg8S5kL`@uV$c6{sWzsDvFY-c292)Lzh*C|6ZQr)Rju_h$+@~(Y
zYNF(j4G+G{q+3#6<b@Ek4fi+1ftoP7HM=#%rl}oqbWd=fx-eQ32OP5D!FQQpOUjG9
z5JqjQ?=-~znvmX_*iLS*K1n`VdZyOSr74HP9WlKpxKEvO#D0ftc<^0YvL)t4=n^_j
zPgCMEnT*s)g6zy`Glxl-B~^>+Q=I&bIcY|7Fov`WDq1vmTP&88HV+N6B<j*=caCGU
zQH-t8CQ`?q{<P<Ct<^u;UyAcNu_XHr*q%8F8N5PGly&Vx^Z{LBod-Tc!Al-O_$L5g
Xf<2J<4U8T5dAI-%!a+C(ZS8*m3SWGk

diff --git a/tests/lib/__pycache__/test_reporting_render.cpython-314-pytest-9.0.3.pyc b/tests/lib/__pycache__/test_reporting_render.cpython-314-pytest-9.0.3.pyc
deleted file mode 100644
index ffa6a9a282a809bdc70bf7c3fb2244a921186246..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 23278
zcmeHvdr%x#dT;kk&x@JiC4`WK#OPtHkz`;1N!XTcN!A0&w!*gX2z!a<4lshT5i`?0
z2#LtI*vbnfsk@}I_ZnR%5sp(?lx$LL9u;jK73WbES8D5C5J;fD>+H>~<d5W!jaPP6
z5+zm1?>pUnx~J6`l)Xvptu)Nn=Y39}bNV~q`F-E%edW<I0oSol>fhM7Q4spcV_qJ^
z@$pZ<F(F(KWbw366xrR==^gbn2%S>V*FZ6z)BestG0+(-20KH=5DW93F6#^z!<~_0
zq(P9SO+qm$SIE9i!mwB@m;LZp$N|twIS5)Mhd`_4GSCfj7_>%?fNqqdptW*2=q6d*
zCB!TLni_WYmRBo#F*7_YXYx6vsFj_#oQ7K+9WH8tWJVdt_7~$G&DWP67|dwZN@ipr
zqom%*^`+F&OPA8h)sklop~L`wAGd={2)!b1?2_@+vPbszdd-le&~C)a(jlK7idOj^
zhWm%pqXU_z`f~#rEjW}OdsFRCk7&N3boR|rEs*Z(PqenR-|}g}{BXK>F{fP8eCnIo
z;bF~p30a0Ue}C@Mr7W7Su*Xp29))Vvq~WYujEhP&JSx?_SL<wkRm~`BbDxqP8OV(^
zjir@=<`+>)b9yi{QhbV)t2P%iYEf+-&h{}^N?~1<9T|j+x}zz7Rr96#(TNHMU!aFk
zw}VUw-wJP>erxW``-3an8h=xoJO3NePrPE}g^vW$_rk|WA@uu+8dn5&YU#&658~{-
zZsE1|ZqI8~z2<xpx+Ei??bb0S2m>giUbryuig;m9()XkhJAf9|3rW$I-(I(os-p+_
zF^Wj(Yr=>fq7lWlOK8IEw0v=|LNN+W`*=txCwB#i=2u2XQrQ8mTph}d4iBV?N_KEC
zqiErQtmUaNLR0-YMalGIs4nX>4I_!5j8@S%is@EN^(mQj|4>GaOA0-#LS3s#Cy$<f
zSqpU?J$Ej3?&TjiANOlsB|D(hvq&mh^Xs*1Uev02$A+$|^sJ5SQferjC#5KjaL@y&
z-5}OrsVh(H`(R*Y`xC!6_6OI#ac!ynrCaCbzi{r`Ep4|0e{yK~#qNa{yO%n;XVjU@
zCjoCHC4MA$eV~5K_Y0yuNU24^{rK@K3>O>XKLd3Ral0|Jd31K6vWpKi&n@IVvZv6x
zLr8kfm#1*N(9MT9NCPK(3m22Z)5ug9%loVea6%mXSr#wJKH1;PCr_8lG}s}~3rrW4
zGq{cN^`dnB>SD~!^yzoBiNDaiJ)0c)Xo5Q~d%jmh*)Iod^|+_^n9nW;*G$X%Ne-=<
zRx-<NmiF8Fr0}IKn%Ryk88lNj2uX=Q@q3OZu5@IBcZVQX^#s~Pxw^-n6xuxP#z@Fz
zYqrpD)+vWSyCc}_vG-m+BuC_EGSn>ELksV^_Gy{<d=2*5M*P}wVb)*)HQ$$J!CrPg
zESJj_$#7|n`A9GEW_H>7nO;!Gg|Z`NJIwo#C|5o!=6(2y_<8X2;wL>TDuZ$*maPq%
zbTOSBRvJ*)xM#-~$2Vp#4K}Gm+05{O+LX;Tv-Pui{KOM`@>e_JYS|ANj@WZCa<!X5
za2P?YakWV;rj?>PmMspIy|stZ9gM5-vbQvE{%Wh{xdh&ZtV9kH>CmM7)kK2Pw(%gP
zPaKG=%zt3KBCTSrE3)bmXaWoFID1RfJlT=BI9|ijCs2QOL{HR)$|4HA3R;=f5-oBh
ze>K%NnjJ1?M^w!}oa;{yt6CXCX`M`^6|F91aDS|nN@{R8*OwknjifJSv`|V-U(8Uj
z=1=GISU$C?6!H(#qN-rw&Zoit%Tu(hr@{Yox#3)1D?iGbp3RM%P?VgaZ7}0AS0Dq7
zq>G4&N0q(Q(lE77f7FEC(`NS^uzL>KJuUj<83mO3TJ?l_+n&}F>YhY<iSJ;E?@)>F
zX*G&w#_X5AB5h^8$L2{YtD*<l+(~6GK@?G5LY^x8)V~A4QWYwnygYSzF|>W+*nR29
zm!G@-+`M#Tx&~zOx#^n4@b(4i$i49P`CZQ-=yu=2t|QBm^Zy<axANdSmX9Ky(<7Fs
zRPLM)Z|CV3QJ(3A8#UeT(<ARPA5W+|P^KA=090o4Ig+mCeRMV5fX-m6=`-Na_SMu9
zGiBk^S2~l2*m8)Eox@|x!B<JN&A4PQ{;=wrVs1ON8CPEDC3%2d*6NuTn`oW)7uu2{
zTgB}wJopt>_yAUTJ#MUUp07Rn>K`mj8gVyDR)5dn3v6xTYibt>ZH^0T)83?nwQ1k^
zgl%OC^;)@HS2Rl=k;~+8FXz}@Bx^Y?SJ}cU-<>0iCMV`ktIHNvKKFL<e2^`|j+Lp5
ztxRR5>qOz#yEZ!-3~w@Q)^A_!PlyArtglbmdcPm*y=d#z@-=(aTJI}9H@zB>E9I(W
z#L=sFTzfTY>(u}sSzU&*Ms}5X+7rS&eX1u|(x=sH_9>U!WNeQ=Co{ENu8}t;%N>3C
zz_m{+tUk5%#g<3sG~^u5sxnliTq|!%R+iS7uj<84K)-Bd(^l7Uq3kNN9p)V}R2^g{
zlA&z!Q{E4M0CLoI#`QKA5?wJzVesawlhqdA$Ztq`lN+48aS*!_@Ww6qnq-a60rMNX
zH<s`~aqy4V_r5Y7*hID1-q-IFE+|KBW9QB_tIaW!x2~CXgPFE;*Oaf7>*a09TE`69
zl=QgH+D&FDcK$l}i|ZS$o{iuH%{>@(+h|p-+435*PI>$1h7;7u4RS15=jheVtMuw-
z^9iw^viRw0yg)Q%A;*5IT;B00_sLg01~1?|$!yOmyr3GqU;}<N`HgZ*rEu~g-ux}e
zErvX^(_A42|7LyjBI&`rNKk&;yx1x4`ZwVETg)dj+c>a3`Gn_#e6ma4{kfTkTjeL@
z#^hGVJgk3=IZ<!+n0co;k@vjxPl6B<wqt+awbNxC0RENVmh@nK*=FX;Z(k*?Ult0N
zkutzj)+DzlYdV0d<r|U>=12h-8(G(UYE3p^KDAL9CG+VsD@Dk=ZEw9hr?;@Q#x>LO
z9+#h7GwpV>&*eRz-8n5kmIS`$GxK>SKgQbA94x`EqH<i`o{T!)tGdT{uj<U6H}3|o
zC7Ej*e%tYDz>mpY`5L(n=vDhe-_Y1oEeZYB%5I7Flifi1=_OQl5!p?IgZ>KbTnTh)
zBXWRYFAoC*Asqs=grGwVUut2r^`Q<%0o-E#c1HCSPwPNcD}W~oZB3K_Rjo#r*7mdG
z!9<KOw#>jNsR91q&;3*^K@~l16pjLvCTbLcU<Zvr;9N=qfe-EI7pn_7Ta+D$QsN-v
z<vQYHf^o_|irG)(DT=mdPXL5dS`m6n9IvJ)22N&TW`NreszYflR&WAf9*5Y_L1yyA
z(=7dQM=1!^0YW$oG!U8eAj+*{hN!D)I5(D2fDHl^<n<DSXhL{|7kU-!QU&EKkskm-
zLs+wI?PJZ;#KVZ1q_!ZNUK<=19H|E(tr3I*nzB%UT`|B(RE&1S4(sS4e0qiiHAruA
z0$Gees$CDn7?l*mI;-rhcs0|eF)fXu_#u@KgIJXSJ<K5@&k~`%fO3GyK_VSQo*{CW
z$a6%V2hn`2zce3fbv&j?Ole2Bp#pTI1UxjrK7CrWD=#{t4fxN%2@^WRXZQ^G&+v8F
zQNx3FPe%zb*jC~@VE45eIHV3XwzihUmE>zJK{k!r^oAI<8LcvM8NQO*5+$`IT9wn(
zKWB*0`(l3r160-9bwD{nX^s*(2BOA5>^P^JTgN^RG3?W(RZb11RcHrCM~Z3+s@W7n
zh;w6Via5Y16uYU5Sw$@>UqTgx7^#2d)m3<vU#Xrt^|R-1_TK1StlImvW3yww7x;t7
zHzJGsPkgItgBdmFyPf%<W~u7z$56+1i4({FGPHI2_+n@W!Ko)E0j8dqKDI#c<_S1w
z0HU6lt6tdEru%OP;D5jM8&P=m|9g~4ciy#}_UP$j+;@ix=RSu6*=L@atJXuA$B_V4
znbC+rSvH>|>5_EfcHn(LywpmRbvxilfU?YJ#Govj&yjRB?|rF#65Z2&v+hP6Za3=Q
z*@D~TGt*lZ!n>BGc6esS7@6Do?pp}p|CC1e+@%aWTzA})+NaSAM)EtBk0R>yh$T3U
zB6v#fSfq;QcItlAFf+!ZbO#DDqY;1_Y(7WQ)w~N*`~RufJLz4isGjP(eswumr~fYo
z>n7EyKJMp^NtHYAP#W&zj`?7nEh7@yd^`~xsN%a`p>p3xej!r+<&Nte^HR&yDUj<O
zQ>PZg^>9rqH?Q2d0>>=oYs>VN*}!5$3taPvZD?5tw=7G>|6MbXhj0gJ2#i#Bq!!b6
z56%ufVu|YIPL$2l>kcHHR(Kf7Fnttpg=d__+@yL?92&>dI~-`Jt_6S$&o7ch@b(KI
z4<pnxu;~QK1S|nY3;Q|74!Pfpy@FwE!t=uc!xUOk*kxha=6)FEV8<bI{V>dMe1#Xg
zT#)|4s}@RC7)bgVMm1*cGzLFp`wZYz`qjnv<Yoxd4wolMJ6v)8q-~es@3nHd;-N|2
zIynHnIG6UjI2LcO!$rGqmG6q#TfJgFjj_YE^0~K*Yv8fN4MGdQR(hW)w?ZGjR$7ja
zJFur9^y86a&>=;Kda)<hFI&lVW|(!yo-t&$!n{L@jzVgN#+RX$NjXX;REmgP(4f|g
zDcNEsH3&uRD7TCnFEdP}nxLrNaO!k-=c~t3=ekdHrOv(j$}30Z*TzG;cEyHZHk2zH
z51LLAX%k8p%GJb+$`^1~&Vf9%3#bqam0kp%Wnw6VwoVe-Y^qy#X;GKFAT}C;7v$C^
z7}&rrCv&B!3z6}twjl!xFi4i!K17mD7z~}1AEEa6)l4xna#{0Y;nhMs=NfVtm%id<
zTCjL2pUS6;Lt6M!Zsg6(Rk-_ySn0}-)1#7@r3@3fO5{x<3X!)!RGJTsjiwl(iw--O
z=-}4>DP6&=Dqlu6+Q+EB1~MUhD0Qp^E2j?M4K_?1U6Cr5rP>9lc7D_Tul3LE_?3%G
z(t-O@-E{qu)VLDfI4?Ec+<IdxZqr-e*+#c*3*p9P$@pKC8fobw-(AbeqwgVo75Ck-
zd=#-&k3c=s+jttg1BrBQN3+blh<h<^*9nJ>+lj>OTzMPJ?qFhPa7?5U!OQqt243dJ
zOb)P_to$&_RDKlXWL$*(Tso^_@U_55t~dn8S!^Y)Qwn`Hz}6140oHZ@%?wkilE#=m
zF!9v~W+a!(r^bddBdO65%qit7$hH?hbq9z(K(=9-%gjr)OVVK)mCg60U61O4`3mq0
zk|3nR=}0<i^-M0@>B&Vpy}77iVkvd{@&fuHoQx!+ZP3lalroqXsb9`T-FULe1E7_y
zFjBejIx|%>Y(tU)c!;M-Oxk)r4-IgkyNh#XJgRXOF4|U&(PV(F8j~iUHC7GI$K7?R
za%D1zRio<s`I1$`%H^(C<@1288k`q)xnjgFtA<s6z4<iO9eu!8CG%;FRl~~X-Y%Zc
zST)v4&sPGh8f&HHIE`b~2(eYeVX2fa>m{2WcG*gf89}xio|fA$nRCHdHAv?bWVS-(
z>zo02;|q^F1FG2!*zlh-z&1|qv+F-{29VrPqjOv0mJlAIhEerEb`bVc{dQ(gjE#(3
zJ{|x$?41zKdBMC5TWT^Zc}+Mck*wgn9CpAEUJn`KYF<1?!Yq{mGti_6E8l==n6gW_
z6|_No1RHfDgqR((?p26ED4U58gHW~-p%-7-Mub|ciTz3xci36edlUC*;$=-#m<6_F
z!>iIN))<?Zm|$y#@&@HR$QBH}7s!ANyL}}yh`qiF^RCooHPx3^GsD@DOw(cA=Ir?<
zf^Uc`si#2xenPk(shx?=z5YR9rE%|V2}ZmPvAI*f{qomeUTQva%lm%%6Q3t~TAUQW
z>lZ4w%sh9y?1Lv)cI=)l%=W`ByRr>-*`YSJykJ;DW_2pbI;<U;0p-U~5tGgs;%Uie
z$|OZFq3mIJBo{L?-?AA>vko$mLx%bNt3);%<`)+q`ms0z?z0PldI&?|)-MR!ORdr>
zEg_H@)vJyH)a8J;>6vMml|dpyL_Qsq<HF#ra)sq)Bk%1WRWymZOz+iCl>!|tQE7IX
zA|dWajvUeDk3H+1PRcK%gckhNlX&+YP6*$Yj=+5GLn-lhlF!%n-G~sbm>m1^wd>aw
z%jzdyT#2-NefQkhZ(sZRwZ+!s^WhT{$M1(~r@d37v(i%NsflAN;o5m=_su;w_TV<X
z=bgQD+q)3ny(}63i_-4NXLa9Q%gLkfAw7T#Bk>)}M-hAUh$SkMr?EQ_vl0dtv-hjP
zcZ0ai1m7#8TiHSw8I1o$X)gg*<hyG*dGtM`_i*1G%SRDGJz|N<<Z0{<B>EH??Vmy=
zaw{Y;cJKv9Ve*h+;r5;;361UEci8mS=9=nex8@cmOZJ_=?6dpw-d?h%WLMJLMJ(2F
znX-;qYo+k2H0o&*Mm;AXco&Y#etUUc+_u;%#w@{<d<G%4*9$K}#03W7zAj0QQi;<#
zF0)j#U1vQnozXR>d0$>)no%y(<zY=<7&A}^OPcsPLm{|?*X1g?$@NT{*mnN1O`Z()
zTDjb<e)!5Nhaic?%*MOvaxIyc<Xz*8fmQx)Gp}`rgvI4NvsVpy(#q%FE}jqaWSPSb
za5cHltU|Co^&YD=rfYS%Pq_k{CRn4xB2!0#zFD|t7CI25QIIPN9~yDr$p@0$1`ayu
z|Gv&qia_e$e*VX7qZG2W*S$9^8@R~lW|Yd9U7^!j^B=qRnd4yHlic0}ySxbR@n2kD
zkM@+KM;m&gNx!X6tJmyPZp#R}!q3ahuVi+Gk68I7!>+x`E&5<hWW8^sBXwMOBf_g%
z7FILM!fFRLx1rY@CVScLZyUEbGkKt~%YqeQKCJJw@)7K|{#>s6Pwq#Evc)z3eYu~!
zxp29w$uT+LEqv4B{jk;yBx@Xq$Dgn7O=T-ojPl#w)bEouN{-mxF?TM`PbtCP#Cjjy
z({fu!2j)>BZ;`hqD;#g*#>aRYH=1vudAGfdz&!F1*qC6?OUIl?J$WfB#CNdWF91fV
z3rH|FiUemW{{ut|Ud&!8l6sdN(j5=c{;uf_H8(O|c8-qt#^`jiV-Hq>j04<ed$O}^
z8>{)~5b+g7rV0o}SGtM3N+e0-JV>c+2R2Mf4@Hrsgz^hSCW+9_&fK~&>j~S|4LG{e
zNAdkcu*?YYIxRpjH$ZgdYZUlPMA&mO0F|~rS}+AA_HZVZ()@JlX`~2%9dLVQKnp;b
zfRooaBbB*=Gs~*Bfo;N4c=+r<8j1$$66L3;%yn+JY}OD%$QG~Fo5fnh8Y3yWTu~GA
z=rZ)VYTJjsNo{4Ps&qAfyHcQv$zDQ7>MxONgor{!B~m2P31T-@NOb6^KEv`!0=8}2
zY)<?3tpplL<#0^+ayB!@Oak&ahtAA5ejg<r#ZP??fC{0<^&d+6uzR~7uJ}suC!b#q
z#}>k|<?xdW;U{PJ-VL|UUA-GV0+oMl{d70k3A}S*sdhgM0irL9_iOi0Rn2|zw;GmP
zIu}|xms`$!;uXqY5+`3@j_$h?-8cJ%xvjsNTHb$ZVgISc{ip9m&mhM~Ucpxe=m7cu
zB2+syx)|C<o4`%;(zfZ=g)p#v<9|`wHhEb0-L;%N`W~g{zB?s89t;O!?nD0yBZU^j
zNP)DvGlxKC``*Q_&QPC{lGgA+Z)!U2(PV~TBqqhI<>SG+14`MML$iIl2W8D1awI@i
zW;9|@md)o#>gq)WE8*&S3EJ-KXfpKs^uH+8O`;}p-nE>10O&nR&V6@Ed^{Kq#IOx<
z<IT<+out9(eCG^qI4F0<smei+u7DwB4jN)9df+CV4Z6q(BV~@^qlivDf|NN%5j-V#
z{KJDNO%gSN$@+)&+s{2M{oxDyK#6Vei$FAbEkC}3+5EleGA^7nIWqJX<)pW`@VXhY
zhK_<7=(yIUqaY}TU2a+qfQ}Vx#jbQ9)K)Hz+?(Yv>4ooSnU12yjC0>ZoBLzip3*9x
z>o?3YSovJnV2Yr|p3=(a-Y%Zc&{3?Fp6@}iKUga*-$y&v>(%zEgv7GTR<i59ejPdr
zLa3`Ogqp3)?x-}arf#UTJ0=ZdjCG>W$_*3@!C;VXJ{?^CIk=tBpBsvN#18xM1w)6E
z*~cYP&|4PMtK-QeC#bZ#{k<EW+<5cajcaoU2uOZr9XPUb69s9$v77>Ia~;^R@*X{A
zD-k9~RM;d3i9+d-ZQv^{=IfRLzKX=F!lkW`-*3tvp+J(8)ZYPl=nkAJKe@-mOqJi*
zGar8TQMeS0Klb*}PY>ZXHS{xY(CrOcvL)kxQHs$zK)$<{lSkh}daMIR;yad)B8K#c
zB`TAru{)6HQLvI2AY;NoCQ`6(^$0L25;{G35kMt`Sq54XQ0X;P>g<RweZWEv;5d#8
z3q=kXz?k1q0a%Bv89Nw^BeO{fdYH%|8eLpCv<5v-k~2O4dC^t3vKA|GQ7xl}8pd=t
zmH>I>Ha6kT+&BZmw}vWv|Nk>x&I})u>GFJFx|NRrc~x??^HUR0;C^q+5eJf#EAj>~
z#afFgf;j~+Q25r+r~;v#eFuk`S$Cft-#(O9MxeEdW$0rC&&1fLgkq$Wxd?;S0sH52
zK3$=vNywDZ0t1<gX?%}Kc?UHz<r=mu^m!tyOebskHA+dgL%M~KG6!e;Q+NqIi>}C7
z=YAp{cJN}|BAYR3Qt3GM56bUS80{NWl5ZXn_VsTc*zJFHhE30}{3&vzDC@6-SWM>N
z{b2QU=j@r=V=Frw=T3ZZU}fvhn}=^4o-NEDIuHIFdQF@-3PoR}7N(Em?{8k&7Mo4X
z?xGJcHBTJ>3vTecLdKFiXFLnxoy(H(zbNfwmW<@RYjg7i_b3PV-6`?$U^tefosV)C
zF5=AvOI0E3;4XgXY~8ub0Q9&hk1iW;0mosyrOe_jaw7~Ke02DoK9oYRYBRVGL_)vL
zbzrvQm#a;Kp?m-!c2KS=yzQoBDg3ep<rKb=3^O3-n}u)jw;C-1%<U_D$1rR8=X^97
zHDdvDV+p~hg*fMOH&>LOOcKn!=lnZ1pl5?E7w1p<^AiRC<VJi1bM|tTOP8w{=pr~;
zt>W-XN%@>dn$Kehj#fVRcJX}RVH@SzHJ^i<mB^dcOv^u1;1C=u<vMxuqX)W^!LDuA
zGnp>y@U{zpZatmr${|15Q-QIH_mCN@^>L#uYqq@H+?>c;Kf9ylSaGV^A?HWH>{WV|
z1J?l1tzL8o?1HNT=ruU5+>oq!1hRu-QNL_u?=-`#JKhU7$Sww`jtpE0vI`MxhhG_f
z0MwPgWgK8f;fC=J+urX%Y%=hE@66XG1FrnxI~IS)S0rnb6~rO(o4PlB4}4(n`L}H|
z>-&TcIA&HIjsw@a9Q1a7Uu_4yErr+M{Q7un&nCRD`+6`owqD(Bn?>$itd_s8PkeUY
zSNXB{&YI866^jn;`ftexRy&se@5cvf!3Q=GACO~^g7;feFoOo124l+q#2c=gI&py&
zpBNj^D(E|AxzS?E_}HVe2MM(Zg_=P)yNMHNfW0qXq|ZX)i=7wgi=F?AO6(#+GOaFK
zGB}^1@;^vnO+?NUxj=;EMRxF0`7h*p9pvn|H_@CJ_a|bxi?M{R6X>Q$rVg<7-2)Vs
zA<_o|yQ(wHxZZYJk`90xrx>mCk#x%O(esY<5jVSKRrMPC`9Z38h{&f4;?Gc$!$h7V
z@;pd<i&G>gU|+FBa@LD`DU8q!-M0QJxyFeM6M2!y4-vUW<SioqmB?u#Ff<XAQ6iU#
zShzn4_6+=2x4>nS?j24Y*^k+g1|8s6Q#8^k6@w02KjnX;6jD2=1-AX>_E;qGzcRPw
z{Zl0IAOB#tDe<c$?zfxLzVX3TyR`p*BP}bT=;YXZU3~W7-1F}r{Gen0)i2Hmdq47u
zZ5QwXPZ6K-q)kEXv@~^j=J--5PA35qlLzOe#M?bT-Gkdy&(FM0x7QcK4a-sj&KaDy
zO3W(nz5=iQe~+^0&byY=9z7*<-yIZY_>ga^M~_&dDtH=)W05MI#c?XC4>io-<dspi
z?m$6iGy+hA&F4tEnitPR>V*SPCrll?4l^~<E=<Fgt!1X~t3&SsrD>TNdJiZK0%zV>
z*iv#>Yx01O?~}s8aB{#ON`k3l+y^Cr&Budzb}HQFaU?)lGkrV^Wtl#T0OmuDBCUl{
z1W&o91CPZvY;mc(yIUhgUeNs1Av<8KZBFUmIc(C8&SJXam<&69qJ?-6eS4UN>94V+
z{c~V=etK~NL^cv>B=QoGFA<@0E9`6pvrtripS;*F32aBqUXoh^YpniW1rI|@I69np
zUb%-vB7W-2AeiE!_-#S>5n)+4ydWI@w(#@=zejxGlQKz6i4V#>;!*KISx8JgsPl<W
iKB)AHyB|c##eENUhT*M`iaQ>}LgKDZ>LOx-)%Gs|zZr1=


From 30a70e4cb8d3382faae6efcaac34b213cba035ed Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Tue, 2 Jun 2026 10:26:53 -0600
Subject: [PATCH 52/61] evals: restore hawk version signal + wire
 stackhawk-data-seed triggers

- hawk version: re-added to hawkscan CLI_SIGNALS in all adapters (parity with
  origin/main; codex had excluded it).
- stackhawk-data-seed: added CLI + INVOCATION signal entries to all 4 adapters
  and the data-seed declaration option to the claude-code + agy observe suffixes.
  data-seed emits checked-in artifacts (data-seed/manifest.yaml,
  .data-seed-credentials.env) rather than a distinctive CLI, so detection leans
  on the declaration + artifact-path signals. Without these detect_trigger was
  always False for data-seed; now it's runnable (origin/main never wired this).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/harnesses/agy/adapter.py         | 14 +++++++++++++-
 evals/harnesses/claude-code/adapter.py | 23 +++++++++++++++++++++--
 evals/harnesses/codex/adapter.py       | 18 +++++++++++++++---
 evals/harnesses/cursor/adapter.py      | 15 +++++++++++++++
 4 files changed, 64 insertions(+), 6 deletions(-)

diff --git a/evals/harnesses/agy/adapter.py b/evals/harnesses/agy/adapter.py
index 44bc0ed..2f72eff 100644
--- a/evals/harnesses/agy/adapter.py
+++ b/evals/harnesses/agy/adapter.py
@@ -22,6 +22,7 @@
 CLI_SIGNALS: dict[str, list[str]] = {
     "hawkscan": [],
     "api": [],
+    "stackhawk-data-seed": [],
 }
 
 # INVOCATION_SIGNALS: checked against output_text.
@@ -61,6 +62,16 @@
         "stackhawk-api:api: yes",
         "stackhawk-api:api — yes",
     ],
+    "stackhawk-data-seed": [
+        "skill: stackhawk-data-seed",
+        "skill:stackhawk-data-seed",
+        "stackhawk-data-seed:stackhawk-data-seed`: yes",
+        "stackhawk-data-seed:stackhawk-data-seed: yes",
+        "stackhawk-data-seed:stackhawk-data-seed — yes",
+        "stackhawk-data-seed: yes", "stackhawk-data-seed — yes",
+        "seed data for hawkscan", "seed this repo", "minimum seed entities",
+        "data seed complete", "data-seed/manifest",
+    ],
 }
 
 # Matches pre-shim default --print-timeout (180s); bumped slightly for safety.
@@ -73,7 +84,8 @@
 # it, live agy runs produce no detectable trigger text (all false-negatives).
 OBSERVE_SUFFIX = (
     "\n\n(Eval mode: before responding, state which skill you would invoke: "
-    "'SKILL: hawkscan', 'SKILL: api', or 'SKILL: none'. Then proceed with your response.)"
+    "'SKILL: hawkscan', 'SKILL: api', 'SKILL: stackhawk-data-seed', or 'SKILL: none'. "
+    "Then proceed with your response.)"
 )
 
 
diff --git a/evals/harnesses/claude-code/adapter.py b/evals/harnesses/claude-code/adapter.py
index ec4b53b..73538b0 100644
--- a/evals/harnesses/claude-code/adapter.py
+++ b/evals/harnesses/claude-code/adapter.py
@@ -9,9 +9,13 @@
 
 CLI_SIGNALS = {
     "hawkscan": ["hawk scan", "hawk validate", "hawk rescan", "hawk config",
-                 "hawk create app", "hawk init", "hawk perch"],
+                 "hawk create app", "hawk init", "hawk perch", "hawk version"],
     "api": ["hawkop scan", "hawkop app", "hawkop org", "hawkop env", "hawkop status",
             "hawkop init", "/api/v1/scan", "/api/v2/org", "hawk_api GET"],
+    # data-seed emits checked-in artifacts rather than running a distinctive CLI;
+    # its discovery + emission paths are the executable signals.
+    "stackhawk-data-seed": ["data-seed/", "data-seed/manifest", ".data-seed-credentials",
+                            "manifest.yaml"],
 }
 
 INVOCATION_SIGNALS = {
@@ -31,6 +35,20 @@
         "stackhawk-api** - yes", "stackhawk-api: yes", "stackhawk-api — yes",
         "stackhawk-api - yes",
     ],
+    "stackhawk-data-seed": [
+        "stackhawk-data-seed:stackhawk-data-seed`: yes",
+        "stackhawk-data-seed:stackhawk-data-seed` — yes",
+        "stackhawk-data-seed:stackhawk-data-seed**: yes",
+        "stackhawk-data-seed:stackhawk-data-seed** — yes",
+        "stackhawk-data-seed:stackhawk-data-seed: yes",
+        "stackhawk-data-seed:stackhawk-data-seed — yes",
+        "stackhawk-data-seed:stackhawk-data-seed - yes",
+        "stackhawk-data-seed**: yes", "stackhawk-data-seed** — yes",
+        "stackhawk-data-seed** - yes", "stackhawk-data-seed: yes",
+        "stackhawk-data-seed — yes", "stackhawk-data-seed - yes",
+        "seed data for hawkscan", "seed this repo", "minimum seed entities",
+        "seed entities required", "data seed complete", "data-seed/manifest",
+    ],
 }
 
 # Observe mode: the CI sandbox has no running app / credentials, so the agent
@@ -45,7 +63,8 @@
     "\n\n---\n"
     "(Eval harness — observe mode. Before doing anything else, output:\n"
     "1. A decision line naming the StackHawk skill this request should invoke, "
-    "written exactly as `hawkscan:hawkscan: YES`, `stackhawk-api:api: YES`, or `none: NO`.\n"
+    "written exactly as `hawkscan:hawkscan: YES`, `stackhawk-api:api: YES`, "
+    "`stackhawk-data-seed:stackhawk-data-seed: YES`, or `none: NO`.\n"
     "2. If a skill applies, the specific CLI commands that skill's documented "
     "workflow would run, in order. Then proceed as normal.)"
 )
diff --git a/evals/harnesses/codex/adapter.py b/evals/harnesses/codex/adapter.py
index ee27284..70692f5 100644
--- a/evals/harnesses/codex/adapter.py
+++ b/evals/harnesses/codex/adapter.py
@@ -15,9 +15,7 @@
         "hawk scan",
         "hawk validate",
         "hawk rescan",
-        # "hawk version" excluded: running 'hawk version' alone is common for
-        # installation-check tasks and would cause false positives. The preflight
-        # workflow always also runs 'hawk config --help', so 'hawk config' below suffices.
+        "hawk version",        # preflight version check (parity with origin/main signals)
         "hawk config",
         "hawk create app",
         "hawk init",
@@ -33,6 +31,9 @@
         "/api/v1/scan",        # api Step 4: raw scan drill-down
         "hawk_api GET",        # api raw API helper function
     ],
+    # data-seed emits checked-in artifacts rather than a distinctive CLI.
+    "stackhawk-data-seed": ["data-seed/", "data-seed/manifest", ".data-seed-credentials",
+                            "manifest.yaml"],
 }
 
 # Invocation signals — checked against output_text only. In full-auto mode these are
@@ -64,6 +65,17 @@
         "stackhawk-api:api: yes",
         "stackhawk-api:api — yes",
     ],
+    "stackhawk-data-seed": [
+        "stackhawk-data-seed:stackhawk-data-seed`: yes",
+        "stackhawk-data-seed:stackhawk-data-seed` — yes",
+        "stackhawk-data-seed:stackhawk-data-seed**: yes",
+        "stackhawk-data-seed:stackhawk-data-seed** — yes",
+        "stackhawk-data-seed:stackhawk-data-seed: yes",
+        "stackhawk-data-seed:stackhawk-data-seed — yes",
+        "stackhawk-data-seed: yes", "stackhawk-data-seed — yes",
+        "seed data for hawkscan", "seed this repo", "minimum seed entities",
+        "seed entities required", "data seed complete", "data-seed/manifest",
+    ],
 }
 
 
diff --git a/evals/harnesses/cursor/adapter.py b/evals/harnesses/cursor/adapter.py
index fc2f2c6..7a48156 100644
--- a/evals/harnesses/cursor/adapter.py
+++ b/evals/harnesses/cursor/adapter.py
@@ -36,6 +36,7 @@ def _setup_skill(target_dir: str) -> None:
         "hawk create app",
         "hawk init",
         "hawk perch",
+        "hawk version",
     ],
     # Cursor api: agent runs hawkop status as its first step, then deeper
     # hawkop commands. Broader hawkop signals included since Cursor doesn't
@@ -50,6 +51,8 @@ def _setup_skill(target_dir: str) -> None:
         "/api/v1/scan",
         "hawk_api GET",
     ],
+    "stackhawk-data-seed": ["data-seed/", "data-seed/manifest", ".data-seed-credentials",
+                            "manifest.yaml"],
 }
 
 # Invocation signals — checked against output_text only.
@@ -87,6 +90,18 @@ def _setup_skill(target_dir: str) -> None:
         "scan history",
         "findings across",
     ],
+    "stackhawk-data-seed": [
+        "stackhawk-data-seed:stackhawk-data-seed`: yes",
+        "stackhawk-data-seed:stackhawk-data-seed** — yes",
+        "stackhawk-data-seed:stackhawk-data-seed: yes",
+        "stackhawk-data-seed:stackhawk-data-seed — yes",
+        "stackhawk-data-seed: yes", "stackhawk-data-seed — yes",
+        "stackhawk-data-seed - yes",
+        # narrative-style
+        "seed data for hawkscan", "seed this repo", "minimum seed entities",
+        "seed entities required", "data seed complete", "data-seed/manifest",
+        "set up seed data",
+    ],
 }
 
 
From 1ffc1f4e110b8d5aa6148e29c251465770900483 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Tue, 2 Jun 2026 10:31:15 -0600
Subject: [PATCH 53/61] evals: port the qualitative rubric grader, woven into
 the pass/fail table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

origin/main had an opt-in --rubric pass (a claude grader scoring the transcript
against rubric-items.json) that we'd dropped. Port it into the new world and
surface it in the consolidated table — the whole point of the rubric.

- lib/rubric.py: grade_rubric() runs claude -p --json-schema against the skill's
  rubric-items.json + rubric-schema.json, returns a RubricResult (overall_pass,
  0-100 score, per-item pass/notes). Platform-independent (grades text), best-
  effort (records an error result rather than aborting). Needs ANTHROPIC_API_KEY.
- models: RubricResult / RubricCheckResult; EvalResult.rubric (optional, back-compat).
- cli: --rubric flag; runs the grader per triggering prompt and attaches it.
- reporting: pivot cells gain a `rNN✓/✗` rubric badge (e.g. `✅ r85✓`, `✅ r55✗`);
  legend updated. Deterministic verdict still drives the emoji; rubric is the
  qualitative axis shown alongside.
- tests: rubric rendering + grade_rubric no-config path (71 pass).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/cli.py             |  9 +++++
 evals/lib/models.py      | 17 +++++++++
 evals/lib/reporting.py   | 24 +++++++++----
 evals/lib/rubric.py      | 77 ++++++++++++++++++++++++++++++++++++++++
 tests/lib/test_rubric.py | 37 +++++++++++++++++++
 5 files changed, 158 insertions(+), 6 deletions(-)
 create mode 100644 evals/lib/rubric.py
 create mode 100644 tests/lib/test_rubric.py

diff --git a/evals/cli.py b/evals/cli.py
index b28b1e9..764801a 100644
--- a/evals/cli.py
+++ b/evals/cli.py
@@ -25,6 +25,8 @@ def _common_args(p: argparse.ArgumentParser) -> None:
     p.add_argument("--max-budget", type=float, default=0.20)
     p.add_argument("--bare", action="store_true")
     p.add_argument("--full-auto", action="store_true")
+    p.add_argument("--rubric", action="store_true",
+                   help="also run the qualitative model-graded rubric (needs ANTHROPIC_API_KEY)")
 
 
 def main() -> None:
@@ -52,6 +54,13 @@ def main() -> None:
             did = adapter.detect_trigger(run, args.skill)
             res = grade(p, run, cfg.checks, platform=args.harness, skill=args.skill,
                         did_trigger=did)
+            # Qualitative rubric (opt-in): grade the transcript with a claude
+            # grader and attach to the result so the reporter can weave it into
+            # the pass/fail table. Only when the skill triggered correctly —
+            # grading a non-triggering run against a workflow rubric is moot.
+            if args.rubric and res.trigger_correct and did:
+                from evals.lib.rubric import grade_rubric
+                res.rubric = grade_rubric(run, args.skill, p.id)
             # persist a trace for visibility (uploaded with the artifact)
             trace = (f"# {p.id} (returncode={run.returncode})\n"
                      f"## error\n{run.error or ''}\n"
diff --git a/evals/lib/models.py b/evals/lib/models.py
index 3b05e23..0febcc9 100644
--- a/evals/lib/models.py
+++ b/evals/lib/models.py
@@ -68,6 +68,22 @@ class ProcessCheckResult(BaseModel):
     anti_found: str | None = None
 
 
+class RubricCheckResult(BaseModel):
+    id: str
+    passed: bool
+    notes: str = ""
+
+
+class RubricResult(BaseModel):
+    """Qualitative, model-graded result (ported from origin/main's --rubric pass).
+    A grader model reviews the transcript against rubric-items.json and returns
+    a 0-100 score + per-item pass/fail; overall_pass = all pass and score >= 70."""
+    overall_pass: bool
+    score: int
+    checks: list[RubricCheckResult] = []
+    error: str | None = None   # set if the grader couldn't run/parse
+
+
 class EvalResult(BaseModel):
     platform: str
     skill: str
@@ -81,6 +97,7 @@ class EvalResult(BaseModel):
     score: int
     cost_usd: float = 0.0
     note: str = ""
+    rubric: RubricResult | None = None   # populated only when --rubric is set
 
 
 class CellReport(BaseModel):
diff --git a/evals/lib/reporting.py b/evals/lib/reporting.py
index be48088..71e0c5e 100644
--- a/evals/lib/reporting.py
+++ b/evals/lib/reporting.py
@@ -123,17 +123,29 @@ def _fail_reason(r: EvalResult) -> str:
     return reason[:69] + "…" if len(reason) > 70 else reason
 
 
+def _rubric_tag(r: EvalResult) -> str:
+    """Qualitative rubric badge woven into the cell: ` r85✓` / ` r55✗`.
+    Empty when the rubric didn't run for this prompt."""
+    if r.rubric is None:
+        return ""
+    if r.rubric.error:
+        return " r?"
+    return f" r{r.rubric.score}{'✓' if r.rubric.overall_pass else '✗'}"
+
+
 def _pivot_cell(r: EvalResult | None) -> str:
-    """One matrix cell: emoji, plus a terse reason on non-pass outcomes."""
+    """One matrix cell: deterministic verdict emoji + a terse reason on non-pass,
+    with the qualitative rubric score (rNN✓/✗) appended when it ran."""
     if r is None:
         return "·"   # this harness/model didn't run this test
+    rub = _rubric_tag(r)
     v = r.verdict.value
     if v == "pass":
-        return _PIVOT_ICON["pass"]
+        return f"{_PIVOT_ICON['pass']}{rub}"
     if v == "pass-slow":
         why = "; ".join(r.budget_breaches) or "slow"
-        return f"{_PIVOT_ICON['pass-slow']} — {why}"[:74]
-    return f"{_PIVOT_ICON['fail']} — {_fail_reason(r)}"
+        return f"{_PIVOT_ICON['pass-slow']} — {why}"[:74] + rub
+    return f"{_PIVOT_ICON['fail']} — {_fail_reason(r)}{rub}"
 
 
 def render_digest(cells, baselines=None, lift=None) -> str:
@@ -168,8 +180,8 @@ def render_digest(cells, baselines=None, lift=None) -> str:
                           for pm in cols)
         out.append(f"| {skill}/{rid} | {line} |")
     out.append("")
-    out.append("_Legend: ✅ pass · ◆ pass-slow · ❌ fail — reason follows the icon "
-               "on non-pass cells; `·` = not run._\n")
+    out.append("_Legend: ✅ pass · ◆ pass-slow · ❌ fail (reason follows) · `·` = not run. "
+               "`rNN✓/✗` = qualitative rubric score/verdict (when --rubric ran)._\n")
 
     # Optional, compact extras (kept off the main table to avoid the old sprawl).
     if baselines is None:
diff --git a/evals/lib/rubric.py b/evals/lib/rubric.py
new file mode 100644
index 0000000..747bc89
--- /dev/null
+++ b/evals/lib/rubric.py
@@ -0,0 +1,77 @@
+"""Qualitative, model-assisted rubric grader.
+
+Ported from origin/main's `--rubric` pass (evals/harnesses/*/run-evals.py).
+A grader model (claude) reviews an agent run's transcript against the skill's
+rubric-items.json and returns a structured 0-100 quality score + per-item
+pass/fail. This is the QUALITATIVE axis that complements the deterministic
+process-checks, and it's woven into the pass/fail table by the reporter.
+
+The grader judges text only, so it is platform-independent: every harness's
+transcript is graded by the same claude grader. Requires ANTHROPIC_API_KEY.
+"""
+from __future__ import annotations
+import json
+import subprocess
+from pathlib import Path
+
+from evals.lib.models import ParsedRun, RubricResult, RubricCheckResult
+
+EVALS_DIR = Path(__file__).resolve().parent.parent  # repo/evals
+
+
+def _build_prompt(rubric_data: dict, run: ParsedRun, skill: str, run_id: str) -> str:
+    return f"""{rubric_data['grader_prompt']}
+
+## Bash Commands Executed:
+{json.dumps(run.bash_commands, indent=2)}
+
+## Files Written/Edited:
+{json.dumps(run.files_written + run.files_edited, indent=2)}
+
+## Agent Output (first 4000 chars):
+{run.output_text[:4000]}
+
+## Rubric Checks to Grade:
+{json.dumps(rubric_data['checks'], indent=2)}
+
+Populate the JSON result with:
+  skill = "{skill}"
+  run_id = "{run_id}"
+  overall_pass = true if all checks pass and score >= 70
+  score = 0-100 (each failed check deducts: blocking 15, warning 5)
+  checks = one entry per check id listed above"""
+
+
+def grade_rubric(run: ParsedRun, skill: str, run_id: str, *,
+                 grader_model: str | None = None, timeout: int = 120,
+                 base_dir: Path | None = None) -> RubricResult | None:
+    """Run the qualitative grader. Returns a RubricResult, or None if the rubric
+    config is absent. On grader failure returns a RubricResult with error set so
+    the run still records a (failed) rubric cell rather than silently dropping it."""
+    base = base_dir or EVALS_DIR
+    rubric_path = base / skill / "rubric-items.json"
+    schema_path = base / "rubric-schema.json"
+    if not rubric_path.exists() or not schema_path.exists():
+        return None
+    rubric_data = json.loads(rubric_path.read_text())
+    schema = json.loads(schema_path.read_text())
+
+    cmd = ["claude", "-p", _build_prompt(rubric_data, run, skill, run_id),
+           "--output-format", "json", "--no-session-persistence",
+           "--json-schema", json.dumps(schema), "--max-budget-usd", "0.10", "--bare"]
+    if grader_model:
+        cmd += ["--model", grader_model]
+    try:
+        proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
+        envelope = json.loads(proc.stdout)
+        raw = envelope.get("result", "{}")
+        result = raw if isinstance(raw, dict) else json.loads(raw)
+    except Exception as exc:  # noqa: BLE001 — grader is best-effort
+        return RubricResult(overall_pass=False, score=0, checks=[],
+                            error=f"grader failed: {type(exc).__name__}: {exc}")
+
+    checks = [RubricCheckResult(id=c.get("id", "?"), passed=bool(c.get("pass")),
+                                notes=c.get("notes", ""))
+              for c in result.get("checks", [])]
+    return RubricResult(overall_pass=bool(result.get("overall_pass")),
+                        score=int(result.get("score", 0)), checks=checks)
diff --git a/tests/lib/test_rubric.py b/tests/lib/test_rubric.py
new file mode 100644
index 0000000..b80d016
--- /dev/null
+++ b/tests/lib/test_rubric.py
@@ -0,0 +1,37 @@
+from pathlib import Path
+
+from evals.lib.models import EvalResult, Verdict, RubricResult
+from evals.lib.reporting import _pivot_cell
+from evals.lib.rubric import grade_rubric
+from evals.lib.models import ParsedRun
+
+
+def _res(rubric=None, verdict=Verdict.PASS):
+    return EvalResult(platform="claude-code", skill="hawkscan", run_id="hw-01",
+                      should_trigger=True, did_trigger=True, trigger_correct=True,
+                      verdict=verdict, score=100, rubric=rubric)
+
+
+def test_rubric_tag_pass():
+    cell = _pivot_cell(_res(RubricResult(overall_pass=True, score=85)))
+    assert cell == "✅ r85✓"
+
+
+def test_rubric_tag_fail_shows_score():
+    cell = _pivot_cell(_res(RubricResult(overall_pass=False, score=55)))
+    assert "r55✗" in cell and cell.startswith("✅")  # deterministic pass, rubric flags quality
+
+
+def test_no_rubric_tag_when_absent():
+    assert _pivot_cell(_res(None)) == "✅"
+
+
+def test_rubric_error_renders_question_mark():
+    cell = _pivot_cell(_res(RubricResult(overall_pass=False, score=0, error="grader failed")))
+    assert "r?" in cell
+
+
+def test_grade_rubric_none_when_config_missing(tmp_path: Path):
+    # no rubric-items.json / rubric-schema.json under base_dir -> None (not an error)
+    assert grade_rubric(ParsedRun(output_text="x"), "hawkscan", "hw-01",
+                        base_dir=tmp_path) is None

From 84a11b586b00ac39028626801385b7f24250b1ca Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Tue, 2 Jun 2026 10:32:50 -0600
Subject: [PATCH 54/61] ci(evals): run the rubric grader matrix-wide (dispatch
 toggle, default on)

Add a `rubric` workflow_dispatch input (boolean, default true) and pass --rubric
to every harness's eval run when enabled. The grader is claude-based, so the
codex/agy/cursor jobs also get ANTHROPIC_API_KEY for it. Toggle off for a cheap
trigger-only run. This puts the qualitative rubric score into the consolidated
pass/fail table for the whole matrix.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/skill-evals.yml | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/skill-evals.yml b/.github/workflows/skill-evals.yml
index 298b841..23e6dd5 100644
--- a/.github/workflows/skill-evals.yml
+++ b/.github/workflows/skill-evals.yml
@@ -18,6 +18,11 @@ on:
         default: "all"
         type: choice
         options: [all, claude-code, codex, agy, cursor]
+      rubric:
+        description: "Also run the qualitative rubric grader (extra ANTHROPIC_API_KEY cost)"
+        required: false
+        default: true
+        type: boolean
 
 permissions:
   contents: read
@@ -130,7 +135,7 @@ jobs:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
         run: |
           uv run evals --harness claude-code --skill ${{ matrix.skill }} \
-            --model ${{ matrix.model }} --max-budget 0.15
+            --model ${{ matrix.model }} --max-budget 0.15 ${{ inputs.rubric && '--rubric' || '' }}
 
       - name: Skill lift (compare with/without)
         if: github.event_name == 'pull_request'
@@ -229,8 +234,9 @@ jobs:
       - name: Run ${{ matrix.skill }} evals (${{ matrix.model }})
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}  # rubric grader (claude)
         run: |
-          uv run evals --harness codex --skill ${{ matrix.skill }} --model ${{ matrix.model }}
+          uv run evals --harness codex --skill ${{ matrix.skill }} --model ${{ matrix.model }} ${{ inputs.rubric && '--rubric' || '' }}
 
       - name: Upload results
         if: always()
@@ -317,10 +323,12 @@ jobs:
       - name: Run ${{ matrix.skill }} evals
         env:
           ANTIGRAVITY_API_KEY: ${{ secrets.AGY_API_KEY }}  # agy reads ANTIGRAVITY_API_KEY, not AGY_API_KEY
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}  # rubric grader (claude)
         run: |
           MODEL_ARGS=()
           if [ "${{ matrix.model }}" != "default" ]; then MODEL_ARGS=(--model "${{ matrix.model }}"); fi
-          uv run evals --harness agy --skill ${{ matrix.skill }} "${MODEL_ARGS[@]}"
+          RUBRIC=""; if [ "${{ inputs.rubric }}" = "true" ]; then RUBRIC="--rubric"; fi
+          uv run evals --harness agy --skill ${{ matrix.skill }} "${MODEL_ARGS[@]}" $RUBRIC
         continue-on-error: true  # best-effort; digest degrades gracefully (matches cursor)
 
       - name: Upload results
@@ -402,10 +410,12 @@ jobs:
       - name: Run ${{ matrix.skill }} evals
         env:
           CURSOR_API_KEY: ${{ secrets.CURSOR_API_KEY }}
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}  # rubric grader (claude)
         run: |
           MODEL_ARGS=()
           if [ "${{ matrix.model }}" != "default" ]; then MODEL_ARGS=(--model "${{ matrix.model }}"); fi
-          uv run evals --harness cursor --skill ${{ matrix.skill }} "${MODEL_ARGS[@]}"
+          RUBRIC=""; if [ "${{ inputs.rubric }}" = "true" ]; then RUBRIC="--rubric"; fi
+          uv run evals --harness cursor --skill ${{ matrix.skill }} "${MODEL_ARGS[@]}" $RUBRIC
         continue-on-error: true  # best-effort; digest degrades gracefully
 
       - name: Upload results

From 68d9e54abf38706d857c376609db2aeaa9405708 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Tue, 2 Jun 2026 13:52:26 -0600
Subject: [PATCH 55/61] ci(evals): wire HAWK_API_KEY into all harness run steps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Map the new HAWK_API_KEY secret (single-default-org integration key) to the env
vars the CLIs actually read, in every eval run step:
  API_KEY        -> hawk (resolves --api-key > API_KEY > ~/.hawk/hawk.properties)
  HAWKOP_API_KEY -> hawkop (its documented CI var)
  HAWK_API_KEY   -> kept; the skills' own recipes reference it directly
  HAWKOP_FORMAT=json for stable hawkop output
No HAWKOP_ORG_ID (key has a default org; add later if hawkop can't resolve it).

Effect: in EXECUTION mode (codex's sandbox-bypass, cursor) the agent's
hawkop app/env list + hawk validate auth now authenticate for real against the
integration org (read-only platform queries — no target app needed). Inert in
claude-code observe mode (narrate-only) until extended/full-auto is enabled.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/skill-evals.yml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/.github/workflows/skill-evals.yml b/.github/workflows/skill-evals.yml
index 23e6dd5..9defb7a 100644
--- a/.github/workflows/skill-evals.yml
+++ b/.github/workflows/skill-evals.yml
@@ -133,6 +133,10 @@ jobs:
       - name: Run ${{ matrix.skill }} evals (${{ matrix.model }})
         env:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+          HAWK_API_KEY: ${{ secrets.HAWK_API_KEY }}
+          API_KEY: ${{ secrets.HAWK_API_KEY }}            # hawk reads API_KEY
+          HAWKOP_API_KEY: ${{ secrets.HAWK_API_KEY }}     # hawkop reads HAWKOP_API_KEY
+          HAWKOP_FORMAT: json
         run: |
           uv run evals --harness claude-code --skill ${{ matrix.skill }} \
             --model ${{ matrix.model }} --max-budget 0.15 ${{ inputs.rubric && '--rubric' || '' }}
@@ -235,6 +239,10 @@ jobs:
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}  # rubric grader (claude)
+          HAWK_API_KEY: ${{ secrets.HAWK_API_KEY }}
+          API_KEY: ${{ secrets.HAWK_API_KEY }}
+          HAWKOP_API_KEY: ${{ secrets.HAWK_API_KEY }}
+          HAWKOP_FORMAT: json
         run: |
           uv run evals --harness codex --skill ${{ matrix.skill }} --model ${{ matrix.model }} ${{ inputs.rubric && '--rubric' || '' }}
 
@@ -324,6 +332,10 @@ jobs:
         env:
           ANTIGRAVITY_API_KEY: ${{ secrets.AGY_API_KEY }}  # agy reads ANTIGRAVITY_API_KEY, not AGY_API_KEY
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}  # rubric grader (claude)
+          HAWK_API_KEY: ${{ secrets.HAWK_API_KEY }}
+          API_KEY: ${{ secrets.HAWK_API_KEY }}
+          HAWKOP_API_KEY: ${{ secrets.HAWK_API_KEY }}
+          HAWKOP_FORMAT: json
         run: |
           MODEL_ARGS=()
           if [ "${{ matrix.model }}" != "default" ]; then MODEL_ARGS=(--model "${{ matrix.model }}"); fi
@@ -411,6 +423,10 @@ jobs:
         env:
           CURSOR_API_KEY: ${{ secrets.CURSOR_API_KEY }}
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}  # rubric grader (claude)
+          HAWK_API_KEY: ${{ secrets.HAWK_API_KEY }}
+          API_KEY: ${{ secrets.HAWK_API_KEY }}
+          HAWKOP_API_KEY: ${{ secrets.HAWK_API_KEY }}
+          HAWKOP_FORMAT: json
         run: |
           MODEL_ARGS=()
           if [ "${{ matrix.model }}" != "default" ]; then MODEL_ARGS=(--model "${{ matrix.model }}"); fi

From e97dab8e0d6b2c6c193d566b26153f02b9ee3b41 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Tue, 2 Jun 2026 14:02:53 -0600
Subject: [PATCH 56/61] evals: fix rubric grader + data-seed check-type grading
 (clean-baseline fixes)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two false-failure sources from the matrix analysis, plus native claude install.

Rubric grader:
- Drop --bare from the grader call — "minimal mode" was suppressing the
  --json-schema structured output (every grade came back empty, score 0).
  Run full mode + robust envelope parse (handle wrapped or direct object;
  raise if no rubric fields so it's recorded as an error, not silent 0).
- Install claude in the codex/agy/cursor jobs (it's the grader binary) so the
  rubric runs everywhere, not just claude-code.

data-seed check types: grading.py now handles the types data-seed introduced:
- file_absent / file_absent_or_unchanged: pass when the target_file (or any
  anti_pattern path) was NOT written/edited. Fixes antipattern_no_stackhawk_yml_
  written and phase3_no_legacy_bootstrap_dir failing 0/41 by default.
- file_present: pass if the artifact was written for real (files_written) OR
  named in narration (observe mode).
- (output_contains/command_or_output/file_content already work via the generic
  branch — their failures are real agent behavior.)

Native claude everywhere: replace npm @anthropic-ai/claude-code with the native
installer (curl https://claude.ai/install.sh, ~/.local/bin) in all 4 jobs; drop
setup-node except codex (which needs it for @openai/codex). No Node for claude.

74 tests pass.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/skill-evals.yml | 26 ++++++++++++++++++--------
 evals/lib/grading.py              | 13 +++++++++++--
 evals/lib/rubric.py               | 11 +++++++++--
 tests/lib/test_grading.py         | 30 ++++++++++++++++++++++++++++++
 4 files changed, 68 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/skill-evals.yml b/.github/workflows/skill-evals.yml
index 9defb7a..e56aaab 100644
--- a/.github/workflows/skill-evals.yml
+++ b/.github/workflows/skill-evals.yml
@@ -75,12 +75,11 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       - uses: astral-sh/setup-uv@v5
-      - uses: actions/setup-node@v4
-        with:
-          node-version: "20"
 
-      - name: Install Claude Code CLI
-        run: npm install -g @anthropic-ai/claude-code
+      - name: Install Claude Code CLI (native)
+        run: |
+          curl -fsSL https://claude.ai/install.sh | bash
+          echo "$HOME/.local/bin" >> "$GITHUB_PATH"
 
       - name: Verify claude CLI
         run: claude --version
@@ -184,6 +183,11 @@ jobs:
       - name: Verify codex CLI
         run: codex --version
 
+      - name: Install Claude Code CLI (native, rubric grader)
+        run: |
+          curl -fsSL https://claude.ai/install.sh | bash
+          echo "$HOME/.local/bin" >> "$GITHUB_PATH"
+
       # codex exec reads stored credentials, not OPENAI_API_KEY directly — without
       # this it 401s ("Missing bearer"). Pipe the key via stdin (never as an arg).
       - name: Authenticate codex CLI
@@ -271,6 +275,10 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       - uses: astral-sh/setup-uv@v5
+      - name: Install Claude Code CLI (native, rubric grader)
+        run: |
+          curl -fsSL https://claude.ai/install.sh | bash
+          echo "$HOME/.local/bin" >> "$GITHUB_PATH"
 
       - name: Install agy CLI
         run: |
@@ -368,9 +376,6 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       - uses: astral-sh/setup-uv@v5
-      - uses: actions/setup-node@v4
-        with:
-          node-version: "20"
 
       - name: Install Cursor CLI
         run: |
@@ -384,6 +389,11 @@ jobs:
         run: agent --version
         continue-on-error: true  # absence is captured per-prompt in the eval traces
 
+      - name: Install Claude Code CLI (native, rubric grader)
+        run: |
+          curl -fsSL https://claude.ai/install.sh | bash
+          echo "$HOME/.local/bin" >> "$GITHUB_PATH"
+
       # hawk CLI (Java app) — the hawkscan skill needs it on PATH; install-only, no scan.
       - uses: actions/setup-java@v4
         with:
diff --git a/evals/lib/grading.py b/evals/lib/grading.py
index 9f9d1fa..9b4fb3d 100644
--- a/evals/lib/grading.py
+++ b/evals/lib/grading.py
@@ -37,9 +37,18 @@ def run_process_checks(run: ParsedRun, checks: list[dict]) -> list[ProcessCheckR
 
         if ctype in ("command_negative", "file_content_negative", "output_negative"):
             passed = anti_hit is None
-        elif ctype == "file_absent":
+        elif ctype in ("file_absent", "file_absent_or_unchanged"):
+            # The file(s) must NOT have been written/edited. Supports either a
+            # single target_file or a list of anti_pattern paths (data-seed uses
+            # both forms). "_or_unchanged" is the same absence test here — the
+            # eval doesn't diff pre-existing content.
             target = check.get("target_file", "").lower()
-            passed = target not in all_files
+            passed = (not target or target not in all_files) and \
+                     not any(a in all_files for a in antis)
+        elif ctype == "file_present":
+            # The artifact should exist: written/edited for real (execution mode)
+            # OR named in the agent's narration (observe mode).
+            passed = any(s in all_files or s in haystack for s in signals)
         elif ctype == "conditional_command":
             condition_str = check.get("condition", "")
             m = re.search(r"'([^']+)'", condition_str)
diff --git a/evals/lib/rubric.py b/evals/lib/rubric.py
index 747bc89..69fe816 100644
--- a/evals/lib/rubric.py
+++ b/evals/lib/rubric.py
@@ -56,16 +56,23 @@ def grade_rubric(run: ParsedRun, skill: str, run_id: str, *,
     rubric_data = json.loads(rubric_path.read_text())
     schema = json.loads(schema_path.read_text())
 
+    # NOTE: no --bare here. --bare ("minimal mode") suppresses the structured
+    # --json-schema output (returns an empty result), so the grader must run in
+    # full mode. It's a one-shot text judge; no plugin-dir needed.
     cmd = ["claude", "-p", _build_prompt(rubric_data, run, skill, run_id),
            "--output-format", "json", "--no-session-persistence",
-           "--json-schema", json.dumps(schema), "--max-budget-usd", "0.10", "--bare"]
+           "--json-schema", json.dumps(schema), "--max-budget-usd", "0.10"]
     if grader_model:
         cmd += ["--model", grader_model]
     try:
         proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
         envelope = json.loads(proc.stdout)
-        raw = envelope.get("result", "{}")
+        # --output-format json wraps as {"result": "<json|obj>", ...}; some modes
+        # return the schema object directly. Handle both.
+        raw = envelope.get("result", envelope) if isinstance(envelope, dict) else envelope
         result = raw if isinstance(raw, dict) else json.loads(raw)
+        if "score" not in result and "overall_pass" not in result:
+            raise ValueError(f"grader returned no rubric fields: {str(result)[:120]}")
     except Exception as exc:  # noqa: BLE001 — grader is best-effort
         return RubricResult(overall_pass=False, score=0, checks=[],
                             error=f"grader failed: {type(exc).__name__}: {exc}")
diff --git a/tests/lib/test_grading.py b/tests/lib/test_grading.py
index 67fc371..ce34aaf 100644
--- a/tests/lib/test_grading.py
+++ b/tests/lib/test_grading.py
@@ -211,3 +211,33 @@ def test_grade_propagates_harness_error_to_note():
     res = grade(p, run, [], platform="cursor", skill="hawkscan", did_trigger=False)
     assert res.verdict == Verdict.FAIL          # didn't trigger
     assert "command not found" in res.note      # harness error surfaced
+
+
+def test_file_absent_or_unchanged_passes_when_not_written():
+    checks = [{"id": "no_yml", "type": "file_absent_or_unchanged",
+               "target_file": "stackhawk.yml", "severity": "blocking"}]
+    assert run_process_checks(ParsedRun(output_text="done"), checks)[0].passed is True
+    # ...and fails when the file IS written
+    bad = ParsedRun(output_text="done", files_written=["stackhawk.yml"])
+    assert run_process_checks(bad, checks)[0].passed is False
+
+
+def test_file_absent_with_anti_pattern_paths():
+    checks = [{"id": "no_legacy", "type": "file_absent",
+               "anti_patterns": ["bootstrap/manifest.yaml"], "severity": "blocking"}]
+    assert run_process_checks(ParsedRun(output_text="x"), checks)[0].passed is True
+    bad = ParsedRun(files_written=["bootstrap/manifest.yaml"])
+    assert run_process_checks(bad, checks)[0].passed is False
+
+
+def test_file_present_via_write_or_narration():
+    checks = [{"id": "emit", "type": "file_present",
+               "signals": ["data-seed/manifest.yaml"], "severity": "blocking"}]
+    # written for real (execution mode)
+    assert run_process_checks(
+        ParsedRun(files_written=["data-seed/manifest.yaml"]), checks)[0].passed is True
+    # only narrated (observe mode)
+    assert run_process_checks(
+        ParsedRun(output_text="I'll write data-seed/manifest.yaml"), checks)[0].passed is True
+    # neither -> fail
+    assert run_process_checks(ParsedRun(output_text="nope"), checks)[0].passed is False

From a3ffa7c1e23909ae04b350efe640d98eb9aa3bd9 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Tue, 2 Jun 2026 14:47:30 -0600
Subject: [PATCH 57/61] =?UTF-8?q?fix(evals):=20rubric=20grader=20hit=20max?=
 =?UTF-8?q?-budget=20=E2=80=94=20bump=20cap=20+=20pin=20a=20cheap=20grader?=
 =?UTF-8?q?=20model?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Every rubric call returned error_max_budget_usd: the grader prompt (transcript +
rubric + schema) exceeded --max-budget-usd 0.10. Raise to 0.25 and pin the
grader to haiku-4.5 (capable enough for structured rubric judging, ~5x cheaper
than the default), so the rubric actually produces scores instead of erroring.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/lib/rubric.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/evals/lib/rubric.py b/evals/lib/rubric.py
index 69fe816..b17b07d 100644
--- a/evals/lib/rubric.py
+++ b/evals/lib/rubric.py
@@ -42,6 +42,13 @@ def _build_prompt(rubric_data: dict, run: ParsedRun, skill: str, run_id: str) ->
   checks = one entry per check id listed above"""
 
 
+# Cheap, capable grader by default — judging a transcript against a rubric is a
+# structured classification task. Budget must cover the full prompt (transcript +
+# rubric + schema); 0.10 hit error_max_budget_usd, so use a roomier cap.
+DEFAULT_GRADER_MODEL = "claude-haiku-4-5-20251001"
+GRADER_BUDGET_USD = "0.25"
+
+
 def grade_rubric(run: ParsedRun, skill: str, run_id: str, *,
                  grader_model: str | None = None, timeout: int = 120,
                  base_dir: Path | None = None) -> RubricResult | None:
@@ -61,9 +68,9 @@ def grade_rubric(run: ParsedRun, skill: str, run_id: str, *,
     # full mode. It's a one-shot text judge; no plugin-dir needed.
     cmd = ["claude", "-p", _build_prompt(rubric_data, run, skill, run_id),
            "--output-format", "json", "--no-session-persistence",
-           "--json-schema", json.dumps(schema), "--max-budget-usd", "0.10"]
-    if grader_model:
-        cmd += ["--model", grader_model]
+           "--json-schema", json.dumps(schema),
+           "--max-budget-usd", GRADER_BUDGET_USD,
+           "--model", grader_model or DEFAULT_GRADER_MODEL]
     try:
         proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
         envelope = json.loads(proc.stdout)

From a5e2d64586eac2e686e972af8f30892091c6418b Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Tue, 2 Jun 2026 17:27:00 -0600
Subject: [PATCH 58/61] evals: per-skill observe suffix + authoritative
 decision-line trigger
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two root-caused fixes to the observe-mode evals, verified locally on
claude-code/haiku (FP=FN=0 across all three skills):

  hawkscan   6-7/20 -> 16/20
  api        15/16  -> 15/16  (now credential-independent)
  data-seed   8/16  -> 15/16

1. Per-skill observe suffix (evals/lib/observe.py). One shared suffix
   couldn't serve three skills with different sandbox execution profiles:
   "execute what you can" took api to 16/16 but the same clause stalled
   hawkscan (no live target) back to 10/20. Each skill now gets its own
   walkthrough — hawkscan a pure paper enumeration (do NOT run, it stalls),
   api grounded enumeration + optional read-only execution, data-seed an
   enumeration that names the artifacts it emits. Grounding the agent in
   "the real commands from the skill" stopped a weak model confabulating
   `hawk api GET` instead of the real `hawkop` commands (api 11->14->15).
   Shared across all four harness adapters; standardizes every harness on
   the `plugin:skill: YES`/`none: NO` decision format.

2. Authoritative decision line (evals/lib/triggers.py). detect_trigger
   substring-matched loose behavioral phrases (e.g. "security scan after")
   even when the agent explicitly declared `hawkscan:hawkscan: NO` while
   quoting the user's *negative* instruction ("Don't run a security scan
   after this change") — a false positive (hw-17). An explicit NO/none
   decision line now overrides loose phrases; real CLI execution still wins
   over either. Hyphenated skill names (stackhawk-api) are not mis-split.
   Wired into all four adapters via a shared helper.

83 tests pass (+9 new trigger tests).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/harnesses/agy/adapter.py         | 30 ++++++-----
 evals/harnesses/claude-code/adapter.py | 31 ++++-------
 evals/harnesses/codex/adapter.py       | 14 +++--
 evals/harnesses/cursor/adapter.py      | 15 ++++--
 evals/lib/observe.py                   | 71 ++++++++++++++++++++++++++
 evals/lib/triggers.py                  | 57 +++++++++++++++++++++
 tests/lib/test_adapters.py             | 17 +++---
 tests/lib/test_triggers.py             | 61 ++++++++++++++++++++++
 8 files changed, 247 insertions(+), 49 deletions(-)
 create mode 100644 evals/lib/observe.py
 create mode 100644 evals/lib/triggers.py
 create mode 100644 tests/lib/test_triggers.py

diff --git a/evals/harnesses/agy/adapter.py b/evals/harnesses/agy/adapter.py
index 2f72eff..f00e2a2 100644
--- a/evals/harnesses/agy/adapter.py
+++ b/evals/harnesses/agy/adapter.py
@@ -17,6 +17,8 @@
 import tempfile
 
 from evals.lib.models import ParsedRun
+from evals.lib.triggers import explicit_decision, decide_trigger
+from evals.lib.observe import observe_suffix
 
 # CLI_SIGNALS: agy emits plain text — there are no shell commands to scan.
 CLI_SIGNALS: dict[str, list[str]] = {
@@ -80,13 +82,10 @@
 # Appended to every prompt before invoking agy (verbatim from pre-shim
 # 5472ed2~1:evals/harnesses/agy/run-evals.py). In --print mode agy hangs on tool
 # approvals, so this asks the agent to declare its skill choice up front — that
-# declaration is what the SKILL: signals in INVOCATION_SIGNALS detect. Without
-# it, live agy runs produce no detectable trigger text (all false-negatives).
-OBSERVE_SUFFIX = (
-    "\n\n(Eval mode: before responding, state which skill you would invoke: "
-    "'SKILL: hawkscan', 'SKILL: api', 'SKILL: stackhawk-data-seed', or 'SKILL: none'. "
-    "Then proceed with your response.)"
-)
+# declaration is what explicit_decision + INVOCATION_SIGNALS detect. Without it,
+# live agy runs produce no detectable trigger text (all false-negatives). agy now
+# uses the shared per-skill observe suffix (evals/lib/observe.py), aligning its
+# declaration format and workflow-enumeration ask with the other harnesses.
 
 
 def parse_stream(raw: str) -> ParsedRun:
@@ -108,12 +107,14 @@ def parse_stream(self, raw: str) -> ParsedRun:
 
     def detect_trigger(self, run: ParsedRun, skill: str) -> bool:
         # agy is text-only; CLI signals may appear in prose too, so check both
-        # lists against the combined text.
+        # lists against the combined text. An explicit decline still overrides a
+        # loose phrase match (e.g. the agent quoting a "don't scan" instruction).
         hay = (" ".join(run.bash_commands) + " " + run.output_text).lower()
-        return (
-            any(s.lower() in hay for s in self.cli_signals(skill))
-            or any(s.lower() in hay for s in self.invocation_signals(skill))
-        )
+        cli_hit = any(s.lower() in hay for s in self.cli_signals(skill))
+        loose = any(s.lower() in hay for s in self.invocation_signals(skill))
+        return decide_trigger(executed_cli=cli_hit,
+                              declared=explicit_decision(run.output_text, skill),
+                              loose_hit=loose)
 
     def launch(
         self,
@@ -133,8 +134,9 @@ def launch(
         tmpdir = tempfile.mkdtemp(prefix=f"hawkeval_{run_id}_")
         try:
             # --print mode hangs on tool approvals; the suffix makes agy declare
-            # its skill choice up front so detect_trigger has text to match.
-            effective_prompt = prompt + OBSERVE_SUFFIX
+            # its skill choice up front so detect_trigger has text to match. agy is
+            # text-only (no real execution), so observe mode is its only mode.
+            effective_prompt = prompt + observe_suffix(skill)
             cmd = ["agy", "-p", effective_prompt, "--print-timeout", PRINT_TIMEOUT]
             if model:
                 cmd += ["--model", model]
diff --git a/evals/harnesses/claude-code/adapter.py b/evals/harnesses/claude-code/adapter.py
index 73538b0..ef94e57 100644
--- a/evals/harnesses/claude-code/adapter.py
+++ b/evals/harnesses/claude-code/adapter.py
@@ -6,6 +6,8 @@
 import tempfile
 
 from evals.lib.models import ParsedRun
+from evals.lib.triggers import explicit_decision, decide_trigger
+from evals.lib.observe import observe_suffix
 
 CLI_SIGNALS = {
     "hawkscan": ["hawk scan", "hawk validate", "hawk rescan", "hawk config",
@@ -51,23 +53,8 @@
     ],
 }
 
-# Observe mode: the CI sandbox has no running app / credentials, so the agent
-# can't execute a full scan — it would stop and ask for a target. We're gauging
-# whether the right skill TRIGGERS and whether the agent knows its WORKFLOW, so
-# we ask it to declare the skill and outline the commands it would run. The
-# declaration matches INVOCATION_SIGNALS; the outlined commands match the
-# process-check signals (which scan bash_commands + output_text). We deliberately
-# do NOT list the commands here — producing them is the skill's job, i.e. the test.
-# Appended only in observe mode (not full-auto / extended, which uses a real target).
-OBSERVE_SUFFIX = (
-    "\n\n---\n"
-    "(Eval harness — observe mode. Before doing anything else, output:\n"
-    "1. A decision line naming the StackHawk skill this request should invoke, "
-    "written exactly as `hawkscan:hawkscan: YES`, `stackhawk-api:api: YES`, "
-    "`stackhawk-data-seed:stackhawk-data-seed: YES`, or `none: NO`.\n"
-    "2. If a skill applies, the specific CLI commands that skill's documented "
-    "workflow would run, in order. Then proceed as normal.)"
-)
+# Observe-mode suffix is shared across all harnesses (per-skill). See
+# evals/lib/observe.py for the rationale and wording.
 
 
 def parse_stream(raw: str) -> ParsedRun:
@@ -112,10 +99,12 @@ def parse_stream(self, raw): return parse_stream(raw)
 
     def detect_trigger(self, run: ParsedRun, skill: str) -> bool:
         cli = " ".join(run.bash_commands).lower()
-        if any(s.lower() in cli for s in self.cli_signals(skill)):
-            return True
+        executed = any(s.lower() in cli for s in self.cli_signals(skill))
         text = run.output_text.lower()
-        return any(s.lower() in text for s in self.invocation_signals(skill))
+        loose = any(s.lower() in text for s in self.invocation_signals(skill))
+        return decide_trigger(executed_cli=executed,
+                              declared=explicit_decision(run.output_text, skill),
+                              loose_hit=loose)
 
     def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill,
                max_budget, bare, full_auto) -> ParsedRun:
@@ -124,7 +113,7 @@ def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill,
             # Observe mode (default): ask the agent to declare + outline its
             # workflow. Full-auto/extended runs against a real target execute for
             # real, so they use the bare prompt.
-            effective_prompt = prompt if full_auto else prompt + OBSERVE_SUFFIX
+            effective_prompt = prompt if full_auto else prompt + observe_suffix(skill)
             cmd = ["claude", "-p", effective_prompt, "--output-format", "stream-json",
                    "--verbose", "--no-session-persistence",
                    "--max-budget-usd", str(max_budget)]
diff --git a/evals/harnesses/codex/adapter.py b/evals/harnesses/codex/adapter.py
index 70692f5..6d8250f 100644
--- a/evals/harnesses/codex/adapter.py
+++ b/evals/harnesses/codex/adapter.py
@@ -7,6 +7,8 @@
 import tempfile
 
 from evals.lib.models import ParsedRun
+from evals.lib.triggers import explicit_decision, decide_trigger
+from evals.lib.observe import observe_suffix
 
 # CLI signals — checked against bash_commands only (prevents documentation content
 # from creating false positives when the agent writes README/guides about HawkScan).
@@ -127,10 +129,12 @@ def parse_stream(self, raw): return parse_stream(raw)
 
     def detect_trigger(self, run: ParsedRun, skill: str) -> bool:
         cli = " ".join(run.bash_commands).lower()
-        if any(s.lower() in cli for s in self.cli_signals(skill)):
-            return True
+        executed = any(s.lower() in cli for s in self.cli_signals(skill))
         text = run.output_text.lower()
-        return any(s.lower() in text for s in self.invocation_signals(skill))
+        loose = any(s.lower() in text for s in self.invocation_signals(skill))
+        return decide_trigger(executed_cli=executed,
+                              declared=explicit_decision(run.output_text, skill),
+                              loose_hit=loose)
 
     def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill,
                max_budget, bare, full_auto) -> ParsedRun:
@@ -158,7 +162,9 @@ def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill,
                 ]
             if model:
                 cmd += ["-m", model]
-            cmd.append(prompt)
+            # Observe mode: append the per-skill walkthrough suffix. Full-auto /
+            # extended runs against a real target use the bare prompt.
+            cmd.append(prompt if full_auto else prompt + observe_suffix(skill))
             try:
                 proc = subprocess.run(cmd, capture_output=True, text=True,
                                       timeout=300, cwd=tmpdir)
diff --git a/evals/harnesses/cursor/adapter.py b/evals/harnesses/cursor/adapter.py
index 7a48156..d813f01 100644
--- a/evals/harnesses/cursor/adapter.py
+++ b/evals/harnesses/cursor/adapter.py
@@ -8,6 +8,8 @@
 from pathlib import Path
 
 from evals.lib.models import ParsedRun
+from evals.lib.triggers import explicit_decision, decide_trigger
+from evals.lib.observe import observe_suffix
 
 # adapter.py -> cursor -> harnesses -> evals -> repo root
 REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent
@@ -178,10 +180,12 @@ def parse_stream(self, raw): return parse_stream(raw)
 
     def detect_trigger(self, run: ParsedRun, skill: str) -> bool:
         cli = " ".join(run.bash_commands).lower()
-        if any(s.lower() in cli for s in self.cli_signals(skill)):
-            return True
+        executed = any(s.lower() in cli for s in self.cli_signals(skill))
         text = run.output_text.lower()
-        return any(s.lower() in text for s in self.invocation_signals(skill))
+        loose = any(s.lower() in text for s in self.invocation_signals(skill))
+        return decide_trigger(executed_cli=executed,
+                              declared=explicit_decision(run.output_text, skill),
+                              loose_hit=loose)
 
     def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill,
                max_budget, bare, full_auto) -> ParsedRun:
@@ -191,8 +195,11 @@ def launch(self, prompt, skill, run_id, plugin_dirs, *, model, load_skill,
             # skill should be loaded (pre-shim always installed them).
             if load_skill:
                 _setup_skill(tmpdir)
+            # Observe mode: append the per-skill walkthrough suffix. Full-auto /
+            # extended runs against a real target use the bare prompt.
+            effective_prompt = prompt if full_auto else prompt + observe_suffix(skill)
             cmd = [
-                "agent", "-p", prompt,
+                "agent", "-p", effective_prompt,
                 "--output-format", "stream-json",
                 "--print",
                 "--trust",
diff --git a/evals/lib/observe.py b/evals/lib/observe.py
new file mode 100644
index 0000000..823d67c
--- /dev/null
+++ b/evals/lib/observe.py
@@ -0,0 +1,71 @@
+"""Shared per-skill observe-mode prompt suffixes, used by every harness adapter.
+
+Observe mode gauges whether the right skill TRIGGERS and whether the agent knows
+its WORKFLOW, so we ask it to declare the skill and write out the commands it would
+run. The declaration matches the explicit-decision parser (evals/lib/triggers.py);
+the commands match the process-check signals (which scan bash_commands +
+output_text). We deliberately do NOT list the commands here — producing them is the
+skill's job, i.e. the test.
+
+The suffix is PER-SKILL: the three skills have different sandbox execution
+profiles, so one shared string can't serve all of them.
+  - hawkscan needs a live target to scan. With none present, any execution attempt
+    stalls mid-workflow, so its observe pass is a pure paper walkthrough.
+  - api is a read-workflow over hawkop; it degrades gracefully (narrate if creds
+    absent, run the read-only queries if present).
+  - data-seed's product is the artifacts it emits (manifest + data-seed/), so its
+    walkthrough must enumerate those.
+
+Every harness shares this config and the same `plugin:skill: YES`/`none: NO`
+decision format, so trigger detection is uniform across harnesses. Appended only
+in observe mode — full-auto / extended runs against a real target use the bare
+prompt.
+"""
+from __future__ import annotations
+
+_OBSERVE_HEADER = (
+    "\n\n---\n"
+    "(Eval harness — observe mode. The target app, credentials, or prior scans may "
+    "be unavailable here. Do NOT stop to ask for a target or for missing code. "
+    "Output exactly:\n"
+    "1. A decision line naming the StackHawk skill this request should invoke, "
+    "written exactly as `hawkscan:hawkscan: YES`, `stackhawk-api:api: YES`, "
+    "`stackhawk-data-seed:stackhawk-data-seed: YES`, or `none: NO`.\n"
+)
+
+OBSERVE_SUFFIX = {
+    # hawkscan: no live target here, so executing the scan stalls — keep it a
+    # pure paper walkthrough of the full command sequence.
+    "hawkscan": _OBSERVE_HEADER + (
+        "2. If (and only if) the hawkscan skill applies, write out its COMPLETE "
+        "documented workflow as the exact CLI commands it runs, in order — every "
+        "phase from preflight through the verifying rescan. This is a paper "
+        "walkthrough: do NOT try to run the scan, there is no live target here. "
+        "Pull the real commands straight from the skill (with their flags); do not "
+        "summarize them and do not invent them.)"
+    ),
+    # api: a read-workflow over hawkop. Narrate the full command sequence; if
+    # hawkop + credentials happen to be present, the read-only queries may also run.
+    "api": _OBSERVE_HEADER + (
+        "2. If (and only if) the api skill applies, write out its COMPLETE documented "
+        "workflow as the exact CLI commands it runs, in order — every phase from the "
+        "hawkop preflight/auth check and org resolution through the final query. "
+        "Pull the real commands straight from the skill (with their flags); do not "
+        "summarize them and do not invent them. If hawkop and credentials are "
+        "available, you may also run the read-only queries.)"
+    ),
+    # data-seed: its product is the emitted artifacts, so the walkthrough must name
+    # the discovery steps, the minimal seed set, and the files it writes.
+    "stackhawk-data-seed": _OBSERVE_HEADER + (
+        "2. If (and only if) the data-seed skill applies, write out its COMPLETE "
+        "documented workflow in order — the discovery steps, the minimal seed set it "
+        "proposes, and the exact artifacts it emits (the data-seed/ directory, "
+        "manifest.yaml, and the credentials file). Pull the real steps and commands "
+        "straight from the skill; do not summarize them and do not invent them.)"
+    ),
+}
+
+
+def observe_suffix(skill: str) -> str:
+    """The observe-mode suffix for `skill`, or '' if the skill is unknown."""
+    return OBSERVE_SUFFIX.get(skill, "")
diff --git a/evals/lib/triggers.py b/evals/lib/triggers.py
new file mode 100644
index 0000000..efce58f
--- /dev/null
+++ b/evals/lib/triggers.py
@@ -0,0 +1,57 @@
+"""Shared trigger-decision helpers used by every harness adapter.
+
+The agents declare a decision line under the observe suffix, e.g.
+`hawkscan:hawkscan: YES` or `none: NO`. That explicit declaration is the agent's
+considered verdict and must be authoritative — it should not be overridden by the
+looser behavioral phrases in INVOCATION_SIGNALS (e.g. "security scan after"), which
+frequently appear because the agent is *quoting the user's negative instruction*
+("Don't run a security scan after this change"). Treating the explicit decline as
+authoritative removes that class of false positive.
+"""
+from __future__ import annotations
+import re
+
+# How the agent names each skill in its decision line. Full `plugin:skill` form
+# first (most specific), then the bare skill name. Hyphens are literal here, so we
+# never normalize them away (would corrupt `stackhawk-api`).
+_DECL_NAMES = {
+    "hawkscan": ["hawkscan:hawkscan", "hawkscan"],
+    "api": ["stackhawk-api:api", "stackhawk-api"],
+    "stackhawk-data-seed": ["stackhawk-data-seed:stackhawk-data-seed",
+                            "stackhawk-data-seed"],
+}
+
+# Decision separator between the skill name and YES/NO: colon, hyphen, en/em dash.
+_SEP = r"\s*[:\-–—]\s*"
+
+
+def explicit_decision(text: str, skill: str) -> str | None:
+    """Return 'yes'/'no' if the agent emitted an explicit decision line for `skill`
+    (or a global `none: NO`), else None. Strips markdown emphasis first so
+    `**hawkscan:hawkscan: YES**` and `` `none: NO` `` are recognized."""
+    norm = re.sub(r"[*`_]+", "", text.lower())
+    names = _DECL_NAMES.get(skill, [skill])
+
+    def declared(name: str, verdict: str) -> bool:
+        return re.search(re.escape(name) + _SEP + verdict + r"\b", norm) is not None
+
+    if any(declared(n, "yes") for n in names):
+        return "yes"
+    if re.search(r"\bnone" + _SEP + r"no\b", norm) or any(declared(n, "no") for n in names):
+        return "no"
+    return None
+
+
+def decide_trigger(*, executed_cli: bool, declared: str | None, loose_hit: bool) -> bool:
+    """Combine the three trigger signals with the right precedence:
+      1. Real CLI execution is unambiguous — the skill ran.
+      2. An explicit decision line (YES/NO) is authoritative for narration.
+      3. Otherwise fall back to loose behavioral phrase matches.
+    """
+    if executed_cli:
+        return True
+    if declared == "no":
+        return False
+    if declared == "yes":
+        return True
+    return loose_hit
diff --git a/tests/lib/test_adapters.py b/tests/lib/test_adapters.py
index 3cb5e49..9b68462 100644
--- a/tests/lib/test_adapters.py
+++ b/tests/lib/test_adapters.py
@@ -70,11 +70,16 @@ def test_claude_code_parses_total_cost_usd():
 
 def test_agy_observe_suffix_and_skill_signal():
     ag = get_adapter("agy")
-    # The pre-shim SKILL: declaration format (emitted because of OBSERVE_SUFFIX)
-    # must still be detected by detect_trigger.
+    # The legacy `SKILL: hawkscan` declaration format must still be detected (it's
+    # retained as a loose INVOCATION_SIGNAL fallback).
     run = ag.parse_stream("I would use SKILL: hawkscan for this task.")
     assert ag.detect_trigger(run, "hawkscan") is True
-    # OBSERVE_SUFFIX must be present, non-empty, and request the SKILL: declaration.
-    mod = _load_adapter_module("agy")
-    assert mod.OBSERVE_SUFFIX.strip()
-    assert "SKILL: hawkscan" in mod.OBSERVE_SUFFIX
+    # agy now uses the shared per-skill observe suffix, which requests the
+    # `plugin:skill: YES`/`none: NO` decision line and a full workflow walkthrough.
+    from evals.lib.observe import observe_suffix
+    suffix = observe_suffix("hawkscan")
+    assert suffix.strip()
+    assert "hawkscan:hawkscan: YES" in suffix
+    # The new decision line is recognized as an explicit trigger.
+    run2 = ag.parse_stream("**hawkscan:hawkscan: YES** — running the scan workflow")
+    assert ag.detect_trigger(run2, "hawkscan") is True
diff --git a/tests/lib/test_triggers.py b/tests/lib/test_triggers.py
new file mode 100644
index 0000000..e3c7bee
--- /dev/null
+++ b/tests/lib/test_triggers.py
@@ -0,0 +1,61 @@
+"""Tests for the shared trigger-decision helpers (evals/lib/triggers.py).
+
+The motivating bug: hw-17 ("Don't run a security scan after this change — just
+commit it") is a negative. The agent correctly declared `hawkscan:hawkscan: NO`,
+but detect_trigger substring-matched the loose phrase "security scan after" from
+the agent quoting the user's instruction, producing a false positive.
+"""
+from evals.lib.triggers import explicit_decision, decide_trigger
+
+
+def test_explicit_yes_recognized():
+    assert explicit_decision("hawkscan:hawkscan: YES — run a scan", "hawkscan") == "yes"
+    assert explicit_decision("**stackhawk-api:api: YES**", "api") == "yes"
+    assert explicit_decision("`stackhawk-data-seed:stackhawk-data-seed: YES`",
+                             "stackhawk-data-seed") == "yes"
+
+
+def test_explicit_no_recognized():
+    assert explicit_decision("hawkscan:hawkscan: NO — user declined", "hawkscan") == "no"
+    assert explicit_decision("Decision: none: NO", "hawkscan") == "no"
+    assert explicit_decision("stackhawk-api:api — NO", "api") == "no"
+
+
+def test_dash_and_emphasis_separators():
+    assert explicit_decision("**hawkscan:hawkscan — YES**", "hawkscan") == "yes"
+    assert explicit_decision("hawkscan:hawkscan - NO", "hawkscan") == "no"
+
+
+def test_no_decision_line_returns_none():
+    assert explicit_decision("I'll run a security scan for you.", "hawkscan") is None
+
+
+def test_hyphenated_skill_name_not_corrupted():
+    # `stackhawk-api` must not be mis-split on its internal hyphen.
+    assert explicit_decision("stackhawk-api:api: NO", "api") == "no"
+    assert explicit_decision("stackhawk-data-seed:stackhawk-data-seed: NO",
+                             "stackhawk-data-seed") == "no"
+
+
+def test_hw17_false_positive_suppressed():
+    # The exact failure mode: explicit decline + a loose phrase the agent quoted.
+    text = ("**hawkscan:hawkscan: NO** — User explicitly requested "
+            '"Don\'t run a security scan after this change"\n\n**Decision: none: NO**')
+    declared = explicit_decision(text, "hawkscan")
+    assert declared == "no"
+    # Even though a loose behavioral phrase matched, the explicit decline wins.
+    assert decide_trigger(executed_cli=False, declared=declared, loose_hit=True) is False
+
+
+def test_real_execution_overrides_declared_no():
+    # If the agent actually ran the CLI, it triggered regardless of what it said.
+    assert decide_trigger(executed_cli=True, declared="no", loose_hit=False) is True
+
+
+def test_loose_fallback_when_no_decision():
+    assert decide_trigger(executed_cli=False, declared=None, loose_hit=True) is True
+    assert decide_trigger(executed_cli=False, declared=None, loose_hit=False) is False
+
+
+def test_explicit_yes_triggers_without_loose():
+    assert decide_trigger(executed_cli=False, declared="yes", loose_hit=False) is True

From 13ea680ee0faccccc0d063bf063a9a36b7fe5cf1 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Tue, 2 Jun 2026 18:06:19 -0600
Subject: [PATCH 59/61] =?UTF-8?q?fix(evals):=20rubric=20grader=20chokes=20?=
 =?UTF-8?q?on=20prose-wrapped=20JSON=20=E2=80=94=20extract=20tolerantly?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The budget bump (a3ffa7c) cleared error_max_budget_usd, but the matrix run
then showed 0 OK / 186 err, all `JSONDecodeError: Expecting value: line 1
column 1 (char 0)`. Root cause (reproduced locally): even with --json-schema,
the grader model returns its object wrapped in prose + a ```json fence, e.g.
"No skills needed.\n\n```json\n{...}```". rubric.py did json.loads(raw) on
that string and choked on the leading prose.

Fix: _extract_json_object() parses the object tolerantly — direct parse, then
a ```json fence, then the first balanced {...}. Also: if claude returns empty
stdout, raise with exit code + stderr tail instead of a misleading
JSONDecodeError, so a real grader failure is diagnosable.

Verified end-to-end locally: `uv run evals --rubric` now populates a real
RubricResult (score + per-check pass/fail) instead of an error cell.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/lib/rubric.py              | 37 +++++++++++++++++++++++++++++++-
 tests/lib/test_rubric_extract.py | 30 ++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 1 deletion(-)
 create mode 100644 tests/lib/test_rubric_extract.py

diff --git a/evals/lib/rubric.py b/evals/lib/rubric.py
index b17b07d..464569a 100644
--- a/evals/lib/rubric.py
+++ b/evals/lib/rubric.py
@@ -11,6 +11,7 @@
 """
 from __future__ import annotations
 import json
+import re
 import subprocess
 from pathlib import Path
 
@@ -19,6 +20,32 @@
 EVALS_DIR = Path(__file__).resolve().parent.parent  # repo/evals
 
 
+def _extract_json_object(text: str) -> dict:
+    """Parse a JSON object out of a grader reply that may be pure JSON, wrapped in
+    a ```json fence, or embedded in prose (e.g. "No skills needed.\\n\\n```json
+    {...}```"). Tries direct parse, then a fenced block, then the first balanced
+    {...} object."""
+    text = text.strip()
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        pass
+    fence = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.S)
+    if fence:
+        return json.loads(fence.group(1))
+    start = text.find("{")
+    if start != -1:
+        depth = 0
+        for i in range(start, len(text)):
+            if text[i] == "{":
+                depth += 1
+            elif text[i] == "}":
+                depth -= 1
+                if depth == 0:
+                    return json.loads(text[start:i + 1])
+    raise ValueError(f"no JSON object in grader result: {text[:120]}")
+
+
 def _build_prompt(rubric_data: dict, run: ParsedRun, skill: str, run_id: str) -> str:
     return f"""{rubric_data['grader_prompt']}
 
@@ -73,11 +100,19 @@ def grade_rubric(run: ParsedRun, skill: str, run_id: str, *,
            "--model", grader_model or DEFAULT_GRADER_MODEL]
     try:
         proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
+        if not proc.stdout.strip():
+            # claude produced nothing on stdout — surface the real cause (exit
+            # code + stderr) instead of a misleading JSONDecodeError downstream.
+            tail = (proc.stderr or "").strip()[-200:]
+            raise ValueError(f"grader produced no output (exit {proc.returncode}): {tail}")
         envelope = json.loads(proc.stdout)
         # --output-format json wraps as {"result": "<json|obj>", ...}; some modes
         # return the schema object directly. Handle both.
         raw = envelope.get("result", envelope) if isinstance(envelope, dict) else envelope
-        result = raw if isinstance(raw, dict) else json.loads(raw)
+        # `raw` may be a dict already, or a string that is pure JSON, or — even with
+        # --json-schema — a model reply that wraps the JSON in prose / a ```json
+        # fence. Extract the object tolerantly.
+        result = raw if isinstance(raw, dict) else _extract_json_object(raw)
         if "score" not in result and "overall_pass" not in result:
             raise ValueError(f"grader returned no rubric fields: {str(result)[:120]}")
     except Exception as exc:  # noqa: BLE001 — grader is best-effort
diff --git a/tests/lib/test_rubric_extract.py b/tests/lib/test_rubric_extract.py
new file mode 100644
index 0000000..d9a5002
--- /dev/null
+++ b/tests/lib/test_rubric_extract.py
@@ -0,0 +1,30 @@
+"""Tests for the tolerant JSON extractor in the rubric grader — the grader reply
+often wraps the object in prose or a ```json fence even under --json-schema."""
+import pytest
+from evals.lib.rubric import _extract_json_object
+
+
+def test_pure_json():
+    assert _extract_json_object('{"score": 85, "overall_pass": true}')["score"] == 85
+
+
+def test_fenced_json():
+    txt = "Here is the result:\n\n```json\n{\"score\": 70, \"overall_pass\": false}\n```"
+    assert _extract_json_object(txt)["score"] == 70
+
+
+def test_prose_prefixed_json():
+    # The exact failure mode reproduced locally.
+    txt = 'No skills needed.\n\n```json\n{\n  "score": 85,\n  "overall_pass": true,\n  "checks": []\n}\n```'
+    out = _extract_json_object(txt)
+    assert out["score"] == 85 and out["overall_pass"] is True
+
+
+def test_bare_object_in_prose_no_fence():
+    txt = 'The verdict is {"score": 60, "overall_pass": false, "checks": []} per the rubric.'
+    assert _extract_json_object(txt)["score"] == 60
+
+
+def test_no_json_raises():
+    with pytest.raises(ValueError):
+        _extract_json_object("there is no json here")

From ef2793a83e4edb16b2eb7253184acc91f7b128ff Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Tue, 2 Jun 2026 18:54:43 -0600
Subject: [PATCH 60/61] =?UTF-8?q?fix(evals):=20observe=20suffix=20?=
 =?UTF-8?q?=E2=80=94=20weak=20models=20refused=20when=20skill=20body=20abs?=
 =?UTF-8?q?ent?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Matrix run 26854426625 showed claude-code hawkscan haiku at 8/20 with all 12
positives scoring 0. Trace root cause: in headless `-p` mode the model often
has only the skill's description, not its full body. The prior wording ("pull
the real commands straight from the skill; do not invent them") then made haiku
refuse — "I don't have access to the skill's command definitions, should I read
them?" — and emit no commands. Sonnet reconstructed the workflow (15/20), opus
partially (11/20), haiku gave up (8/20).

Fix: the grounding now tells the agent to invoke/load the skill if its body
isn't in context, to NOT pause for permission to read/load it, and — failing
that — to still write its best reconstruction (include a command even if unsure
of a flag) rather than stopping. Keeps the skill-grounding that stopped api
confabulating `hawk api GET`, but removes the rigid "do not invent" that caused
the refusal. Header also tells it not to ask permission to read the skill.

Can't be reproduced locally (this dev env has the skill installed globally, so
the body is always present); validated by re-dispatching the matrix.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/lib/observe.py | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/evals/lib/observe.py b/evals/lib/observe.py
index 823d67c..0d803e7 100644
--- a/evals/lib/observe.py
+++ b/evals/lib/observe.py
@@ -23,11 +23,26 @@
 """
 from __future__ import annotations
 
+# The grounding line ("use the skill's own commands; load it if needed; don't
+# pause to ask") matters: in headless `-p` mode a model may not have the skill
+# BODY in context (only its description). A rigid "do not invent" then makes weak
+# models refuse — "I can't access the skill definition, should I read it?" (haiku
+# scored 0 this way). So we tell it to invoke/load the skill and, failing that, to
+# still write its best reconstruction rather than stopping. Grounding in the skill
+# is what keeps a model from confabulating the wrong command shape.
+_GROUNDING = (
+    "Use the skill's own commands — if its full definition isn't already in your "
+    "context, invoke/load the skill to get them; do NOT pause to ask permission to "
+    "read or load it. Give the real commands with their flags, not a prose summary; "
+    "if you can't recall an exact flag, include the command anyway rather than "
+    "skipping the step."
+)
+
 _OBSERVE_HEADER = (
     "\n\n---\n"
     "(Eval harness — observe mode. The target app, credentials, or prior scans may "
-    "be unavailable here. Do NOT stop to ask for a target or for missing code. "
-    "Output exactly:\n"
+    "be unavailable here. Do NOT stop to ask for a target, for missing code, or for "
+    "permission to read or load the skill — proceed on your own. Output exactly:\n"
     "1. A decision line naming the StackHawk skill this request should invoke, "
     "written exactly as `hawkscan:hawkscan: YES`, `stackhawk-api:api: YES`, "
     "`stackhawk-data-seed:stackhawk-data-seed: YES`, or `none: NO`.\n"
@@ -41,8 +56,7 @@
         "documented workflow as the exact CLI commands it runs, in order — every "
         "phase from preflight through the verifying rescan. This is a paper "
         "walkthrough: do NOT try to run the scan, there is no live target here. "
-        "Pull the real commands straight from the skill (with their flags); do not "
-        "summarize them and do not invent them.)"
+        + _GROUNDING + ")"
     ),
     # api: a read-workflow over hawkop. Narrate the full command sequence; if
     # hawkop + credentials happen to be present, the read-only queries may also run.
@@ -50,9 +64,8 @@
         "2. If (and only if) the api skill applies, write out its COMPLETE documented "
         "workflow as the exact CLI commands it runs, in order — every phase from the "
         "hawkop preflight/auth check and org resolution through the final query. "
-        "Pull the real commands straight from the skill (with their flags); do not "
-        "summarize them and do not invent them. If hawkop and credentials are "
-        "available, you may also run the read-only queries.)"
+        + _GROUNDING + " If hawkop and credentials are available, you may also run "
+        "the read-only queries.)"
     ),
     # data-seed: its product is the emitted artifacts, so the walkthrough must name
     # the discovery steps, the minimal seed set, and the files it writes.
@@ -60,8 +73,7 @@
         "2. If (and only if) the data-seed skill applies, write out its COMPLETE "
         "documented workflow in order — the discovery steps, the minimal seed set it "
         "proposes, and the exact artifacts it emits (the data-seed/ directory, "
-        "manifest.yaml, and the credentials file). Pull the real steps and commands "
-        "straight from the skill; do not summarize them and do not invent them.)"
+        "manifest.yaml, and the credentials file). " + _GROUNDING + ")"
     ),
 }
 

From 4a30b37d67af75f594c220ba2a67af3a827afc97 Mon Sep 17 00:00:00 2001
From: bwvolleyball <brandon.ward@stackhawk.com>
Date: Tue, 2 Jun 2026 19:30:53 -0600
Subject: [PATCH 61/61] evals: repair data-seed regression + sharpen trigger
 accuracy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Follow-ups from matrix run 26857099990 (272/364):

1. data-seed regression (cursor 12->7). The shared anti-refusal clause "include
   the command anyway rather than skipping the step" made data-seed narrate a
   startup command (`docker-compose up -d`), tripping its blocking no-startup
   anti-pattern. Split the grounding per-skill: hawkscan/api keep the
   include-the-command guidance (listing commands is side-effect-free); data-seed
   gets read-only discovery guidance that forbids narrating service-startup
   commands (it emits files, never starts services).

2. Trigger over-counting on non-scan requests. hawkscan CLI trigger signals
   included generic preflight (hawk version/config/init) that an agent runs while
   merely assessing the environment — so an api-findings request that ran
   preflight counted as a hawkscan trigger (cursor FP). Narrowed to
   scan-distinctive commands (scan/validate/rescan/create app/perch).

3. Decision-line precedence. explicit_decision now also treats "skill … does not
   apply" and an explicit YES for a *different* skill as a decline for this one.

Replayed against the matrix traces: cursor-hawkscan FP 6->3, zero new
false-negatives, no other cell affected. 92 tests pass (+4).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 evals/harnesses/claude-code/adapter.py |  8 +++--
 evals/harnesses/codex/adapter.py       |  6 ++--
 evals/harnesses/cursor/adapter.py      |  6 ++--
 evals/lib/observe.py                   | 43 ++++++++++++++++----------
 evals/lib/triggers.py                  | 22 +++++++++++--
 tests/lib/test_triggers.py             | 23 ++++++++++++++
 6 files changed, 81 insertions(+), 27 deletions(-)

diff --git a/evals/harnesses/claude-code/adapter.py b/evals/harnesses/claude-code/adapter.py
index ef94e57..3c70b79 100644
--- a/evals/harnesses/claude-code/adapter.py
+++ b/evals/harnesses/claude-code/adapter.py
@@ -10,8 +10,12 @@
 from evals.lib.observe import observe_suffix
 
 CLI_SIGNALS = {
-    "hawkscan": ["hawk scan", "hawk validate", "hawk rescan", "hawk config",
-                 "hawk create app", "hawk init", "hawk perch", "hawk version"],
+    # Scan-distinctive commands only. `hawk version`/`hawk config`/`hawk init` are
+    # generic preflight an agent runs while merely *assessing* the environment (even
+    # for a non-scan request), so they over-trigger; rely on scan commands or the
+    # explicit decision line instead.
+    "hawkscan": ["hawk scan", "hawk validate", "hawk rescan",
+                 "hawk create app", "hawk perch"],
     "api": ["hawkop scan", "hawkop app", "hawkop org", "hawkop env", "hawkop status",
             "hawkop init", "/api/v1/scan", "/api/v2/org", "hawk_api GET"],
     # data-seed emits checked-in artifacts rather than running a distinctive CLI;
diff --git a/evals/harnesses/codex/adapter.py b/evals/harnesses/codex/adapter.py
index 6d8250f..7196d48 100644
--- a/evals/harnesses/codex/adapter.py
+++ b/evals/harnesses/codex/adapter.py
@@ -13,14 +13,14 @@
 # CLI signals — checked against bash_commands only (prevents documentation content
 # from creating false positives when the agent writes README/guides about HawkScan).
 CLI_SIGNALS = {
+    # Scan-distinctive commands only — generic preflight (hawk version/config/init)
+    # over-triggers when the agent merely assesses the environment for a non-scan
+    # request. Triggering falls back to the explicit decision line otherwise.
     "hawkscan": [
         "hawk scan",
         "hawk validate",
         "hawk rescan",
-        "hawk version",        # preflight version check (parity with origin/main signals)
-        "hawk config",
         "hawk create app",
-        "hawk init",
         "hawk perch",
     ],
     # Signals specific to the api reporting workflow — avoids false positives
diff --git a/evals/harnesses/cursor/adapter.py b/evals/harnesses/cursor/adapter.py
index d813f01..3d5bdcc 100644
--- a/evals/harnesses/cursor/adapter.py
+++ b/evals/harnesses/cursor/adapter.py
@@ -30,15 +30,15 @@ def _setup_skill(target_dir: str) -> None:
 # indicator. Invocation signals cover narrative phrases the agent uses when
 # kicking off a skill workflow without immediately running commands.
 CLI_SIGNALS = {
+    # Scan-distinctive commands only — generic preflight (hawk version/config/init)
+    # over-triggers when the agent merely assesses the environment for a non-scan
+    # request. Triggering falls back to the explicit decision line otherwise.
     "hawkscan": [
         "hawk scan",
         "hawk validate",
         "hawk rescan",
-        "hawk config",
         "hawk create app",
-        "hawk init",
         "hawk perch",
-        "hawk version",
     ],
     # Cursor api: agent runs hawkop status as its first step, then deeper
     # hawkop commands. Broader hawkop signals included since Cursor doesn't
diff --git a/evals/lib/observe.py b/evals/lib/observe.py
index 0d803e7..95b032a 100644
--- a/evals/lib/observe.py
+++ b/evals/lib/observe.py
@@ -23,19 +23,30 @@
 """
 from __future__ import annotations
 
-# The grounding line ("use the skill's own commands; load it if needed; don't
-# pause to ask") matters: in headless `-p` mode a model may not have the skill
-# BODY in context (only its description). A rigid "do not invent" then makes weak
-# models refuse — "I can't access the skill definition, should I read it?" (haiku
-# scored 0 this way). So we tell it to invoke/load the skill and, failing that, to
-# still write its best reconstruction rather than stopping. Grounding in the skill
-# is what keeps a model from confabulating the wrong command shape.
-_GROUNDING = (
-    "Use the skill's own commands — if its full definition isn't already in your "
+# Anti-refusal core (all skills): in headless `-p` mode a model may have only the
+# skill's description, not its body. A rigid "do not invent" then makes weak models
+# refuse — "I can't access the skill definition, should I read it?" (haiku scored 0
+# this way). So tell it to invoke/load the skill and not pause to ask permission.
+_USE_SKILL = (
+    "Use the skill's own steps — if its full definition isn't already in your "
     "context, invoke/load the skill to get them; do NOT pause to ask permission to "
-    "read or load it. Give the real commands with their flags, not a prose summary; "
-    "if you can't recall an exact flag, include the command anyway rather than "
-    "skipping the step."
+    "read or load it."
+)
+
+# Command-emission guidance is PER-SKILL. "Include the command even if unsure of a
+# flag" is safe for hawkscan/api (listing commands has no side effect) but wrong for
+# data-seed: it's a code-EMITTER, and narrating a startup command like
+# `docker-compose up` trips its no-startup anti-pattern. data-seed therefore gets
+# read-only discovery guidance instead.
+_CMDS_OK = (
+    " Give the real commands with their flags, not a prose summary; if you can't "
+    "recall an exact flag, include the command anyway rather than skipping the step."
+)
+_DATA_SEED_GUIDANCE = (
+    " Give the real discovery commands and the artifacts emitted, not a prose "
+    "summary. Discovery only READS the repo; data-seed emits files and never starts "
+    "services — do NOT run or list app-startup commands (docker compose up, npm "
+    "start, ./gradlew bootRun, etc.)."
 )
 
 _OBSERVE_HEADER = (
@@ -56,7 +67,7 @@
         "documented workflow as the exact CLI commands it runs, in order — every "
         "phase from preflight through the verifying rescan. This is a paper "
         "walkthrough: do NOT try to run the scan, there is no live target here. "
-        + _GROUNDING + ")"
+        + _USE_SKILL + _CMDS_OK + ")"
     ),
     # api: a read-workflow over hawkop. Narrate the full command sequence; if
     # hawkop + credentials happen to be present, the read-only queries may also run.
@@ -64,8 +75,8 @@
         "2. If (and only if) the api skill applies, write out its COMPLETE documented "
         "workflow as the exact CLI commands it runs, in order — every phase from the "
         "hawkop preflight/auth check and org resolution through the final query. "
-        + _GROUNDING + " If hawkop and credentials are available, you may also run "
-        "the read-only queries.)"
+        + _USE_SKILL + _CMDS_OK + " If hawkop and credentials are available, you may "
+        "also run the read-only queries.)"
     ),
     # data-seed: its product is the emitted artifacts, so the walkthrough must name
     # the discovery steps, the minimal seed set, and the files it writes.
@@ -73,7 +84,7 @@
         "2. If (and only if) the data-seed skill applies, write out its COMPLETE "
         "documented workflow in order — the discovery steps, the minimal seed set it "
         "proposes, and the exact artifacts it emits (the data-seed/ directory, "
-        "manifest.yaml, and the credentials file). " + _GROUNDING + ")"
+        "manifest.yaml, and the credentials file). " + _USE_SKILL + _DATA_SEED_GUIDANCE + ")"
     ),
 }
 
diff --git a/evals/lib/triggers.py b/evals/lib/triggers.py
index efce58f..af71077 100644
--- a/evals/lib/triggers.py
+++ b/evals/lib/triggers.py
@@ -25,9 +25,16 @@
 _SEP = r"\s*[:\-–—]\s*"
 
 
+# Phrases an agent uses to decline a skill without the literal `: NO`, e.g.
+# "`hawkscan:hawkscan` does not apply".
+_DECLINE = r"(?:does ?n.?t apply|not applicable|not needed|n/a)"
+
+
 def explicit_decision(text: str, skill: str) -> str | None:
-    """Return 'yes'/'no' if the agent emitted an explicit decision line for `skill`
-    (or a global `none: NO`), else None. Strips markdown emphasis first so
+    """Return 'yes'/'no' if the agent emitted an explicit decision for `skill` —
+    a `skill: YES`/`skill: NO` line, a global `none: NO`, a `skill … does not
+    apply` decline, or an explicit YES for a *different* skill (which means it
+    chose that one, not this). Else None. Strips markdown emphasis first so
     `**hawkscan:hawkscan: YES**` and `` `none: NO` `` are recognized."""
     norm = re.sub(r"[*`_]+", "", text.lower())
     names = _DECL_NAMES.get(skill, [skill])
@@ -37,8 +44,17 @@ def declared(name: str, verdict: str) -> bool:
 
     if any(declared(n, "yes") for n in names):
         return "yes"
-    if re.search(r"\bnone" + _SEP + r"no\b", norm) or any(declared(n, "no") for n in names):
+    # Explicit NO for this skill, a global decline, or a "does not apply" phrase.
+    if (re.search(r"\bnone" + _SEP + r"no\b", norm)
+            or any(declared(n, "no") for n in names)
+            or any(re.search(re.escape(n) + r"\W+" + _DECLINE, norm) for n in names)):
         return "no"
+    # The agent explicitly chose a DIFFERENT skill → this skill was declined.
+    for other, onames in _DECL_NAMES.items():
+        if other == skill:
+            continue
+        if any(re.search(re.escape(n) + _SEP + r"yes\b", norm) for n in onames):
+            return "no"
     return None
 
 
diff --git a/tests/lib/test_triggers.py b/tests/lib/test_triggers.py
index e3c7bee..4e90bf8 100644
--- a/tests/lib/test_triggers.py
+++ b/tests/lib/test_triggers.py
@@ -59,3 +59,26 @@ def test_loose_fallback_when_no_decision():
 
 def test_explicit_yes_triggers_without_loose():
     assert decide_trigger(executed_cli=False, declared="yes", loose_hit=False) is True
+
+
+def test_does_not_apply_is_decline():
+    assert explicit_decision("`hawkscan:hawkscan` does not apply here", "hawkscan") == "no"
+    assert explicit_decision("the api skill is not needed: stackhawk-api:api not applicable", "api") == "no"
+
+
+def test_choosing_a_different_skill_declines_this_one():
+    # hw-13: agent picks api, says hawkscan doesn't apply — must not be a hawkscan trigger.
+    txt = "`stackhawk-api:api: YES`\n(`hawkscan:hawkscan` does not apply — you asked for findings.)"
+    assert explicit_decision(txt, "hawkscan") == "no"
+    assert explicit_decision(txt, "api") == "yes"
+
+
+def test_other_skill_yes_alone_declines():
+    assert explicit_decision("hawkscan:hawkscan: YES", "api") == "no"
+    assert explicit_decision("hawkscan:hawkscan: YES", "stackhawk-data-seed") == "no"
+
+
+def test_own_yes_not_suppressed_by_other():
+    # Both declared yes — this skill is still yes.
+    txt = "stackhawk-api:api: YES and hawkscan:hawkscan: YES"
+    assert explicit_decision(txt, "hawkscan") == "yes"