From a2be23caccb27a66d7a638063fc086445771a80b Mon Sep 17 00:00:00 2001 From: Ismail Mehdi Date: Mon, 11 May 2026 19:42:06 +0000 Subject: [PATCH 01/11] feat: add Codex to agent filters in viewer - Add 'Codex' option to agent toggle buttons in status and list views. - Implement filtering for 'codex_cli' generator in both views. - Add missing 'mesop' and 'gunicorn' dependencies to pyproject.toml. TAG=agy CONV=5c0ca3b4-cd35-4f4b-aa14-bc902aaaf0c7 --- pyproject.toml | 2 ++ uv.lock | 4 ++++ viewer/main.py | 8 ++++++++ 3 files changed, 14 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index a61e6e3f..4e2d904b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,8 @@ dependencies = [ "dbt-core", "dbt-bigquery", "dbt-postgres", + "mesop", + "gunicorn", ] [tool.setuptools.packages.find] diff --git a/uv.lock b/uv.lock index 3f8d4ac0..d556b35e 100644 --- a/uv.lock +++ b/uv.lock @@ -904,7 +904,9 @@ dependencies = [ { name = "google-genai" }, { name = "grpcio" }, { name = "grpcio-tools" }, + { name = "gunicorn" }, { name = "jsonschema" }, + { name = "mesop" }, { name = "mongomock" }, { name = "pandas", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "pandas", version = "3.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, @@ -955,7 +957,9 @@ requires-dist = [ { name = "google-genai" }, { name = "grpcio", specifier = ">=1.80.0" }, { name = "grpcio-tools", specifier = ">=1.80.0" }, + { name = "gunicorn" }, { name = "jsonschema" }, + { name = "mesop" }, { name = "mongomock" }, { name = "pandas" }, { name = "pandas-gbq" }, diff --git a/viewer/main.py b/viewer/main.py index 6835fb43..7a265bb0 100644 --- a/viewer/main.py +++ b/viewer/main.py @@ -295,6 +295,7 @@ def on_agent_tab_change(e): buttons=[ me.ButtonToggleButton(label="Gemini", value="Gemini"), me.ButtonToggleButton(label="Claude", value="Claude"), + me.ButtonToggleButton(label="Codex", value="Codex"), ], on_change=on_agent_tab_change, ) @@ -362,6 +363,10 @@ def on_agent_tab_change(e): # Filter by agent tab if state.status_agent_tab == "Gemini": summary_df = summary_df[(summary_df['model_config.generator'] == 'gemini_cli') | (summary_df['model_config.generator'] == 'unknown') | (summary_df['model_config.generator'] == 'N/A') | summary_df['Product'].isin(default_products)] + elif state.status_agent_tab == "Claude": + summary_df = summary_df[(summary_df['model_config.generator'] == 'claude_code') | (summary_df['model_config.generator'] == 'unknown') | (summary_df['model_config.generator'] == 'N/A')] + elif state.status_agent_tab == "Codex": + summary_df = summary_df[(summary_df['model_config.generator'] == 'codex_cli')] # Render table similar to lists tab with me.box( @@ -574,6 +579,7 @@ def on_list_agent_tab_change(e): buttons=[ me.ButtonToggleButton(label="Gemini", value="Gemini"), me.ButtonToggleButton(label="Claude", value="Claude"), + me.ButtonToggleButton(label="Codex", value="Codex"), ], on_change=on_list_agent_tab_change, ) @@ -739,6 +745,8 @@ def get_sort_key(x): summaries = [x for x in summaries if x.get("model_config.generator") == "gemini_cli" or x.get("model_config.generator") == "unknown" or x.get("model_config.generator") == "N/A" or x.get("product") in ['spanner', 'bigtable', 'alloydb', 'memorystore', 'dms', 'datastream']] elif state.list_agent_tab == "Claude": summaries = [x for x in summaries if x.get("model_config.generator") == "claude_code" or (x.get("model_config.generator") == "unknown" and 'claude' in str(x.get("product")).lower())] + elif state.list_agent_tab == "Codex": + summaries = [x for x in summaries if x.get("model_config.generator") == "codex_cli"] logging.info(f"Number of summaries after tab filter: {len(summaries)}") if state.eval_id_filter: From 023d1505a73e77835f75746dfc5624a921ce2832 Mon Sep 17 00:00:00 2001 From: Ismail Mehdi Date: Mon, 11 May 2026 19:58:35 +0000 Subject: [PATCH 02/11] fix: fix Mesop event routing bug in trends dropdown --- viewer/trends.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/viewer/trends.py b/viewer/trends.py index c0bc5c99..065515d3 100644 --- a/viewer/trends.py +++ b/viewer/trends.py @@ -184,9 +184,12 @@ def trends_component(): df = df[df['model_config.generator'].str.contains('gemini', case=False) | (df['model_config.generator'] == 'unknown') | (df['model_config.generator'] == 'N/A') | df['product'].isin(['spanner', 'bigtable', 'alloydb', 'memorystore', 'dms', 'datastream'])] elif state.trends_agent_tab == "Claude": df = df[df['model_config.generator'].str.contains('claude', case=False) | ((df['model_config.generator'] == 'unknown') & df['product'].str.contains('claude', case=False))] + elif state.trends_agent_tab == "Codex": + df = df[df['model_config.generator'].str.contains('codex', case=False)] # Extract unique products for dropdown all_products = sorted(df['product'].unique().tolist()) + logging.info(f"All products found in trends: {all_products}") # Apply filter if selected if state.trends_product_filter: @@ -229,6 +232,7 @@ def on_trends_agent_tab_change(e): buttons=[ me.ButtonToggleButton(label="Gemini", value="Gemini"), me.ButtonToggleButton(label="Claude", value="Claude"), + me.ButtonToggleButton(label="Codex", value="Codex"), ], on_change=on_trends_agent_tab_change, ) @@ -245,8 +249,14 @@ def toggle_trends_product_dropdown(e: me.ClickEvent): def make_product_handler(val): def handler(e: me.ClickEvent): st = me.state(State) + logging.info(f"Product handler triggered for: {val}") st.trends_product_filter = val st.open_dropdown = "" + + safe_val = str(val).replace(" ", "_").replace(".", "_").replace("-", "_") + handler_name = f"click_trends_prod_{safe_val}" + handler.__name__ = handler_name + globals()[handler_name] = handler return handler with me.box(style=me.Style(position="relative", width="300px")): From 196cf0f7583f16db41ef034af9b79a2fe1683727 Mon Sep 17 00:00:00 2001 From: Ismail Mehdi Date: Mon, 11 May 2026 21:05:44 +0000 Subject: [PATCH 03/11] Fix Cloud Run deployment by ensuring all workspace dependencies are installed. - Updated pyproject.toml to include viewer as a workspace dependency. - Updated Dockerfile to copy viewer/pyproject.toml before running uv sync. - Updated supervisord configs to use uv run to ensure correct environment is used. - Updated viewer/run_frontend.sh to use uv run gunicorn. - Added .dockerignore to prevent copying local .venv. TAG=agy CONV=5c0ca3b4-cd35-4f4b-aa14-bc902aaaf0c7 --- .dockerignore | 2 +- evalbench_service/Dockerfile | 1 + evalbench_service/supervisord_cloudrun.conf | 2 +- evalbench_service/supervisord_combined.conf | 4 ++-- evalbench_service/supervisord_evalbench.conf | 2 +- pyproject.toml | 4 ++++ uv.lock | 4 +++- viewer/run_frontend.sh | 2 +- viewer/version.txt | 2 +- 9 files changed, 15 insertions(+), 8 deletions(-) diff --git a/.dockerignore b/.dockerignore index c5d724a4..e06167f4 100644 --- a/.dockerignore +++ b/.dockerignore @@ -2,4 +2,4 @@ .git __pycache__ *.pyc -results +.pytest_cache diff --git a/evalbench_service/Dockerfile b/evalbench_service/Dockerfile index b3424f0a..bf0a42db 100644 --- a/evalbench_service/Dockerfile +++ b/evalbench_service/Dockerfile @@ -23,6 +23,7 @@ RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor - WORKDIR /evalbench COPY pyproject.toml uv.lock ./ +COPY viewer/pyproject.toml viewer/ RUN uv sync --frozen COPY . . diff --git a/evalbench_service/supervisord_cloudrun.conf b/evalbench_service/supervisord_cloudrun.conf index 83fae017..8889686e 100644 --- a/evalbench_service/supervisord_cloudrun.conf +++ b/evalbench_service/supervisord_cloudrun.conf @@ -15,7 +15,7 @@ stderr_logfile=/dev/stderr stderr_logfile_maxbytes=0 [program:precompute_trends] -command=python /evalbench/viewer/run_precompute.py +command=uv run python /evalbench/viewer/run_precompute.py directory=/evalbench/viewer autostart=true autorestart=true diff --git a/evalbench_service/supervisord_combined.conf b/evalbench_service/supervisord_combined.conf index 8ec25a89..71d6bb67 100644 --- a/evalbench_service/supervisord_combined.conf +++ b/evalbench_service/supervisord_combined.conf @@ -15,7 +15,7 @@ stderr_logfile=/dev/stderr stderr_logfile_maxbytes=0 [program:evalbench_server] -command=python evalbench/eval_server.py +command=uv run python evalbench/eval_server.py directory=/evalbench autostart=true autorestart=true @@ -25,7 +25,7 @@ stderr_logfile=/dev/stderr stderr_logfile_maxbytes=0 [program:precompute_trends] -command=python /evalbench/viewer/run_precompute.py +command=uv run python /evalbench/viewer/run_precompute.py directory=/evalbench/viewer autostart=true autorestart=true diff --git a/evalbench_service/supervisord_evalbench.conf b/evalbench_service/supervisord_evalbench.conf index a569935b..3748b9f5 100644 --- a/evalbench_service/supervisord_evalbench.conf +++ b/evalbench_service/supervisord_evalbench.conf @@ -5,7 +5,7 @@ logfile_maxbytes = 50MB logfile_backups=5 [program:evalbench_server] -command=python evalbench/eval_server.py +command=uv run python evalbench/eval_server.py directory=/evalbench autostart=true autorestart=true diff --git a/pyproject.toml b/pyproject.toml index 748c12bc..4056c354 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,8 +52,12 @@ dependencies = [ "dbt-postgres", "mesop", "gunicorn", + "viewer", ] +[tool.uv.sources] +viewer = { workspace = true } + [project.scripts] google-evalbench = "evalbench.evalbench:run" diff --git a/uv.lock b/uv.lock index d556b35e..7489ab9c 100644 --- a/uv.lock +++ b/uv.lock @@ -925,6 +925,7 @@ dependencies = [ { name = "sqlglot" }, { name = "sqlparse" }, { name = "tabulate" }, + { name = "viewer" }, ] [package.dev-dependencies] @@ -977,6 +978,7 @@ requires-dist = [ { name = "sqlglot" }, { name = "sqlparse" }, { name = "tabulate" }, + { name = "viewer", editable = "viewer" }, ] [package.metadata.requires-dev] @@ -4892,7 +4894,7 @@ wheels = [ [[package]] name = "viewer" version = "0.1.0" -source = { virtual = "viewer" } +source = { editable = "viewer" } dependencies = [ { name = "gevent" }, { name = "gunicorn" }, diff --git a/viewer/run_frontend.sh b/viewer/run_frontend.sh index b7e8ff68..b6f9a7e9 100755 --- a/viewer/run_frontend.sh +++ b/viewer/run_frontend.sh @@ -1,2 +1,2 @@ #!/bin/bash -exec gunicorn -w 12 -k sync main:me --bind :${PORT:-3000} --forwarded-allow-ips="*" --timeout 120 +exec uv run gunicorn -w 12 -k sync main:me --bind :${PORT:-3000} --forwarded-allow-ips="*" --timeout 120 diff --git a/viewer/version.txt b/viewer/version.txt index 41c5857d..5ef1b5f8 100644 --- a/viewer/version.txt +++ b/viewer/version.txt @@ -1 +1 @@ -01802cf +023d150 From ac5ee8413b7f7b944d71fa08691091b9546d7457 Mon Sep 17 00:00:00 2001 From: Ismail Mehdi Date: Mon, 11 May 2026 22:52:47 +0000 Subject: [PATCH 04/11] chore: ignore .jetskicli/project.json in .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 17318955..dde5ebcb 100644 --- a/.gitignore +++ b/.gitignore @@ -192,3 +192,4 @@ evalbench/db_connections/bat/db_blog.db # Autogenerated version file viewer/version.txt +.jetskicli/project.json From 0fcab945fa2342368b03e98197ab9b401b23e522 Mon Sep 17 00:00:00 2001 From: Ismail Mehdi Date: Tue, 12 May 2026 16:24:47 +0000 Subject: [PATCH 05/11] Fix Cloud Run startup crash by ensuring workspace dependencies are fully built. - Added `RUN uv sync --frozen` after `COPY . .` in the Dockerfile. This ensures that the `viewer` workspace member is fully built and installed during the Docker build phase (with internet access), preventing `uv run` from trying to download `setuptools` at runtime in the restricted Cloud Run environment. TAG=agy CONV=5c0ca3b4-cd35-4f4b-aa14-bc902aaaf0c7 --- evalbench_service/Dockerfile | 1 + viewer/version.txt | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/evalbench_service/Dockerfile b/evalbench_service/Dockerfile index bf0a42db..9ae57aa1 100644 --- a/evalbench_service/Dockerfile +++ b/evalbench_service/Dockerfile @@ -26,6 +26,7 @@ COPY pyproject.toml uv.lock ./ COPY viewer/pyproject.toml viewer/ RUN uv sync --frozen COPY . . +RUN uv sync --frozen # Create a non-root user for Claude Code. It refuses # --dangerously-skip-permissions when running as root. diff --git a/viewer/version.txt b/viewer/version.txt index 5ef1b5f8..d697fb2c 100644 --- a/viewer/version.txt +++ b/viewer/version.txt @@ -1 +1 @@ -023d150 +5ae3209 From c6680aecedabd21e33b920772a88612847cf49b2 Mon Sep 17 00:00:00 2001 From: Ismail Mehdi Date: Tue, 12 May 2026 18:56:04 +0000 Subject: [PATCH 06/11] chore: update Dockerfile to use --all-packages for uv sync This ensures that all workspace packages (including the viewer UI) are fully built and installed during the Docker build phase, supporting clean decoupling of viewer dependencies in the core package. TAG=agy CONV=5c0ca3b4-cd35-4f4b-aa14-bc902aaaf0c7 --- evalbench_service/Dockerfile | 4 ++-- viewer/version.txt | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/evalbench_service/Dockerfile b/evalbench_service/Dockerfile index 9ae57aa1..5eb43485 100644 --- a/evalbench_service/Dockerfile +++ b/evalbench_service/Dockerfile @@ -24,9 +24,9 @@ RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor - WORKDIR /evalbench COPY pyproject.toml uv.lock ./ COPY viewer/pyproject.toml viewer/ -RUN uv sync --frozen +RUN uv sync --frozen --all-packages COPY . . -RUN uv sync --frozen +RUN uv sync --frozen --all-packages # Create a non-root user for Claude Code. It refuses # --dangerously-skip-permissions when running as root. diff --git a/viewer/version.txt b/viewer/version.txt index d697fb2c..7b97e225 100644 --- a/viewer/version.txt +++ b/viewer/version.txt @@ -1 +1 @@ -5ae3209 +95990d4 From e3d0cfd1cb6f0dfb54f0de059d2a75d4c6650cb0 Mon Sep 17 00:00:00 2001 From: Ismail Mehdi Date: Tue, 12 May 2026 19:16:53 +0000 Subject: [PATCH 07/11] chore: declare experiment_config as key flag in evalbench.py This ensures that --experiment_config shows up in the basic --help output (instead of being hidden in --helpfull), making the standalone CLI interface much more user-friendly. TAG=agy CONV=5c0ca3b4-cd35-4f4b-aa14-bc902aaaf0c7 --- evalbench/evalbench.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/evalbench/evalbench.py b/evalbench/evalbench.py index 943a3240..c96ab6ab 100644 --- a/evalbench/evalbench.py +++ b/evalbench/evalbench.py @@ -34,6 +34,7 @@ None, "Path to a suite configuration file to run multiple experiments.", ) +flags.declare_key_flag('experiment_config') def eval(experiment_config: str): @@ -208,6 +209,11 @@ def main(argv: Sequence[str]): def run(): """Starting function for the uvx package entrypoint.""" + # Fix absl help output when run via uvx/launcher + if '__main__' in sys.modules: + main_module = sys.modules['__main__'] + if main_module.__doc__ and 'exec' in main_module.__doc__: + main_module.__doc__ = sys.modules[__name__].__doc__ app.run(main) From 9a41da0fd38fad7bf42071d8868c17d1a11874ea Mon Sep 17 00:00:00 2001 From: Ismail Mehdi Date: Tue, 12 May 2026 19:23:02 +0000 Subject: [PATCH 08/11] chore: fix absl key flags display when running via uvx/launcher - Updated run() in evalbench.py to register key flags under both '__main__' and `sys.argv[0]`. This bypasses the absl-py translation bug where it looks up key flags for `sys.argv[0]` instead of '__main__' when rendering help for the entrypoint script. TAG=agy CONV=5c0ca3b4-cd35-4f4b-aa14-bc902aaaf0c7 --- evalbench/evalbench.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/evalbench/evalbench.py b/evalbench/evalbench.py index c96ab6ab..292d12c3 100644 --- a/evalbench/evalbench.py +++ b/evalbench/evalbench.py @@ -214,6 +214,11 @@ def run(): main_module = sys.modules['__main__'] if main_module.__doc__ and 'exec' in main_module.__doc__: main_module.__doc__ = sys.modules[__name__].__doc__ + # Register key flags for __main__ and sys.argv[0] so they show up in launcher's short help + flags.FLAGS.register_key_flag_for_module('__main__', flags.FLAGS['experiment_config']) + flags.FLAGS.register_key_flag_for_module('__main__', flags.FLAGS['suite_config']) + flags.FLAGS.register_key_flag_for_module(sys.argv[0], flags.FLAGS['experiment_config']) + flags.FLAGS.register_key_flag_for_module(sys.argv[0], flags.FLAGS['suite_config']) app.run(main) From 85048e2760d7951351423facbf0c6fa2106b439e Mon Sep 17 00:00:00 2001 From: Ismail Mehdi Date: Tue, 12 May 2026 19:30:46 +0000 Subject: [PATCH 09/11] fix: patch absl help output when running via uvx/launcher --- evalbench/evalbench.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/evalbench/evalbench.py b/evalbench/evalbench.py index 292d12c3..c80910f3 100644 --- a/evalbench/evalbench.py +++ b/evalbench/evalbench.py @@ -214,6 +214,8 @@ def run(): main_module = sys.modules['__main__'] if main_module.__doc__ and 'exec' in main_module.__doc__: main_module.__doc__ = sys.modules[__name__].__doc__ + # Clean up sys.argv[0] to hide the full temporary path + sys.argv[0] = os.path.basename(sys.argv[0]) # Register key flags for __main__ and sys.argv[0] so they show up in launcher's short help flags.FLAGS.register_key_flag_for_module('__main__', flags.FLAGS['experiment_config']) flags.FLAGS.register_key_flag_for_module('__main__', flags.FLAGS['suite_config']) From 7b44d1ba59d0209940bae8dd0857aeb9f8008aae Mon Sep 17 00:00:00 2001 From: Ismail Mehdi Date: Tue, 12 May 2026 19:33:22 +0000 Subject: [PATCH 10/11] chore: update version.txt for merge commit TAG=agy CONV=5c0ca3b4-cd35-4f4b-aa14-bc902aaaf0c7 --- viewer/version.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viewer/version.txt b/viewer/version.txt index 7b97e225..cbf16756 100644 --- a/viewer/version.txt +++ b/viewer/version.txt @@ -1 +1 @@ -95990d4 +dbfbafa From 109f897f46599efaf6a05b7ea5f62568b0c65091 Mon Sep 17 00:00:00 2001 From: Ismail Mehdi Date: Tue, 12 May 2026 19:44:21 +0000 Subject: [PATCH 11/11] chore: ignore results/ and .jetskicli/ in .dockerignore This prevents baking local evaluation results and Jetski agent state into the built Docker image, keeping it clean and reducing image size. TAG=agy CONV=5c0ca3b4-cd35-4f4b-aa14-bc902aaaf0c7 --- .dockerignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.dockerignore b/.dockerignore index e06167f4..3ea5b1ba 100644 --- a/.dockerignore +++ b/.dockerignore @@ -3,3 +3,6 @@ __pycache__ *.pyc .pytest_cache +results +.jetskicli +