From a98fd31a3437c3db4fcdbdac95ae00802255ce17 Mon Sep 17 00:00:00 2001 From: using-system Date: Fri, 17 Apr 2026 10:31:22 +0200 Subject: [PATCH 1/8] chore(benchmark): add latest OpenAI and Anthropic models to defaults Adds openai/gpt-5.4, openai/gpt-5.4-nano, openai/gpt-5.3-codex, openai/gpt-oss-120b, openai/gpt-oss-20b and anthropic/claude-opus-4.7 to the default benchmark matrix. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/benchmark.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 2b48c1a..43cd080 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -7,7 +7,7 @@ on: description: "JSON array of models to benchmark (OpenRouter model IDs)" required: false # yamllint disable-line rule:line-length - default: '["openai/gpt-4o","openai/gpt-4o-mini","openai/gpt-4.1","openai/gpt-4.1-mini","openai/gpt-4.1-nano","anthropic/claude-opus-4.6","anthropic/claude-haiku-4.5","google/gemma-3-4b-it","google/gemma-3-12b-it","google/gemma-3-27b-it","google/gemma-4-31b-it","mistralai/mistral-small-3.1-24b-instruct","mistralai/mistral-medium-3.1","meta-llama/llama-4-scout","meta-llama/llama-4-maverick","deepseek/deepseek-chat-v3-0324","deepseek/deepseek-v3.2","microsoft/phi-4","x-ai/grok-3-mini","x-ai/grok-4-fast"]' + default: '["openai/gpt-5.4","openai/gpt-5.4-nano","openai/gpt-5.3-codex","openai/gpt-oss-120b","openai/gpt-oss-20b","openai/gpt-4o","openai/gpt-4o-mini","openai/gpt-4.1","openai/gpt-4.1-mini","openai/gpt-4.1-nano","anthropic/claude-opus-4.7","anthropic/claude-opus-4.6","anthropic/claude-haiku-4.5","google/gemma-3-4b-it","google/gemma-3-12b-it","google/gemma-3-27b-it","google/gemma-4-31b-it","mistralai/mistral-small-3.1-24b-instruct","mistralai/mistral-medium-3.1","meta-llama/llama-4-scout","meta-llama/llama-4-maverick","deepseek/deepseek-chat-v3-0324","deepseek/deepseek-v3.2","microsoft/phi-4","x-ai/grok-3-mini","x-ai/grok-4-fast"]' judge_model: description: "Model used as LLM judge" required: false @@ -29,7 +29,7 @@ jobs: max-parallel: 5 # yamllint disable-line rule:line-length matrix: - model: ${{ fromJson(inputs.models || '["openai/gpt-4o","openai/gpt-4o-mini","openai/gpt-4.1","openai/gpt-4.1-mini","openai/gpt-4.1-nano","anthropic/claude-opus-4.6","anthropic/claude-haiku-4.5","google/gemma-3-4b-it","google/gemma-3-12b-it","google/gemma-3-27b-it","google/gemma-4-31b-it","mistralai/mistral-small-3.1-24b-instruct","mistralai/mistral-medium-3.1","meta-llama/llama-4-scout","meta-llama/llama-4-maverick","deepseek/deepseek-chat-v3-0324","deepseek/deepseek-v3.2","microsoft/phi-4","x-ai/grok-3-mini","x-ai/grok-4-fast"]') }} + model: ${{ fromJson(inputs.models || '["openai/gpt-5.4","openai/gpt-5.4-nano","openai/gpt-5.3-codex","openai/gpt-oss-120b","openai/gpt-oss-20b","openai/gpt-4o","openai/gpt-4o-mini","openai/gpt-4.1","openai/gpt-4.1-mini","openai/gpt-4.1-nano","anthropic/claude-opus-4.7","anthropic/claude-opus-4.6","anthropic/claude-haiku-4.5","google/gemma-3-4b-it","google/gemma-3-12b-it","google/gemma-3-27b-it","google/gemma-4-31b-it","mistralai/mistral-small-3.1-24b-instruct","mistralai/mistral-medium-3.1","meta-llama/llama-4-scout","meta-llama/llama-4-maverick","deepseek/deepseek-chat-v3-0324","deepseek/deepseek-v3.2","microsoft/phi-4","x-ai/grok-3-mini","x-ai/grok-4-fast"]') }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 From 46272b27de87dea7dcd4ac0be08dd5063ddfa45e Mon Sep 17 00:00:00 2001 From: using-system Date: Fri, 17 Apr 2026 10:44:33 +0200 Subject: [PATCH 2/8] chore(benchmark): remove openai/gpt-5.3-codex from defaults The model is unavailable on OpenRouter and caused the matrix job to fail. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/benchmark.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 43cd080..12d722c 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -7,7 +7,7 @@ on: description: "JSON array of models to benchmark (OpenRouter model IDs)" required: false # yamllint disable-line rule:line-length - default: '["openai/gpt-5.4","openai/gpt-5.4-nano","openai/gpt-5.3-codex","openai/gpt-oss-120b","openai/gpt-oss-20b","openai/gpt-4o","openai/gpt-4o-mini","openai/gpt-4.1","openai/gpt-4.1-mini","openai/gpt-4.1-nano","anthropic/claude-opus-4.7","anthropic/claude-opus-4.6","anthropic/claude-haiku-4.5","google/gemma-3-4b-it","google/gemma-3-12b-it","google/gemma-3-27b-it","google/gemma-4-31b-it","mistralai/mistral-small-3.1-24b-instruct","mistralai/mistral-medium-3.1","meta-llama/llama-4-scout","meta-llama/llama-4-maverick","deepseek/deepseek-chat-v3-0324","deepseek/deepseek-v3.2","microsoft/phi-4","x-ai/grok-3-mini","x-ai/grok-4-fast"]' + default: '["openai/gpt-5.4","openai/gpt-5.4-nano","openai/gpt-oss-120b","openai/gpt-oss-20b","openai/gpt-4o","openai/gpt-4o-mini","openai/gpt-4.1","openai/gpt-4.1-mini","openai/gpt-4.1-nano","anthropic/claude-opus-4.7","anthropic/claude-opus-4.6","anthropic/claude-haiku-4.5","google/gemma-3-4b-it","google/gemma-3-12b-it","google/gemma-3-27b-it","google/gemma-4-31b-it","mistralai/mistral-small-3.1-24b-instruct","mistralai/mistral-medium-3.1","meta-llama/llama-4-scout","meta-llama/llama-4-maverick","deepseek/deepseek-chat-v3-0324","deepseek/deepseek-v3.2","microsoft/phi-4","x-ai/grok-3-mini","x-ai/grok-4-fast"]' judge_model: description: "Model used as LLM judge" required: false @@ -29,7 +29,7 @@ jobs: max-parallel: 5 # yamllint disable-line rule:line-length matrix: - model: ${{ fromJson(inputs.models || '["openai/gpt-5.4","openai/gpt-5.4-nano","openai/gpt-5.3-codex","openai/gpt-oss-120b","openai/gpt-oss-20b","openai/gpt-4o","openai/gpt-4o-mini","openai/gpt-4.1","openai/gpt-4.1-mini","openai/gpt-4.1-nano","anthropic/claude-opus-4.7","anthropic/claude-opus-4.6","anthropic/claude-haiku-4.5","google/gemma-3-4b-it","google/gemma-3-12b-it","google/gemma-3-27b-it","google/gemma-4-31b-it","mistralai/mistral-small-3.1-24b-instruct","mistralai/mistral-medium-3.1","meta-llama/llama-4-scout","meta-llama/llama-4-maverick","deepseek/deepseek-chat-v3-0324","deepseek/deepseek-v3.2","microsoft/phi-4","x-ai/grok-3-mini","x-ai/grok-4-fast"]') }} + model: ${{ fromJson(inputs.models || '["openai/gpt-5.4","openai/gpt-5.4-nano","openai/gpt-oss-120b","openai/gpt-oss-20b","openai/gpt-4o","openai/gpt-4o-mini","openai/gpt-4.1","openai/gpt-4.1-mini","openai/gpt-4.1-nano","anthropic/claude-opus-4.7","anthropic/claude-opus-4.6","anthropic/claude-haiku-4.5","google/gemma-3-4b-it","google/gemma-3-12b-it","google/gemma-3-27b-it","google/gemma-4-31b-it","mistralai/mistral-small-3.1-24b-instruct","mistralai/mistral-medium-3.1","meta-llama/llama-4-scout","meta-llama/llama-4-maverick","deepseek/deepseek-chat-v3-0324","deepseek/deepseek-v3.2","microsoft/phi-4","x-ai/grok-3-mini","x-ai/grok-4-fast"]') }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 From 600137961f14408c8c45215681b37b13a67e987d Mon Sep 17 00:00:00 2001 From: using-system Date: Fri, 17 Apr 2026 10:44:42 +0200 Subject: [PATCH 3/8] fix(benchmark): use shared extract_json helper for LLM responses The benchmark and aggregate scripts still parsed LLM output with a local _strip_code_fences + json.loads pair, which fails when models wrap JSON in explanatory text or extra content. Production code switched to extract_json (regex-based) in #13; the benchmark scripts were missed, which caused most models to score N/A and rank with composite 0.0/10. Replace the local helper with reddit_digest.nodes.llm_utils.extract_json in both bench_model.py and aggregate.py. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/aggregate.py | 16 +++------------- benchmarks/bench_model.py | 21 +++++---------------- 2 files changed, 8 insertions(+), 29 deletions(-) diff --git a/benchmarks/aggregate.py b/benchmarks/aggregate.py index 2c524d8..b8aa0e0 100644 --- a/benchmarks/aggregate.py +++ b/benchmarks/aggregate.py @@ -16,22 +16,12 @@ from langchain_openai import ChatOpenAI +from reddit_digest.nodes.llm_utils import extract_json + logger = logging.getLogger(__name__) DEFAULT_BASE_URL = "https://openrouter.ai/api/v1" - -def _strip_code_fences(text: str) -> str: - """Strip markdown code fences and whitespace from LLM output.""" - text = text.strip() - if text.startswith("```"): - first_newline = text.index("\n") if "\n" in text else len(text) - text = text[first_newline + 1 :] - if text.endswith("```"): - text = text[:-3] - return text.strip() - - DEFAULT_JUDGE_MODEL = "openai/gpt-4o" JUDGE_PROMPT = """You are evaluating the quality of AI-generated summaries of Reddit posts. @@ -151,7 +141,7 @@ async def run_judge( try: response = await llm.ainvoke(prompt) - data = json.loads(_strip_code_fences(response.content)) + data = extract_json(response.content) evals = data.get("evaluations", {}) all_evaluations.update(evals) except Exception: diff --git a/benchmarks/bench_model.py b/benchmarks/bench_model.py index 7a319b4..23eabba 100644 --- a/benchmarks/bench_model.py +++ b/benchmarks/bench_model.py @@ -20,6 +20,7 @@ from langchain_openai import ChatOpenAI from reddit_digest.models import RedditPost +from reddit_digest.nodes.llm_utils import extract_json from reddit_digest.nodes.scorer import SCORE_PROMPT from reddit_digest.nodes.scorer import _build_post_block as scorer_build_block from reddit_digest.nodes.summarizer import PROMPT_TEMPLATE @@ -35,18 +36,6 @@ def load_fixture(path: str) -> dict: return json.loads(Path(path).read_text()) -def _strip_code_fences(text: str) -> str: - """Strip markdown code fences and whitespace from LLM output.""" - text = text.strip() - if text.startswith("```"): - # Remove opening fence (```json or ```) - first_newline = text.index("\n") if "\n" in text else len(text) - text = text[first_newline + 1 :] - if text.endswith("```"): - text = text[:-3] - return text.strip() - - def _extract_cost(response) -> float: """Extract cost from OpenRouter response metadata (usage.cost).""" metadata = getattr(response, "response_metadata", {}) @@ -110,11 +99,11 @@ async def run_benchmark( total_tokens_completion += usage.get("completion_tokens", 0) total_cost += _extract_cost(response) - data = json.loads(_strip_code_fences(response.content)) + data = extract_json(response.content) scores = data.get("scores", {}) all_scores.update(scores) json_valid_count += 1 - except json.JSONDecodeError as e: + except (ValueError, json.JSONDecodeError) as e: elapsed_ms = (time.monotonic() - start) * 1000 latencies.append(elapsed_ms) errors.append(f"scorer/{subreddit}: JSON parse error: {e}") @@ -141,11 +130,11 @@ async def run_benchmark( total_tokens_completion += usage.get("completion_tokens", 0) total_cost += _extract_cost(response) - data = json.loads(_strip_code_fences(response.content)) + data = extract_json(response.content) summaries = data.get("summaries", {}) all_summaries.update(summaries) json_valid_count += 1 - except json.JSONDecodeError as e: + except (ValueError, json.JSONDecodeError) as e: elapsed_ms = (time.monotonic() - start) * 1000 latencies.append(elapsed_ms) errors.append(f"summarizer/{subreddit}: JSON parse error: {e}") From 2ae7200d8243e008d570cdae7e2895744f96987b Mon Sep 17 00:00:00 2001 From: using-system Date: Fri, 17 Apr 2026 10:46:08 +0200 Subject: [PATCH 4/8] test(benchmark): drop tests for removed _strip_code_fences helper The helper was inlined in bench_model.py / aggregate.py and was replaced by reddit_digest.nodes.llm_utils.extract_json. Drop the four obsolete tests that imported the removed function. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/test_benchmark.py | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 07accda..8e1c8d1 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -1,25 +1,9 @@ from unittest.mock import MagicMock -from benchmarks.bench_model import _extract_cost, _strip_code_fences +from benchmarks.bench_model import _extract_cost from benchmarks.aggregate import normalize_min_max, compute_composite, generate_report -def test_strip_code_fences_json(): - assert _strip_code_fences('```json\n{"a": 1}\n```') == '{"a": 1}' - - -def test_strip_code_fences_plain(): - assert _strip_code_fences('```\n{"a": 1}\n```') == '{"a": 1}' - - -def test_strip_code_fences_no_fences(): - assert _strip_code_fences('{"a": 1}') == '{"a": 1}' - - -def test_strip_code_fences_whitespace(): - assert _strip_code_fences('\n {"a": 1} \n') == '{"a": 1}' - - def test_extract_cost_from_response(): response = MagicMock() response.response_metadata = {"token_usage": {"cost": 0.00042}} From 132421e6f29dc887ee6cfca4630a2ab818701d6f Mon Sep 17 00:00:00 2001 From: using-system Date: Fri, 17 Apr 2026 10:57:15 +0200 Subject: [PATCH 5/8] docs(benchmark): refresh ranking with 2026-04-17 results Update the README with the run after #17 (5 new models, fixed JSON parsing). Recommend google/gemma-3-12b-it as the best quality/price self-hostable option (composite 0.9627 at \$0.0009 per run, 100%% JSON valid). Note anomalies: gpt-oss-120b summary at 0.0/10, gpt-4.1-nano 17%% JSON, phi-4 67%% JSON. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/README.md | 67 +++++++++++++++++++++++++------------------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index 63fe992..3d90257 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -2,44 +2,53 @@ Automated benchmark system to compare LLM models for the reddit-digest-agent pipeline. Tests each model's ability to score and summarize Reddit posts using the project's real prompts. -## Latest Results (2026-04-12) +## Latest Results (2026-04-17) -> Run: [GitHub Actions #24308576896](https://github.com/using-system/reddit-digest-agent/actions/runs/24308576896) -> 20 models tested across 7 providers via OpenRouter +> Run: [GitHub Actions #24556347098](https://github.com/using-system/reddit-digest-agent/actions/runs/24556347098) +> 25 models tested across 7 providers via OpenRouter > Judge: openai/gpt-4o ### Ranking | Model | Cost | JSON OK | Latency (avg) | Summaries | Scoring MAE | **Composite** | |-------|------|---------|---------------|-----------|-------------|---------------| -| openai/gpt-4o-mini | $0.0014 | 100% | 1818ms | 7.8/10 | 1.1 | **0.9813** | -| google/gemma-3-12b-it | $0.0005 | 100% | 4206ms | 8.1/10 | 1.4 | **0.9793** | -| google/gemma-3-27b-it | $0.0012 | 100% | 5178ms | 8.0/10 | 1.3 | **0.9679** | -| openai/gpt-4.1-mini | $0.0044 | 100% | 1852ms | 7.7/10 | 1.6 | **0.9638** | -| mistralai/mistral-medium-3.1 | $0.0064 | 100% | 4052ms | 7.6/10 | 1.5 | **0.9391** | -| mistralai/mistral-small-3.1-24b-instruct | $0.0037 | 100% | 5478ms | 7.4/10 | 1.3 | **0.938** | -| anthropic/claude-haiku-4.5 | $0.0153 | 100% | 2788ms | 7.6/10 | 0.6 | **0.9078** | -| openai/gpt-4.1 | $0.0183 | 100% | 2128ms | 7.9/10 | 1.1 | **0.9054** | -| google/gemma-4-31b-it | $0.0016 | 100% | 18314ms | 7.9/10 | 0.9 | **0.8882** | -| x-ai/grok-3-mini | $0.0048 | 100% | 15890ms | 7.5/10 | 1.3 | **0.8759** | -| deepseek/deepseek-chat-v3-0324 | $0.0030 | 83% | 10821ms | 7.3/10 | 0.8 | **0.8588** | -| deepseek/deepseek-v3.2 | $0.0046 | 100% | 6031ms | 7.3/10 | 45.6 | **0.8297** | -| google/gemma-3-4b-it | $0.0005 | 100% | 4327ms | 1.0/10 | 1.2 | **0.7794** | -| x-ai/grok-4-fast | $0.0043 | 100% | 4731ms | 1.0/10 | 0.7 | **0.7607** | -| meta-llama/llama-4-maverick | $0.0041 | 50% | 8480ms | 7.5/10 | 10.0 | **0.7526** | -| microsoft/phi-4 | $0.0009 | 33% | 4598ms | 7.9/10 | 10.0 | **0.7505** | -| openai/gpt-4o | $0.0253 | 100% | 1424ms | 1.0/10 | 0.7 | **0.6844** | -| anthropic/claude-opus-4.6 | $0.0775 | 100% | 5674ms | 7.7/10 | 0.7 | **0.6115** | -| openai/gpt-4.1-nano | $0.0009 | 33% | 986ms | 1.0/10 | 10.0 | **0.577** | -| meta-llama/llama-4-scout | $0.0019 | 17% | 5013ms | 1.0/10 | 10.0 | **0.4991** | +| openai/gpt-5.4-nano | $0.0026 | 100% | 1519ms | 7.1/10 | 1.2 | **0.9734** | +| openai/gpt-4o-mini | $0.0016 | 100% | 2207ms | 7.1/10 | 1.3 | **0.9715** | +| x-ai/grok-4-fast | $0.0041 | 100% | 3586ms | 7.6/10 | 1.4 | **0.9684** | +| google/gemma-4-31b-it | $0.0016 | 100% | 4869ms | 7.2/10 | 0.9 | **0.9649** | +| google/gemma-3-27b-it | $0.0012 | 100% | 4790ms | 7.2/10 | 1.2 | **0.9632** | +| google/gemma-3-12b-it | $0.0009 | 100% | 3980ms | 7.1/10 | 1.5 | **0.9627** | +| google/gemma-3-4b-it | $0.0004 | 100% | 1958ms | 6.4/10 | 1.2 | **0.9597** | +| meta-llama/llama-4-maverick | $0.0027 | 100% | 3784ms | 7.0/10 | 1.3 | **0.9569** | +| mistralai/mistral-medium-3.1 | $0.0066 | 100% | 4272ms | 7.6/10 | 1.9 | **0.9516** | +| openai/gpt-4.1-mini | $0.0042 | 100% | 4014ms | 7.1/10 | 1.7 | **0.949** | +| meta-llama/llama-4-scout | $0.0017 | 100% | 5388ms | 6.7/10 | 1.1 | **0.9457** | +| mistralai/mistral-small-3.1-24b-instruct | $0.0037 | 100% | 5398ms | 7.0/10 | 1.3 | **0.9456** | +| anthropic/claude-haiku-4.5 | $0.0156 | 100% | 2846ms | 7.2/10 | 0.6 | **0.9327** | +| openai/gpt-5.4 | $0.0179 | 100% | 2870ms | 7.3/10 | 1.1 | **0.9224** | +| deepseek/deepseek-v3.2 | $0.0047 | 100% | 11709ms | 7.5/10 | 1.3 | **0.9207** | +| openai/gpt-4.1 | $0.0189 | 100% | 2571ms | 7.1/10 | 1.1 | **0.9148** | +| openai/gpt-4o | $0.0252 | 100% | 1886ms | 7.1/10 | 0.6 | **0.9037** | +| deepseek/deepseek-chat-v3-0324 | $0.0027 | 100% | 18976ms | 7.4/10 | 1.3 | **0.8865** | +| openai/gpt-oss-20b | $0.0013 | 100% | 20130ms | 7.1/10 | 1.2 | **0.8776** | +| x-ai/grok-3-mini | $0.0041 | 100% | 18508ms | 6.9/10 | 1.4 | **0.8697** | +| microsoft/phi-4 | $0.0009 | 67% | 6152ms | 7.0/10 | 10.0 | **0.7578** | +| openai/gpt-oss-120b | $0.0029 | 100% | 16742ms | 0.0/10 | 0.6 | **0.7103** | +| anthropic/claude-opus-4.6 | $0.0779 | 100% | 7131ms | 7.3/10 | 0.6 | **0.7075** | +| anthropic/claude-opus-4.7 | $0.1070 | 100% | 4497ms | 7.5/10 | 0.7 | **0.6306** | +| openai/gpt-4.1-nano | $0.0010 | 17% | 1619ms | 0.0/10 | 1.0 | **0.5434** | + +### Recommandation + +**Meilleur qualité/prix self-hostable : `google/gemma-3-12b-it`** — composite 0.9627 à $0.0009 par run, 100 % de JSON valide, latence ~4 s, qualité de résumé 7.1/10. Quasi à égalité avec les meilleurs modèles propriétaires tout en restant exécutable sur du matériel grand public. ### Key takeaways -- **Best overall**: `openai/gpt-4o-mini` — cheapest top-tier option with 100% JSON reliability and good summary quality -- **Best self-hostable**: `google/gemma-3-12b-it` — nearly identical composite score (0.9793 vs 0.9813), highest summary quality (8.1/10), lowest cost ($0.0005), and can run on consumer hardware -- **Gemma 3 family**: The 12B variant hits the sweet spot. The 27B is marginally worse and slower, the 4B drops dramatically in summary quality -- **Cost vs quality**: Models costing >$0.01 per run (Claude Opus, GPT-4o, GPT-4.1) don't justify the price — cheaper models match or beat them on summary quality -- **JSON reliability**: Most models achieve 100%. Notable failures: Llama 4 Scout (17%), Phi-4 (33%), GPT-4.1-nano (33%) +- **Best overall**: `openai/gpt-5.4-nano` — premier composite (0.9734), 100 % JSON, latence la plus basse (~1.5 s) +- **Best self-hostable**: `google/gemma-3-12b-it` — meilleur compromis hostable : coût le plus bas de la famille hostable utile, latence raisonnable, qualité comparable aux variantes 27B / 31B sans le besoin de GPU haut de gamme +- **Famille Gemma**: 4B / 12B / 27B / 4-31B sont toutes en haut du classement ; le 12B reste le sweet spot entre coût, latence et qualité +- **Coût vs qualité**: les modèles à >$0.01 par run (Claude Opus 4.6/4.7, GPT-4o, GPT-4.1, GPT-5.4) n'apportent pas un gain qui justifie le prix — `claude-opus-4.7` est même le plus cher du panel ($0.107) sans dépasser les modèles légers +- **Anomalies à surveiller**: `openai/gpt-oss-120b` (résumé évalué 0.0/10 par le judge malgré 100 % JSON), `openai/gpt-4.1-nano` (17 % JSON seulement), `microsoft/phi-4` (67 % JSON, MAE de scoring à 10.0) ### Composite score formula @@ -51,7 +60,7 @@ Each metric is min-max normalized across all models. For cost, latency, and MAE, ### From GitHub Actions (recommended) -Go to **Actions > LLM Benchmark > Run workflow**. The default runs all 20 models. You can pass a custom JSON array of model IDs. +Go to **Actions > LLM Benchmark > Run workflow**. The default runs all 25 models. You can pass a custom JSON array of model IDs. ### Locally From 9779b218f8d97e4415cf25cad4c535dd522728e7 Mon Sep 17 00:00:00 2001 From: using-system Date: Fri, 17 Apr 2026 10:59:04 +0200 Subject: [PATCH 6/8] docs(benchmark): translate recommendation section to English MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Keep README consistent — the rest of the file is in English. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index 3d90257..1acc4e6 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -38,17 +38,17 @@ Automated benchmark system to compare LLM models for the reddit-digest-agent pip | anthropic/claude-opus-4.7 | $0.1070 | 100% | 4497ms | 7.5/10 | 0.7 | **0.6306** | | openai/gpt-4.1-nano | $0.0010 | 17% | 1619ms | 0.0/10 | 1.0 | **0.5434** | -### Recommandation +### Recommendation -**Meilleur qualité/prix self-hostable : `google/gemma-3-12b-it`** — composite 0.9627 à $0.0009 par run, 100 % de JSON valide, latence ~4 s, qualité de résumé 7.1/10. Quasi à égalité avec les meilleurs modèles propriétaires tout en restant exécutable sur du matériel grand public. +**Best quality/price self-hostable: `google/gemma-3-12b-it`** — composite 0.9627 at $0.0009 per run, 100% valid JSON, ~4s latency, 7.1/10 summary quality. Nearly on par with the top proprietary models while remaining runnable on consumer hardware. ### Key takeaways -- **Best overall**: `openai/gpt-5.4-nano` — premier composite (0.9734), 100 % JSON, latence la plus basse (~1.5 s) -- **Best self-hostable**: `google/gemma-3-12b-it` — meilleur compromis hostable : coût le plus bas de la famille hostable utile, latence raisonnable, qualité comparable aux variantes 27B / 31B sans le besoin de GPU haut de gamme -- **Famille Gemma**: 4B / 12B / 27B / 4-31B sont toutes en haut du classement ; le 12B reste le sweet spot entre coût, latence et qualité -- **Coût vs qualité**: les modèles à >$0.01 par run (Claude Opus 4.6/4.7, GPT-4o, GPT-4.1, GPT-5.4) n'apportent pas un gain qui justifie le prix — `claude-opus-4.7` est même le plus cher du panel ($0.107) sans dépasser les modèles légers -- **Anomalies à surveiller**: `openai/gpt-oss-120b` (résumé évalué 0.0/10 par le judge malgré 100 % JSON), `openai/gpt-4.1-nano` (17 % JSON seulement), `microsoft/phi-4` (67 % JSON, MAE de scoring à 10.0) +- **Best overall**: `openai/gpt-5.4-nano` — top composite (0.9734), 100% JSON, lowest latency (~1.5s) +- **Best self-hostable**: `google/gemma-3-12b-it` — best hostable trade-off: lowest cost in the usable hostable family, reasonable latency, quality comparable to the 27B / 31B variants without needing high-end GPUs +- **Gemma family**: 4B / 12B / 27B / 4-31B all sit at the top of the ranking; the 12B remains the sweet spot between cost, latency and quality +- **Cost vs quality**: models above $0.01 per run (Claude Opus 4.6/4.7, GPT-4o, GPT-4.1, GPT-5.4) don't deliver a gain that justifies the price — `claude-opus-4.7` is even the most expensive of the panel ($0.107) without beating the light models +- **Anomalies to watch**: `openai/gpt-oss-120b` (judge rated summary 0.0/10 despite 100% JSON), `openai/gpt-4.1-nano` (only 17% JSON), `microsoft/phi-4` (67% JSON, scoring MAE at 10.0) ### Composite score formula From 83c1fe13879d57c2d0fb0c167e5f71eaa7dabfb8 Mon Sep 17 00:00:00 2001 From: using-system Date: Fri, 17 Apr 2026 11:01:55 +0200 Subject: [PATCH 7/8] docs(readme): refresh recommended-model section with 2026-04-17 benchmark Update the table to reflect the 25-model run, add gpt-5.4-nano and the Gemma 4-31B / 3-27B variants, and reframe the recommendation around gemma-3-12b-it as the best self-hostable option (no longer top-2 on composite, but still cheapest at $0.0009/run with 100% JSON validity). Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 2cc37c1..729ddda 100644 --- a/README.md +++ b/README.md @@ -131,15 +131,17 @@ All configuration is done via environment variables (`.env` file). ## Recommended model -Based on our [benchmark of 20 models](benchmarks/README.md), we recommend **`google/gemma-3-12b-it`** as the default LLM: +Based on our [benchmark of 25 models](benchmarks/README.md), we recommend **`google/gemma-3-12b-it`** as the default LLM for self-hosted deployments: | Model | Cost/run | JSON OK | Summary quality | Composite | |-------|----------|---------|-----------------|-----------| -| openai/gpt-4o-mini | $0.0014 | 100% | 7.8/10 | 0.9813 | -| **google/gemma-3-12b-it** | **$0.0005** | **100%** | **8.1/10** | **0.9793** | -| google/gemma-3-27b-it | $0.0012 | 100% | 8.0/10 | 0.9679 | +| openai/gpt-5.4-nano | $0.0026 | 100% | 7.1/10 | 0.9734 | +| openai/gpt-4o-mini | $0.0016 | 100% | 7.1/10 | 0.9715 | +| google/gemma-4-31b-it | $0.0016 | 100% | 7.2/10 | 0.9649 | +| google/gemma-3-27b-it | $0.0012 | 100% | 7.2/10 | 0.9632 | +| **google/gemma-3-12b-it** | **$0.0009** | **100%** | **7.1/10** | **0.9627** | -While `gpt-4o-mini` scores marginally higher on the composite metric, `gemma-3-12b-it` is the best self-hostable alternative: highest summary quality (8.1/10), lowest cost, 100% JSON reliability, and it runs on consumer hardware (12B parameters). It is the default value for `LLM_MODEL`. +While `gpt-5.4-nano` and `gpt-4o-mini` lead the composite ranking, `gemma-3-12b-it` is the best self-hostable option: lowest cost ($0.0009/run), 100% JSON reliability, summary quality on par with the top proprietary models, and it runs on consumer hardware (12B parameters). It is the default value for `LLM_MODEL`. See [benchmarks/README.md](benchmarks/README.md) for the full ranking and methodology. From 386ad249e6ca33c0363fb66e4770e7f83d143e23 Mon Sep 17 00:00:00 2001 From: using-system Date: Fri, 17 Apr 2026 11:17:47 +0200 Subject: [PATCH 8/8] docs(benchmark): refresh both READMEs with run #24556985945 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Latest benchmark run is much more conclusive for the Gemma 3 family — gemma-3-27b-it tops the composite ranking (0.9819) and gemma-3-12b-it moves up to #2 (0.9781) with the same 7.9/10 summary quality. Three Gemma 3 variants plus the Gemma 4-31B fill 4 of the top 5 spots. Conclusion preserved: gemma-3-12b-it remains the recommended self-hostable default — same summary quality as the 27B at less than half the cost (\$0.0005 vs \$0.0012/run) and runnable on a single consumer GPU. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 11 ++++---- benchmarks/README.md | 64 ++++++++++++++++++++++---------------------- 2 files changed, 37 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index 729ddda..8212da6 100644 --- a/README.md +++ b/README.md @@ -135,13 +135,12 @@ Based on our [benchmark of 25 models](benchmarks/README.md), we recommend **`goo | Model | Cost/run | JSON OK | Summary quality | Composite | |-------|----------|---------|-----------------|-----------| -| openai/gpt-5.4-nano | $0.0026 | 100% | 7.1/10 | 0.9734 | -| openai/gpt-4o-mini | $0.0016 | 100% | 7.1/10 | 0.9715 | -| google/gemma-4-31b-it | $0.0016 | 100% | 7.2/10 | 0.9649 | -| google/gemma-3-27b-it | $0.0012 | 100% | 7.2/10 | 0.9632 | -| **google/gemma-3-12b-it** | **$0.0009** | **100%** | **7.1/10** | **0.9627** | +| google/gemma-3-27b-it | $0.0012 | 100% | 7.9/10 | 0.9819 | +| **google/gemma-3-12b-it** | **$0.0005** | **100%** | **7.9/10** | **0.9781** | +| openai/gpt-4o-mini | $0.0015 | 100% | 7.6/10 | 0.9746 | +| google/gemma-4-31b-it | $0.0016 | 100% | 8.0/10 | 0.9693 | -While `gpt-5.4-nano` and `gpt-4o-mini` lead the composite ranking, `gemma-3-12b-it` is the best self-hostable option: lowest cost ($0.0009/run), 100% JSON reliability, summary quality on par with the top proprietary models, and it runs on consumer hardware (12B parameters). It is the default value for `LLM_MODEL`. +While `gemma-3-27b-it` tops the composite ranking, `gemma-3-12b-it` is the best self-hostable option: same summary quality (7.9/10), less than half the cost ($0.0005 vs $0.0012/run), 100% JSON reliability, and it runs on a single consumer GPU (12B parameters). It is the default value for `LLM_MODEL`. See [benchmarks/README.md](benchmarks/README.md) for the full ranking and methodology. diff --git a/benchmarks/README.md b/benchmarks/README.md index 1acc4e6..151a9bd 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -4,7 +4,7 @@ Automated benchmark system to compare LLM models for the reddit-digest-agent pip ## Latest Results (2026-04-17) -> Run: [GitHub Actions #24556347098](https://github.com/using-system/reddit-digest-agent/actions/runs/24556347098) +> Run: [GitHub Actions #24556985945](https://github.com/using-system/reddit-digest-agent/actions/runs/24556985945) > 25 models tested across 7 providers via OpenRouter > Judge: openai/gpt-4o @@ -12,43 +12,43 @@ Automated benchmark system to compare LLM models for the reddit-digest-agent pip | Model | Cost | JSON OK | Latency (avg) | Summaries | Scoring MAE | **Composite** | |-------|------|---------|---------------|-----------|-------------|---------------| -| openai/gpt-5.4-nano | $0.0026 | 100% | 1519ms | 7.1/10 | 1.2 | **0.9734** | -| openai/gpt-4o-mini | $0.0016 | 100% | 2207ms | 7.1/10 | 1.3 | **0.9715** | -| x-ai/grok-4-fast | $0.0041 | 100% | 3586ms | 7.6/10 | 1.4 | **0.9684** | -| google/gemma-4-31b-it | $0.0016 | 100% | 4869ms | 7.2/10 | 0.9 | **0.9649** | -| google/gemma-3-27b-it | $0.0012 | 100% | 4790ms | 7.2/10 | 1.2 | **0.9632** | -| google/gemma-3-12b-it | $0.0009 | 100% | 3980ms | 7.1/10 | 1.5 | **0.9627** | -| google/gemma-3-4b-it | $0.0004 | 100% | 1958ms | 6.4/10 | 1.2 | **0.9597** | -| meta-llama/llama-4-maverick | $0.0027 | 100% | 3784ms | 7.0/10 | 1.3 | **0.9569** | -| mistralai/mistral-medium-3.1 | $0.0066 | 100% | 4272ms | 7.6/10 | 1.9 | **0.9516** | -| openai/gpt-4.1-mini | $0.0042 | 100% | 4014ms | 7.1/10 | 1.7 | **0.949** | -| meta-llama/llama-4-scout | $0.0017 | 100% | 5388ms | 6.7/10 | 1.1 | **0.9457** | -| mistralai/mistral-small-3.1-24b-instruct | $0.0037 | 100% | 5398ms | 7.0/10 | 1.3 | **0.9456** | -| anthropic/claude-haiku-4.5 | $0.0156 | 100% | 2846ms | 7.2/10 | 0.6 | **0.9327** | -| openai/gpt-5.4 | $0.0179 | 100% | 2870ms | 7.3/10 | 1.1 | **0.9224** | -| deepseek/deepseek-v3.2 | $0.0047 | 100% | 11709ms | 7.5/10 | 1.3 | **0.9207** | -| openai/gpt-4.1 | $0.0189 | 100% | 2571ms | 7.1/10 | 1.1 | **0.9148** | -| openai/gpt-4o | $0.0252 | 100% | 1886ms | 7.1/10 | 0.6 | **0.9037** | -| deepseek/deepseek-chat-v3-0324 | $0.0027 | 100% | 18976ms | 7.4/10 | 1.3 | **0.8865** | -| openai/gpt-oss-20b | $0.0013 | 100% | 20130ms | 7.1/10 | 1.2 | **0.8776** | -| x-ai/grok-3-mini | $0.0041 | 100% | 18508ms | 6.9/10 | 1.4 | **0.8697** | -| microsoft/phi-4 | $0.0009 | 67% | 6152ms | 7.0/10 | 10.0 | **0.7578** | -| openai/gpt-oss-120b | $0.0029 | 100% | 16742ms | 0.0/10 | 0.6 | **0.7103** | -| anthropic/claude-opus-4.6 | $0.0779 | 100% | 7131ms | 7.3/10 | 0.6 | **0.7075** | -| anthropic/claude-opus-4.7 | $0.1070 | 100% | 4497ms | 7.5/10 | 0.7 | **0.6306** | -| openai/gpt-4.1-nano | $0.0010 | 17% | 1619ms | 0.0/10 | 1.0 | **0.5434** | +| google/gemma-3-27b-it | $0.0012 | 100% | 3336ms | 7.9/10 | 1.1 | **0.9819** | +| google/gemma-3-12b-it | $0.0005 | 100% | 4296ms | 7.9/10 | 1.4 | **0.9781** | +| openai/gpt-4o-mini | $0.0015 | 100% | 2156ms | 7.6/10 | 0.9 | **0.9746** | +| google/gemma-4-31b-it | $0.0016 | 100% | 8232ms | 8.0/10 | 1.0 | **0.9693** | +| x-ai/grok-4-fast | $0.0042 | 100% | 3791ms | 7.9/10 | 1.5 | **0.966** | +| meta-llama/llama-4-maverick | $0.0033 | 100% | 3196ms | 7.7/10 | 1.2 | **0.9657** | +| openai/gpt-4.1-mini | $0.0045 | 100% | 3130ms | 7.8/10 | 1.5 | **0.9636** | +| openai/gpt-5.4-nano | $0.0026 | 100% | 1951ms | 7.4/10 | 1.1 | **0.9606** | +| mistralai/mistral-medium-3.1 | $0.0064 | 100% | 4736ms | 8.0/10 | 1.5 | **0.96** | +| mistralai/mistral-small-3.1-24b-instruct | $0.0037 | 100% | 5644ms | 7.6/10 | 1.3 | **0.9506** | +| meta-llama/llama-4-scout | $0.0022 | 100% | 7142ms | 7.6/10 | 1.3 | **0.9505** | +| deepseek/deepseek-chat-v3-0324 | $0.0029 | 100% | 13163ms | 7.9/10 | 1.0 | **0.9432** | +| anthropic/claude-haiku-4.5 | $0.0156 | 100% | 2846ms | 7.7/10 | 1.0 | **0.9271** | +| openai/gpt-4.1 | $0.0192 | 100% | 2684ms | 7.8/10 | 1.1 | **0.9185** | +| openai/gpt-5.4 | $0.0181 | 100% | 3066ms | 7.2/10 | 1.1 | **0.8947** | +| openai/gpt-oss-120b | $0.0029 | 100% | 29202ms | 7.7/10 | 0.9 | **0.8794** | +| google/gemma-3-4b-it | $0.0004 | 100% | 2302ms | 3.5/10 | 1.2 | **0.7919** | +| microsoft/phi-4 | $0.0009 | 67% | 7588ms | 7.8/10 | 10.0 | **0.7686** | +| deepseek/deepseek-v3.2 | $0.0027 | 100% | 12455ms | 3.8/10 | 1.3 | **0.7612** | +| openai/gpt-4o | $0.0245 | 100% | 2358ms | 3.8/10 | 1.0 | **0.7252** | +| anthropic/claude-opus-4.6 | $0.0778 | 100% | 5298ms | 7.5/10 | 0.7 | **0.7014** | +| x-ai/grok-3-mini | $0.0057 | 100% | 30198ms | 3.7/10 | 1.5 | **0.6821** | +| openai/gpt-oss-20b | $0.0013 | 67% | 10371ms | 3.8/10 | 0.7 | **0.6795** | +| anthropic/claude-opus-4.7 | $0.1031 | 100% | 4006ms | 7.7/10 | 0.7 | **0.6283** | +| openai/gpt-4.1-nano | $0.0011 | 17% | 1614ms | 3.8/10 | 10.0 | **0.4611** | ### Recommendation -**Best quality/price self-hostable: `google/gemma-3-12b-it`** — composite 0.9627 at $0.0009 per run, 100% valid JSON, ~4s latency, 7.1/10 summary quality. Nearly on par with the top proprietary models while remaining runnable on consumer hardware. +**Best quality/price self-hostable: `google/gemma-3-12b-it`** — composite 0.9781 at $0.0005 per run, 100% valid JSON, ~4.3s latency, 7.9/10 summary quality. Second overall in the ranking, trailing only its bigger sibling `gemma-3-27b-it` while costing less than half and running on consumer hardware. ### Key takeaways -- **Best overall**: `openai/gpt-5.4-nano` — top composite (0.9734), 100% JSON, lowest latency (~1.5s) -- **Best self-hostable**: `google/gemma-3-12b-it` — best hostable trade-off: lowest cost in the usable hostable family, reasonable latency, quality comparable to the 27B / 31B variants without needing high-end GPUs -- **Gemma family**: 4B / 12B / 27B / 4-31B all sit at the top of the ranking; the 12B remains the sweet spot between cost, latency and quality -- **Cost vs quality**: models above $0.01 per run (Claude Opus 4.6/4.7, GPT-4o, GPT-4.1, GPT-5.4) don't deliver a gain that justifies the price — `claude-opus-4.7` is even the most expensive of the panel ($0.107) without beating the light models -- **Anomalies to watch**: `openai/gpt-oss-120b` (judge rated summary 0.0/10 despite 100% JSON), `openai/gpt-4.1-nano` (only 17% JSON), `microsoft/phi-4` (67% JSON, scoring MAE at 10.0) +- **Best overall**: `google/gemma-3-27b-it` — top composite (0.9819), 100% JSON, 7.9/10 summary, fast at ~3.3s and only $0.0012 per run +- **Best self-hostable**: `google/gemma-3-12b-it` — same summary quality as the 27B (7.9/10), half the cost ($0.0005 vs $0.0012), runs on a single consumer GPU +- **Gemma family dominates**: three Gemma 3 variants and the Gemma 4-31B fill 4 of the top 5 spots, beating every proprietary model except `gpt-4o-mini` +- **Cost vs quality**: models above $0.01 per run (Claude Opus 4.6/4.7, GPT-4o, GPT-4.1, GPT-5.4) deliver no benefit over Gemma 3-12B; `claude-opus-4.7` is the most expensive of the panel ($0.103) and still ranks last on composite +- **Anomalies to watch**: judge gave a 3.5–3.8/10 summary score to `gemma-3-4b-it`, `deepseek-v3.2`, `gpt-4o`, `grok-3-mini`, `gpt-oss-20b` and `gpt-4.1-nano` despite valid JSON — likely judge variance worth re-running. Hard failures: `gpt-4.1-nano` (17% JSON), `microsoft/phi-4` and `gpt-oss-20b` (67% JSON) ### Composite score formula