From a98fd31a3437c3db4fcdbdac95ae00802255ce17 Mon Sep 17 00:00:00 2001
From: using-system <mnicolescu@gmail.com>
Date: Fri, 17 Apr 2026 10:31:22 +0200
Subject: [PATCH 1/8] chore(benchmark): add latest OpenAI and Anthropic models
 to defaults

Adds openai/gpt-5.4, openai/gpt-5.4-nano, openai/gpt-5.3-codex,
openai/gpt-oss-120b, openai/gpt-oss-20b and anthropic/claude-opus-4.7
to the default benchmark matrix.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/benchmark.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 2b48c1a..43cd080 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -7,7 +7,7 @@ on:
         description: "JSON array of models to benchmark (OpenRouter model IDs)"
         required: false
         # yamllint disable-line rule:line-length
-        default: '["openai/gpt-4o","openai/gpt-4o-mini","openai/gpt-4.1","openai/gpt-4.1-mini","openai/gpt-4.1-nano","anthropic/claude-opus-4.6","anthropic/claude-haiku-4.5","google/gemma-3-4b-it","google/gemma-3-12b-it","google/gemma-3-27b-it","google/gemma-4-31b-it","mistralai/mistral-small-3.1-24b-instruct","mistralai/mistral-medium-3.1","meta-llama/llama-4-scout","meta-llama/llama-4-maverick","deepseek/deepseek-chat-v3-0324","deepseek/deepseek-v3.2","microsoft/phi-4","x-ai/grok-3-mini","x-ai/grok-4-fast"]'
+        default: '["openai/gpt-5.4","openai/gpt-5.4-nano","openai/gpt-5.3-codex","openai/gpt-oss-120b","openai/gpt-oss-20b","openai/gpt-4o","openai/gpt-4o-mini","openai/gpt-4.1","openai/gpt-4.1-mini","openai/gpt-4.1-nano","anthropic/claude-opus-4.7","anthropic/claude-opus-4.6","anthropic/claude-haiku-4.5","google/gemma-3-4b-it","google/gemma-3-12b-it","google/gemma-3-27b-it","google/gemma-4-31b-it","mistralai/mistral-small-3.1-24b-instruct","mistralai/mistral-medium-3.1","meta-llama/llama-4-scout","meta-llama/llama-4-maverick","deepseek/deepseek-chat-v3-0324","deepseek/deepseek-v3.2","microsoft/phi-4","x-ai/grok-3-mini","x-ai/grok-4-fast"]'
       judge_model:
         description: "Model used as LLM judge"
         required: false
@@ -29,7 +29,7 @@ jobs:
       max-parallel: 5
       # yamllint disable-line rule:line-length
       matrix:
-        model: ${{ fromJson(inputs.models || '["openai/gpt-4o","openai/gpt-4o-mini","openai/gpt-4.1","openai/gpt-4.1-mini","openai/gpt-4.1-nano","anthropic/claude-opus-4.6","anthropic/claude-haiku-4.5","google/gemma-3-4b-it","google/gemma-3-12b-it","google/gemma-3-27b-it","google/gemma-4-31b-it","mistralai/mistral-small-3.1-24b-instruct","mistralai/mistral-medium-3.1","meta-llama/llama-4-scout","meta-llama/llama-4-maverick","deepseek/deepseek-chat-v3-0324","deepseek/deepseek-v3.2","microsoft/phi-4","x-ai/grok-3-mini","x-ai/grok-4-fast"]') }}
+        model: ${{ fromJson(inputs.models || '["openai/gpt-5.4","openai/gpt-5.4-nano","openai/gpt-5.3-codex","openai/gpt-oss-120b","openai/gpt-oss-20b","openai/gpt-4o","openai/gpt-4o-mini","openai/gpt-4.1","openai/gpt-4.1-mini","openai/gpt-4.1-nano","anthropic/claude-opus-4.7","anthropic/claude-opus-4.6","anthropic/claude-haiku-4.5","google/gemma-3-4b-it","google/gemma-3-12b-it","google/gemma-3-27b-it","google/gemma-4-31b-it","mistralai/mistral-small-3.1-24b-instruct","mistralai/mistral-medium-3.1","meta-llama/llama-4-scout","meta-llama/llama-4-maverick","deepseek/deepseek-chat-v3-0324","deepseek/deepseek-v3.2","microsoft/phi-4","x-ai/grok-3-mini","x-ai/grok-4-fast"]') }}
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
 

From 46272b27de87dea7dcd4ac0be08dd5063ddfa45e Mon Sep 17 00:00:00 2001
From: using-system <mnicolescu@gmail.com>
Date: Fri, 17 Apr 2026 10:44:33 +0200
Subject: [PATCH 2/8] chore(benchmark): remove openai/gpt-5.3-codex from
 defaults

The model is unavailable on OpenRouter and caused the matrix job to fail.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/benchmark.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 43cd080..12d722c 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -7,7 +7,7 @@ on:
         description: "JSON array of models to benchmark (OpenRouter model IDs)"
         required: false
         # yamllint disable-line rule:line-length
-        default: '["openai/gpt-5.4","openai/gpt-5.4-nano","openai/gpt-5.3-codex","openai/gpt-oss-120b","openai/gpt-oss-20b","openai/gpt-4o","openai/gpt-4o-mini","openai/gpt-4.1","openai/gpt-4.1-mini","openai/gpt-4.1-nano","anthropic/claude-opus-4.7","anthropic/claude-opus-4.6","anthropic/claude-haiku-4.5","google/gemma-3-4b-it","google/gemma-3-12b-it","google/gemma-3-27b-it","google/gemma-4-31b-it","mistralai/mistral-small-3.1-24b-instruct","mistralai/mistral-medium-3.1","meta-llama/llama-4-scout","meta-llama/llama-4-maverick","deepseek/deepseek-chat-v3-0324","deepseek/deepseek-v3.2","microsoft/phi-4","x-ai/grok-3-mini","x-ai/grok-4-fast"]'
+        default: '["openai/gpt-5.4","openai/gpt-5.4-nano","openai/gpt-oss-120b","openai/gpt-oss-20b","openai/gpt-4o","openai/gpt-4o-mini","openai/gpt-4.1","openai/gpt-4.1-mini","openai/gpt-4.1-nano","anthropic/claude-opus-4.7","anthropic/claude-opus-4.6","anthropic/claude-haiku-4.5","google/gemma-3-4b-it","google/gemma-3-12b-it","google/gemma-3-27b-it","google/gemma-4-31b-it","mistralai/mistral-small-3.1-24b-instruct","mistralai/mistral-medium-3.1","meta-llama/llama-4-scout","meta-llama/llama-4-maverick","deepseek/deepseek-chat-v3-0324","deepseek/deepseek-v3.2","microsoft/phi-4","x-ai/grok-3-mini","x-ai/grok-4-fast"]'
       judge_model:
         description: "Model used as LLM judge"
         required: false
@@ -29,7 +29,7 @@ jobs:
       max-parallel: 5
       # yamllint disable-line rule:line-length
       matrix:
-        model: ${{ fromJson(inputs.models || '["openai/gpt-5.4","openai/gpt-5.4-nano","openai/gpt-5.3-codex","openai/gpt-oss-120b","openai/gpt-oss-20b","openai/gpt-4o","openai/gpt-4o-mini","openai/gpt-4.1","openai/gpt-4.1-mini","openai/gpt-4.1-nano","anthropic/claude-opus-4.7","anthropic/claude-opus-4.6","anthropic/claude-haiku-4.5","google/gemma-3-4b-it","google/gemma-3-12b-it","google/gemma-3-27b-it","google/gemma-4-31b-it","mistralai/mistral-small-3.1-24b-instruct","mistralai/mistral-medium-3.1","meta-llama/llama-4-scout","meta-llama/llama-4-maverick","deepseek/deepseek-chat-v3-0324","deepseek/deepseek-v3.2","microsoft/phi-4","x-ai/grok-3-mini","x-ai/grok-4-fast"]') }}
+        model: ${{ fromJson(inputs.models || '["openai/gpt-5.4","openai/gpt-5.4-nano","openai/gpt-oss-120b","openai/gpt-oss-20b","openai/gpt-4o","openai/gpt-4o-mini","openai/gpt-4.1","openai/gpt-4.1-mini","openai/gpt-4.1-nano","anthropic/claude-opus-4.7","anthropic/claude-opus-4.6","anthropic/claude-haiku-4.5","google/gemma-3-4b-it","google/gemma-3-12b-it","google/gemma-3-27b-it","google/gemma-4-31b-it","mistralai/mistral-small-3.1-24b-instruct","mistralai/mistral-medium-3.1","meta-llama/llama-4-scout","meta-llama/llama-4-maverick","deepseek/deepseek-chat-v3-0324","deepseek/deepseek-v3.2","microsoft/phi-4","x-ai/grok-3-mini","x-ai/grok-4-fast"]') }}
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
 

From 600137961f14408c8c45215681b37b13a67e987d Mon Sep 17 00:00:00 2001
From: using-system <mnicolescu@gmail.com>
Date: Fri, 17 Apr 2026 10:44:42 +0200
Subject: [PATCH 3/8] fix(benchmark): use shared extract_json helper for LLM
 responses

The benchmark and aggregate scripts still parsed LLM output with a local
_strip_code_fences + json.loads pair, which fails when models wrap JSON
in explanatory text or extra content. Production code switched to
extract_json (regex-based) in #13; the benchmark scripts were missed,
which caused most models to score N/A and rank with composite 0.0/10.

Replace the local helper with reddit_digest.nodes.llm_utils.extract_json
in both bench_model.py and aggregate.py.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/aggregate.py   | 16 +++-------------
 benchmarks/bench_model.py | 21 +++++----------------
 2 files changed, 8 insertions(+), 29 deletions(-)

diff --git a/benchmarks/aggregate.py b/benchmarks/aggregate.py
index 2c524d8..b8aa0e0 100644
--- a/benchmarks/aggregate.py
+++ b/benchmarks/aggregate.py
@@ -16,22 +16,12 @@
 
 from langchain_openai import ChatOpenAI
 
+from reddit_digest.nodes.llm_utils import extract_json
+
 logger = logging.getLogger(__name__)
 
 DEFAULT_BASE_URL = "https://openrouter.ai/api/v1"
 
-
-def _strip_code_fences(text: str) -> str:
-    """Strip markdown code fences and whitespace from LLM output."""
-    text = text.strip()
-    if text.startswith("```"):
-        first_newline = text.index("\n") if "\n" in text else len(text)
-        text = text[first_newline + 1 :]
-    if text.endswith("```"):
-        text = text[:-3]
-    return text.strip()
-
-
 DEFAULT_JUDGE_MODEL = "openai/gpt-4o"
 
 JUDGE_PROMPT = """You are evaluating the quality of AI-generated summaries of Reddit posts.
@@ -151,7 +141,7 @@ async def run_judge(
 
         try:
             response = await llm.ainvoke(prompt)
-            data = json.loads(_strip_code_fences(response.content))
+            data = extract_json(response.content)
             evals = data.get("evaluations", {})
             all_evaluations.update(evals)
         except Exception:
diff --git a/benchmarks/bench_model.py b/benchmarks/bench_model.py
index 7a319b4..23eabba 100644
--- a/benchmarks/bench_model.py
+++ b/benchmarks/bench_model.py
@@ -20,6 +20,7 @@
 from langchain_openai import ChatOpenAI
 
 from reddit_digest.models import RedditPost
+from reddit_digest.nodes.llm_utils import extract_json
 from reddit_digest.nodes.scorer import SCORE_PROMPT
 from reddit_digest.nodes.scorer import _build_post_block as scorer_build_block
 from reddit_digest.nodes.summarizer import PROMPT_TEMPLATE
@@ -35,18 +36,6 @@ def load_fixture(path: str) -> dict:
     return json.loads(Path(path).read_text())
 
 
-def _strip_code_fences(text: str) -> str:
-    """Strip markdown code fences and whitespace from LLM output."""
-    text = text.strip()
-    if text.startswith("```"):
-        # Remove opening fence (```json or ```)
-        first_newline = text.index("\n") if "\n" in text else len(text)
-        text = text[first_newline + 1 :]
-    if text.endswith("```"):
-        text = text[:-3]
-    return text.strip()
-
-
 def _extract_cost(response) -> float:
     """Extract cost from OpenRouter response metadata (usage.cost)."""
     metadata = getattr(response, "response_metadata", {})
@@ -110,11 +99,11 @@ async def run_benchmark(
             total_tokens_completion += usage.get("completion_tokens", 0)
             total_cost += _extract_cost(response)
 
-            data = json.loads(_strip_code_fences(response.content))
+            data = extract_json(response.content)
             scores = data.get("scores", {})
             all_scores.update(scores)
             json_valid_count += 1
-        except json.JSONDecodeError as e:
+        except (ValueError, json.JSONDecodeError) as e:
             elapsed_ms = (time.monotonic() - start) * 1000
             latencies.append(elapsed_ms)
             errors.append(f"scorer/{subreddit}: JSON parse error: {e}")
@@ -141,11 +130,11 @@ async def run_benchmark(
             total_tokens_completion += usage.get("completion_tokens", 0)
             total_cost += _extract_cost(response)
 
-            data = json.loads(_strip_code_fences(response.content))
+            data = extract_json(response.content)
             summaries = data.get("summaries", {})
             all_summaries.update(summaries)
             json_valid_count += 1
-        except json.JSONDecodeError as e:
+        except (ValueError, json.JSONDecodeError) as e:
             elapsed_ms = (time.monotonic() - start) * 1000
             latencies.append(elapsed_ms)
             errors.append(f"summarizer/{subreddit}: JSON parse error: {e}")

From 2ae7200d8243e008d570cdae7e2895744f96987b Mon Sep 17 00:00:00 2001
From: using-system <mnicolescu@gmail.com>
Date: Fri, 17 Apr 2026 10:46:08 +0200
Subject: [PATCH 4/8] test(benchmark): drop tests for removed
 _strip_code_fences helper

The helper was inlined in bench_model.py / aggregate.py and was replaced
by reddit_digest.nodes.llm_utils.extract_json. Drop the four obsolete
tests that imported the removed function.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/test_benchmark.py | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 07accda..8e1c8d1 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -1,25 +1,9 @@
 from unittest.mock import MagicMock
 
-from benchmarks.bench_model import _extract_cost, _strip_code_fences
+from benchmarks.bench_model import _extract_cost
 from benchmarks.aggregate import normalize_min_max, compute_composite, generate_report
 
 
-def test_strip_code_fences_json():
-    assert _strip_code_fences('```json\n{"a": 1}\n```') == '{"a": 1}'
-
-
-def test_strip_code_fences_plain():
-    assert _strip_code_fences('```\n{"a": 1}\n```') == '{"a": 1}'
-
-
-def test_strip_code_fences_no_fences():
-    assert _strip_code_fences('{"a": 1}') == '{"a": 1}'
-
-
-def test_strip_code_fences_whitespace():
-    assert _strip_code_fences('\n  {"a": 1}  \n') == '{"a": 1}'
-
-
 def test_extract_cost_from_response():
     response = MagicMock()
     response.response_metadata = {"token_usage": {"cost": 0.00042}}

From 132421e6f29dc887ee6cfca4630a2ab818701d6f Mon Sep 17 00:00:00 2001
From: using-system <mnicolescu@gmail.com>
Date: Fri, 17 Apr 2026 10:57:15 +0200
Subject: [PATCH 5/8] docs(benchmark): refresh ranking with 2026-04-17 results

Update the README with the run after #17 (5 new models, fixed JSON
parsing). Recommend google/gemma-3-12b-it as the best quality/price
self-hostable option (composite 0.9627 at \$0.0009 per run, 100%% JSON
valid). Note anomalies: gpt-oss-120b summary at 0.0/10, gpt-4.1-nano
17%% JSON, phi-4 67%% JSON.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/README.md | 67 +++++++++++++++++++++++++-------------------
 1 file changed, 38 insertions(+), 29 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 63fe992..3d90257 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -2,44 +2,53 @@
 
 Automated benchmark system to compare LLM models for the reddit-digest-agent pipeline. Tests each model's ability to score and summarize Reddit posts using the project's real prompts.
 
-## Latest Results (2026-04-12)
+## Latest Results (2026-04-17)
 
-> Run: [GitHub Actions #24308576896](https://github.com/using-system/reddit-digest-agent/actions/runs/24308576896)
-> 20 models tested across 7 providers via OpenRouter
+> Run: [GitHub Actions #24556347098](https://github.com/using-system/reddit-digest-agent/actions/runs/24556347098)
+> 25 models tested across 7 providers via OpenRouter
 > Judge: openai/gpt-4o
 
 ### Ranking
 
 | Model | Cost | JSON OK | Latency (avg) | Summaries | Scoring MAE | **Composite** |
 |-------|------|---------|---------------|-----------|-------------|---------------|
-| openai/gpt-4o-mini | $0.0014 | 100% | 1818ms | 7.8/10 | 1.1 | **0.9813** |
-| google/gemma-3-12b-it | $0.0005 | 100% | 4206ms | 8.1/10 | 1.4 | **0.9793** |
-| google/gemma-3-27b-it | $0.0012 | 100% | 5178ms | 8.0/10 | 1.3 | **0.9679** |
-| openai/gpt-4.1-mini | $0.0044 | 100% | 1852ms | 7.7/10 | 1.6 | **0.9638** |
-| mistralai/mistral-medium-3.1 | $0.0064 | 100% | 4052ms | 7.6/10 | 1.5 | **0.9391** |
-| mistralai/mistral-small-3.1-24b-instruct | $0.0037 | 100% | 5478ms | 7.4/10 | 1.3 | **0.938** |
-| anthropic/claude-haiku-4.5 | $0.0153 | 100% | 2788ms | 7.6/10 | 0.6 | **0.9078** |
-| openai/gpt-4.1 | $0.0183 | 100% | 2128ms | 7.9/10 | 1.1 | **0.9054** |
-| google/gemma-4-31b-it | $0.0016 | 100% | 18314ms | 7.9/10 | 0.9 | **0.8882** |
-| x-ai/grok-3-mini | $0.0048 | 100% | 15890ms | 7.5/10 | 1.3 | **0.8759** |
-| deepseek/deepseek-chat-v3-0324 | $0.0030 | 83% | 10821ms | 7.3/10 | 0.8 | **0.8588** |
-| deepseek/deepseek-v3.2 | $0.0046 | 100% | 6031ms | 7.3/10 | 45.6 | **0.8297** |
-| google/gemma-3-4b-it | $0.0005 | 100% | 4327ms | 1.0/10 | 1.2 | **0.7794** |
-| x-ai/grok-4-fast | $0.0043 | 100% | 4731ms | 1.0/10 | 0.7 | **0.7607** |
-| meta-llama/llama-4-maverick | $0.0041 | 50% | 8480ms | 7.5/10 | 10.0 | **0.7526** |
-| microsoft/phi-4 | $0.0009 | 33% | 4598ms | 7.9/10 | 10.0 | **0.7505** |
-| openai/gpt-4o | $0.0253 | 100% | 1424ms | 1.0/10 | 0.7 | **0.6844** |
-| anthropic/claude-opus-4.6 | $0.0775 | 100% | 5674ms | 7.7/10 | 0.7 | **0.6115** |
-| openai/gpt-4.1-nano | $0.0009 | 33% | 986ms | 1.0/10 | 10.0 | **0.577** |
-| meta-llama/llama-4-scout | $0.0019 | 17% | 5013ms | 1.0/10 | 10.0 | **0.4991** |
+| openai/gpt-5.4-nano | $0.0026 | 100% | 1519ms | 7.1/10 | 1.2 | **0.9734** |
+| openai/gpt-4o-mini | $0.0016 | 100% | 2207ms | 7.1/10 | 1.3 | **0.9715** |
+| x-ai/grok-4-fast | $0.0041 | 100% | 3586ms | 7.6/10 | 1.4 | **0.9684** |
+| google/gemma-4-31b-it | $0.0016 | 100% | 4869ms | 7.2/10 | 0.9 | **0.9649** |
+| google/gemma-3-27b-it | $0.0012 | 100% | 4790ms | 7.2/10 | 1.2 | **0.9632** |
+| google/gemma-3-12b-it | $0.0009 | 100% | 3980ms | 7.1/10 | 1.5 | **0.9627** |
+| google/gemma-3-4b-it | $0.0004 | 100% | 1958ms | 6.4/10 | 1.2 | **0.9597** |
+| meta-llama/llama-4-maverick | $0.0027 | 100% | 3784ms | 7.0/10 | 1.3 | **0.9569** |
+| mistralai/mistral-medium-3.1 | $0.0066 | 100% | 4272ms | 7.6/10 | 1.9 | **0.9516** |
+| openai/gpt-4.1-mini | $0.0042 | 100% | 4014ms | 7.1/10 | 1.7 | **0.949** |
+| meta-llama/llama-4-scout | $0.0017 | 100% | 5388ms | 6.7/10 | 1.1 | **0.9457** |
+| mistralai/mistral-small-3.1-24b-instruct | $0.0037 | 100% | 5398ms | 7.0/10 | 1.3 | **0.9456** |
+| anthropic/claude-haiku-4.5 | $0.0156 | 100% | 2846ms | 7.2/10 | 0.6 | **0.9327** |
+| openai/gpt-5.4 | $0.0179 | 100% | 2870ms | 7.3/10 | 1.1 | **0.9224** |
+| deepseek/deepseek-v3.2 | $0.0047 | 100% | 11709ms | 7.5/10 | 1.3 | **0.9207** |
+| openai/gpt-4.1 | $0.0189 | 100% | 2571ms | 7.1/10 | 1.1 | **0.9148** |
+| openai/gpt-4o | $0.0252 | 100% | 1886ms | 7.1/10 | 0.6 | **0.9037** |
+| deepseek/deepseek-chat-v3-0324 | $0.0027 | 100% | 18976ms | 7.4/10 | 1.3 | **0.8865** |
+| openai/gpt-oss-20b | $0.0013 | 100% | 20130ms | 7.1/10 | 1.2 | **0.8776** |
+| x-ai/grok-3-mini | $0.0041 | 100% | 18508ms | 6.9/10 | 1.4 | **0.8697** |
+| microsoft/phi-4 | $0.0009 | 67% | 6152ms | 7.0/10 | 10.0 | **0.7578** |
+| openai/gpt-oss-120b | $0.0029 | 100% | 16742ms | 0.0/10 | 0.6 | **0.7103** |
+| anthropic/claude-opus-4.6 | $0.0779 | 100% | 7131ms | 7.3/10 | 0.6 | **0.7075** |
+| anthropic/claude-opus-4.7 | $0.1070 | 100% | 4497ms | 7.5/10 | 0.7 | **0.6306** |
+| openai/gpt-4.1-nano | $0.0010 | 17% | 1619ms | 0.0/10 | 1.0 | **0.5434** |
+
+### Recommandation
+
+**Meilleur qualité/prix self-hostable : `google/gemma-3-12b-it`** — composite 0.9627 à $0.0009 par run, 100 % de JSON valide, latence ~4 s, qualité de résumé 7.1/10. Quasi à égalité avec les meilleurs modèles propriétaires tout en restant exécutable sur du matériel grand public.
 
 ### Key takeaways
 
-- **Best overall**: `openai/gpt-4o-mini` — cheapest top-tier option with 100% JSON reliability and good summary quality
-- **Best self-hostable**: `google/gemma-3-12b-it` — nearly identical composite score (0.9793 vs 0.9813), highest summary quality (8.1/10), lowest cost ($0.0005), and can run on consumer hardware
-- **Gemma 3 family**: The 12B variant hits the sweet spot. The 27B is marginally worse and slower, the 4B drops dramatically in summary quality
-- **Cost vs quality**: Models costing >$0.01 per run (Claude Opus, GPT-4o, GPT-4.1) don't justify the price — cheaper models match or beat them on summary quality
-- **JSON reliability**: Most models achieve 100%. Notable failures: Llama 4 Scout (17%), Phi-4 (33%), GPT-4.1-nano (33%)
+- **Best overall**: `openai/gpt-5.4-nano` — premier composite (0.9734), 100 % JSON, latence la plus basse (~1.5 s)
+- **Best self-hostable**: `google/gemma-3-12b-it` — meilleur compromis hostable : coût le plus bas de la famille hostable utile, latence raisonnable, qualité comparable aux variantes 27B / 31B sans le besoin de GPU haut de gamme
+- **Famille Gemma**: 4B / 12B / 27B / 4-31B sont toutes en haut du classement ; le 12B reste le sweet spot entre coût, latence et qualité
+- **Coût vs qualité**: les modèles à >$0.01 par run (Claude Opus 4.6/4.7, GPT-4o, GPT-4.1, GPT-5.4) n'apportent pas un gain qui justifie le prix — `claude-opus-4.7` est même le plus cher du panel ($0.107) sans dépasser les modèles légers
+- **Anomalies à surveiller**: `openai/gpt-oss-120b` (résumé évalué 0.0/10 par le judge malgré 100 % JSON), `openai/gpt-4.1-nano` (17 % JSON seulement), `microsoft/phi-4` (67 % JSON, MAE de scoring à 10.0)
 
 ### Composite score formula
 
@@ -51,7 +60,7 @@ Each metric is min-max normalized across all models. For cost, latency, and MAE,
 
 ### From GitHub Actions (recommended)
 
-Go to **Actions > LLM Benchmark > Run workflow**. The default runs all 20 models. You can pass a custom JSON array of model IDs.
+Go to **Actions > LLM Benchmark > Run workflow**. The default runs all 25 models. You can pass a custom JSON array of model IDs.
 
 ### Locally
 

From 9779b218f8d97e4415cf25cad4c535dd522728e7 Mon Sep 17 00:00:00 2001
From: using-system <mnicolescu@gmail.com>
Date: Fri, 17 Apr 2026 10:59:04 +0200
Subject: [PATCH 6/8] docs(benchmark): translate recommendation section to
 English
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Keep README consistent — the rest of the file is in English.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/README.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 3d90257..1acc4e6 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -38,17 +38,17 @@ Automated benchmark system to compare LLM models for the reddit-digest-agent pip
 | anthropic/claude-opus-4.7 | $0.1070 | 100% | 4497ms | 7.5/10 | 0.7 | **0.6306** |
 | openai/gpt-4.1-nano | $0.0010 | 17% | 1619ms | 0.0/10 | 1.0 | **0.5434** |
 
-### Recommandation
+### Recommendation
 
-**Meilleur qualité/prix self-hostable : `google/gemma-3-12b-it`** — composite 0.9627 à $0.0009 par run, 100 % de JSON valide, latence ~4 s, qualité de résumé 7.1/10. Quasi à égalité avec les meilleurs modèles propriétaires tout en restant exécutable sur du matériel grand public.
+**Best quality/price self-hostable: `google/gemma-3-12b-it`** — composite 0.9627 at $0.0009 per run, 100% valid JSON, ~4s latency, 7.1/10 summary quality. Nearly on par with the top proprietary models while remaining runnable on consumer hardware.
 
 ### Key takeaways
 
-- **Best overall**: `openai/gpt-5.4-nano` — premier composite (0.9734), 100 % JSON, latence la plus basse (~1.5 s)
-- **Best self-hostable**: `google/gemma-3-12b-it` — meilleur compromis hostable : coût le plus bas de la famille hostable utile, latence raisonnable, qualité comparable aux variantes 27B / 31B sans le besoin de GPU haut de gamme
-- **Famille Gemma**: 4B / 12B / 27B / 4-31B sont toutes en haut du classement ; le 12B reste le sweet spot entre coût, latence et qualité
-- **Coût vs qualité**: les modèles à >$0.01 par run (Claude Opus 4.6/4.7, GPT-4o, GPT-4.1, GPT-5.4) n'apportent pas un gain qui justifie le prix — `claude-opus-4.7` est même le plus cher du panel ($0.107) sans dépasser les modèles légers
-- **Anomalies à surveiller**: `openai/gpt-oss-120b` (résumé évalué 0.0/10 par le judge malgré 100 % JSON), `openai/gpt-4.1-nano` (17 % JSON seulement), `microsoft/phi-4` (67 % JSON, MAE de scoring à 10.0)
+- **Best overall**: `openai/gpt-5.4-nano` — top composite (0.9734), 100% JSON, lowest latency (~1.5s)
+- **Best self-hostable**: `google/gemma-3-12b-it` — best hostable trade-off: lowest cost in the usable hostable family, reasonable latency, quality comparable to the 27B / 31B variants without needing high-end GPUs
+- **Gemma family**: 4B / 12B / 27B / 4-31B all sit at the top of the ranking; the 12B remains the sweet spot between cost, latency and quality
+- **Cost vs quality**: models above $0.01 per run (Claude Opus 4.6/4.7, GPT-4o, GPT-4.1, GPT-5.4) don't deliver a gain that justifies the price — `claude-opus-4.7` is even the most expensive of the panel ($0.107) without beating the light models
+- **Anomalies to watch**: `openai/gpt-oss-120b` (judge rated summary 0.0/10 despite 100% JSON), `openai/gpt-4.1-nano` (only 17% JSON), `microsoft/phi-4` (67% JSON, scoring MAE at 10.0)
 
 ### Composite score formula
 

From 83c1fe13879d57c2d0fb0c167e5f71eaa7dabfb8 Mon Sep 17 00:00:00 2001
From: using-system <mnicolescu@gmail.com>
Date: Fri, 17 Apr 2026 11:01:55 +0200
Subject: [PATCH 7/8] docs(readme): refresh recommended-model section with
 2026-04-17 benchmark

Update the table to reflect the 25-model run, add gpt-5.4-nano and the
Gemma 4-31B / 3-27B variants, and reframe the recommendation around
gemma-3-12b-it as the best self-hostable option (no longer top-2 on
composite, but still cheapest at $0.0009/run with 100% JSON validity).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 README.md | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 2cc37c1..729ddda 100644
--- a/README.md
+++ b/README.md
@@ -131,15 +131,17 @@ All configuration is done via environment variables (`.env` file).
 
 ## Recommended model
 
-Based on our [benchmark of 20 models](benchmarks/README.md), we recommend **`google/gemma-3-12b-it`** as the default LLM:
+Based on our [benchmark of 25 models](benchmarks/README.md), we recommend **`google/gemma-3-12b-it`** as the default LLM for self-hosted deployments:
 
 | Model | Cost/run | JSON OK | Summary quality | Composite |
 |-------|----------|---------|-----------------|-----------|
-| openai/gpt-4o-mini | $0.0014 | 100% | 7.8/10 | 0.9813 |
-| **google/gemma-3-12b-it** | **$0.0005** | **100%** | **8.1/10** | **0.9793** |
-| google/gemma-3-27b-it | $0.0012 | 100% | 8.0/10 | 0.9679 |
+| openai/gpt-5.4-nano | $0.0026 | 100% | 7.1/10 | 0.9734 |
+| openai/gpt-4o-mini | $0.0016 | 100% | 7.1/10 | 0.9715 |
+| google/gemma-4-31b-it | $0.0016 | 100% | 7.2/10 | 0.9649 |
+| google/gemma-3-27b-it | $0.0012 | 100% | 7.2/10 | 0.9632 |
+| **google/gemma-3-12b-it** | **$0.0009** | **100%** | **7.1/10** | **0.9627** |
 
-While `gpt-4o-mini` scores marginally higher on the composite metric, `gemma-3-12b-it` is the best self-hostable alternative: highest summary quality (8.1/10), lowest cost, 100% JSON reliability, and it runs on consumer hardware (12B parameters). It is the default value for `LLM_MODEL`.
+While `gpt-5.4-nano` and `gpt-4o-mini` lead the composite ranking, `gemma-3-12b-it` is the best self-hostable option: lowest cost ($0.0009/run), 100% JSON reliability, summary quality on par with the top proprietary models, and it runs on consumer hardware (12B parameters). It is the default value for `LLM_MODEL`.
 
 See [benchmarks/README.md](benchmarks/README.md) for the full ranking and methodology.
 

From 386ad249e6ca33c0363fb66e4770e7f83d143e23 Mon Sep 17 00:00:00 2001
From: using-system <mnicolescu@gmail.com>
Date: Fri, 17 Apr 2026 11:17:47 +0200
Subject: [PATCH 8/8] docs(benchmark): refresh both READMEs with run
 #24556985945
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Latest benchmark run is much more conclusive for the Gemma 3 family —
gemma-3-27b-it tops the composite ranking (0.9819) and gemma-3-12b-it
moves up to #2 (0.9781) with the same 7.9/10 summary quality. Three
Gemma 3 variants plus the Gemma 4-31B fill 4 of the top 5 spots.

Conclusion preserved: gemma-3-12b-it remains the recommended
self-hostable default — same summary quality as the 27B at less than
half the cost (\$0.0005 vs \$0.0012/run) and runnable on a single
consumer GPU.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 README.md            | 11 ++++----
 benchmarks/README.md | 64 ++++++++++++++++++++++----------------------
 2 files changed, 37 insertions(+), 38 deletions(-)

diff --git a/README.md b/README.md
index 729ddda..8212da6 100644
--- a/README.md
+++ b/README.md
@@ -135,13 +135,12 @@ Based on our [benchmark of 25 models](benchmarks/README.md), we recommend **`goo
 
 | Model | Cost/run | JSON OK | Summary quality | Composite |
 |-------|----------|---------|-----------------|-----------|
-| openai/gpt-5.4-nano | $0.0026 | 100% | 7.1/10 | 0.9734 |
-| openai/gpt-4o-mini | $0.0016 | 100% | 7.1/10 | 0.9715 |
-| google/gemma-4-31b-it | $0.0016 | 100% | 7.2/10 | 0.9649 |
-| google/gemma-3-27b-it | $0.0012 | 100% | 7.2/10 | 0.9632 |
-| **google/gemma-3-12b-it** | **$0.0009** | **100%** | **7.1/10** | **0.9627** |
+| google/gemma-3-27b-it | $0.0012 | 100% | 7.9/10 | 0.9819 |
+| **google/gemma-3-12b-it** | **$0.0005** | **100%** | **7.9/10** | **0.9781** |
+| openai/gpt-4o-mini | $0.0015 | 100% | 7.6/10 | 0.9746 |
+| google/gemma-4-31b-it | $0.0016 | 100% | 8.0/10 | 0.9693 |
 
-While `gpt-5.4-nano` and `gpt-4o-mini` lead the composite ranking, `gemma-3-12b-it` is the best self-hostable option: lowest cost ($0.0009/run), 100% JSON reliability, summary quality on par with the top proprietary models, and it runs on consumer hardware (12B parameters). It is the default value for `LLM_MODEL`.
+While `gemma-3-27b-it` tops the composite ranking, `gemma-3-12b-it` is the best self-hostable option: same summary quality (7.9/10), less than half the cost ($0.0005 vs $0.0012/run), 100% JSON reliability, and it runs on a single consumer GPU (12B parameters). It is the default value for `LLM_MODEL`.
 
 See [benchmarks/README.md](benchmarks/README.md) for the full ranking and methodology.
 
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 1acc4e6..151a9bd 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -4,7 +4,7 @@ Automated benchmark system to compare LLM models for the reddit-digest-agent pip
 
 ## Latest Results (2026-04-17)
 
-> Run: [GitHub Actions #24556347098](https://github.com/using-system/reddit-digest-agent/actions/runs/24556347098)
+> Run: [GitHub Actions #24556985945](https://github.com/using-system/reddit-digest-agent/actions/runs/24556985945)
 > 25 models tested across 7 providers via OpenRouter
 > Judge: openai/gpt-4o
 
@@ -12,43 +12,43 @@ Automated benchmark system to compare LLM models for the reddit-digest-agent pip
 
 | Model | Cost | JSON OK | Latency (avg) | Summaries | Scoring MAE | **Composite** |
 |-------|------|---------|---------------|-----------|-------------|---------------|
-| openai/gpt-5.4-nano | $0.0026 | 100% | 1519ms | 7.1/10 | 1.2 | **0.9734** |
-| openai/gpt-4o-mini | $0.0016 | 100% | 2207ms | 7.1/10 | 1.3 | **0.9715** |
-| x-ai/grok-4-fast | $0.0041 | 100% | 3586ms | 7.6/10 | 1.4 | **0.9684** |
-| google/gemma-4-31b-it | $0.0016 | 100% | 4869ms | 7.2/10 | 0.9 | **0.9649** |
-| google/gemma-3-27b-it | $0.0012 | 100% | 4790ms | 7.2/10 | 1.2 | **0.9632** |
-| google/gemma-3-12b-it | $0.0009 | 100% | 3980ms | 7.1/10 | 1.5 | **0.9627** |
-| google/gemma-3-4b-it | $0.0004 | 100% | 1958ms | 6.4/10 | 1.2 | **0.9597** |
-| meta-llama/llama-4-maverick | $0.0027 | 100% | 3784ms | 7.0/10 | 1.3 | **0.9569** |
-| mistralai/mistral-medium-3.1 | $0.0066 | 100% | 4272ms | 7.6/10 | 1.9 | **0.9516** |
-| openai/gpt-4.1-mini | $0.0042 | 100% | 4014ms | 7.1/10 | 1.7 | **0.949** |
-| meta-llama/llama-4-scout | $0.0017 | 100% | 5388ms | 6.7/10 | 1.1 | **0.9457** |
-| mistralai/mistral-small-3.1-24b-instruct | $0.0037 | 100% | 5398ms | 7.0/10 | 1.3 | **0.9456** |
-| anthropic/claude-haiku-4.5 | $0.0156 | 100% | 2846ms | 7.2/10 | 0.6 | **0.9327** |
-| openai/gpt-5.4 | $0.0179 | 100% | 2870ms | 7.3/10 | 1.1 | **0.9224** |
-| deepseek/deepseek-v3.2 | $0.0047 | 100% | 11709ms | 7.5/10 | 1.3 | **0.9207** |
-| openai/gpt-4.1 | $0.0189 | 100% | 2571ms | 7.1/10 | 1.1 | **0.9148** |
-| openai/gpt-4o | $0.0252 | 100% | 1886ms | 7.1/10 | 0.6 | **0.9037** |
-| deepseek/deepseek-chat-v3-0324 | $0.0027 | 100% | 18976ms | 7.4/10 | 1.3 | **0.8865** |
-| openai/gpt-oss-20b | $0.0013 | 100% | 20130ms | 7.1/10 | 1.2 | **0.8776** |
-| x-ai/grok-3-mini | $0.0041 | 100% | 18508ms | 6.9/10 | 1.4 | **0.8697** |
-| microsoft/phi-4 | $0.0009 | 67% | 6152ms | 7.0/10 | 10.0 | **0.7578** |
-| openai/gpt-oss-120b | $0.0029 | 100% | 16742ms | 0.0/10 | 0.6 | **0.7103** |
-| anthropic/claude-opus-4.6 | $0.0779 | 100% | 7131ms | 7.3/10 | 0.6 | **0.7075** |
-| anthropic/claude-opus-4.7 | $0.1070 | 100% | 4497ms | 7.5/10 | 0.7 | **0.6306** |
-| openai/gpt-4.1-nano | $0.0010 | 17% | 1619ms | 0.0/10 | 1.0 | **0.5434** |
+| google/gemma-3-27b-it | $0.0012 | 100% | 3336ms | 7.9/10 | 1.1 | **0.9819** |
+| google/gemma-3-12b-it | $0.0005 | 100% | 4296ms | 7.9/10 | 1.4 | **0.9781** |
+| openai/gpt-4o-mini | $0.0015 | 100% | 2156ms | 7.6/10 | 0.9 | **0.9746** |
+| google/gemma-4-31b-it | $0.0016 | 100% | 8232ms | 8.0/10 | 1.0 | **0.9693** |
+| x-ai/grok-4-fast | $0.0042 | 100% | 3791ms | 7.9/10 | 1.5 | **0.966** |
+| meta-llama/llama-4-maverick | $0.0033 | 100% | 3196ms | 7.7/10 | 1.2 | **0.9657** |
+| openai/gpt-4.1-mini | $0.0045 | 100% | 3130ms | 7.8/10 | 1.5 | **0.9636** |
+| openai/gpt-5.4-nano | $0.0026 | 100% | 1951ms | 7.4/10 | 1.1 | **0.9606** |
+| mistralai/mistral-medium-3.1 | $0.0064 | 100% | 4736ms | 8.0/10 | 1.5 | **0.96** |
+| mistralai/mistral-small-3.1-24b-instruct | $0.0037 | 100% | 5644ms | 7.6/10 | 1.3 | **0.9506** |
+| meta-llama/llama-4-scout | $0.0022 | 100% | 7142ms | 7.6/10 | 1.3 | **0.9505** |
+| deepseek/deepseek-chat-v3-0324 | $0.0029 | 100% | 13163ms | 7.9/10 | 1.0 | **0.9432** |
+| anthropic/claude-haiku-4.5 | $0.0156 | 100% | 2846ms | 7.7/10 | 1.0 | **0.9271** |
+| openai/gpt-4.1 | $0.0192 | 100% | 2684ms | 7.8/10 | 1.1 | **0.9185** |
+| openai/gpt-5.4 | $0.0181 | 100% | 3066ms | 7.2/10 | 1.1 | **0.8947** |
+| openai/gpt-oss-120b | $0.0029 | 100% | 29202ms | 7.7/10 | 0.9 | **0.8794** |
+| google/gemma-3-4b-it | $0.0004 | 100% | 2302ms | 3.5/10 | 1.2 | **0.7919** |
+| microsoft/phi-4 | $0.0009 | 67% | 7588ms | 7.8/10 | 10.0 | **0.7686** |
+| deepseek/deepseek-v3.2 | $0.0027 | 100% | 12455ms | 3.8/10 | 1.3 | **0.7612** |
+| openai/gpt-4o | $0.0245 | 100% | 2358ms | 3.8/10 | 1.0 | **0.7252** |
+| anthropic/claude-opus-4.6 | $0.0778 | 100% | 5298ms | 7.5/10 | 0.7 | **0.7014** |
+| x-ai/grok-3-mini | $0.0057 | 100% | 30198ms | 3.7/10 | 1.5 | **0.6821** |
+| openai/gpt-oss-20b | $0.0013 | 67% | 10371ms | 3.8/10 | 0.7 | **0.6795** |
+| anthropic/claude-opus-4.7 | $0.1031 | 100% | 4006ms | 7.7/10 | 0.7 | **0.6283** |
+| openai/gpt-4.1-nano | $0.0011 | 17% | 1614ms | 3.8/10 | 10.0 | **0.4611** |
 
 ### Recommendation
 
-**Best quality/price self-hostable: `google/gemma-3-12b-it`** — composite 0.9627 at $0.0009 per run, 100% valid JSON, ~4s latency, 7.1/10 summary quality. Nearly on par with the top proprietary models while remaining runnable on consumer hardware.
+**Best quality/price self-hostable: `google/gemma-3-12b-it`** — composite 0.9781 at $0.0005 per run, 100% valid JSON, ~4.3s latency, 7.9/10 summary quality. Second overall in the ranking, trailing only its bigger sibling `gemma-3-27b-it` while costing less than half and running on consumer hardware.
 
 ### Key takeaways
 
-- **Best overall**: `openai/gpt-5.4-nano` — top composite (0.9734), 100% JSON, lowest latency (~1.5s)
-- **Best self-hostable**: `google/gemma-3-12b-it` — best hostable trade-off: lowest cost in the usable hostable family, reasonable latency, quality comparable to the 27B / 31B variants without needing high-end GPUs
-- **Gemma family**: 4B / 12B / 27B / 4-31B all sit at the top of the ranking; the 12B remains the sweet spot between cost, latency and quality
-- **Cost vs quality**: models above $0.01 per run (Claude Opus 4.6/4.7, GPT-4o, GPT-4.1, GPT-5.4) don't deliver a gain that justifies the price — `claude-opus-4.7` is even the most expensive of the panel ($0.107) without beating the light models
-- **Anomalies to watch**: `openai/gpt-oss-120b` (judge rated summary 0.0/10 despite 100% JSON), `openai/gpt-4.1-nano` (only 17% JSON), `microsoft/phi-4` (67% JSON, scoring MAE at 10.0)
+- **Best overall**: `google/gemma-3-27b-it` — top composite (0.9819), 100% JSON, 7.9/10 summary, fast at ~3.3s and only $0.0012 per run
+- **Best self-hostable**: `google/gemma-3-12b-it` — same summary quality as the 27B (7.9/10), half the cost ($0.0005 vs $0.0012), runs on a single consumer GPU
+- **Gemma family dominates**: three Gemma 3 variants and the Gemma 4-31B fill 4 of the top 5 spots, beating every proprietary model except `gpt-4o-mini`
+- **Cost vs quality**: models above $0.01 per run (Claude Opus 4.6/4.7, GPT-4o, GPT-4.1, GPT-5.4) deliver no benefit over Gemma 3-12B; `claude-opus-4.7` is the most expensive of the panel ($0.103) and still ranks last on composite
+- **Anomalies to watch**: judge gave a 3.5–3.8/10 summary score to `gemma-3-4b-it`, `deepseek-v3.2`, `gpt-4o`, `grok-3-mini`, `gpt-oss-20b` and `gpt-4.1-nano` despite valid JSON — likely judge variance worth re-running. Hard failures: `gpt-4.1-nano` (17% JSON), `microsoft/phi-4` and `gpt-oss-20b` (67% JSON)
 
 ### Composite score formula