diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 2b48c1a..12d722c 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -7,7 +7,7 @@ on:
         description: "JSON array of models to benchmark (OpenRouter model IDs)"
         required: false
         # yamllint disable-line rule:line-length
-        default: '["openai/gpt-4o","openai/gpt-4o-mini","openai/gpt-4.1","openai/gpt-4.1-mini","openai/gpt-4.1-nano","anthropic/claude-opus-4.6","anthropic/claude-haiku-4.5","google/gemma-3-4b-it","google/gemma-3-12b-it","google/gemma-3-27b-it","google/gemma-4-31b-it","mistralai/mistral-small-3.1-24b-instruct","mistralai/mistral-medium-3.1","meta-llama/llama-4-scout","meta-llama/llama-4-maverick","deepseek/deepseek-chat-v3-0324","deepseek/deepseek-v3.2","microsoft/phi-4","x-ai/grok-3-mini","x-ai/grok-4-fast"]'
+        default: '["openai/gpt-5.4","openai/gpt-5.4-nano","openai/gpt-oss-120b","openai/gpt-oss-20b","openai/gpt-4o","openai/gpt-4o-mini","openai/gpt-4.1","openai/gpt-4.1-mini","openai/gpt-4.1-nano","anthropic/claude-opus-4.7","anthropic/claude-opus-4.6","anthropic/claude-haiku-4.5","google/gemma-3-4b-it","google/gemma-3-12b-it","google/gemma-3-27b-it","google/gemma-4-31b-it","mistralai/mistral-small-3.1-24b-instruct","mistralai/mistral-medium-3.1","meta-llama/llama-4-scout","meta-llama/llama-4-maverick","deepseek/deepseek-chat-v3-0324","deepseek/deepseek-v3.2","microsoft/phi-4","x-ai/grok-3-mini","x-ai/grok-4-fast"]'
       judge_model:
         description: "Model used as LLM judge"
         required: false
@@ -29,7 +29,7 @@ jobs:
       max-parallel: 5
       # yamllint disable-line rule:line-length
       matrix:
-        model: ${{ fromJson(inputs.models || '["openai/gpt-4o","openai/gpt-4o-mini","openai/gpt-4.1","openai/gpt-4.1-mini","openai/gpt-4.1-nano","anthropic/claude-opus-4.6","anthropic/claude-haiku-4.5","google/gemma-3-4b-it","google/gemma-3-12b-it","google/gemma-3-27b-it","google/gemma-4-31b-it","mistralai/mistral-small-3.1-24b-instruct","mistralai/mistral-medium-3.1","meta-llama/llama-4-scout","meta-llama/llama-4-maverick","deepseek/deepseek-chat-v3-0324","deepseek/deepseek-v3.2","microsoft/phi-4","x-ai/grok-3-mini","x-ai/grok-4-fast"]') }}
+        model: ${{ fromJson(inputs.models || '["openai/gpt-5.4","openai/gpt-5.4-nano","openai/gpt-oss-120b","openai/gpt-oss-20b","openai/gpt-4o","openai/gpt-4o-mini","openai/gpt-4.1","openai/gpt-4.1-mini","openai/gpt-4.1-nano","anthropic/claude-opus-4.7","anthropic/claude-opus-4.6","anthropic/claude-haiku-4.5","google/gemma-3-4b-it","google/gemma-3-12b-it","google/gemma-3-27b-it","google/gemma-4-31b-it","mistralai/mistral-small-3.1-24b-instruct","mistralai/mistral-medium-3.1","meta-llama/llama-4-scout","meta-llama/llama-4-maverick","deepseek/deepseek-chat-v3-0324","deepseek/deepseek-v3.2","microsoft/phi-4","x-ai/grok-3-mini","x-ai/grok-4-fast"]') }}
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
 
diff --git a/README.md b/README.md
index 2cc37c1..8212da6 100644
--- a/README.md
+++ b/README.md
@@ -131,15 +131,16 @@ All configuration is done via environment variables (`.env` file).
 
 ## Recommended model
 
-Based on our [benchmark of 20 models](benchmarks/README.md), we recommend **`google/gemma-3-12b-it`** as the default LLM:
+Based on our [benchmark of 25 models](benchmarks/README.md), we recommend **`google/gemma-3-12b-it`** as the default LLM for self-hosted deployments:
 
 | Model | Cost/run | JSON OK | Summary quality | Composite |
 |-------|----------|---------|-----------------|-----------|
-| openai/gpt-4o-mini | $0.0014 | 100% | 7.8/10 | 0.9813 |
-| **google/gemma-3-12b-it** | **$0.0005** | **100%** | **8.1/10** | **0.9793** |
-| google/gemma-3-27b-it | $0.0012 | 100% | 8.0/10 | 0.9679 |
+| google/gemma-3-27b-it | $0.0012 | 100% | 7.9/10 | 0.9819 |
+| **google/gemma-3-12b-it** | **$0.0005** | **100%** | **7.9/10** | **0.9781** |
+| openai/gpt-4o-mini | $0.0015 | 100% | 7.6/10 | 0.9746 |
+| google/gemma-4-31b-it | $0.0016 | 100% | 8.0/10 | 0.9693 |
 
-While `gpt-4o-mini` scores marginally higher on the composite metric, `gemma-3-12b-it` is the best self-hostable alternative: highest summary quality (8.1/10), lowest cost, 100% JSON reliability, and it runs on consumer hardware (12B parameters). It is the default value for `LLM_MODEL`.
+While `gemma-3-27b-it` tops the composite ranking, `gemma-3-12b-it` is the best self-hostable option: same summary quality (7.9/10), less than half the cost ($0.0005 vs $0.0012/run), 100% JSON reliability, and it runs on a single consumer GPU (12B parameters). It is the default value for `LLM_MODEL`.
 
 See [benchmarks/README.md](benchmarks/README.md) for the full ranking and methodology.
 
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 63fe992..151a9bd 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -2,44 +2,53 @@
 
 Automated benchmark system to compare LLM models for the reddit-digest-agent pipeline. Tests each model's ability to score and summarize Reddit posts using the project's real prompts.
 
-## Latest Results (2026-04-12)
+## Latest Results (2026-04-17)
 
-> Run: [GitHub Actions #24308576896](https://github.com/using-system/reddit-digest-agent/actions/runs/24308576896)
-> 20 models tested across 7 providers via OpenRouter
+> Run: [GitHub Actions #24556985945](https://github.com/using-system/reddit-digest-agent/actions/runs/24556985945)
+> 25 models tested across 7 providers via OpenRouter
 > Judge: openai/gpt-4o
 
 ### Ranking
 
 | Model | Cost | JSON OK | Latency (avg) | Summaries | Scoring MAE | **Composite** |
 |-------|------|---------|---------------|-----------|-------------|---------------|
-| openai/gpt-4o-mini | $0.0014 | 100% | 1818ms | 7.8/10 | 1.1 | **0.9813** |
-| google/gemma-3-12b-it | $0.0005 | 100% | 4206ms | 8.1/10 | 1.4 | **0.9793** |
-| google/gemma-3-27b-it | $0.0012 | 100% | 5178ms | 8.0/10 | 1.3 | **0.9679** |
-| openai/gpt-4.1-mini | $0.0044 | 100% | 1852ms | 7.7/10 | 1.6 | **0.9638** |
-| mistralai/mistral-medium-3.1 | $0.0064 | 100% | 4052ms | 7.6/10 | 1.5 | **0.9391** |
-| mistralai/mistral-small-3.1-24b-instruct | $0.0037 | 100% | 5478ms | 7.4/10 | 1.3 | **0.938** |
-| anthropic/claude-haiku-4.5 | $0.0153 | 100% | 2788ms | 7.6/10 | 0.6 | **0.9078** |
-| openai/gpt-4.1 | $0.0183 | 100% | 2128ms | 7.9/10 | 1.1 | **0.9054** |
-| google/gemma-4-31b-it | $0.0016 | 100% | 18314ms | 7.9/10 | 0.9 | **0.8882** |
-| x-ai/grok-3-mini | $0.0048 | 100% | 15890ms | 7.5/10 | 1.3 | **0.8759** |
-| deepseek/deepseek-chat-v3-0324 | $0.0030 | 83% | 10821ms | 7.3/10 | 0.8 | **0.8588** |
-| deepseek/deepseek-v3.2 | $0.0046 | 100% | 6031ms | 7.3/10 | 45.6 | **0.8297** |
-| google/gemma-3-4b-it | $0.0005 | 100% | 4327ms | 1.0/10 | 1.2 | **0.7794** |
-| x-ai/grok-4-fast | $0.0043 | 100% | 4731ms | 1.0/10 | 0.7 | **0.7607** |
-| meta-llama/llama-4-maverick | $0.0041 | 50% | 8480ms | 7.5/10 | 10.0 | **0.7526** |
-| microsoft/phi-4 | $0.0009 | 33% | 4598ms | 7.9/10 | 10.0 | **0.7505** |
-| openai/gpt-4o | $0.0253 | 100% | 1424ms | 1.0/10 | 0.7 | **0.6844** |
-| anthropic/claude-opus-4.6 | $0.0775 | 100% | 5674ms | 7.7/10 | 0.7 | **0.6115** |
-| openai/gpt-4.1-nano | $0.0009 | 33% | 986ms | 1.0/10 | 10.0 | **0.577** |
-| meta-llama/llama-4-scout | $0.0019 | 17% | 5013ms | 1.0/10 | 10.0 | **0.4991** |
+| google/gemma-3-27b-it | $0.0012 | 100% | 3336ms | 7.9/10 | 1.1 | **0.9819** |
+| google/gemma-3-12b-it | $0.0005 | 100% | 4296ms | 7.9/10 | 1.4 | **0.9781** |
+| openai/gpt-4o-mini | $0.0015 | 100% | 2156ms | 7.6/10 | 0.9 | **0.9746** |
+| google/gemma-4-31b-it | $0.0016 | 100% | 8232ms | 8.0/10 | 1.0 | **0.9693** |
+| x-ai/grok-4-fast | $0.0042 | 100% | 3791ms | 7.9/10 | 1.5 | **0.966** |
+| meta-llama/llama-4-maverick | $0.0033 | 100% | 3196ms | 7.7/10 | 1.2 | **0.9657** |
+| openai/gpt-4.1-mini | $0.0045 | 100% | 3130ms | 7.8/10 | 1.5 | **0.9636** |
+| openai/gpt-5.4-nano | $0.0026 | 100% | 1951ms | 7.4/10 | 1.1 | **0.9606** |
+| mistralai/mistral-medium-3.1 | $0.0064 | 100% | 4736ms | 8.0/10 | 1.5 | **0.96** |
+| mistralai/mistral-small-3.1-24b-instruct | $0.0037 | 100% | 5644ms | 7.6/10 | 1.3 | **0.9506** |
+| meta-llama/llama-4-scout | $0.0022 | 100% | 7142ms | 7.6/10 | 1.3 | **0.9505** |
+| deepseek/deepseek-chat-v3-0324 | $0.0029 | 100% | 13163ms | 7.9/10 | 1.0 | **0.9432** |
+| anthropic/claude-haiku-4.5 | $0.0156 | 100% | 2846ms | 7.7/10 | 1.0 | **0.9271** |
+| openai/gpt-4.1 | $0.0192 | 100% | 2684ms | 7.8/10 | 1.1 | **0.9185** |
+| openai/gpt-5.4 | $0.0181 | 100% | 3066ms | 7.2/10 | 1.1 | **0.8947** |
+| openai/gpt-oss-120b | $0.0029 | 100% | 29202ms | 7.7/10 | 0.9 | **0.8794** |
+| google/gemma-3-4b-it | $0.0004 | 100% | 2302ms | 3.5/10 | 1.2 | **0.7919** |
+| microsoft/phi-4 | $0.0009 | 67% | 7588ms | 7.8/10 | 10.0 | **0.7686** |
+| deepseek/deepseek-v3.2 | $0.0027 | 100% | 12455ms | 3.8/10 | 1.3 | **0.7612** |
+| openai/gpt-4o | $0.0245 | 100% | 2358ms | 3.8/10 | 1.0 | **0.7252** |
+| anthropic/claude-opus-4.6 | $0.0778 | 100% | 5298ms | 7.5/10 | 0.7 | **0.7014** |
+| x-ai/grok-3-mini | $0.0057 | 100% | 30198ms | 3.7/10 | 1.5 | **0.6821** |
+| openai/gpt-oss-20b | $0.0013 | 67% | 10371ms | 3.8/10 | 0.7 | **0.6795** |
+| anthropic/claude-opus-4.7 | $0.1031 | 100% | 4006ms | 7.7/10 | 0.7 | **0.6283** |
+| openai/gpt-4.1-nano | $0.0011 | 17% | 1614ms | 3.8/10 | 10.0 | **0.4611** |
+
+### Recommendation
+
+**Best quality/price self-hostable: `google/gemma-3-12b-it`** — composite 0.9781 at $0.0005 per run, 100% valid JSON, ~4.3s latency, 7.9/10 summary quality. Second overall in the ranking, trailing only its bigger sibling `gemma-3-27b-it` while costing less than half and running on consumer hardware.
 
 ### Key takeaways
 
-- **Best overall**: `openai/gpt-4o-mini` — cheapest top-tier option with 100% JSON reliability and good summary quality
-- **Best self-hostable**: `google/gemma-3-12b-it` — nearly identical composite score (0.9793 vs 0.9813), highest summary quality (8.1/10), lowest cost ($0.0005), and can run on consumer hardware
-- **Gemma 3 family**: The 12B variant hits the sweet spot. The 27B is marginally worse and slower, the 4B drops dramatically in summary quality
-- **Cost vs quality**: Models costing >$0.01 per run (Claude Opus, GPT-4o, GPT-4.1) don't justify the price — cheaper models match or beat them on summary quality
-- **JSON reliability**: Most models achieve 100%. Notable failures: Llama 4 Scout (17%), Phi-4 (33%), GPT-4.1-nano (33%)
+- **Best overall**: `google/gemma-3-27b-it` — top composite (0.9819), 100% JSON, 7.9/10 summary, fast at ~3.3s and only $0.0012 per run
+- **Best self-hostable**: `google/gemma-3-12b-it` — same summary quality as the 27B (7.9/10), half the cost ($0.0005 vs $0.0012), runs on a single consumer GPU
+- **Gemma family dominates**: three Gemma 3 variants and the Gemma 4-31B fill 4 of the top 5 spots, beating every proprietary model except `gpt-4o-mini`
+- **Cost vs quality**: models above $0.01 per run (Claude Opus 4.6/4.7, GPT-4o, GPT-4.1, GPT-5.4) deliver no benefit over Gemma 3-12B; `claude-opus-4.7` is the most expensive of the panel ($0.103) and still ranks last on composite
+- **Anomalies to watch**: judge gave a 3.5–3.8/10 summary score to `gemma-3-4b-it`, `deepseek-v3.2`, `gpt-4o`, `grok-3-mini`, `gpt-oss-20b` and `gpt-4.1-nano` despite valid JSON — likely judge variance worth re-running. Hard failures: `gpt-4.1-nano` (17% JSON), `microsoft/phi-4` and `gpt-oss-20b` (67% JSON)
 
 ### Composite score formula
 
@@ -51,7 +60,7 @@ Each metric is min-max normalized across all models. For cost, latency, and MAE,
 
 ### From GitHub Actions (recommended)
 
-Go to **Actions > LLM Benchmark > Run workflow**. The default runs all 20 models. You can pass a custom JSON array of model IDs.
+Go to **Actions > LLM Benchmark > Run workflow**. The default runs all 25 models. You can pass a custom JSON array of model IDs.
 
 ### Locally
 
diff --git a/benchmarks/aggregate.py b/benchmarks/aggregate.py
index 2c524d8..b8aa0e0 100644
--- a/benchmarks/aggregate.py
+++ b/benchmarks/aggregate.py
@@ -16,22 +16,12 @@
 
 from langchain_openai import ChatOpenAI
 
+from reddit_digest.nodes.llm_utils import extract_json
+
 logger = logging.getLogger(__name__)
 
 DEFAULT_BASE_URL = "https://openrouter.ai/api/v1"
 
-
-def _strip_code_fences(text: str) -> str:
-    """Strip markdown code fences and whitespace from LLM output."""
-    text = text.strip()
-    if text.startswith("```"):
-        first_newline = text.index("\n") if "\n" in text else len(text)
-        text = text[first_newline + 1 :]
-    if text.endswith("```"):
-        text = text[:-3]
-    return text.strip()
-
-
 DEFAULT_JUDGE_MODEL = "openai/gpt-4o"
 
 JUDGE_PROMPT = """You are evaluating the quality of AI-generated summaries of Reddit posts.
@@ -151,7 +141,7 @@ async def run_judge(
 
         try:
             response = await llm.ainvoke(prompt)
-            data = json.loads(_strip_code_fences(response.content))
+            data = extract_json(response.content)
             evals = data.get("evaluations", {})
             all_evaluations.update(evals)
         except Exception:
diff --git a/benchmarks/bench_model.py b/benchmarks/bench_model.py
index 7a319b4..23eabba 100644
--- a/benchmarks/bench_model.py
+++ b/benchmarks/bench_model.py
@@ -20,6 +20,7 @@
 from langchain_openai import ChatOpenAI
 
 from reddit_digest.models import RedditPost
+from reddit_digest.nodes.llm_utils import extract_json
 from reddit_digest.nodes.scorer import SCORE_PROMPT
 from reddit_digest.nodes.scorer import _build_post_block as scorer_build_block
 from reddit_digest.nodes.summarizer import PROMPT_TEMPLATE
@@ -35,18 +36,6 @@ def load_fixture(path: str) -> dict:
     return json.loads(Path(path).read_text())
 
 
-def _strip_code_fences(text: str) -> str:
-    """Strip markdown code fences and whitespace from LLM output."""
-    text = text.strip()
-    if text.startswith("```"):
-        # Remove opening fence (```json or ```)
-        first_newline = text.index("\n") if "\n" in text else len(text)
-        text = text[first_newline + 1 :]
-    if text.endswith("```"):
-        text = text[:-3]
-    return text.strip()
-
-
 def _extract_cost(response) -> float:
     """Extract cost from OpenRouter response metadata (usage.cost)."""
     metadata = getattr(response, "response_metadata", {})
@@ -110,11 +99,11 @@ async def run_benchmark(
             total_tokens_completion += usage.get("completion_tokens", 0)
             total_cost += _extract_cost(response)
 
-            data = json.loads(_strip_code_fences(response.content))
+            data = extract_json(response.content)
             scores = data.get("scores", {})
             all_scores.update(scores)
             json_valid_count += 1
-        except json.JSONDecodeError as e:
+        except (ValueError, json.JSONDecodeError) as e:
             elapsed_ms = (time.monotonic() - start) * 1000
             latencies.append(elapsed_ms)
             errors.append(f"scorer/{subreddit}: JSON parse error: {e}")
@@ -141,11 +130,11 @@ async def run_benchmark(
             total_tokens_completion += usage.get("completion_tokens", 0)
             total_cost += _extract_cost(response)
 
-            data = json.loads(_strip_code_fences(response.content))
+            data = extract_json(response.content)
             summaries = data.get("summaries", {})
             all_summaries.update(summaries)
             json_valid_count += 1
-        except json.JSONDecodeError as e:
+        except (ValueError, json.JSONDecodeError) as e:
             elapsed_ms = (time.monotonic() - start) * 1000
             latencies.append(elapsed_ms)
             errors.append(f"summarizer/{subreddit}: JSON parse error: {e}")
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 07accda..8e1c8d1 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -1,25 +1,9 @@
 from unittest.mock import MagicMock
 
-from benchmarks.bench_model import _extract_cost, _strip_code_fences
+from benchmarks.bench_model import _extract_cost
 from benchmarks.aggregate import normalize_min_max, compute_composite, generate_report
 
 
-def test_strip_code_fences_json():
-    assert _strip_code_fences('```json\n{"a": 1}\n```') == '{"a": 1}'
-
-
-def test_strip_code_fences_plain():
-    assert _strip_code_fences('```\n{"a": 1}\n```') == '{"a": 1}'
-
-
-def test_strip_code_fences_no_fences():
-    assert _strip_code_fences('{"a": 1}') == '{"a": 1}'
-
-
-def test_strip_code_fences_whitespace():
-    assert _strip_code_fences('\n  {"a": 1}  \n') == '{"a": 1}'
-
-
 def test_extract_cost_from_response():
     response = MagicMock()
     response.response_metadata = {"token_usage": {"cost": 0.00042}}