diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 2b48c1a..12d722c 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -7,7 +7,7 @@ on: description: "JSON array of models to benchmark (OpenRouter model IDs)" required: false # yamllint disable-line rule:line-length - default: '["openai/gpt-4o","openai/gpt-4o-mini","openai/gpt-4.1","openai/gpt-4.1-mini","openai/gpt-4.1-nano","anthropic/claude-opus-4.6","anthropic/claude-haiku-4.5","google/gemma-3-4b-it","google/gemma-3-12b-it","google/gemma-3-27b-it","google/gemma-4-31b-it","mistralai/mistral-small-3.1-24b-instruct","mistralai/mistral-medium-3.1","meta-llama/llama-4-scout","meta-llama/llama-4-maverick","deepseek/deepseek-chat-v3-0324","deepseek/deepseek-v3.2","microsoft/phi-4","x-ai/grok-3-mini","x-ai/grok-4-fast"]' + default: '["openai/gpt-5.4","openai/gpt-5.4-nano","openai/gpt-oss-120b","openai/gpt-oss-20b","openai/gpt-4o","openai/gpt-4o-mini","openai/gpt-4.1","openai/gpt-4.1-mini","openai/gpt-4.1-nano","anthropic/claude-opus-4.7","anthropic/claude-opus-4.6","anthropic/claude-haiku-4.5","google/gemma-3-4b-it","google/gemma-3-12b-it","google/gemma-3-27b-it","google/gemma-4-31b-it","mistralai/mistral-small-3.1-24b-instruct","mistralai/mistral-medium-3.1","meta-llama/llama-4-scout","meta-llama/llama-4-maverick","deepseek/deepseek-chat-v3-0324","deepseek/deepseek-v3.2","microsoft/phi-4","x-ai/grok-3-mini","x-ai/grok-4-fast"]' judge_model: description: "Model used as LLM judge" required: false @@ -29,7 +29,7 @@ jobs: max-parallel: 5 # yamllint disable-line rule:line-length matrix: - model: ${{ fromJson(inputs.models || '["openai/gpt-4o","openai/gpt-4o-mini","openai/gpt-4.1","openai/gpt-4.1-mini","openai/gpt-4.1-nano","anthropic/claude-opus-4.6","anthropic/claude-haiku-4.5","google/gemma-3-4b-it","google/gemma-3-12b-it","google/gemma-3-27b-it","google/gemma-4-31b-it","mistralai/mistral-small-3.1-24b-instruct","mistralai/mistral-medium-3.1","meta-llama/llama-4-scout","meta-llama/llama-4-maverick","deepseek/deepseek-chat-v3-0324","deepseek/deepseek-v3.2","microsoft/phi-4","x-ai/grok-3-mini","x-ai/grok-4-fast"]') }} + model: ${{ fromJson(inputs.models || '["openai/gpt-5.4","openai/gpt-5.4-nano","openai/gpt-oss-120b","openai/gpt-oss-20b","openai/gpt-4o","openai/gpt-4o-mini","openai/gpt-4.1","openai/gpt-4.1-mini","openai/gpt-4.1-nano","anthropic/claude-opus-4.7","anthropic/claude-opus-4.6","anthropic/claude-haiku-4.5","google/gemma-3-4b-it","google/gemma-3-12b-it","google/gemma-3-27b-it","google/gemma-4-31b-it","mistralai/mistral-small-3.1-24b-instruct","mistralai/mistral-medium-3.1","meta-llama/llama-4-scout","meta-llama/llama-4-maverick","deepseek/deepseek-chat-v3-0324","deepseek/deepseek-v3.2","microsoft/phi-4","x-ai/grok-3-mini","x-ai/grok-4-fast"]') }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 diff --git a/README.md b/README.md index 2cc37c1..8212da6 100644 --- a/README.md +++ b/README.md @@ -131,15 +131,16 @@ All configuration is done via environment variables (`.env` file). ## Recommended model -Based on our [benchmark of 20 models](benchmarks/README.md), we recommend **`google/gemma-3-12b-it`** as the default LLM: +Based on our [benchmark of 25 models](benchmarks/README.md), we recommend **`google/gemma-3-12b-it`** as the default LLM for self-hosted deployments: | Model | Cost/run | JSON OK | Summary quality | Composite | |-------|----------|---------|-----------------|-----------| -| openai/gpt-4o-mini | $0.0014 | 100% | 7.8/10 | 0.9813 | -| **google/gemma-3-12b-it** | **$0.0005** | **100%** | **8.1/10** | **0.9793** | -| google/gemma-3-27b-it | $0.0012 | 100% | 8.0/10 | 0.9679 | +| google/gemma-3-27b-it | $0.0012 | 100% | 7.9/10 | 0.9819 | +| **google/gemma-3-12b-it** | **$0.0005** | **100%** | **7.9/10** | **0.9781** | +| openai/gpt-4o-mini | $0.0015 | 100% | 7.6/10 | 0.9746 | +| google/gemma-4-31b-it | $0.0016 | 100% | 8.0/10 | 0.9693 | -While `gpt-4o-mini` scores marginally higher on the composite metric, `gemma-3-12b-it` is the best self-hostable alternative: highest summary quality (8.1/10), lowest cost, 100% JSON reliability, and it runs on consumer hardware (12B parameters). It is the default value for `LLM_MODEL`. +While `gemma-3-27b-it` tops the composite ranking, `gemma-3-12b-it` is the best self-hostable option: same summary quality (7.9/10), less than half the cost ($0.0005 vs $0.0012/run), 100% JSON reliability, and it runs on a single consumer GPU (12B parameters). It is the default value for `LLM_MODEL`. See [benchmarks/README.md](benchmarks/README.md) for the full ranking and methodology. diff --git a/benchmarks/README.md b/benchmarks/README.md index 63fe992..151a9bd 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -2,44 +2,53 @@ Automated benchmark system to compare LLM models for the reddit-digest-agent pipeline. Tests each model's ability to score and summarize Reddit posts using the project's real prompts. -## Latest Results (2026-04-12) +## Latest Results (2026-04-17) -> Run: [GitHub Actions #24308576896](https://github.com/using-system/reddit-digest-agent/actions/runs/24308576896) -> 20 models tested across 7 providers via OpenRouter +> Run: [GitHub Actions #24556985945](https://github.com/using-system/reddit-digest-agent/actions/runs/24556985945) +> 25 models tested across 7 providers via OpenRouter > Judge: openai/gpt-4o ### Ranking | Model | Cost | JSON OK | Latency (avg) | Summaries | Scoring MAE | **Composite** | |-------|------|---------|---------------|-----------|-------------|---------------| -| openai/gpt-4o-mini | $0.0014 | 100% | 1818ms | 7.8/10 | 1.1 | **0.9813** | -| google/gemma-3-12b-it | $0.0005 | 100% | 4206ms | 8.1/10 | 1.4 | **0.9793** | -| google/gemma-3-27b-it | $0.0012 | 100% | 5178ms | 8.0/10 | 1.3 | **0.9679** | -| openai/gpt-4.1-mini | $0.0044 | 100% | 1852ms | 7.7/10 | 1.6 | **0.9638** | -| mistralai/mistral-medium-3.1 | $0.0064 | 100% | 4052ms | 7.6/10 | 1.5 | **0.9391** | -| mistralai/mistral-small-3.1-24b-instruct | $0.0037 | 100% | 5478ms | 7.4/10 | 1.3 | **0.938** | -| anthropic/claude-haiku-4.5 | $0.0153 | 100% | 2788ms | 7.6/10 | 0.6 | **0.9078** | -| openai/gpt-4.1 | $0.0183 | 100% | 2128ms | 7.9/10 | 1.1 | **0.9054** | -| google/gemma-4-31b-it | $0.0016 | 100% | 18314ms | 7.9/10 | 0.9 | **0.8882** | -| x-ai/grok-3-mini | $0.0048 | 100% | 15890ms | 7.5/10 | 1.3 | **0.8759** | -| deepseek/deepseek-chat-v3-0324 | $0.0030 | 83% | 10821ms | 7.3/10 | 0.8 | **0.8588** | -| deepseek/deepseek-v3.2 | $0.0046 | 100% | 6031ms | 7.3/10 | 45.6 | **0.8297** | -| google/gemma-3-4b-it | $0.0005 | 100% | 4327ms | 1.0/10 | 1.2 | **0.7794** | -| x-ai/grok-4-fast | $0.0043 | 100% | 4731ms | 1.0/10 | 0.7 | **0.7607** | -| meta-llama/llama-4-maverick | $0.0041 | 50% | 8480ms | 7.5/10 | 10.0 | **0.7526** | -| microsoft/phi-4 | $0.0009 | 33% | 4598ms | 7.9/10 | 10.0 | **0.7505** | -| openai/gpt-4o | $0.0253 | 100% | 1424ms | 1.0/10 | 0.7 | **0.6844** | -| anthropic/claude-opus-4.6 | $0.0775 | 100% | 5674ms | 7.7/10 | 0.7 | **0.6115** | -| openai/gpt-4.1-nano | $0.0009 | 33% | 986ms | 1.0/10 | 10.0 | **0.577** | -| meta-llama/llama-4-scout | $0.0019 | 17% | 5013ms | 1.0/10 | 10.0 | **0.4991** | +| google/gemma-3-27b-it | $0.0012 | 100% | 3336ms | 7.9/10 | 1.1 | **0.9819** | +| google/gemma-3-12b-it | $0.0005 | 100% | 4296ms | 7.9/10 | 1.4 | **0.9781** | +| openai/gpt-4o-mini | $0.0015 | 100% | 2156ms | 7.6/10 | 0.9 | **0.9746** | +| google/gemma-4-31b-it | $0.0016 | 100% | 8232ms | 8.0/10 | 1.0 | **0.9693** | +| x-ai/grok-4-fast | $0.0042 | 100% | 3791ms | 7.9/10 | 1.5 | **0.966** | +| meta-llama/llama-4-maverick | $0.0033 | 100% | 3196ms | 7.7/10 | 1.2 | **0.9657** | +| openai/gpt-4.1-mini | $0.0045 | 100% | 3130ms | 7.8/10 | 1.5 | **0.9636** | +| openai/gpt-5.4-nano | $0.0026 | 100% | 1951ms | 7.4/10 | 1.1 | **0.9606** | +| mistralai/mistral-medium-3.1 | $0.0064 | 100% | 4736ms | 8.0/10 | 1.5 | **0.96** | +| mistralai/mistral-small-3.1-24b-instruct | $0.0037 | 100% | 5644ms | 7.6/10 | 1.3 | **0.9506** | +| meta-llama/llama-4-scout | $0.0022 | 100% | 7142ms | 7.6/10 | 1.3 | **0.9505** | +| deepseek/deepseek-chat-v3-0324 | $0.0029 | 100% | 13163ms | 7.9/10 | 1.0 | **0.9432** | +| anthropic/claude-haiku-4.5 | $0.0156 | 100% | 2846ms | 7.7/10 | 1.0 | **0.9271** | +| openai/gpt-4.1 | $0.0192 | 100% | 2684ms | 7.8/10 | 1.1 | **0.9185** | +| openai/gpt-5.4 | $0.0181 | 100% | 3066ms | 7.2/10 | 1.1 | **0.8947** | +| openai/gpt-oss-120b | $0.0029 | 100% | 29202ms | 7.7/10 | 0.9 | **0.8794** | +| google/gemma-3-4b-it | $0.0004 | 100% | 2302ms | 3.5/10 | 1.2 | **0.7919** | +| microsoft/phi-4 | $0.0009 | 67% | 7588ms | 7.8/10 | 10.0 | **0.7686** | +| deepseek/deepseek-v3.2 | $0.0027 | 100% | 12455ms | 3.8/10 | 1.3 | **0.7612** | +| openai/gpt-4o | $0.0245 | 100% | 2358ms | 3.8/10 | 1.0 | **0.7252** | +| anthropic/claude-opus-4.6 | $0.0778 | 100% | 5298ms | 7.5/10 | 0.7 | **0.7014** | +| x-ai/grok-3-mini | $0.0057 | 100% | 30198ms | 3.7/10 | 1.5 | **0.6821** | +| openai/gpt-oss-20b | $0.0013 | 67% | 10371ms | 3.8/10 | 0.7 | **0.6795** | +| anthropic/claude-opus-4.7 | $0.1031 | 100% | 4006ms | 7.7/10 | 0.7 | **0.6283** | +| openai/gpt-4.1-nano | $0.0011 | 17% | 1614ms | 3.8/10 | 10.0 | **0.4611** | + +### Recommendation + +**Best quality/price self-hostable: `google/gemma-3-12b-it`** — composite 0.9781 at $0.0005 per run, 100% valid JSON, ~4.3s latency, 7.9/10 summary quality. Second overall in the ranking, trailing only its bigger sibling `gemma-3-27b-it` while costing less than half and running on consumer hardware. ### Key takeaways -- **Best overall**: `openai/gpt-4o-mini` — cheapest top-tier option with 100% JSON reliability and good summary quality -- **Best self-hostable**: `google/gemma-3-12b-it` — nearly identical composite score (0.9793 vs 0.9813), highest summary quality (8.1/10), lowest cost ($0.0005), and can run on consumer hardware -- **Gemma 3 family**: The 12B variant hits the sweet spot. The 27B is marginally worse and slower, the 4B drops dramatically in summary quality -- **Cost vs quality**: Models costing >$0.01 per run (Claude Opus, GPT-4o, GPT-4.1) don't justify the price — cheaper models match or beat them on summary quality -- **JSON reliability**: Most models achieve 100%. Notable failures: Llama 4 Scout (17%), Phi-4 (33%), GPT-4.1-nano (33%) +- **Best overall**: `google/gemma-3-27b-it` — top composite (0.9819), 100% JSON, 7.9/10 summary, fast at ~3.3s and only $0.0012 per run +- **Best self-hostable**: `google/gemma-3-12b-it` — same summary quality as the 27B (7.9/10), half the cost ($0.0005 vs $0.0012), runs on a single consumer GPU +- **Gemma family dominates**: three Gemma 3 variants and the Gemma 4-31B fill 4 of the top 5 spots, beating every proprietary model except `gpt-4o-mini` +- **Cost vs quality**: models above $0.01 per run (Claude Opus 4.6/4.7, GPT-4o, GPT-4.1, GPT-5.4) deliver no benefit over Gemma 3-12B; `claude-opus-4.7` is the most expensive of the panel ($0.103) and still ranks last on composite +- **Anomalies to watch**: judge gave a 3.5–3.8/10 summary score to `gemma-3-4b-it`, `deepseek-v3.2`, `gpt-4o`, `grok-3-mini`, `gpt-oss-20b` and `gpt-4.1-nano` despite valid JSON — likely judge variance worth re-running. Hard failures: `gpt-4.1-nano` (17% JSON), `microsoft/phi-4` and `gpt-oss-20b` (67% JSON) ### Composite score formula @@ -51,7 +60,7 @@ Each metric is min-max normalized across all models. For cost, latency, and MAE, ### From GitHub Actions (recommended) -Go to **Actions > LLM Benchmark > Run workflow**. The default runs all 20 models. You can pass a custom JSON array of model IDs. +Go to **Actions > LLM Benchmark > Run workflow**. The default runs all 25 models. You can pass a custom JSON array of model IDs. ### Locally diff --git a/benchmarks/aggregate.py b/benchmarks/aggregate.py index 2c524d8..b8aa0e0 100644 --- a/benchmarks/aggregate.py +++ b/benchmarks/aggregate.py @@ -16,22 +16,12 @@ from langchain_openai import ChatOpenAI +from reddit_digest.nodes.llm_utils import extract_json + logger = logging.getLogger(__name__) DEFAULT_BASE_URL = "https://openrouter.ai/api/v1" - -def _strip_code_fences(text: str) -> str: - """Strip markdown code fences and whitespace from LLM output.""" - text = text.strip() - if text.startswith("```"): - first_newline = text.index("\n") if "\n" in text else len(text) - text = text[first_newline + 1 :] - if text.endswith("```"): - text = text[:-3] - return text.strip() - - DEFAULT_JUDGE_MODEL = "openai/gpt-4o" JUDGE_PROMPT = """You are evaluating the quality of AI-generated summaries of Reddit posts. @@ -151,7 +141,7 @@ async def run_judge( try: response = await llm.ainvoke(prompt) - data = json.loads(_strip_code_fences(response.content)) + data = extract_json(response.content) evals = data.get("evaluations", {}) all_evaluations.update(evals) except Exception: diff --git a/benchmarks/bench_model.py b/benchmarks/bench_model.py index 7a319b4..23eabba 100644 --- a/benchmarks/bench_model.py +++ b/benchmarks/bench_model.py @@ -20,6 +20,7 @@ from langchain_openai import ChatOpenAI from reddit_digest.models import RedditPost +from reddit_digest.nodes.llm_utils import extract_json from reddit_digest.nodes.scorer import SCORE_PROMPT from reddit_digest.nodes.scorer import _build_post_block as scorer_build_block from reddit_digest.nodes.summarizer import PROMPT_TEMPLATE @@ -35,18 +36,6 @@ def load_fixture(path: str) -> dict: return json.loads(Path(path).read_text()) -def _strip_code_fences(text: str) -> str: - """Strip markdown code fences and whitespace from LLM output.""" - text = text.strip() - if text.startswith("```"): - # Remove opening fence (```json or ```) - first_newline = text.index("\n") if "\n" in text else len(text) - text = text[first_newline + 1 :] - if text.endswith("```"): - text = text[:-3] - return text.strip() - - def _extract_cost(response) -> float: """Extract cost from OpenRouter response metadata (usage.cost).""" metadata = getattr(response, "response_metadata", {}) @@ -110,11 +99,11 @@ async def run_benchmark( total_tokens_completion += usage.get("completion_tokens", 0) total_cost += _extract_cost(response) - data = json.loads(_strip_code_fences(response.content)) + data = extract_json(response.content) scores = data.get("scores", {}) all_scores.update(scores) json_valid_count += 1 - except json.JSONDecodeError as e: + except (ValueError, json.JSONDecodeError) as e: elapsed_ms = (time.monotonic() - start) * 1000 latencies.append(elapsed_ms) errors.append(f"scorer/{subreddit}: JSON parse error: {e}") @@ -141,11 +130,11 @@ async def run_benchmark( total_tokens_completion += usage.get("completion_tokens", 0) total_cost += _extract_cost(response) - data = json.loads(_strip_code_fences(response.content)) + data = extract_json(response.content) summaries = data.get("summaries", {}) all_summaries.update(summaries) json_valid_count += 1 - except json.JSONDecodeError as e: + except (ValueError, json.JSONDecodeError) as e: elapsed_ms = (time.monotonic() - start) * 1000 latencies.append(elapsed_ms) errors.append(f"summarizer/{subreddit}: JSON parse error: {e}") diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 07accda..8e1c8d1 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -1,25 +1,9 @@ from unittest.mock import MagicMock -from benchmarks.bench_model import _extract_cost, _strip_code_fences +from benchmarks.bench_model import _extract_cost from benchmarks.aggregate import normalize_min_max, compute_composite, generate_report -def test_strip_code_fences_json(): - assert _strip_code_fences('```json\n{"a": 1}\n```') == '{"a": 1}' - - -def test_strip_code_fences_plain(): - assert _strip_code_fences('```\n{"a": 1}\n```') == '{"a": 1}' - - -def test_strip_code_fences_no_fences(): - assert _strip_code_fences('{"a": 1}') == '{"a": 1}' - - -def test_strip_code_fences_whitespace(): - assert _strip_code_fences('\n {"a": 1} \n') == '{"a": 1}' - - def test_extract_cost_from_response(): response = MagicMock() response.response_metadata = {"token_usage": {"cost": 0.00042}}