From 0982cde6c19ade55e2d122151cd8a33c10c9e269 Mon Sep 17 00:00:00 2001
From: orin <zaqwerss@outlook.com>
Date: Fri, 22 May 2026 11:49:34 +0800
Subject: [PATCH] add benchmark thresholds and positioning docs

---
 .gitignore                                |  8 ++
 README.md                                 | 36 ++++++---
 benchmarks/README.md                      | 27 +++++++
 benchmarks/manifest.schema.json           | 96 +++++++++++++++++++++++
 benchmarks/sample_manifest.json           | 41 ++++++++++
 benchmarks/thresholds/golden_minimum.json | 12 +++
 docs/financial_fact_platform_roadmap.md   | 22 +++---
 scripts/eval.py                           | 57 ++++++++++++++
 tests/test_benchmark_manifest.py          | 14 ++++
 tests/test_eval_script.py                 | 69 +++++++++++++++-
 10 files changed, 357 insertions(+), 25 deletions(-)
 create mode 100644 benchmarks/README.md
 create mode 100644 benchmarks/manifest.schema.json
 create mode 100644 benchmarks/sample_manifest.json
 create mode 100644 benchmarks/thresholds/golden_minimum.json
 create mode 100644 tests/test_benchmark_manifest.py

diff --git a/.gitignore b/.gitignore
index b02013e..eb00336 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,6 +11,14 @@ __pycache__/
 data/
 tmpclaude-*
 
+# Benchmark data policy: commit manifests/schemas/thresholds, keep raw/private data local.
+benchmarks/raw/
+benchmarks/private/
+benchmarks/**/*.pdf
+benchmarks/**/*.xlsx
+benchmarks/**/*.xls
+benchmarks/**/labels.private.json
+
 # Example runtime artifacts
 examples/**/output/
 examples/**/fixtures/*.pdf
diff --git a/README.md b/README.md
index ab0d696..7c515a1 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,14 @@
 # Jetbot
 
-Jetbot is a financial report analysis platform that turns PDF filings into structured financial statements, key notes, risk signals, event-study outputs, and trader-style summaries. It combines PDF extraction, validation, LLM orchestration, a FastAPI backend, and a Vue dashboard in one repository.
+Jetbot is a Filing-to-Model Copilot and Financial Fact Platform for evidence-backed financial report extraction. It turns PDF filings into canonical financial facts, structured statements, key notes, risk signals, event-study outputs, and analyst-ready summaries.
 
-It is designed for teams that need a single workflow to ingest reports, inspect extracted evidence, and ship the results through an API, a CLI, or a browser UI.
+It is designed for teams that need a single workflow to ingest reports, inspect source evidence, review and correct extracted facts, and ship the results through an API, a CLI, exports, or a browser UI.
 
 ## Highlights
 
-- End-to-end PDF pipeline for raw text, tables, statements, notes, and report generation.
+- End-to-end PDF pipeline for raw text, tables, statements, notes, facts, and report generation.
+- Canonical financial fact layer with page/table/cell evidence metadata for review and downstream exports.
+- Evaluation runner with machine-readable reports and configurable quality thresholds.
 - Works in mock mode out of the box, with optional OpenAI and Anthropic model routing.
 - Vue 3 dashboard for reviewing original PDFs alongside extraction and analysis outputs.
 - Docker-first local stack with API, worker, Redis, PostgreSQL, and MinIO.
@@ -17,13 +19,13 @@ It is designed for teams that need a single workflow to ingest reports, inspect
 
 ```mermaid
 flowchart LR
-    A[Financial PDF] --> B[PDF extraction and OCR]
-    B --> C[Normalization and validation]
-    C --> D[LLM enrichment and report generation]
-    C --> E[Risk signals and event study]
-    D --> F[FastAPI and CLI]
-    E --> F
-    F --> G[Vue dashboard at /ui]
+  A[Financial Filing PDF] --> B[PDF extraction and OCR]
+  B --> C[Statements and canonical facts]
+  C --> D[Evidence and validation]
+  D --> E[Review, API, and exports]
+  D --> F[Risk signals and analyst reports]
+  E --> G[Vue dashboard at /ui]
+  F --> G
 ```
 
 ## Quick Start
@@ -81,7 +83,7 @@ After startup, the main entry points are:
 | Surface | URL / Command | Notes |
 | --- | --- | --- |
 | Web UI | `http://127.0.0.1:18000/ui/` | Review uploaded PDFs, tables, statements, signals, and generated reports |
-| API | `http://127.0.0.1:18000/v1` | Programmatic ingestion and retrieval |
+| API | `http://127.0.0.1:18000/v1` | Programmatic ingestion and retrieval, including canonical facts |
 | OpenAPI docs | `http://127.0.0.1:18000/docs` | Interactive API explorer |
 | Health | `http://127.0.0.1:18000/health` | Liveness probe |
 | Metrics | `http://127.0.0.1:18000/metrics` | Prometheus endpoint |
@@ -153,6 +155,7 @@ pip install -e ".[all]"
 ```bash
 make test
 make eval
+python scripts/eval.py --thresholds benchmarks/thresholds/golden_minimum.json
 make fmt
 make lint
 make typecheck
@@ -164,12 +167,19 @@ The repository is organized around a small number of clear surfaces:
 
 - `src/api/` for HTTP entry points and application wiring
 - `src/pdf/` for extraction, rendering, tables, and OCR
-- `src/finance/` for schemas, normalization, validation, and signal logic
+- `src/finance/` for facts, normalization, validation, and signal logic
 - `src/agent/` for pipeline orchestration and state handling
 - `src/market/` for event-study analysis and market providers
 - `web/` for the Vue 3 dashboard
 - `tests/` for API, storage, pipeline, frontend-adjacent, and integration coverage
-- `docs/` for architecture, branch protection, and project notes
+- `benchmarks/` for benchmark manifest schemas, threshold configs, and non-sensitive sample manifests
+- `docs/` for architecture, branch protection, roadmap, and project notes
+
+## Benchmark Data Policy
+
+Benchmark manifests, anonymized labels, synthetic fixtures, schemas, and threshold configs can be committed. Raw third-party or proprietary PDFs, private labels, customer files, and generated benchmark artifacts must stay out of git.
+
+Use `benchmarks/raw/` or `benchmarks/private/` for local-only datasets. Those paths are ignored by git. Store only stable metadata, expected facts, expected evidence pointers, and licensing notes in committed manifests.
 
 ## Contributing
 
diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 0000000..33df6b6
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,27 @@
+# Benchmark Manifests
+
+This directory stores committed benchmark metadata for Jetbot evaluation. It is for manifests, schemas, anonymized labels, synthetic fixtures, and quality threshold configs only.
+
+Do commit:
+
+- `manifest.schema.json`
+- anonymized benchmark manifests
+- synthetic fixture metadata
+- expected facts, expected evidence pointers, expected note/risk labels
+- threshold configs under `thresholds/`
+
+Do not commit:
+
+- raw third-party or proprietary PDFs
+- private customer reports
+- non-anonymized analyst labels
+- generated eval outputs
+- files under `benchmarks/raw/` or `benchmarks/private/`
+
+Run the current golden evaluation gate with:
+
+```bash
+python scripts/eval.py --thresholds benchmarks/thresholds/golden_minimum.json
+```
+
+Real PDF benchmark manifests should point to local-only files through relative paths such as `raw/company-2025-10k.pdf`. Those raw files are intentionally ignored by git.
\ No newline at end of file
diff --git a/benchmarks/manifest.schema.json b/benchmarks/manifest.schema.json
new file mode 100644
index 0000000..c84d29d
--- /dev/null
+++ b/benchmarks/manifest.schema.json
@@ -0,0 +1,96 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://github.com/magic-alt/jetbot/benchmarks/manifest.schema.json",
+  "title": "Jetbot Benchmark Manifest",
+  "type": "object",
+  "additionalProperties": false,
+  "required": ["schema_version", "benchmark_id", "name", "cases"],
+  "properties": {
+    "schema_version": {"type": "integer", "const": 1},
+    "benchmark_id": {"type": "string", "minLength": 1},
+    "name": {"type": "string", "minLength": 1},
+    "description": {"type": "string"},
+    "data_policy": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": ["raw_files_committed", "label_policy"],
+      "properties": {
+        "raw_files_committed": {"type": "boolean", "const": false},
+        "label_policy": {"type": "string", "enum": ["synthetic", "anonymized", "private"]},
+        "notes": {"type": "string"}
+      }
+    },
+    "cases": {
+      "type": "array",
+      "minItems": 1,
+      "items": {"$ref": "#/$defs/case"}
+    }
+  },
+  "$defs": {
+    "case": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": ["case_id", "source", "expected_facts"],
+      "properties": {
+        "case_id": {"type": "string", "minLength": 1},
+        "company": {"type": "string"},
+        "ticker": {"type": "string"},
+        "filing_type": {"type": "string"},
+        "period_end": {"type": "string", "format": "date"},
+        "source": {
+          "type": "object",
+          "additionalProperties": false,
+          "required": ["type", "path"],
+          "properties": {
+            "type": {"type": "string", "enum": ["synthetic", "pdf", "html", "xbrl"]},
+            "path": {"type": "string", "minLength": 1},
+            "license": {"type": "string"},
+            "sha256": {"type": "string"}
+          }
+        },
+        "expected_facts": {
+          "type": "array",
+          "items": {"$ref": "#/$defs/fact"}
+        },
+        "expected_notes": {
+          "type": "array",
+          "items": {"type": "string"}
+        },
+        "expected_risk_categories": {
+          "type": "array",
+          "items": {"type": "string"}
+        }
+      }
+    },
+    "fact": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": ["statement_type", "concept", "value"],
+      "properties": {
+        "statement_type": {"type": "string", "enum": ["income", "balance", "cashflow", "note", "other"]},
+        "concept": {"type": "string", "minLength": 1},
+        "label": {"type": "string"},
+        "value": {"type": "number"},
+        "unit": {"type": "string"},
+        "currency": {"type": "string"},
+        "period_end": {"type": "string", "format": "date"},
+        "evidence": {
+          "type": "array",
+          "items": {"$ref": "#/$defs/evidence"}
+        }
+      }
+    },
+    "evidence": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": ["page"],
+      "properties": {
+        "page": {"type": "integer", "minimum": 1},
+        "table_id": {"type": "string"},
+        "row": {"type": "integer", "minimum": 0},
+        "col": {"type": "integer", "minimum": 0},
+        "quote": {"type": "string"}
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/benchmarks/sample_manifest.json b/benchmarks/sample_manifest.json
new file mode 100644
index 0000000..f4f44c7
--- /dev/null
+++ b/benchmarks/sample_manifest.json
@@ -0,0 +1,41 @@
+{
+  "schema_version": 1,
+  "benchmark_id": "synthetic-smoke-v1",
+  "name": "Synthetic Smoke Benchmark",
+  "description": "A committed example manifest showing the expected shape for benchmark metadata. It does not reference real proprietary files.",
+  "data_policy": {
+    "raw_files_committed": false,
+    "label_policy": "synthetic",
+    "notes": "Use synthetic or anonymized labels in git. Keep real PDFs under ignored local paths."
+  },
+  "cases": [
+    {
+      "case_id": "synthetic-income-001",
+      "company": "Example Co",
+      "ticker": "EXM",
+      "filing_type": "10-Q",
+      "period_end": "2025-12-31",
+      "source": {
+        "type": "synthetic",
+        "path": "tests/golden/conftest.py",
+        "license": "synthetic"
+      },
+      "expected_facts": [
+        {
+          "statement_type": "income",
+          "concept": "revenue",
+          "label": "Revenue",
+          "value": 100.0,
+          "unit": "USD millions",
+          "currency": "USD",
+          "period_end": "2025-12-31",
+          "evidence": [
+            {"page": 1, "quote": "Revenue 100"}
+          ]
+        }
+      ],
+      "expected_notes": ["other"],
+      "expected_risk_categories": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/benchmarks/thresholds/golden_minimum.json b/benchmarks/thresholds/golden_minimum.json
new file mode 100644
index 0000000..e35854f
--- /dev/null
+++ b/benchmarks/thresholds/golden_minimum.json
@@ -0,0 +1,12 @@
+{
+  "schema_version": 1,
+  "description": "Initial non-regression thresholds for the synthetic golden suite. Tighten these as extraction quality improves.",
+  "min_metrics": {
+    "n_cases": 5,
+    "avg_source_ref_completeness": 1.0,
+    "avg_signal_category_recall": 0.8,
+    "avg_note_type_recall": 0.6,
+    "avg_fact_value_accuracy": 0.08,
+    "avg_fact_source_ref_completeness": 0.34
+  }
+}
\ No newline at end of file
diff --git a/docs/financial_fact_platform_roadmap.md b/docs/financial_fact_platform_roadmap.md
index 2da881a..263a301 100644
--- a/docs/financial_fact_platform_roadmap.md
+++ b/docs/financial_fact_platform_roadmap.md
@@ -79,7 +79,7 @@ Jetbot 当前已经具备较完整的财报 PDF Agent MVP 能力：PDF 上传、
 
 ## 4. 已完成的第一实现切片
 
-本路线图的第一切片已经在当前分支 `feat/financial-fact-foundation` 中实现，目标是为后续人工复核、导出和 benchmark 建立事实层底座。
+本路线图的第一切片已通过 PR12 合并到 `main`，目标是为后续人工复核、导出和 benchmark 建立事实层底座。
 
 ### 4.1 Schema 与证据模型
 
@@ -162,7 +162,7 @@ Jetbot 当前已经具备较完整的财报 PDF Agent MVP 能力：PDF 上传、
 
 - 文档和 README 中明确 Jetbot 的下一阶段定位。
 - 每个 P0 feature 都能映射到质量指标。
-- 不把真实敏感 PDF 提交到仓库。
+- 不把真实敏感 PDF 提交到仓库；真实样本只保存在本地或私有存储，仓库只提交 manifest、匿名标签、合成 fixture、schema 和阈值配置。
 
 ### Phase 1：Benchmark 与 Eval CI，Week 1-2
 
@@ -192,6 +192,7 @@ Jetbot 当前已经具备较完整的财报 PDF Agent MVP 能力：PDF 上传、
 验收标准：
 
 - `python scripts/eval.py --output-dir data/eval-dev` 可生成报告。
+- `python scripts/eval.py --thresholds benchmarks/thresholds/golden_minimum.json` 可作为质量门槛，指标低于阈值时返回非 0。
 - 报告包含 document-level 与 aggregate metrics。
 - synthetic golden gate 可稳定在 CI 中运行。
 - real PDF benchmark 可本地运行，且不会把敏感样本提交到 git。
@@ -574,13 +575,14 @@ docker compose up --build
 
 ## 10. 下一步推荐执行顺序
 
-1. 完成 correction API 和 effective facts。
-2. 在前端增加 facts tab 或 review panel。
-3. 给 `PdfViewer` 增加 bbox overlay。
-4. 给 `EvidenceLink` 增加 row/col/bbox payload。
-5. 增加 Excel/CSV/JSON export。
-6. 扩展 benchmark manifest 和 threshold gate。
-7. 开始 table router protocol。
-8. 再接 SEC/XBRL/HTML ingestion。
+1. 收口 Phase 0：README/路线图正式定位为 Filing-to-Model Copilot / Financial Fact Platform，并文档化 benchmark 数据政策。
+2. 完成 Phase 1 评测门槛：benchmark manifest schema、样例 manifest、threshold 配置和 eval gate。
+3. 完成 correction API 和 effective facts。
+4. 在前端增加 facts tab 或 review panel。
+5. 给 `PdfViewer` 增加 bbox overlay。
+6. 给 `EvidenceLink` 增加 row/col/bbox payload。
+7. 增加 Excel/CSV/JSON export。
+8. 开始 table router protocol。
+9. 再接 SEC/XBRL/HTML ingestion。
 
 这一路线的判断标准很简单：每增加一个能力，都必须让 facts 更准确、证据更可审计、复核更省时间、输出更能进入真实 analyst workflow。
\ No newline at end of file
diff --git a/scripts/eval.py b/scripts/eval.py
index e5747f6..204e443 100644
--- a/scripts/eval.py
+++ b/scripts/eval.py
@@ -17,6 +17,7 @@
 def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
     parser = argparse.ArgumentParser(description="Run Jetbot financial extraction evaluation.")
     parser.add_argument("--output-dir", default=str(DEFAULT_OUTPUT_DIR), help="Directory for eval artifacts.")
+    parser.add_argument("--thresholds", help="Optional JSON file with min_metrics/max_metrics quality gates.")
     parser.add_argument("--skip-pytest", action="store_true", help="Skip pytest golden gate and only compute metrics.")
     parser.add_argument("--allow-real-llm", action="store_true", help="Do not force the mock LLM provider.")
     return parser.parse_args(argv)
@@ -32,11 +33,13 @@ def main(argv: Sequence[str] | None = None) -> int:
     pytest_result = None if args.skip_pytest else _run_pytest_gate()
     case_results = _run_golden_cases(output_dir)
     metrics = _compute_metrics(case_results)
+    threshold_results = evaluate_thresholds(metrics, load_thresholds(args.thresholds) if args.thresholds else None)
     finished_at = _utc_now()
     report = build_eval_report(
         metrics=metrics,
         case_results=case_results,
         pytest_result=pytest_result,
+        threshold_results=threshold_results,
         started_at=started_at,
         finished_at=finished_at,
     )
@@ -44,6 +47,8 @@ def main(argv: Sequence[str] | None = None) -> int:
     print(render_markdown_report(report))
     if pytest_result and pytest_result["exit_code"] != 0:
         return int(pytest_result["exit_code"])
+    if threshold_results["status"] == "failed":
+        return 2
     return 0
 
 
@@ -52,12 +57,16 @@ def build_eval_report(
     metrics: dict[str, Any],
     case_results: list[dict[str, Any]],
     pytest_result: dict[str, Any] | None,
+    threshold_results: dict[str, Any] | None = None,
     started_at: str,
     finished_at: str,
 ) -> dict[str, Any]:
     status = "passed"
     if pytest_result and pytest_result["exit_code"] != 0:
         status = "failed"
+    threshold_results = threshold_results or {"status": "skipped", "checks": []}
+    if threshold_results["status"] == "failed":
+        status = "failed"
     return {
         "schema_version": 1,
         "suite": "golden",
@@ -66,6 +75,7 @@ def build_eval_report(
         "finished_at": finished_at,
         "metrics": metrics,
         "cases": [_case_summary(case) for case in case_results],
+        "thresholds": threshold_results,
         "pytest": pytest_result,
     }
 
@@ -84,6 +94,14 @@ def render_markdown_report(report: dict[str, Any]) -> str:
     ]
     for key, value in metrics.items():
         lines.append(f"- `{key}`: {_format_metric(value)}")
+    thresholds = report.get("thresholds", {"status": "skipped", "checks": []})
+    lines.extend(["", "## Thresholds", "", f"Status: **{thresholds['status']}**"])
+    for check in thresholds.get("checks", []):
+        comparator = ">=" if check["kind"] == "min" else "<="
+        lines.append(
+            f"- `{check['metric']}`: {_format_metric(check.get('actual'))} "
+            f"{comparator} {_format_metric(check['threshold'])} -> {check['status']}"
+        )
     lines.extend(["", "## Cases", ""])
     for case in report["cases"]:
         lines.append(
@@ -102,6 +120,45 @@ def write_eval_report(report: dict[str, Any], output_dir: Path) -> None:
     (output_dir / "eval_report.md").write_text(render_markdown_report(report), encoding="utf-8")
 
 
+def load_thresholds(path: str | None) -> dict[str, Any] | None:
+    if not path:
+        return None
+    return json.loads(Path(path).read_text(encoding="utf-8"))
+
+
+def evaluate_thresholds(metrics: dict[str, Any], thresholds: dict[str, Any] | None) -> dict[str, Any]:
+    if not thresholds:
+        return {"status": "skipped", "checks": []}
+
+    checks: list[dict[str, Any]] = []
+    failed = False
+    for metric, threshold in thresholds.get("min_metrics", {}).items():
+        actual = metrics.get(metric)
+        passed = _is_number(actual) and float(actual) >= float(threshold)
+        failed = failed or not passed
+        checks.append(_threshold_check("min", metric, actual, threshold, passed))
+    for metric, threshold in thresholds.get("max_metrics", {}).items():
+        actual = metrics.get(metric)
+        passed = _is_number(actual) and float(actual) <= float(threshold)
+        failed = failed or not passed
+        checks.append(_threshold_check("max", metric, actual, threshold, passed))
+    return {"status": "failed" if failed else "passed", "checks": checks}
+
+
+def _threshold_check(kind: str, metric: str, actual: Any, threshold: Any, passed: bool) -> dict[str, Any]:
+    return {
+        "kind": kind,
+        "metric": metric,
+        "actual": actual,
+        "threshold": threshold,
+        "status": "passed" if passed else "failed",
+    }
+
+
+def _is_number(value: Any) -> bool:
+    return isinstance(value, int | float) and not isinstance(value, bool)
+
+
 def _case_summary(case: dict[str, Any]) -> dict[str, Any]:
     return {
         "name": case["name"],
diff --git a/tests/test_benchmark_manifest.py b/tests/test_benchmark_manifest.py
new file mode 100644
index 0000000..3619cd5
--- /dev/null
+++ b/tests/test_benchmark_manifest.py
@@ -0,0 +1,14 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+
+def test_benchmark_manifest_schema_and_sample_are_valid_json() -> None:
+    schema = json.loads(Path("benchmarks/manifest.schema.json").read_text(encoding="utf-8"))
+    sample = json.loads(Path("benchmarks/sample_manifest.json").read_text(encoding="utf-8"))
+
+    assert schema["title"] == "Jetbot Benchmark Manifest"
+    assert sample["schema_version"] == schema["properties"]["schema_version"]["const"]
+    assert sample["data_policy"]["raw_files_committed"] is False
+    assert sample["cases"][0]["expected_facts"][0]["concept"] == "revenue"
\ No newline at end of file
diff --git a/tests/test_eval_script.py b/tests/test_eval_script.py
index 481c88d..28eb265 100644
--- a/tests/test_eval_script.py
+++ b/tests/test_eval_script.py
@@ -1,8 +1,10 @@
 from __future__ import annotations
 
+import json
 from pathlib import Path
 
-from scripts.eval import build_eval_report, render_markdown_report, write_eval_report
+import scripts.eval as eval_script
+from scripts.eval import build_eval_report, evaluate_thresholds, render_markdown_report, write_eval_report
 
 
 def test_eval_report_writer_creates_json_and_markdown(tmp_path: Path) -> None:
@@ -10,6 +12,7 @@ def test_eval_report_writer_creates_json_and_markdown(tmp_path: Path) -> None:
         metrics={"n_cases": 1, "avg_fact_value_accuracy": 1.0},
         case_results=[{"name": "case-a", "fact_count": 2, "statement_types": ["income"], "errors": []}],
         pytest_result={"exit_code": 0, "command": ["pytest"], "stdout": "", "stderr": ""},
+        threshold_results={"status": "passed", "checks": []},
         started_at="2026-01-01T00:00:00+00:00",
         finished_at="2026-01-01T00:00:01+00:00",
     )
@@ -26,8 +29,70 @@ def test_eval_report_marks_pytest_failure() -> None:
         metrics={"n_cases": 0},
         case_results=[],
         pytest_result={"exit_code": 1, "command": ["pytest"], "stdout": "", "stderr": "failed"},
+        threshold_results={"status": "skipped", "checks": []},
         started_at="2026-01-01T00:00:00+00:00",
         finished_at="2026-01-01T00:00:01+00:00",
     )
 
-    assert report["status"] == "failed"
\ No newline at end of file
+    assert report["status"] == "failed"
+
+
+def test_thresholds_pass_when_metrics_meet_minimums() -> None:
+    result = evaluate_thresholds(
+        {"avg_fact_value_accuracy": 0.9, "n_cases": 5},
+        {"min_metrics": {"avg_fact_value_accuracy": 0.8, "n_cases": 5}},
+    )
+
+    assert result["status"] == "passed"
+    assert all(check["status"] == "passed" for check in result["checks"])
+
+
+def test_thresholds_fail_when_metric_is_below_minimum() -> None:
+    result = evaluate_thresholds(
+        {"avg_fact_value_accuracy": 0.7},
+        {"min_metrics": {"avg_fact_value_accuracy": 0.8}},
+    )
+
+    assert result["status"] == "failed"
+    assert result["checks"][0] == {
+        "kind": "min",
+        "metric": "avg_fact_value_accuracy",
+        "actual": 0.7,
+        "threshold": 0.8,
+        "status": "failed",
+    }
+
+
+def test_eval_report_marks_threshold_failure() -> None:
+    report = build_eval_report(
+        metrics={"avg_fact_value_accuracy": 0.7},
+        case_results=[],
+        pytest_result={"exit_code": 0, "command": ["pytest"], "stdout": "", "stderr": ""},
+        threshold_results={"status": "failed", "checks": []},
+        started_at="2026-01-01T00:00:00+00:00",
+        finished_at="2026-01-01T00:00:01+00:00",
+    )
+
+    assert report["status"] == "failed"
+
+
+def test_main_returns_nonzero_when_thresholds_fail(tmp_path: Path, monkeypatch) -> None:
+    thresholds = tmp_path / "thresholds.json"
+    thresholds.write_text(json.dumps({"min_metrics": {"avg_fact_value_accuracy": 0.9}}), encoding="utf-8")
+
+    monkeypatch.setattr(eval_script, "_force_mock_llm", lambda: None)
+    monkeypatch.setattr(eval_script, "_run_golden_cases", lambda output_dir: [])
+    monkeypatch.setattr(eval_script, "_compute_metrics", lambda case_results: {"avg_fact_value_accuracy": 0.5})
+
+    exit_code = eval_script.main([
+        "--skip-pytest",
+        "--thresholds",
+        str(thresholds),
+        "--output-dir",
+        str(tmp_path / "out"),
+    ])
+
+    assert exit_code == 2
+    report = json.loads((tmp_path / "out" / "eval_report.json").read_text(encoding="utf-8"))
+    assert report["status"] == "failed"
+    assert report["thresholds"]["checks"][0]["status"] == "failed"
\ No newline at end of file