From 803fc3dacfcf3e6d032b6177a10a8ff8ee86ca56 Mon Sep 17 00:00:00 2001 From: orin Date: Fri, 22 May 2026 13:47:47 +0800 Subject: [PATCH 1/4] add phase 1 benchmark runner and ci gates --- .github/workflows/ci.yml | 5 +- benchmarks/README.md | 17 +- benchmarks/fixtures/synthetic_income_001.json | 16 + .../local_manifests/stress_private_batch.json | 83 +++++ .../us_filings_private_batch.json | 117 +++++++ benchmarks/sample_manifest.json | 76 +++- benchmarks/thresholds/golden_minimum.json | 18 +- scripts/eval.py | 324 ++++++++++++++++-- scripts/local_ci.sh | 19 +- src/agent/nodes.py | 47 ++- src/finance/normalizer.py | 11 + src/utils/metrics.py | 35 +- tests/golden/conftest.py | 27 ++ tests/test_benchmark_manifest.py | 23 +- tests/test_eval_script.py | 58 +++- tests/test_metrics.py | 15 +- tests/test_normalizer_and_events.py | 6 + 17 files changed, 829 insertions(+), 68 deletions(-) create mode 100644 benchmarks/fixtures/synthetic_income_001.json create mode 100644 benchmarks/local_manifests/stress_private_batch.json create mode 100644 benchmarks/local_manifests/us_filings_private_batch.json diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9f962e3..9c96f66 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -32,7 +32,7 @@ jobs: pip install -e ".[dev]" - name: Lint - run: python -m ruff check src tests + run: python -m ruff check src tests scripts - name: Type check run: python -m mypy src --ignore-missing-imports @@ -40,6 +40,9 @@ jobs: - name: Run tests run: python -m pytest -q --timeout=60 + - name: Eval thresholds gate + run: python scripts/eval.py --skip-pytest --thresholds benchmarks/thresholds/golden_minimum.json --output-dir data/eval-ci + web-build: name: Web UI build runs-on: ubuntu-latest diff --git a/benchmarks/README.md b/benchmarks/README.md index 33df6b6..3aa4ff0 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -24,4 +24,19 @@ Run the current golden evaluation gate with: python scripts/eval.py --thresholds benchmarks/thresholds/golden_minimum.json ``` -Real PDF benchmark manifests should point to local-only files through relative paths such as `raw/company-2025-10k.pdf`. Those raw files are intentionally ignored by git. \ No newline at end of file +Run the sample local benchmark manifest with: + +```bash +python scripts/eval.py --benchmark-manifest benchmarks/sample_manifest.json --skip-pytest --output-dir data/eval-dev +``` + +Run the first local private filing batches after placing raw PDFs under ignored paths: + +```bash +python scripts/eval.py --benchmark-manifest benchmarks/local_manifests/us_filings_private_batch.json --skip-pytest --output-dir data/eval-us-private +python scripts/eval.py --benchmark-manifest benchmarks/local_manifests/stress_private_batch.json --skip-pytest --output-dir data/eval-stress-private +``` + +Real PDF benchmark manifests should point to local-only files through relative paths such as `raw/company-2025-10k.pdf`. Those raw files are intentionally ignored by git. +The current eval runner supports committed `synthetic` fixtures and local `pdf` sources from a manifest. +The committed manifests under `benchmarks/local_manifests/` are safe to commit because they only store anonymized metadata and expected facts; the raw PDFs still live under ignored paths such as `benchmarks/private/us_filings/` and `benchmarks/private/stress/`. \ No newline at end of file diff --git a/benchmarks/fixtures/synthetic_income_001.json b/benchmarks/fixtures/synthetic_income_001.json new file mode 100644 index 0000000..c4b398b --- /dev/null +++ b/benchmarks/fixtures/synthetic_income_001.json @@ -0,0 +1,16 @@ +{ + "pages": [ + { + "page_number": 1, + "text": "Consolidated Balance Sheet\nCash and equivalents 50\nTotal Assets 90\nTotal Liabilities 40\nTotal Equity 50\n" + }, + { + "page_number": 2, + "text": "Consolidated Income Statement\nRevenue 100\nCost of goods sold 60\nGross Profit 40\nOperating expenses 20\nOperating income 20\nNet Income 10\n" + }, + { + "page_number": 3, + "text": "Notes to Financial Statements\nAccounting Policy: Revenue is recognized over time.\n" + } + ] +} \ No newline at end of file diff --git a/benchmarks/local_manifests/stress_private_batch.json b/benchmarks/local_manifests/stress_private_batch.json new file mode 100644 index 0000000..4977e48 --- /dev/null +++ b/benchmarks/local_manifests/stress_private_batch.json @@ -0,0 +1,83 @@ +{ + "schema_version": 1, + "benchmark_id": "local-private-stress-v1", + "name": "Local Private Stress Batch", + "description": "Local-only stress cases for scanned pages, rotated layouts, and complex tables. Raw PDFs remain under ignored paths and are not committed.", + "data_policy": { + "raw_files_committed": false, + "label_policy": "anonymized", + "notes": "Place local stress PDFs under benchmarks/private/stress/ before running. Keep detailed operator notes outside git when they contain vendor-specific annotations." + }, + "cases": [ + { + "case_id": "stress-scanned-001", + "company": "Stress Scan Co A", + "ticker": "SSC", + "filing_type": "10-K", + "period_end": "2025-12-31", + "source": { + "type": "pdf", + "path": "../private/stress/stress-scanned-001.pdf", + "license": "local-private" + }, + "expected_facts": [ + {"statement_type": "income", "concept": "revenue", "value": 420.0, "currency": "USD", "evidence": [{"page": 18}]}, + {"statement_type": "income", "concept": "net_income", "value": 22.0, "currency": "USD", "evidence": [{"page": 18}]}, + {"statement_type": "balance", "concept": "total_assets", "value": 870.0, "currency": "USD", "evidence": [{"page": 15}]}, + {"statement_type": "balance", "concept": "cash_and_equivalents", "value": 55.0, "currency": "USD", "evidence": [{"page": 15}]}, + {"statement_type": "cashflow", "concept": "operating_cash_flow", "value": 38.0, "currency": "USD", "evidence": [{"page": 19}]}, + {"statement_type": "cashflow", "concept": "capex", "value": -27.0, "currency": "USD", "evidence": [{"page": 19}]} + ], + "expected_notes": ["audit_opinion"], + "expected_risk_categories": ["cash_vs_profit"] + }, + { + "case_id": "stress-rotated-001", + "company": "Stress Rotation Co B", + "ticker": "SRB", + "filing_type": "10-Q", + "period_end": "2026-03-31", + "source": { + "type": "pdf", + "path": "../private/stress/stress-rotated-001.pdf", + "license": "local-private" + }, + "expected_facts": [ + {"statement_type": "income", "concept": "revenue", "value": 310.0, "currency": "USD", "evidence": [{"page": 7}]}, + {"statement_type": "income", "concept": "gross_profit", "value": 96.0, "currency": "USD", "evidence": [{"page": 7}]}, + {"statement_type": "income", "concept": "operating_income", "value": 34.0, "currency": "USD", "evidence": [{"page": 7}]}, + {"statement_type": "balance", "concept": "total_assets", "value": 640.0, "currency": "USD", "evidence": [{"page": 5}]}, + {"statement_type": "balance", "concept": "total_liabilities", "value": 350.0, "currency": "USD", "evidence": [{"page": 5}]}, + {"statement_type": "balance", "concept": "total_equity", "value": 290.0, "currency": "USD", "evidence": [{"page": 5}]}, + {"statement_type": "cashflow", "concept": "operating_cash_flow", "value": 29.0, "currency": "USD", "evidence": [{"page": 8}]} + ], + "expected_notes": ["other"], + "expected_risk_categories": [] + }, + { + "case_id": "stress-complex-table-001", + "company": "Stress Layout Co C", + "ticker": "SLC", + "filing_type": "10-K", + "period_end": "2025-12-31", + "source": { + "type": "pdf", + "path": "../private/stress/stress-complex-table-001.pdf", + "license": "local-private" + }, + "expected_facts": [ + {"statement_type": "income", "concept": "revenue", "value": 760.0, "currency": "USD", "evidence": [{"page": 28}]}, + {"statement_type": "income", "concept": "gross_profit", "value": 250.0, "currency": "USD", "evidence": [{"page": 28}]}, + {"statement_type": "income", "concept": "net_income", "value": 84.0, "currency": "USD", "evidence": [{"page": 28}]}, + {"statement_type": "balance", "concept": "total_assets", "value": 1510.0, "currency": "USD", "evidence": [{"page": 25}]}, + {"statement_type": "balance", "concept": "total_liabilities", "value": 690.0, "currency": "USD", "evidence": [{"page": 25}]}, + {"statement_type": "balance", "concept": "total_equity", "value": 820.0, "currency": "USD", "evidence": [{"page": 25}]}, + {"statement_type": "balance", "concept": "cash_and_equivalents", "value": 118.0, "currency": "USD", "evidence": [{"page": 25}]}, + {"statement_type": "cashflow", "concept": "operating_cash_flow", "value": 132.0, "currency": "USD", "evidence": [{"page": 29}]}, + {"statement_type": "cashflow", "concept": "capex", "value": -44.0, "currency": "USD", "evidence": [{"page": 29}]} + ], + "expected_notes": ["segment", "related_party"], + "expected_risk_categories": ["disclosure_inconsistency"] + } + ] +} \ No newline at end of file diff --git a/benchmarks/local_manifests/us_filings_private_batch.json b/benchmarks/local_manifests/us_filings_private_batch.json new file mode 100644 index 0000000..1f186dd --- /dev/null +++ b/benchmarks/local_manifests/us_filings_private_batch.json @@ -0,0 +1,117 @@ +{ + "schema_version": 1, + "benchmark_id": "local-private-us-filings-v1", + "name": "Local Private US Filing Batch", + "description": "First local-only batch covering anonymized 10-K and 10-Q filings. Raw PDFs stay under ignored paths while manifests and anonymized expected facts remain committed.", + "data_policy": { + "raw_files_committed": false, + "label_policy": "anonymized", + "notes": "Copy private PDFs into benchmarks/private/us_filings/ before running. Keep any non-anonymized analyst labels outside git." + }, + "cases": [ + { + "case_id": "us10k-tech-001", + "company": "US Tech Co A", + "ticker": "UTA", + "filing_type": "10-K", + "period_end": "2025-12-31", + "source": { + "type": "pdf", + "path": "../private/us_filings/us10k-tech-001.pdf", + "license": "local-private" + }, + "expected_facts": [ + {"statement_type": "income", "concept": "revenue", "value": 1980.0, "currency": "USD", "evidence": [{"page": 34}]}, + {"statement_type": "income", "concept": "gross_profit", "value": 820.0, "currency": "USD", "evidence": [{"page": 34}]}, + {"statement_type": "income", "concept": "operating_income", "value": 330.0, "currency": "USD", "evidence": [{"page": 35}]}, + {"statement_type": "income", "concept": "net_income", "value": 240.0, "currency": "USD", "evidence": [{"page": 35}]}, + {"statement_type": "balance", "concept": "total_assets", "value": 2250.0, "currency": "USD", "evidence": [{"page": 32}]}, + {"statement_type": "balance", "concept": "total_liabilities", "value": 910.0, "currency": "USD", "evidence": [{"page": 32}]}, + {"statement_type": "balance", "concept": "total_equity", "value": 1340.0, "currency": "USD", "evidence": [{"page": 32}]}, + {"statement_type": "balance", "concept": "cash_and_equivalents", "value": 280.0, "currency": "USD", "evidence": [{"page": 32}]}, + {"statement_type": "cashflow", "concept": "operating_cash_flow", "value": 410.0, "currency": "USD", "evidence": [{"page": 36}]}, + {"statement_type": "cashflow", "concept": "capex", "value": -120.0, "currency": "USD", "evidence": [{"page": 36}]} + ], + "expected_notes": ["accounting_policy", "segment"], + "expected_risk_categories": [] + }, + { + "case_id": "us10q-industrial-001", + "company": "US Industrial Co B", + "ticker": "UIB", + "filing_type": "10-Q", + "period_end": "2026-03-31", + "source": { + "type": "pdf", + "path": "../private/us_filings/us10q-industrial-001.pdf", + "license": "local-private" + }, + "expected_facts": [ + {"statement_type": "income", "concept": "revenue", "value": 620.0, "currency": "USD", "evidence": [{"page": 8}]}, + {"statement_type": "income", "concept": "gross_profit", "value": 180.0, "currency": "USD", "evidence": [{"page": 8}]}, + {"statement_type": "income", "concept": "operating_income", "value": 72.0, "currency": "USD", "evidence": [{"page": 8}]}, + {"statement_type": "income", "concept": "net_income", "value": 41.0, "currency": "USD", "evidence": [{"page": 8}]}, + {"statement_type": "balance", "concept": "total_assets", "value": 1430.0, "currency": "USD", "evidence": [{"page": 6}]}, + {"statement_type": "balance", "concept": "total_liabilities", "value": 790.0, "currency": "USD", "evidence": [{"page": 6}]}, + {"statement_type": "balance", "concept": "total_equity", "value": 640.0, "currency": "USD", "evidence": [{"page": 6}]}, + {"statement_type": "balance", "concept": "cash_and_equivalents", "value": 92.0, "currency": "USD", "evidence": [{"page": 6}]}, + {"statement_type": "cashflow", "concept": "operating_cash_flow", "value": 88.0, "currency": "USD", "evidence": [{"page": 10}]}, + {"statement_type": "cashflow", "concept": "capex", "value": -24.0, "currency": "USD", "evidence": [{"page": 10}]} + ], + "expected_notes": ["other"], + "expected_risk_categories": [] + }, + { + "case_id": "us10k-healthcare-001", + "company": "US Healthcare Co C", + "ticker": "UHC", + "filing_type": "10-K", + "period_end": "2025-12-31", + "source": { + "type": "pdf", + "path": "../private/us_filings/us10k-healthcare-001.pdf", + "license": "local-private" + }, + "expected_facts": [ + {"statement_type": "income", "concept": "revenue", "value": 1140.0, "currency": "USD", "evidence": [{"page": 41}]}, + {"statement_type": "income", "concept": "gross_profit", "value": 530.0, "currency": "USD", "evidence": [{"page": 41}]}, + {"statement_type": "income", "concept": "operating_income", "value": 148.0, "currency": "USD", "evidence": [{"page": 41}]}, + {"statement_type": "income", "concept": "net_income", "value": 101.0, "currency": "USD", "evidence": [{"page": 41}]}, + {"statement_type": "balance", "concept": "total_assets", "value": 2680.0, "currency": "USD", "evidence": [{"page": 39}]}, + {"statement_type": "balance", "concept": "total_liabilities", "value": 1180.0, "currency": "USD", "evidence": [{"page": 39}]}, + {"statement_type": "balance", "concept": "total_equity", "value": 1500.0, "currency": "USD", "evidence": [{"page": 39}]}, + {"statement_type": "balance", "concept": "cash_and_equivalents", "value": 310.0, "currency": "USD", "evidence": [{"page": 39}]}, + {"statement_type": "cashflow", "concept": "operating_cash_flow", "value": 205.0, "currency": "USD", "evidence": [{"page": 43}]}, + {"statement_type": "cashflow", "concept": "capex", "value": -58.0, "currency": "USD", "evidence": [{"page": 43}]} + ], + "expected_notes": ["accounting_policy", "contingency"], + "expected_risk_categories": ["audit_governance"] + }, + { + "case_id": "us10q-consumer-001", + "company": "US Consumer Co D", + "ticker": "UCD", + "filing_type": "10-Q", + "period_end": "2026-06-30", + "source": { + "type": "pdf", + "path": "../private/us_filings/us10q-consumer-001.pdf", + "license": "local-private" + }, + "expected_facts": [ + {"statement_type": "income", "concept": "revenue", "value": 540.0, "currency": "USD", "evidence": [{"page": 11}]}, + {"statement_type": "income", "concept": "gross_profit", "value": 210.0, "currency": "USD", "evidence": [{"page": 11}]}, + {"statement_type": "income", "concept": "operating_income", "value": 64.0, "currency": "USD", "evidence": [{"page": 11}]}, + {"statement_type": "income", "concept": "net_income", "value": 39.0, "currency": "USD", "evidence": [{"page": 11}]}, + {"statement_type": "balance", "concept": "total_assets", "value": 980.0, "currency": "USD", "evidence": [{"page": 9}]}, + {"statement_type": "balance", "concept": "total_liabilities", "value": 470.0, "currency": "USD", "evidence": [{"page": 9}]}, + {"statement_type": "balance", "concept": "total_equity", "value": 510.0, "currency": "USD", "evidence": [{"page": 9}]}, + {"statement_type": "balance", "concept": "cash_and_equivalents", "value": 76.0, "currency": "USD", "evidence": [{"page": 9}]}, + {"statement_type": "cashflow", "concept": "operating_cash_flow", "value": 67.0, "currency": "USD", "evidence": [{"page": 13}]}, + {"statement_type": "cashflow", "concept": "capex", "value": -19.0, "currency": "USD", "evidence": [{"page": 13}]} + ], + "expected_notes": ["other"], + "expected_risk_categories": [] + } + ] +} \ No newline at end of file diff --git a/benchmarks/sample_manifest.json b/benchmarks/sample_manifest.json index f4f44c7..1364ce4 100644 --- a/benchmarks/sample_manifest.json +++ b/benchmarks/sample_manifest.json @@ -17,10 +17,46 @@ "period_end": "2025-12-31", "source": { "type": "synthetic", - "path": "tests/golden/conftest.py", + "path": "fixtures/synthetic_income_001.json", "license": "synthetic" }, "expected_facts": [ + { + "statement_type": "balance", + "concept": "total_assets", + "label": "Total Assets", + "value": 90.0, + "unit": "USD millions", + "currency": "USD", + "period_end": "2025-12-31", + "evidence": [ + {"page": 1, "quote": "Total Assets 90"} + ] + }, + { + "statement_type": "balance", + "concept": "total_liabilities", + "label": "Total Liabilities", + "value": 40.0, + "unit": "USD millions", + "currency": "USD", + "period_end": "2025-12-31", + "evidence": [ + {"page": 1, "quote": "Total Liabilities 40"} + ] + }, + { + "statement_type": "balance", + "concept": "total_equity", + "label": "Total Equity", + "value": 50.0, + "unit": "USD millions", + "currency": "USD", + "period_end": "2025-12-31", + "evidence": [ + {"page": 1, "quote": "Total Equity 50"} + ] + }, { "statement_type": "income", "concept": "revenue", @@ -32,9 +68,45 @@ "evidence": [ {"page": 1, "quote": "Revenue 100"} ] + }, + { + "statement_type": "income", + "concept": "gross_profit", + "label": "Gross Profit", + "value": 40.0, + "unit": "USD millions", + "currency": "USD", + "period_end": "2025-12-31", + "evidence": [ + {"page": 2, "quote": "Gross Profit 40"} + ] + }, + { + "statement_type": "income", + "concept": "operating_income", + "label": "Operating income", + "value": 20.0, + "unit": "USD millions", + "currency": "USD", + "period_end": "2025-12-31", + "evidence": [ + {"page": 2, "quote": "Operating income 20"} + ] + }, + { + "statement_type": "income", + "concept": "net_income", + "label": "Net Income", + "value": 10.0, + "unit": "USD millions", + "currency": "USD", + "period_end": "2025-12-31", + "evidence": [ + {"page": 2, "quote": "Net Income 10"} + ] } ], - "expected_notes": ["other"], + "expected_notes": ["accounting_policy"], "expected_risk_categories": [] } ] diff --git a/benchmarks/thresholds/golden_minimum.json b/benchmarks/thresholds/golden_minimum.json index e35854f..1f7d1bb 100644 --- a/benchmarks/thresholds/golden_minimum.json +++ b/benchmarks/thresholds/golden_minimum.json @@ -1,12 +1,22 @@ { "schema_version": 1, - "description": "Initial non-regression thresholds for the synthetic golden suite. Tighten these as extraction quality improves.", + "description": "Non-regression thresholds for the synthetic golden suite, including key fact accuracy gates. Tighten these as extraction quality improves.", "min_metrics": { "n_cases": 5, "avg_source_ref_completeness": 1.0, - "avg_signal_category_recall": 0.8, + "avg_signal_category_recall": 0.9, "avg_note_type_recall": 0.6, - "avg_fact_value_accuracy": 0.08, - "avg_fact_source_ref_completeness": 0.34 + "avg_fact_value_accuracy": 0.96, + "avg_fact_source_ref_completeness": 0.9, + "fact_accuracy_revenue": 1.0, + "fact_accuracy_gross_profit": 1.0, + "fact_accuracy_operating_income": 1.0, + "fact_accuracy_net_income": 1.0, + "fact_accuracy_total_assets": 1.0, + "fact_accuracy_total_liabilities": 0.75, + "fact_accuracy_total_equity": 1.0, + "fact_accuracy_operating_cash_flow": 1.0, + "fact_accuracy_capex": 0.5, + "fact_accuracy_cash_and_equivalents": 1.0 } } \ No newline at end of file diff --git a/scripts/eval.py b/scripts/eval.py index 204e443..3a83080 100644 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -12,11 +12,27 @@ DEFAULT_OUTPUT_DIR = Path("data") / "eval" +KEY_FACTS: dict[str, tuple[str, ...]] = { + "revenue": ("revenue",), + "gross_profit": ("gross_profit",), + "operating_income": ("operating_income",), + "net_income": ("net_income",), + "total_assets": ("total_assets",), + "total_liabilities": ("total_liabilities",), + "total_equity": ("total_equity",), + "operating_cash_flow": ("operating_cash_flow", "operating_cf"), + "capex": ("capex",), + "cash_and_equivalents": ("cash_and_equivalents",), +} def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace: parser = argparse.ArgumentParser(description="Run Jetbot financial extraction evaluation.") parser.add_argument("--output-dir", default=str(DEFAULT_OUTPUT_DIR), help="Directory for eval artifacts.") + parser.add_argument( + "--benchmark-manifest", + help="Optional benchmark manifest JSON. When set, run that local benchmark dataset instead of the built-in golden cases.", + ) parser.add_argument("--thresholds", help="Optional JSON file with min_metrics/max_metrics quality gates.") parser.add_argument("--skip-pytest", action="store_true", help="Skip pytest golden gate and only compute metrics.") parser.add_argument("--allow-real-llm", action="store_true", help="Do not force the mock LLM provider.") @@ -30,18 +46,33 @@ def main(argv: Sequence[str] | None = None) -> int: output_dir = Path(args.output_dir) started_at = _utc_now() - pytest_result = None if args.skip_pytest else _run_pytest_gate() - case_results = _run_golden_cases(output_dir) + benchmark_manifest = load_benchmark_manifest(args.benchmark_manifest) + pytest_result = None if args.skip_pytest or benchmark_manifest else _run_pytest_gate() + if benchmark_manifest: + case_results = _run_benchmark_manifest(benchmark_manifest, Path(args.benchmark_manifest), output_dir) + suite = benchmark_manifest["benchmark_id"] + dataset = { + "benchmark_id": benchmark_manifest["benchmark_id"], + "name": benchmark_manifest.get("name"), + "manifest_path": str(Path(args.benchmark_manifest)), + "data_policy": benchmark_manifest.get("data_policy"), + } + else: + case_results = _run_golden_cases(output_dir) + suite = "golden" + dataset = None metrics = _compute_metrics(case_results) threshold_results = evaluate_thresholds(metrics, load_thresholds(args.thresholds) if args.thresholds else None) finished_at = _utc_now() report = build_eval_report( + suite=suite, metrics=metrics, case_results=case_results, pytest_result=pytest_result, threshold_results=threshold_results, started_at=started_at, finished_at=finished_at, + dataset=dataset, ) write_eval_report(report, output_dir) print(render_markdown_report(report)) @@ -54,12 +85,14 @@ def main(argv: Sequence[str] | None = None) -> int: def build_eval_report( *, + suite: str = "golden", metrics: dict[str, Any], case_results: list[dict[str, Any]], pytest_result: dict[str, Any] | None, threshold_results: dict[str, Any] | None = None, started_at: str, finished_at: str, + dataset: dict[str, Any] | None = None, ) -> dict[str, Any]: status = "passed" if pytest_result and pytest_result["exit_code"] != 0: @@ -69,12 +102,13 @@ def build_eval_report( status = "failed" return { "schema_version": 1, - "suite": "golden", + "suite": suite, "status": status, "started_at": started_at, "finished_at": finished_at, "metrics": metrics, "cases": [_case_summary(case) for case in case_results], + "dataset": dataset, "thresholds": threshold_results, "pytest": pytest_result, } @@ -92,6 +126,15 @@ def render_markdown_report(report: dict[str, Any]) -> str: "## Metrics", "", ] + dataset = report.get("dataset") + if dataset: + lines.extend([ + "## Dataset", + "", + f"Benchmark: `{dataset['benchmark_id']}`", + f"Manifest: `{dataset['manifest_path']}`", + "", + ]) for key, value in metrics.items(): lines.append(f"- `{key}`: {_format_metric(value)}") thresholds = report.get("thresholds", {"status": "skipped", "checks": []}) @@ -104,9 +147,14 @@ def render_markdown_report(report: dict[str, Any]) -> str: ) lines.extend(["", "## Cases", ""]) for case in report["cases"]: + case_metrics = case.get("metrics", {}) lines.append( - f"- `{case['name']}`: facts={case['fact_count']}, " - f"statements={','.join(case['statement_types']) or 'none'}, errors={len(case['errors'])}" + f"- `{case['name']}` ({case['source_type']}): facts={case['fact_count']}, " + f"statements={','.join(case['statement_types']) or 'none'}, errors={len(case['errors'])}, " + f"fact_accuracy={_format_metric(case_metrics.get('fact_value_accuracy'))}, " + f"source_refs={_format_metric(case_metrics.get('fact_source_ref_completeness'))}, " + f"note_recall={_format_metric(case_metrics.get('note_type_recall'))}, " + f"signal_recall={_format_metric(case_metrics.get('signal_category_recall'))}" ) return "\n".join(lines) + "\n" @@ -126,6 +174,21 @@ def load_thresholds(path: str | None) -> dict[str, Any] | None: return json.loads(Path(path).read_text(encoding="utf-8")) +def load_benchmark_manifest(path: str | None) -> dict[str, Any] | None: + if not path: + return None + + manifest = json.loads(Path(path).read_text(encoding="utf-8")) + if manifest.get("schema_version") != 1: + raise ValueError(f"Unsupported benchmark manifest schema_version: {manifest.get('schema_version')}") + if not manifest.get("benchmark_id"): + raise ValueError("Benchmark manifest requires a non-empty benchmark_id") + cases = manifest.get("cases") + if not isinstance(cases, list) or not cases: + raise ValueError("Benchmark manifest requires at least one case") + return manifest + + def evaluate_thresholds(metrics: dict[str, Any], thresholds: dict[str, Any] | None) -> dict[str, Any]: if not thresholds: return {"status": "skipped", "checks": []} @@ -162,14 +225,51 @@ def _is_number(value: Any) -> bool: def _case_summary(case: dict[str, Any]) -> dict[str, Any]: return { "name": case["name"], + "source_type": case.get("source_type", "synthetic"), + "source_path": case.get("source_path"), "statement_types": case["statement_types"], "fact_count": case["fact_count"], + "expected_fact_count": len(case.get("expected_facts", {})), "note_count": len(case.get("notes", [])), "risk_signal_count": len(case.get("risk_signals", [])), + "metrics": _case_metrics(case), "errors": case["errors"], } +def _case_metrics(case: dict[str, Any]) -> dict[str, Any]: + from src.utils.metrics import ( + balance_equation_pass_rate, + fact_source_ref_completeness, + fact_value_accuracy, + note_type_recall, + signal_category_recall, + source_ref_completeness, + statement_accuracy, + ) + + statements = case.get("statements", {}) + expected_totals = case.get("expected_totals", {}) + statement_metrics: dict[str, float] = {} + for statement_type, totals in expected_totals.items(): + if statement_type in statements and totals: + statement_metrics[statement_type] = statement_accuracy(statements[statement_type], totals)["accuracy"] + + actual_categories = {signal.category for signal in case.get("risk_signals", [])} + actual_note_types = {note.note_type for note in case.get("notes", [])} + expected_facts = case.get("expected_facts", {}) + facts = case.get("facts", []) + return { + "statement_accuracy_by_type": statement_metrics, + "balance_equation_pass": balance_equation_pass_rate([statements]) if statements else None, + "source_ref_completeness": source_ref_completeness(case.get("notes", []), case.get("risk_signals", [])), + "fact_value_accuracy": fact_value_accuracy(facts, expected_facts)["accuracy"] if expected_facts else None, + "fact_source_ref_completeness": fact_source_ref_completeness(facts), + "signal_category_recall": signal_category_recall(actual_categories, case.get("expected_signal_categories", set())), + "note_type_recall": note_type_recall(actual_note_types, case.get("expected_note_types", set())), + } + + def _run_pytest_gate() -> dict[str, Any]: import subprocess @@ -185,45 +285,186 @@ def _run_pytest_gate() -> dict[str, Any]: def _run_golden_cases(output_dir: Path) -> list[dict[str, Any]]: from src.agent.graph import build_graph - from src.agent.state import AgentState - from src.finance.facts import facts_from_statements - from src.schemas.models import DocumentMeta, Page graph = build_graph() results: list[dict[str, Any]] = [] for case in _load_golden_cases(): - case_dir = output_dir / "artifacts" - pages = [Page(page_number=p["page_number"], text=p["text"], images=[]) for p in case["pages"]] - state = AgentState( - doc_meta=DocumentMeta(doc_id=f"eval-{case['name']}", filename=f"{case['name']}.pdf"), + results.append( + _run_case( + graph, + { + "name": case["name"], + "doc_id": f"eval-{case['name']}", + "filename": f"{case['name']}.pdf", + "pages": case["pages"], + "source_type": "synthetic", + "source_path": None, + "expected_totals": case.get("expected_statements", {}), + "expected_facts": _golden_expected_facts(case), + "expected_note_types": set(case.get("expected_note_types", [])), + "expected_signal_categories": set(case.get("expected_signal_categories", [])), + }, + output_dir, + ) + ) + return results + + +def _run_benchmark_manifest( + manifest: dict[str, Any], + manifest_path: Path, + output_dir: Path, +) -> list[dict[str, Any]]: + from src.agent.graph import build_graph + + graph = build_graph() + manifest_dir = manifest_path.parent + results: list[dict[str, Any]] = [] + for case in manifest.get("cases", []): + results.append(_run_case(graph, _manifest_case_spec(case, manifest_dir), output_dir)) + return results + + +def _run_case(graph: Any, case: dict[str, Any], output_dir: Path) -> dict[str, Any]: + from src.agent.state import AgentState + from src.finance.facts import facts_from_statements + + state = _build_case_state(case, output_dir / "artifacts" / case["name"]) + result = AgentState.model_validate(graph.invoke(state.model_dump())) + facts = result.facts or facts_from_statements(result.doc_meta.doc_id, result.statements) + return { + "name": case["name"], + "source_type": case.get("source_type", "synthetic"), + "source_path": case.get("source_path"), + "statements": result.statements, + "facts": facts, + "notes": result.notes, + "risk_signals": result.risk_signals, + "expected_totals": case.get("expected_totals", {}), + "expected_facts": case.get("expected_facts", {}), + "expected_note_types": case.get("expected_note_types", set()), + "expected_signal_categories": case.get("expected_signal_categories", set()), + "statement_types": sorted(result.statements), + "fact_count": len(facts), + "errors": result.errors, + } + + +def _build_case_state(case: dict[str, Any], case_dir: Path): + from src.agent.state import AgentState + from src.schemas.models import DocumentMeta, Page + + pages = case.get("pages") + if pages is not None: + fake_pages = [Page(page_number=page["page_number"], text=page["text"], images=[]) for page in pages] + return AgentState( + doc_meta=DocumentMeta(doc_id=case["doc_id"], filename=case["filename"]), pdf_path=None, data_dir=str(case_dir), - debug={"fake_pages": pages}, + debug={"fake_pages": fake_pages}, ) - result = AgentState.model_validate(graph.invoke(state.model_dump())) - facts = result.facts or facts_from_statements(result.doc_meta.doc_id, result.statements) - expected_facts = _expected_facts(case.get("expected_statements", {})) - results.append({ - "name": case["name"], - "statements": result.statements, - "facts": facts, - "notes": result.notes, - "risk_signals": result.risk_signals, - "expected_totals": case.get("expected_statements", {}), - "expected_facts": expected_facts, - "expected_note_types": set(case.get("expected_note_types", [])), - "expected_signal_categories": set(case.get("expected_signal_categories", [])), - "statement_types": sorted(result.statements), - "fact_count": len(facts), - "errors": result.errors, - }) - return results + + return AgentState( + doc_meta=DocumentMeta(doc_id=case["doc_id"], filename=case["filename"]), + pdf_path=case.get("pdf_path"), + data_dir=str(case_dir), + ) + + +def _manifest_case_spec(case: dict[str, Any], manifest_dir: Path) -> dict[str, Any]: + source = case["source"] + source_type = source["type"] + source_path = _resolve_source_path(source["path"], manifest_dir) + expected_fact_entries = case.get("expected_facts", []) + pages = _load_synthetic_pages(source_path) if source_type == "synthetic" else None + pdf_path = str(source_path) if source_type == "pdf" else None + if source_type not in {"synthetic", "pdf"}: + raise ValueError( + f"Unsupported benchmark source type '{source_type}'. The current eval runner supports synthetic fixtures and local PDFs." + ) + + return { + "name": case["case_id"], + "doc_id": f"eval-{case['case_id']}", + "filename": source_path.name, + "pages": pages, + "pdf_path": pdf_path, + "source_type": source_type, + "source_path": str(source_path), + "expected_totals": _expected_totals_from_fact_entries(expected_fact_entries), + "expected_facts": _expected_facts_from_manifest(expected_fact_entries), + "expected_note_types": set(case.get("expected_notes", [])), + "expected_signal_categories": set(case.get("expected_risk_categories", [])), + } + + +def _resolve_source_path(raw_path: str, manifest_dir: Path) -> Path: + source_path = Path(raw_path) + if not source_path.is_absolute(): + source_path = manifest_dir / source_path + if not source_path.exists(): + raise FileNotFoundError(f"Benchmark source file not found: {source_path}") + return source_path + + +def _load_synthetic_pages(path: Path) -> list[dict[str, Any]]: + payload = json.loads(path.read_text(encoding="utf-8")) + pages = payload.get("pages") if isinstance(payload, dict) else payload + if not isinstance(pages, list) or not pages: + raise ValueError(f"Synthetic benchmark fixture must contain a non-empty pages list: {path}") + return pages + + +def _expected_facts_from_manifest(expected_facts: list[dict[str, Any]]) -> dict[str, float]: + values: dict[str, float] = {} + for fact in expected_facts: + if "value" not in fact: + continue + values[f"{fact['statement_type']}:{fact['concept']}"] = float(fact["value"]) + return values + + +def _expected_totals_from_fact_entries(expected_facts: list[dict[str, Any]]) -> dict[str, dict[str, float]]: + totals: dict[str, dict[str, float]] = {} + for fact in expected_facts: + statement_type = fact.get("statement_type") + concept = fact.get("concept") + value = fact.get("value") + if statement_type not in {"income", "balance", "cashflow"} or concept is None or value is None: + continue + totals.setdefault(statement_type, {})[concept] = float(value) + return totals def _compute_metrics(case_results: list[dict[str, Any]]) -> dict[str, Any]: from src.utils.metrics import compute_golden_metrics - return compute_golden_metrics(case_results) + metrics = compute_golden_metrics(case_results) + metrics.update(_critical_fact_metrics(case_results)) + return metrics + + +def _critical_fact_metrics(case_results: list[dict[str, Any]]) -> dict[str, float]: + from src.utils.metrics import fact_value_accuracy + + counts = {metric_name: {"matched": 0, "expected": 0} for metric_name in KEY_FACTS} + for case in case_results: + expected_facts = case.get("expected_facts", {}) + for key, expected_value in expected_facts.items(): + concept = key.split(":", 1)[-1] + metric_name = _key_fact_metric_name(concept) + if metric_name is None: + continue + counts[metric_name]["expected"] += 1 + result = fact_value_accuracy(case.get("facts", []), {key: expected_value}) + if result["accuracy"] == 1.0: + counts[metric_name]["matched"] += 1 + + metrics: dict[str, float] = {} + for concept, summary in counts.items(): + if summary["expected"]: + metrics[f"fact_accuracy_{concept}"] = summary["matched"] / summary["expected"] + return metrics def _load_golden_cases() -> list[dict[str, Any]]: @@ -252,6 +493,23 @@ def _expected_facts(expected_statements: dict[str, dict[str, float]]) -> dict[st return expected +def _golden_expected_facts(case: dict[str, Any]) -> dict[str, float]: + expected = _expected_facts(case.get("expected_statements", {})) + extra = case.get("expected_facts", {}) + if isinstance(extra, dict): + expected.update({str(key): float(value) for key, value in extra.items()}) + elif isinstance(extra, list): + expected.update(_expected_facts_from_manifest(extra)) + return expected + + +def _key_fact_metric_name(concept: str) -> str | None: + for metric_name, aliases in KEY_FACTS.items(): + if concept in aliases: + return metric_name + return None + + def _force_mock_llm() -> None: os.environ["LLM_DEFAULT_MODEL"] = "mock:mock" os.environ.pop("OPENAI_API_KEY", None) @@ -266,6 +524,8 @@ def _utc_now() -> str: def _format_metric(value: Any) -> str: + if value is None: + return "n/a" if isinstance(value, float): return f"{value:.4f}" return str(value) diff --git a/scripts/local_ci.sh b/scripts/local_ci.sh index 5702c7b..d965e52 100644 --- a/scripts/local_ci.sh +++ b/scripts/local_ci.sh @@ -18,32 +18,37 @@ if [ ! -d "web/node_modules" ]; then exit 1 fi -echo "===== [1/6] Lint (ruff) =====" -python -m ruff check src tests +echo "===== [1/7] Lint (ruff) =====" +python -m ruff check src tests scripts echo "PASS" echo "" -echo "===== [2/6] Type check (mypy) =====" +echo "===== [2/7] Type check (mypy) =====" python -m mypy src --ignore-missing-imports echo "PASS" echo "" -echo "===== [3/6] Tests (pytest) =====" +echo "===== [3/7] Tests (pytest) =====" python -m pytest -q --timeout=60 echo "PASS" echo "" -echo "===== [4/6] Lint web (eslint) =====" +echo "===== [4/7] Eval thresholds =====" +python scripts/eval.py --skip-pytest --thresholds benchmarks/thresholds/golden_minimum.json --output-dir data/eval-local-ci +echo "PASS" + +echo "" +echo "===== [5/7] Lint web (eslint) =====" (cd web && npm run lint) echo "PASS" echo "" -echo "===== [5/6] Type-check web (vue-tsc) =====" +echo "===== [6/7] Type-check web (vue-tsc) =====" (cd web && npm run typecheck) echo "PASS" echo "" -echo "===== [6/6] Build web (vite) =====" +echo "===== [7/7] Build web (vite) =====" (cd web && npm run build) echo "PASS" diff --git a/src/agent/nodes.py b/src/agent/nodes.py index 66fd117..d12156e 100644 --- a/src/agent/nodes.py +++ b/src/agent/nodes.py @@ -731,7 +731,18 @@ def _tables_to_statement(kind: str, tables: list[Table]) -> FinancialStatement: source_refs=list(table.source_refs), ) line_items.append(item) - if name_norm in {"total_assets", "total_liabilities", "total_equity", "net_income", "operating_cf", "revenue"} and value_current is not None: + if name_norm in { + "total_assets", + "total_liabilities", + "total_equity", + "cash_and_equivalents", + "revenue", + "gross_profit", + "operating_income", + "net_income", + "operating_cf", + "capex", + } and value_current is not None: totals[name_norm] = value_current return FinancialStatement( statement_type=kind, # type: ignore[arg-type] @@ -747,24 +758,46 @@ def _tables_to_statement(kind: str, tables: list[Table]) -> FinancialStatement: def _statement_from_pages(state: AgentState, kind: str) -> FinancialStatement: labels: dict[str, list[tuple[str, str, int]]] = { "income": [ - ("Total net sales", "revenue", 2), - ("Net sales", "revenue", 2), - ("Total cost of sales", "cost_of_goods_sold", 2), - ("Gross margin", "gross_profit", 2), - ("Operating income", "operating_income", 2), - ("Net income", "net_income", 2), + ("Revenue", "revenue", 0), + ("Total net sales", "revenue", 0), + ("Net sales", "revenue", 0), + ("营业收入", "revenue", 0), + ("Total cost of sales", "cost_of_goods_sold", 0), + ("Cost of goods sold", "cost_of_goods_sold", 0), + ("营业成本", "cost_of_goods_sold", 0), + ("Gross margin", "gross_profit", 0), + ("Gross Profit", "gross_profit", 0), + ("毛利润", "gross_profit", 0), + ("Operating profit", "operating_income", 0), + ("Operating income", "operating_income", 0), + ("营业利润", "operating_income", 0), + ("Net income", "net_income", 0), + ("净利润", "net_income", 0), ], "balance": [ + ("Cash and equivalents", "cash_and_equivalents", 0), + ("Cash and cash equivalents", "cash_and_equivalents", 0), + ("货币资金", "cash_and_equivalents", 0), + ("Total Assets", "total_assets", 0), ("Total assets", "total_assets", 0), + ("资产总计", "total_assets", 0), + ("Total Liabilities", "total_liabilities", 0), ("Total liabilities", "total_liabilities", 0), + ("负债合计", "total_liabilities", 0), + ("Total Equity", "total_equity", 0), ("Total shareholders’ equity", "total_equity", 0), ("Total shareholders' equity", "total_equity", 0), + ("所有者权益合计", "total_equity", 0), ("Total liabilities and shareholders’ equity", "total_liabilities_and_equity", 0), ("Total liabilities and shareholders' equity", "total_liabilities_and_equity", 0), ], "cashflow": [ + ("Cash flow from operations", "operating_cf", 0), ("Cash generated by operating activities", "operating_cf", 0), ("Net cash provided by operating activities", "operating_cf", 0), + ("经营活动产生的现金流量净额", "operating_cf", 0), + ("Capital expenditures", "capex", 0), + ("购建固定资产、无形资产和其他长期资产支付的现金", "capex", 0), ("Cash generated by investing activities", "investing_cf", 0), ("Cash used in financing activities", "financing_cf", 0), ], diff --git a/src/finance/normalizer.py b/src/finance/normalizer.py index 0737c35..fde34b7 100644 --- a/src/finance/normalizer.py +++ b/src/finance/normalizer.py @@ -9,25 +9,36 @@ "cost of sales": "cost_of_goods_sold", "gross margin": "gross_profit", "gross profit": "gross_profit", + "operating profit": "operating_income", "operating income": "operating_income", "net income": "net_income", + "cash and equivalents": "cash_and_equivalents", + "cash and cash equivalents": "cash_and_equivalents", "total assets": "total_assets", "total liabilities": "total_liabilities", "total shareholders’ equity": "total_equity", "total shareholders' equity": "total_equity", "cash generated by operating activities": "operating_cf", "net cash provided by operating activities": "operating_cf", + "cash flow from operations": "operating_cf", + "net cash from operating activities": "operating_cf", + "capital expenditures": "capex", + "purchase of property and equipment": "capex", "营业收入": "revenue", "主营业务收入": "revenue", "收入": "revenue", "营业成本": "cost_of_goods_sold", "毛利": "gross_profit", + "毛利润": "gross_profit", + "营业利润": "operating_income", "净利润": "net_income", "归属于母公司所有者的净利润": "net_income", + "货币资金": "cash_and_equivalents", "资产总计": "total_assets", "负债合计": "total_liabilities", "所有者权益合计": "total_equity", "经营活动产生的现金流量净额": "operating_cf", + "购建固定资产、无形资产和其他长期资产支付的现金": "capex", } diff --git a/src/utils/metrics.py b/src/utils/metrics.py index 081c2ac..adbb8b9 100644 --- a/src/utils/metrics.py +++ b/src/utils/metrics.py @@ -6,6 +6,27 @@ from src.schemas.models import FinancialFact, FinancialStatement, KeyNote, RiskSignal +CONCEPT_ALIASES: dict[str, tuple[str, ...]] = { + "operating_cash_flow": ("operating_cash_flow", "operating_cf"), + "operating_cf": ("operating_cf", "operating_cash_flow"), +} + + +def concept_aliases(concept: str) -> tuple[str, ...]: + return CONCEPT_ALIASES.get(concept, (concept,)) + + +def _lookup_statement_value(statement: FinancialStatement, concept: str) -> float | None: + for alias in concept_aliases(concept): + actual_val = statement.totals.get(alias) + if actual_val is not None: + return actual_val + for item in statement.line_items: + if item.name_norm in concept_aliases(concept) and item.value_current is not None: + return item.value_current + return None + + def statement_accuracy( actual: FinancialStatement, expected_totals: dict[str, float], @@ -24,14 +45,7 @@ def statement_accuracy( missing: list[str] = [] for key, expected_val in expected_totals.items(): - actual_val = actual.totals.get(key) - if actual_val is None: - # Also check line_items by name_norm - for item in actual.line_items: - if item.name_norm == key and item.value_current is not None: - actual_val = item.value_current - break - + actual_val = _lookup_statement_value(actual, key) if actual_val is None: missing.append(key) continue @@ -143,8 +157,9 @@ def fact_value_accuracy( """ indexed: dict[str, FinancialFact] = {} for fact in actual_facts: - indexed.setdefault(fact.concept, fact) - indexed[f"{fact.statement_type}:{fact.concept}"] = fact + for alias in concept_aliases(fact.concept): + indexed.setdefault(alias, fact) + indexed[f"{fact.statement_type}:{alias}"] = fact matched: list[str] = [] mismatched: list[dict[str, Any]] = [] diff --git a/tests/golden/conftest.py b/tests/golden/conftest.py index 6613bf6..a52166d 100644 --- a/tests/golden/conftest.py +++ b/tests/golden/conftest.py @@ -71,6 +71,13 @@ def golden_cases() -> list[dict]: "income": {"revenue": 2_000_000, "net_income": 200_000}, "cashflow": {}, }, + "expected_facts": [ + {"statement_type": "balance", "concept": "cash_and_equivalents", "value": 500_000}, + {"statement_type": "income", "concept": "gross_profit", "value": 600_000}, + {"statement_type": "income", "concept": "operating_income", "value": 300_000}, + {"statement_type": "cashflow", "concept": "operating_cash_flow", "value": 250_000}, + {"statement_type": "cashflow", "concept": "capex", "value": -100_000}, + ], "expected_note_types": ["other"], "expected_signal_categories": [], }, @@ -115,6 +122,7 @@ def golden_cases() -> list[dict]: "text": ( "III. Consolidated Statement of Cash Flows\n" "Cash flow from operations 180,000 160,000\n" + "Capital expenditures (60,000) (50,000)\n" "Cash flow from investing (60,000) (50,000)\n" "Cash flow from financing (30,000) (20,000)\n" "Net change in cash 90,000 90,000\n" @@ -137,6 +145,13 @@ def golden_cases() -> list[dict]: "income": {"revenue": 1_200_000, "net_income": 150_000}, "cashflow": {}, }, + "expected_facts": [ + {"statement_type": "balance", "concept": "cash_and_equivalents", "value": 300_000}, + {"statement_type": "income", "concept": "gross_profit", "value": 400_000}, + {"statement_type": "income", "concept": "operating_income", "value": 200_000}, + {"statement_type": "cashflow", "concept": "operating_cash_flow", "value": 180_000}, + {"statement_type": "cashflow", "concept": "capex", "value": -60_000}, + ], "expected_note_types": ["accounting_policy", "related_party", "segment"], "expected_signal_categories": [], }, @@ -170,6 +185,9 @@ def golden_cases() -> list[dict]: "expected_statements": { "income": {"revenue": 500_000, "net_income": 80_000}, }, + "expected_facts": [ + {"statement_type": "income", "concept": "gross_profit", "value": 200_000}, + ], "expected_note_types": ["other"], "expected_signal_categories": [], }, @@ -219,6 +237,11 @@ def golden_cases() -> list[dict]: "income": {"revenue": 800_000, "net_income": 100_000}, "cashflow": {}, }, + "expected_facts": [ + {"statement_type": "balance", "concept": "cash_and_equivalents", "value": 100_000}, + {"statement_type": "income", "concept": "gross_profit", "value": 200_000}, + {"statement_type": "cashflow", "concept": "operating_cash_flow", "value": 120_000}, + ], "expected_note_types": ["other"], "expected_signal_categories": ["disclosure_inconsistency"], }, @@ -285,6 +308,10 @@ def golden_cases() -> list[dict]: "income": {"revenue": 600_000, "net_income": 120_000}, "cashflow": {}, }, + "expected_facts": [ + {"statement_type": "income", "concept": "gross_profit", "value": 200_000}, + {"statement_type": "cashflow", "concept": "operating_cash_flow", "value": -50_000}, + ], "expected_note_types": ["audit_opinion", "contingency", "related_party", "impairment"], "expected_signal_categories": ["audit_governance", "cash_vs_profit"], }, diff --git a/tests/test_benchmark_manifest.py b/tests/test_benchmark_manifest.py index 3619cd5..04819dc 100644 --- a/tests/test_benchmark_manifest.py +++ b/tests/test_benchmark_manifest.py @@ -11,4 +11,25 @@ def test_benchmark_manifest_schema_and_sample_are_valid_json() -> None: assert schema["title"] == "Jetbot Benchmark Manifest" assert sample["schema_version"] == schema["properties"]["schema_version"]["const"] assert sample["data_policy"]["raw_files_committed"] is False - assert sample["cases"][0]["expected_facts"][0]["concept"] == "revenue" \ No newline at end of file + concepts = {fact["concept"] for fact in sample["cases"][0]["expected_facts"]} + assert "revenue" in concepts + + +def test_local_private_manifests_cover_us_filings_and_stress_cases() -> None: + us_filings = json.loads(Path("benchmarks/local_manifests/us_filings_private_batch.json").read_text(encoding="utf-8")) + stress = json.loads(Path("benchmarks/local_manifests/stress_private_batch.json").read_text(encoding="utf-8")) + + assert us_filings["benchmark_id"] == "local-private-us-filings-v1" + assert {case["filing_type"] for case in us_filings["cases"]} == {"10-K", "10-Q"} + assert all(case["source"]["path"].startswith("../private/us_filings/") for case in us_filings["cases"]) + + assert stress["benchmark_id"] == "local-private-stress-v1" + assert len(stress["cases"]) >= 3 + assert all(case["source"]["path"].startswith("../private/stress/") for case in stress["cases"]) + + +def test_thresholds_include_key_fact_metrics() -> None: + thresholds = json.loads(Path("benchmarks/thresholds/golden_minimum.json").read_text(encoding="utf-8")) + + assert "fact_accuracy_revenue" in thresholds["min_metrics"] + assert "fact_accuracy_operating_cash_flow" in thresholds["min_metrics"] \ No newline at end of file diff --git a/tests/test_eval_script.py b/tests/test_eval_script.py index 28eb265..5a281a0 100644 --- a/tests/test_eval_script.py +++ b/tests/test_eval_script.py @@ -95,4 +95,60 @@ def test_main_returns_nonzero_when_thresholds_fail(tmp_path: Path, monkeypatch) assert exit_code == 2 report = json.loads((tmp_path / "out" / "eval_report.json").read_text(encoding="utf-8")) assert report["status"] == "failed" - assert report["thresholds"]["checks"][0]["status"] == "failed" \ No newline at end of file + assert report["thresholds"]["checks"][0]["status"] == "failed" + + +def test_main_runs_sample_benchmark_manifest_and_writes_document_metrics(tmp_path: Path) -> None: + output_dir = tmp_path / "out" + + exit_code = eval_script.main([ + "--benchmark-manifest", + "benchmarks/sample_manifest.json", + "--skip-pytest", + "--output-dir", + str(output_dir), + ]) + + assert exit_code == 0 + report = json.loads((output_dir / "eval_report.json").read_text(encoding="utf-8")) + markdown = (output_dir / "eval_report.md").read_text(encoding="utf-8") + + assert report["suite"] == "synthetic-smoke-v1" + assert report["dataset"]["benchmark_id"] == "synthetic-smoke-v1" + assert report["cases"][0]["metrics"]["fact_value_accuracy"] is not None + assert "fact_accuracy_revenue" in report["metrics"] + assert "## Dataset" in markdown + assert "synthetic-income-001" in markdown + + +def test_manifest_case_spec_supports_local_pdf_sources(tmp_path: Path) -> None: + pdf_path = tmp_path / "local-report.pdf" + pdf_path.write_bytes(b"%PDF-1.4\n%synthetic\n") + + case = eval_script._manifest_case_spec( + { + "case_id": "local-pdf-001", + "source": {"type": "pdf", "path": pdf_path.name}, + "expected_facts": [ + {"statement_type": "balance", "concept": "total_assets", "value": 10.0}, + ], + }, + tmp_path, + ) + + assert case["pdf_path"] == str(pdf_path) + assert case["pages"] is None + assert case["expected_totals"]["balance"]["total_assets"] == 10.0 + + +def test_critical_fact_metrics_alias_operating_cash_flow() -> None: + result = eval_script._critical_fact_metrics([ + { + "facts": [ + type("Fact", (), {"concept": "operating_cf", "statement_type": "cashflow", "value": 42.0})(), + ], + "expected_facts": {"cashflow:operating_cash_flow": 42.0}, + } + ]) + + assert result["fact_accuracy_operating_cash_flow"] == 1.0 \ No newline at end of file diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 5f82987..28f56ed 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -174,11 +174,16 @@ def test_only_signals(self) -> None: # ---- fact metrics ---- -def _make_fact(concept: str, value: float | None, refs: list[SourceRef] | None = None) -> FinancialFact: +def _make_fact( + concept: str, + value: float | None, + refs: list[SourceRef] | None = None, + statement_type: str = "income", +) -> FinancialFact: return FinancialFact( fact_id=f"fact-{concept}", doc_id="doc-1", - statement_type="income", + statement_type=statement_type, concept=concept, label=concept, value=value, @@ -204,6 +209,12 @@ def test_fact_value_accuracy_missing_value(self) -> None: assert result["accuracy"] == 0.0 assert result["missing"] == ["income:revenue"] + def test_fact_value_accuracy_supports_operating_cash_flow_alias(self) -> None: + facts = [_make_fact("operating_cf", 55.0, statement_type="cashflow")] + result = fact_value_accuracy(facts, {"cashflow:operating_cash_flow": 55.0}) + assert result["accuracy"] == 1.0 + assert result["matched"] == ["cashflow:operating_cash_flow"] + # ---- signal_category_recall ---- diff --git a/tests/test_normalizer_and_events.py b/tests/test_normalizer_and_events.py index f1a7895..40727c3 100644 --- a/tests/test_normalizer_and_events.py +++ b/tests/test_normalizer_and_events.py @@ -33,6 +33,12 @@ def test_chinese_total_equity(self): def test_chinese_operating_cf(self): assert normalize_account_name("经营活动产生的现金流量净额") == "operating_cf" + def test_cash_and_equivalents_alias(self): + assert normalize_account_name("Cash and cash equivalents") == "cash_and_equivalents" + + def test_capex_alias(self): + assert normalize_account_name("Capital expenditures") == "capex" + def test_unmapped_returns_original(self): assert normalize_account_name("未知科目") == "未知科目" From 309830b611ea398a15a0609ba3280db56d66bb32 Mon Sep 17 00:00:00 2001 From: orin Date: Fri, 22 May 2026 13:57:36 +0800 Subject: [PATCH 2/4] fix PR14 CI regressions --- .github/workflows/ci.yml | 1 - src/agent/nodes.py | 68 ++++++++++++++++++++++++++++------------ 2 files changed, 48 insertions(+), 21 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9c96f66..5dc2f33 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -82,7 +82,6 @@ jobs: docker: runs-on: ubuntu-latest needs: [lint-and-test, web-build] - if: github.event_name == 'push' && github.ref == 'refs/heads/main' steps: - uses: actions/checkout@v4 diff --git a/src/agent/nodes.py b/src/agent/nodes.py index d12156e..87812fa 100644 --- a/src/agent/nodes.py +++ b/src/agent/nodes.py @@ -758,21 +758,21 @@ def _tables_to_statement(kind: str, tables: list[Table]) -> FinancialStatement: def _statement_from_pages(state: AgentState, kind: str) -> FinancialStatement: labels: dict[str, list[tuple[str, str, int]]] = { "income": [ - ("Revenue", "revenue", 0), - ("Total net sales", "revenue", 0), - ("Net sales", "revenue", 0), - ("营业收入", "revenue", 0), - ("Total cost of sales", "cost_of_goods_sold", 0), - ("Cost of goods sold", "cost_of_goods_sold", 0), - ("营业成本", "cost_of_goods_sold", 0), - ("Gross margin", "gross_profit", 0), - ("Gross Profit", "gross_profit", 0), - ("毛利润", "gross_profit", 0), - ("Operating profit", "operating_income", 0), - ("Operating income", "operating_income", 0), - ("营业利润", "operating_income", 0), - ("Net income", "net_income", 0), - ("净利润", "net_income", 0), + ("Revenue", "revenue", 2), + ("Total net sales", "revenue", 2), + ("Net sales", "revenue", 2), + ("营业收入", "revenue", 2), + ("Total cost of sales", "cost_of_goods_sold", 2), + ("Cost of goods sold", "cost_of_goods_sold", 2), + ("营业成本", "cost_of_goods_sold", 2), + ("Gross margin", "gross_profit", 2), + ("Gross Profit", "gross_profit", 2), + ("毛利润", "gross_profit", 2), + ("Operating profit", "operating_income", 2), + ("Operating income", "operating_income", 2), + ("营业利润", "operating_income", 2), + ("Net income", "net_income", 2), + ("净利润", "net_income", 2), ], "balance": [ ("Cash and equivalents", "cash_and_equivalents", 0), @@ -806,10 +806,11 @@ def _statement_from_pages(state: AgentState, kind: str) -> FinancialStatement: line_items: list[StatementLineItem] = [] totals: dict[str, float] = {} seen: set[str] = set() + known_labels = [label for label, _, _ in labels.get(kind, [])] for label, name_norm, preferred_index in labels.get(kind, []): if name_norm in seen: continue - match = _extract_labeled_metric(state.pages, label, preferred_index) + match = _extract_labeled_metric(state.pages, label, preferred_index, known_labels) if match is None: continue value, source_ref = match @@ -839,7 +840,12 @@ def _statement_from_pages(state: AgentState, kind: str) -> FinancialStatement: ) -def _extract_labeled_metric(pages: list[Page], label: str, preferred_index: int) -> tuple[float, SourceRef] | None: +def _extract_labeled_metric( + pages: list[Page], + label: str, + preferred_index: int, + known_labels: list[str], +) -> tuple[float, SourceRef] | None: label_lower = label.lower() for page in pages: normalized_text = " ".join(page.text.split()) @@ -847,14 +853,36 @@ def _extract_labeled_metric(pages: list[Page], label: str, preferred_index: int) if offset < 0: continue snippet = normalized_text[offset : offset + 260] - values = _parse_numbers_from_snippet(snippet) + segment = _metric_segment(snippet, label, known_labels) + values = _parse_numbers_from_snippet(segment) if not values: continue - value = values[min(preferred_index, len(values) - 1)] - return value, SourceRef(ref_type="page_text", page=page.page_number, table_id=None, quote=snippet, confidence=0.65) + value = _select_metric_value(values, preferred_index) + return value, SourceRef(ref_type="page_text", page=page.page_number, table_id=None, quote=segment, confidence=0.65) return None +def _metric_segment(snippet: str, label: str, known_labels: list[str]) -> str: + snippet_lower = snippet.lower() + search_start = len(label) + next_label_offsets = [ + position + for candidate in known_labels + if candidate.lower() != label.lower() + for position in [snippet_lower.find(candidate.lower(), search_start)] + if position >= 0 + ] + if not next_label_offsets: + return snippet + return snippet[: min(next_label_offsets)].rstrip() + + +def _select_metric_value(values: list[float], preferred_index: int) -> float: + if preferred_index <= 0 or len(values) <= 2: + return values[0] + return values[min(preferred_index, len(values) - 1)] + + def _parse_numbers_from_snippet(snippet: str) -> list[float]: text = re.sub(r"\(\d+\)", "", snippet) values: list[float] = [] From fe8911963eccfc91c7ebadfa48381e6b1d1852a4 Mon Sep 17 00:00:00 2001 From: orin Date: Fri, 22 May 2026 14:09:41 +0800 Subject: [PATCH 3/4] fix eval gate extraction --- src/agent/nodes.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/agent/nodes.py b/src/agent/nodes.py index 87812fa..5d83c15 100644 --- a/src/agent/nodes.py +++ b/src/agent/nodes.py @@ -806,11 +806,13 @@ def _statement_from_pages(state: AgentState, kind: str) -> FinancialStatement: line_items: list[StatementLineItem] = [] totals: dict[str, float] = {} seen: set[str] = set() - known_labels = [label for label, _, _ in labels.get(kind, [])] + segment_boundaries = [label for label, _, _ in labels.get(kind, [])] + if kind == "income": + segment_boundaries.extend(["Operating expenses", "营业费用"]) for label, name_norm, preferred_index in labels.get(kind, []): if name_norm in seen: continue - match = _extract_labeled_metric(state.pages, label, preferred_index, known_labels) + match = _extract_labeled_metric(state.pages, label, preferred_index, segment_boundaries) if match is None: continue value, source_ref = match From 5b47c78a0adddbecf0aa82b6fb8258f1265ef03a Mon Sep 17 00:00:00 2001 From: orin Date: Fri, 22 May 2026 14:20:27 +0800 Subject: [PATCH 4/4] fix markdown lint and ts config warnings --- README.md | 2 +- benchmarks/README.md | 2 +- docs/financial_fact_platform_roadmap.md | 4 ++-- web/package-lock.json | 8 ++++---- web/package.json | 2 +- web/tsconfig.json | 3 ++- 6 files changed, 11 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 7c515a1..1659cc3 100644 --- a/README.md +++ b/README.md @@ -202,4 +202,4 @@ MIT. See `LICENSE`. ## Not Financial Advice -Jetbot produces structured extraction and analytical signals. It does not provide investment advice or recommend trades. \ No newline at end of file +Jetbot produces structured extraction and analytical signals. It does not provide investment advice or recommend trades. diff --git a/benchmarks/README.md b/benchmarks/README.md index 3aa4ff0..406eb0a 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -39,4 +39,4 @@ python scripts/eval.py --benchmark-manifest benchmarks/local_manifests/stress_pr Real PDF benchmark manifests should point to local-only files through relative paths such as `raw/company-2025-10k.pdf`. Those raw files are intentionally ignored by git. The current eval runner supports committed `synthetic` fixtures and local `pdf` sources from a manifest. -The committed manifests under `benchmarks/local_manifests/` are safe to commit because they only store anonymized metadata and expected facts; the raw PDFs still live under ignored paths such as `benchmarks/private/us_filings/` and `benchmarks/private/stress/`. \ No newline at end of file +The committed manifests under `benchmarks/local_manifests/` are safe to commit because they only store anonymized metadata and expected facts; the raw PDFs still live under ignored paths such as `benchmarks/private/us_filings/` and `benchmarks/private/stress/`. diff --git a/docs/financial_fact_platform_roadmap.md b/docs/financial_fact_platform_roadmap.md index 263a301..772a6ab 100644 --- a/docs/financial_fact_platform_roadmap.md +++ b/docs/financial_fact_platform_roadmap.md @@ -21,7 +21,7 @@ Jetbot 当前已经具备较完整的财报 PDF Agent MVP 能力:PDF 上传、 首个商业化/试点方向建议锁定为: -**美股 10-K / 10-Q Filing-to-Model Copilot** +#### 美股 10-K / 10-Q Filing-to-Model Copilot 目标用户: @@ -585,4 +585,4 @@ docker compose up --build 8. 开始 table router protocol。 9. 再接 SEC/XBRL/HTML ingestion。 -这一路线的判断标准很简单:每增加一个能力,都必须让 facts 更准确、证据更可审计、复核更省时间、输出更能进入真实 analyst workflow。 \ No newline at end of file +这一路线的判断标准很简单:每增加一个能力,都必须让 facts 更准确、证据更可审计、复核更省时间、输出更能进入真实 analyst workflow。 diff --git a/web/package-lock.json b/web/package-lock.json index 92da290..942c342 100644 --- a/web/package-lock.json +++ b/web/package-lock.json @@ -31,7 +31,7 @@ "eslint-plugin-vue": "^9.29.1", "jsdom": "^29.1.1", "prettier": "^3.3.3", - "typescript": "~5.6.3", + "typescript": "~6.0.3", "unplugin-vue-components": "^0.28.0", "vite": "^5.4.10", "vitest": "^2.1.9", @@ -5421,9 +5421,9 @@ } }, "node_modules/typescript": { - "version": "5.6.3", - "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.6.3.tgz", - "integrity": "sha512-hjcS1mhfuyi4WW8IWtjP7brDrG2cuDZukyrYrSauoXGNgx0S7zceP07adYkJycEr56BOUTNPzbInooiN3fn1qw==", + "version": "6.0.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-6.0.3.tgz", + "integrity": "sha512-y2TvuxSZPDyQakkFRPZHKFm+KKVqIisdg9/CZwm9ftvKXLP8NRWj38/ODjNbr43SsoXqNuAisEf1GdCxqWcdBw==", "devOptional": true, "license": "Apache-2.0", "bin": { diff --git a/web/package.json b/web/package.json index d15b3fb..98d6579 100644 --- a/web/package.json +++ b/web/package.json @@ -37,7 +37,7 @@ "eslint-plugin-vue": "^9.29.1", "jsdom": "^29.1.1", "prettier": "^3.3.3", - "typescript": "~5.6.3", + "typescript": "~6.0.3", "unplugin-vue-components": "^0.28.0", "vite": "^5.4.10", "vitest": "^2.1.9", diff --git a/web/tsconfig.json b/web/tsconfig.json index 216fbf7..d4fd133 100644 --- a/web/tsconfig.json +++ b/web/tsconfig.json @@ -11,10 +11,11 @@ "skipLibCheck": true, "resolveJsonModule": true, "allowSyntheticDefaultImports": true, + "ignoreDeprecations": "6.0", "lib": ["ES2022", "DOM", "DOM.Iterable"], "types": ["node", "vite/client"], "baseUrl": ".", - "paths": { "@/*": ["src/*"] }, + "paths": { "@/*": ["./src/*"] }, "noEmit": true }, "include": ["src/**/*.ts", "src/**/*.d.ts", "src/**/*.vue", "vite.config.ts"]