From e64b38fd2d300e6545414561b60b6eadc7b0da0b Mon Sep 17 00:00:00 2001 From: orin Date: Fri, 22 May 2026 14:42:42 +0800 Subject: [PATCH] implement phase 2 fact validation surfaces --- src/agent/nodes.py | 14 +- src/agent/state.py | 2 + src/api/routes.py | 16 ++ src/cli.py | 6 + src/finance/facts.py | 33 ++-- src/finance/validators.py | 257 +++++++++++++++++++++++++++++- src/schemas/models.py | 28 ++++ tests/test_finance_facts.py | 35 +++- tests/test_routes_web.py | 38 +++++ tests/test_validators_extended.py | 102 +++++++++++- web/src/api/docs.ts | 4 + web/src/api/types.ts | 24 +++ 12 files changed, 542 insertions(+), 17 deletions(-) diff --git a/src/agent/nodes.py b/src/agent/nodes.py index 5d83c15..314f9f4 100644 --- a/src/agent/nodes.py +++ b/src/agent/nodes.py @@ -15,7 +15,7 @@ from src.finance.normalizer import normalize_account_name from src.finance.signals import generate_signals from src.finance.utils import table_rows -from src.finance.validators import validate_statements +from src.finance.validators import validate_facts, validate_statements from src.llm.base import StructuredPromptRequest, get_default_llm_client, get_llm_client, get_llm_model_config, langsmith_metadata from src.pdf.extractor import PDFExtractor from src.pdf.tables import extract_tables as extract_tables_from_pdf @@ -259,11 +259,15 @@ def extract_financial_statements(state: AgentState) -> AgentState: def validate_and_reconcile(state: AgentState) -> AgentState: start_ms = monotonic_ms() + if not state.facts: + state.facts = facts_from_statements(state.doc_meta, state.statements) state.validation_results = validate_statements(state.statements) + state.fact_validation_results = validate_facts(state.doc_meta, state.facts, state.statements) severe = any( issue in {"balance_equation_failed", "balance_missing_totals"} or issue.startswith("unit_mismatch") for issue in state.validation_results.get("issues", []) ) + severe = severe or any(issue.severity == "high" for issue in (state.fact_validation_results.issues if state.fact_validation_results else [])) # Remove any previous validation_failed entries before deciding, ensuring idempotency state.errors = [err for err in state.errors if err != "validation_failed"] if severe: @@ -559,8 +563,14 @@ def finalize(state: AgentState) -> AgentState: store.save_json(state.doc_meta.doc_id, "extracted/tables.json", [t.model_dump() for t in state.tables]) store.save_json(state.doc_meta.doc_id, "extracted/statements.json", {k: v.model_dump() for k, v in state.statements.items()}) if not state.facts: - state.facts = facts_from_statements(state.doc_meta.doc_id, state.statements) + state.facts = facts_from_statements(state.doc_meta, state.statements) store.save_json(state.doc_meta.doc_id, "extracted/facts.json", [fact.model_dump(mode="json") for fact in state.facts]) + if state.fact_validation_results: + store.save_json( + state.doc_meta.doc_id, + "extracted/fact_validation.json", + state.fact_validation_results.model_dump(mode="json"), + ) if state.corrections: store.save_json( state.doc_meta.doc_id, diff --git a/src/agent/state.py b/src/agent/state.py index a0de973..93c9edf 100644 --- a/src/agent/state.py +++ b/src/agent/state.py @@ -13,6 +13,7 @@ DocumentMeta, EventStudyResult, ExtractionTrace, + FactValidationResult, FinancialFact, FinancialStatement, KeyNote, @@ -33,6 +34,7 @@ class AgentState(BaseModel): statements: dict[str, FinancialStatement] = Field(default_factory=dict) notes: list[KeyNote] = Field(default_factory=list) validation_results: dict[str, Any] = Field(default_factory=dict) + fact_validation_results: FactValidationResult | None = None risk_signals: list[RiskSignal] = Field(default_factory=list) facts: list[FinancialFact] = Field(default_factory=list) corrections: list[Correction] = Field(default_factory=list) diff --git a/src/api/routes.py b/src/api/routes.py index a3afaf4..72d85d4 100644 --- a/src/api/routes.py +++ b/src/api/routes.py @@ -140,6 +140,9 @@ async def create_document( background_tasks: BackgroundTasks, file: UploadFile = File(...), company: str | None = Form(default=None), + ticker: str | None = Form(default=None), + cik: str | None = Form(default=None), + filing_type: str | None = Form(default=None), period_end: str | None = Form(default=None), report_type: str | None = Form(default=None), language: str | None = Form(default=None), @@ -174,6 +177,9 @@ async def create_document( doc_id=doc_id, filename=safe_filename, company=company, + ticker=ticker, + cik=cik, + filing_type=filing_type, period_end=parsed_period, report_type=report_type, language=language, @@ -274,6 +280,14 @@ async def get_facts(_auth: _AuthDep, doc_id: str): return _ok(data) +@router.get("/documents/{doc_id}/fact-validation") +async def get_fact_validation(_auth: _AuthDep, doc_id: str): + data = store.load_json(doc_id, "extracted/fact_validation.json") + if data is None: + return _err("not_found", "Fact validation not found") + return _ok(data) + + @router.get("/documents/{doc_id}/notes") async def get_notes(_auth: _AuthDep, doc_id: str): data = store.load_json(doc_id, "extracted/notes.json") @@ -532,6 +546,8 @@ def _save_partial_results(doc_id: str) -> None: s.save_json(doc_id, "extracted/statements.json", {k: v.model_dump() for k, v in partial.statements.items()}) if partial.facts: s.save_json(doc_id, "extracted/facts.json", [fact.model_dump(mode="json") for fact in partial.facts]) + if partial.fact_validation_results: + s.save_json(doc_id, "extracted/fact_validation.json", partial.fact_validation_results.model_dump(mode="json")) if partial.notes: s.save_json(doc_id, "extracted/notes.json", [n.model_dump() for n in partial.notes]) if partial.risk_signals: diff --git a/src/cli.py b/src/cli.py index 0f94459..a9bf62d 100644 --- a/src/cli.py +++ b/src/cli.py @@ -22,6 +22,9 @@ def analyze( pdf: str = typer.Option(..., help="Path to PDF file"), out: str = typer.Option("data", help="Output directory"), company: str | None = typer.Option(None, help="Company name"), + ticker: str | None = typer.Option(None, help="Ticker symbol"), + cik: str | None = typer.Option(None, help="CIK identifier"), + filing_type: str | None = typer.Option(None, help="Filing type (for example 10-K or 10-Q)"), period_end: str | None = typer.Option(None, help="Report period end (YYYY-MM-DD)"), report_type: str | None = typer.Option(None, help="Report type"), language: str | None = typer.Option(None, help="Language"), @@ -38,6 +41,9 @@ def analyze( doc_id=doc_id, filename=pdf_path.name, company=company, + ticker=ticker, + cik=cik, + filing_type=filing_type, period_end=parsed_period, report_type=report_type, language=language, diff --git a/src/finance/facts.py b/src/finance/facts.py index 994a7c2..e9ba7e3 100644 --- a/src/finance/facts.py +++ b/src/finance/facts.py @@ -4,10 +4,11 @@ from collections.abc import Iterable from typing import Any -from src.schemas.models import Correction, FinancialFact, FinancialStatement, SourceRef, StatementLineItem +from src.schemas.models import Correction, DocumentMeta, FinancialFact, FinancialStatement, SourceRef, StatementLineItem -def facts_from_statements(doc_id: str, statements: dict[str, FinancialStatement]) -> list[FinancialFact]: +def facts_from_statements(doc_meta: str | DocumentMeta, statements: dict[str, FinancialStatement]) -> list[FinancialFact]: + meta = _coerce_doc_meta(doc_meta) facts: list[FinancialFact] = [] seen: set[tuple[str, str]] = set() @@ -17,7 +18,7 @@ def facts_from_statements(doc_id: str, statements: dict[str, FinancialStatement] for item in statement.line_items: if item.value_current is None and not item.source_refs: continue - fact = _fact_from_line_item(doc_id, statement, item) + fact = _fact_from_line_item(meta, statement, item) facts.append(fact) seen.add((statement.statement_type, item.name_norm)) @@ -27,10 +28,11 @@ def facts_from_statements(doc_id: str, statements: dict[str, FinancialStatement] continue facts.append( _build_fact( - doc_id=doc_id, + doc_meta=meta, statement=statement, concept=concept, label=concept, + raw_label=concept, value=value, unit=None, currency=None, @@ -55,16 +57,17 @@ def apply_corrections(facts: Iterable[FinancialFact], corrections: Iterable[Corr def _fact_from_line_item( - doc_id: str, + doc_meta: DocumentMeta, statement: FinancialStatement, item: StatementLineItem, ) -> FinancialFact: confidence = _evidence_confidence(item.source_refs, statement.extraction_confidence) return _build_fact( - doc_id=doc_id, + doc_meta=doc_meta, statement=statement, concept=item.name_norm, label=item.name_raw, + raw_label=item.name_raw, value=item.value_current, unit=item.unit, currency=item.currency, @@ -76,10 +79,11 @@ def _fact_from_line_item( def _build_fact( *, - doc_id: str, + doc_meta: DocumentMeta, statement: FinancialStatement, concept: str, label: str, + raw_label: str, value: float | None, unit: str | None, currency: str | None, @@ -88,11 +92,16 @@ def _build_fact( metadata: dict[str, Any], ) -> FinancialFact: return FinancialFact( - fact_id=_fact_id(doc_id, statement.statement_type, concept, statement.period_end, label), - doc_id=doc_id, + fact_id=_fact_id(doc_meta.doc_id, statement.statement_type, concept, statement.period_end, raw_label), + doc_id=doc_meta.doc_id, + company=doc_meta.company, + ticker=doc_meta.ticker, + cik=doc_meta.cik, + filing_type=doc_meta.filing_type or doc_meta.report_type, statement_type=statement.statement_type, concept=concept, label=label, + raw_label=raw_label, value=value, unit=unit, scale=_infer_scale(unit), @@ -107,6 +116,12 @@ def _build_fact( ) +def _coerce_doc_meta(doc_meta: str | DocumentMeta) -> DocumentMeta: + if isinstance(doc_meta, DocumentMeta): + return doc_meta + return DocumentMeta(doc_id=doc_meta, filename=f"{doc_meta}.pdf") + + def _fact_id(doc_id: str, statement_type: str, concept: str, period_end: object, label: str) -> str: raw = "|".join([doc_id, statement_type, concept, str(period_end or ""), label]) return "fact_" + hashlib.sha1(raw.encode("utf-8")).hexdigest()[:16] diff --git a/src/finance/validators.py b/src/finance/validators.py index 8794835..3e7a7ec 100644 --- a/src/finance/validators.py +++ b/src/finance/validators.py @@ -1,9 +1,38 @@ from __future__ import annotations +from collections import defaultdict from typing import Any from src.finance.utils import find_line_item, find_total -from src.schemas.models import FinancialStatement +from src.schemas.models import DocumentMeta, FactValidationIssue, FactValidationResult, FinancialFact, FinancialStatement + + +_CRITICAL_FACTS: dict[str, tuple[str, ...]] = { + "income": ("revenue", "gross_profit", "net_income"), + "balance": ("total_assets", "total_liabilities", "total_equity"), + "cashflow": ("operating_cash_flow", "capex"), +} + +_CONCEPT_CANONICAL = { + "operating_cf": "operating_cash_flow", + "operating_cash_flow": "operating_cash_flow", +} + + +def _canonical_concept(concept: str) -> str: + return _CONCEPT_CANONICAL.get(concept, concept) + + +def _fact_value(facts: list[FinancialFact], statement_type: str, concept: str) -> tuple[float | None, list[FinancialFact]]: + matched = [ + fact + for fact in facts + if fact.statement_type == statement_type and _canonical_concept(fact.concept) == _canonical_concept(concept) + ] + for fact in matched: + if fact.value is not None: + return fact.value, matched + return None, matched def _unit_consistency(statement: FinancialStatement) -> list[str]: @@ -86,3 +115,229 @@ def validate_statements(statements: dict[str, FinancialStatement], tolerance: fl rev_growth = (rev_item.value_current - rev_item.value_prior) / max(abs(rev_item.value_prior), 1.0) results["metrics"]["ar_growth_vs_rev_growth"] = ar_growth - rev_growth return results + + +def validate_facts( + doc_meta: DocumentMeta, + facts: list[FinancialFact], + statements: dict[str, FinancialStatement] | None = None, + tolerance: float = 0.02, +) -> FactValidationResult: + result = FactValidationResult() + statements = statements or {} + result.metrics["fact_count"] = len(facts) + result.metrics["facts_with_source_refs"] = sum(1 for fact in facts if fact.source_refs) + result.checks["facts_present"] = bool(facts) + + if not facts: + result.issues.append( + FactValidationIssue( + code="facts_missing", + severity="high", + message=f"No canonical facts were produced for document {doc_meta.doc_id}.", + ) + ) + return result + + available_statement_types = { + statement_type + for statement_type in set(statements) | {fact.statement_type for fact in facts} + if statement_type in _CRITICAL_FACTS + } + + missing_critical = 0 + for statement_type in sorted(available_statement_types): + expected = _CRITICAL_FACTS[statement_type] + missing = [ + concept + for concept in expected + if not any( + _canonical_concept(fact.concept) == _canonical_concept(concept) + for fact in facts + if fact.statement_type == statement_type + ) + ] + if missing: + missing_critical += len(missing) + result.issues.append( + FactValidationIssue( + code="missing_critical_facts", + severity="high", + message=f"Missing critical facts for {statement_type}: {', '.join(missing)}.", + concepts=missing, + statement_type=statement_type, + ) + ) + result.metrics["missing_critical_fact_count"] = missing_critical + result.checks["missing_critical_facts"] = missing_critical == 0 + + duplicates: dict[tuple[str, str, object, object], list[FinancialFact]] = defaultdict(list) + for fact in facts: + duplicates[(fact.statement_type, _canonical_concept(fact.concept), fact.period_start, fact.period_end)].append(fact) + duplicate_groups = [group for group in duplicates.values() if len(group) > 1] + result.metrics["duplicate_fact_groups"] = len(duplicate_groups) + result.checks["duplicate_concepts"] = len(duplicate_groups) == 0 + for group in duplicate_groups: + result.issues.append( + FactValidationIssue( + code="duplicate_concepts", + severity="medium", + message=( + f"Duplicate canonical facts detected for {group[0].statement_type}:{_canonical_concept(group[0].concept)}." + ), + fact_ids=[fact.fact_id for fact in group], + concepts=[group[0].concept], + statement_type=group[0].statement_type, + ) + ) + + expected_period_type = {"balance": "instant", "income": "duration", "cashflow": "duration"} + period_issue_count = 0 + for statement_type in sorted(available_statement_types): + facts_for_type = [fact for fact in facts if fact.statement_type == statement_type] + non_null_period_ends = {fact.period_end for fact in facts_for_type if fact.period_end is not None} + if len(non_null_period_ends) > 1: + period_issue_count += 1 + result.issues.append( + FactValidationIssue( + code="period_inconsistency", + severity="high", + message=f"Facts for {statement_type} use multiple period_end values.", + fact_ids=[fact.fact_id for fact in facts_for_type], + statement_type=statement_type, + ) + ) + + wrong_period_type = [ + fact + for fact in facts_for_type + if fact.period_type != "unknown" and fact.period_type != expected_period_type.get(statement_type) + ] + if wrong_period_type: + period_issue_count += 1 + result.issues.append( + FactValidationIssue( + code="period_type_inconsistency", + severity="high", + message=f"Facts for {statement_type} use an unexpected period_type.", + fact_ids=[fact.fact_id for fact in wrong_period_type], + statement_type=statement_type, + ) + ) + result.metrics["period_issue_count"] = period_issue_count + result.checks["period_consistency"] = period_issue_count == 0 + + currency_scale_issue_count = 0 + for statement_type in sorted(available_statement_types): + facts_for_type = [fact for fact in facts if fact.statement_type == statement_type] + currencies = {fact.currency for fact in facts_for_type if fact.currency} + scales = {fact.scale for fact in facts_for_type if fact.scale is not None} + if len(currencies) > 1: + currency_scale_issue_count += 1 + result.issues.append( + FactValidationIssue( + code="currency_inconsistency", + severity="medium", + message=f"Facts for {statement_type} use multiple currencies: {', '.join(sorted(currencies))}.", + fact_ids=[fact.fact_id for fact in facts_for_type if fact.currency], + statement_type=statement_type, + ) + ) + if len(scales) > 1: + currency_scale_issue_count += 1 + result.issues.append( + FactValidationIssue( + code="scale_inconsistency", + severity="medium", + message=f"Facts for {statement_type} use multiple scales.", + fact_ids=[fact.fact_id for fact in facts_for_type if fact.scale is not None], + statement_type=statement_type, + ) + ) + result.metrics["currency_scale_issue_count"] = currency_scale_issue_count + result.checks["scale_currency_consistency"] = currency_scale_issue_count == 0 + + missing_ref_count = 0 + for statement_type, concepts in _CRITICAL_FACTS.items(): + for concept in concepts: + _, matched = _fact_value(facts, statement_type, concept) + if matched and not any(fact.source_refs for fact in matched): + missing_ref_count += 1 + result.issues.append( + FactValidationIssue( + code="missing_source_refs", + severity="high", + message=f"Critical fact {statement_type}:{concept} is missing source references.", + fact_ids=[fact.fact_id for fact in matched], + concepts=[concept], + statement_type=statement_type, + ) + ) + result.metrics["critical_facts_missing_source_refs"] = missing_ref_count + result.checks["critical_facts_have_source_refs"] = missing_ref_count == 0 + + if "balance" in available_statement_types: + total_assets, asset_facts = _fact_value(facts, "balance", "total_assets") + total_liabilities, liability_facts = _fact_value(facts, "balance", "total_liabilities") + total_equity, equity_facts = _fact_value(facts, "balance", "total_equity") + if total_assets is None or total_liabilities is None or total_equity is None: + result.checks["balance_equation"] = False + result.issues.append( + FactValidationIssue( + code="balance_equation_missing_facts", + severity="high", + message="Balance equation validation is missing one or more required facts.", + fact_ids=[fact.fact_id for fact in asset_facts + liability_facts + equity_facts], + statement_type="balance", + ) + ) + else: + diff_ratio = abs(total_assets - (total_liabilities + total_equity)) / max(abs(total_assets), 1.0) + result.metrics["balance_equation_diff_ratio"] = diff_ratio + result.checks["balance_equation"] = diff_ratio < tolerance + if diff_ratio >= tolerance: + result.issues.append( + FactValidationIssue( + code="balance_equation_failed", + severity="high", + message="Balance facts do not satisfy assets = liabilities + equity.", + fact_ids=[fact.fact_id for fact in asset_facts + liability_facts + equity_facts], + concepts=["total_assets", "total_liabilities", "total_equity"], + statement_type="balance", + metadata={"diff_ratio": diff_ratio}, + ) + ) + + if "cashflow" in available_statement_types: + operating_cash_flow, operating_facts = _fact_value(facts, "cashflow", "operating_cash_flow") + capex, capex_facts = _fact_value(facts, "cashflow", "capex") + if operating_cash_flow is None or capex is None: + result.checks["cashflow_reconciliation"] = False + result.issues.append( + FactValidationIssue( + code="cashflow_reconciliation_missing_facts", + severity="medium", + message="Cashflow reconciliation is missing operating cash flow or capex.", + fact_ids=[fact.fact_id for fact in operating_facts + capex_facts], + concepts=["operating_cash_flow", "capex"], + statement_type="cashflow", + ) + ) + else: + free_cash_flow = operating_cash_flow + capex + result.metrics["free_cash_flow"] = free_cash_flow + result.checks["cashflow_reconciliation"] = capex <= 0 + if capex > 0: + result.issues.append( + FactValidationIssue( + code="cashflow_reconciliation_failed", + severity="medium", + message="Capex is positive; expected a cash outflow value for reconciliation.", + fact_ids=[fact.fact_id for fact in operating_facts + capex_facts], + concepts=["operating_cash_flow", "capex"], + statement_type="cashflow", + metadata={"free_cash_flow": free_cash_flow}, + ) + ) + + return result diff --git a/src/schemas/models.py b/src/schemas/models.py index db06576..77b6a58 100644 --- a/src/schemas/models.py +++ b/src/schemas/models.py @@ -38,6 +38,9 @@ class DocumentMeta(BaseModel): doc_id: str filename: str company: str | None = None + ticker: str | None = None + cik: str | None = None + filing_type: str | None = None period_end: date | None = None report_type: str | None = None language: str | None = None @@ -120,9 +123,14 @@ class FinancialFact(BaseModel): fact_id: str doc_id: str + company: str | None = None + ticker: str | None = None + cik: str | None = None + filing_type: str | None = None statement_type: Literal["income", "balance", "cashflow", "note", "other"] concept: str label: str + raw_label: str | None = None value: float | None = None unit: str | None = None scale: float | None = None @@ -167,6 +175,26 @@ class Correction(BaseModel): created_at: datetime = Field(default_factory=_utc_now) +class FactValidationIssue(BaseModel): + model_config = ConfigDict(extra="forbid") + + code: str + severity: Literal["low", "medium", "high"] = "medium" + message: str + fact_ids: list[str] = Field(default_factory=list) + concepts: list[str] = Field(default_factory=list) + statement_type: str | None = None + metadata: dict[str, Any] = Field(default_factory=dict) + + +class FactValidationResult(BaseModel): + model_config = ConfigDict(extra="forbid") + + issues: list[FactValidationIssue] = Field(default_factory=list) + checks: dict[str, bool | float | int | str] = Field(default_factory=dict) + metrics: dict[str, float | int | str] = Field(default_factory=dict) + + class KeyNote(BaseModel): model_config = ConfigDict(extra="forbid") diff --git a/tests/test_finance_facts.py b/tests/test_finance_facts.py index 3d914c1..041d289 100644 --- a/tests/test_finance_facts.py +++ b/tests/test_finance_facts.py @@ -3,7 +3,7 @@ from datetime import date from src.finance.facts import apply_corrections, facts_from_statements -from src.schemas.models import Correction, FinancialStatement, SourceRef, StatementLineItem +from src.schemas.models import Correction, DocumentMeta, FinancialStatement, SourceRef, StatementLineItem def test_facts_from_statements_preserves_line_item_evidence() -> None: @@ -35,13 +35,28 @@ def test_facts_from_statements_preserves_line_item_evidence() -> None: extraction_confidence=0.7, ) - facts = facts_from_statements("doc-1", {"income": statement}) + facts = facts_from_statements( + DocumentMeta( + doc_id="doc-1", + filename="demo.pdf", + company="ACME Corp", + ticker="ACME", + cik="00001234", + filing_type="10-K", + ), + {"income": statement}, + ) assert len(facts) == 1 fact = facts[0] assert fact.doc_id == "doc-1" + assert fact.company == "ACME Corp" + assert fact.ticker == "ACME" + assert fact.cik == "00001234" + assert fact.filing_type == "10-K" assert fact.statement_type == "income" assert fact.concept == "revenue" + assert fact.raw_label == "Revenue" assert fact.value == 100.0 assert fact.period_type == "duration" assert fact.scale == 1_000_000.0 @@ -85,4 +100,18 @@ def test_apply_corrections_updates_allowed_fact_field() -> None: corrected = apply_corrections([fact], [correction]) assert corrected[0].value == 125.0 - assert fact.value == 100.0 \ No newline at end of file + assert fact.value == 100.0 + + +def test_facts_from_statements_uses_report_type_as_filing_type_fallback() -> None: + statement = FinancialStatement( + statement_type="balance", + line_items=[StatementLineItem(name_raw="Total assets", name_norm="total_assets", value_current=500.0)], + ) + + facts = facts_from_statements( + DocumentMeta(doc_id="doc-2", filename="demo.pdf", report_type="annual-report"), + {"balance": statement}, + ) + + assert facts[0].filing_type == "annual-report" \ No newline at end of file diff --git a/tests/test_routes_web.py b/tests/test_routes_web.py index 083e5d8..3d1bdb5 100644 --- a/tests/test_routes_web.py +++ b/tests/test_routes_web.py @@ -255,12 +255,50 @@ def test_facts_endpoint(client: TestClient, tmp_path: Path) -> None: assert body["data"][0]["concept"] == "revenue" +def test_fact_validation_endpoint(client: TestClient, tmp_path: Path) -> None: + _make_doc(tmp_path / "data", "abc123") + validation_path = tmp_path / "data" / "abc123" / "extracted" / "fact_validation.json" + validation_path.write_text( + json.dumps( + { + "issues": [ + { + "code": "missing_critical_facts", + "severity": "high", + "message": "Missing critical facts for income: gross_profit.", + "fact_ids": [], + "concepts": ["gross_profit"], + "statement_type": "income", + "metadata": {}, + } + ], + "checks": {"missing_critical_facts": False}, + "metrics": {"missing_critical_fact_count": 1}, + } + ), + encoding="utf-8", + ) + + r = client.get("/v1/documents/abc123/fact-validation") + + assert r.status_code == 200 + body = r.json() + assert body["ok"] is True + assert body["data"]["issues"][0]["code"] == "missing_critical_facts" + + def test_facts_missing_returns_404(client: TestClient, tmp_path: Path) -> None: _make_doc(tmp_path / "data", "abc123") r = client.get("/v1/documents/abc123/facts") assert r.status_code == 404 +def test_fact_validation_missing_returns_404(client: TestClient, tmp_path: Path) -> None: + _make_doc(tmp_path / "data", "abc123") + r = client.get("/v1/documents/abc123/fact-validation") + assert r.status_code == 404 + + def test_tables_missing_returns_404(client: TestClient, tmp_path: Path) -> None: _make_doc(tmp_path / "data", "abc123", with_tables=False) r = client.get("/v1/documents/abc123/tables") diff --git a/tests/test_validators_extended.py b/tests/test_validators_extended.py index 33b3a57..192cfa8 100644 --- a/tests/test_validators_extended.py +++ b/tests/test_validators_extended.py @@ -1,8 +1,8 @@ """Tests for validators — balance equation failure, unit mismatch, gross profit check.""" from __future__ import annotations -from src.finance.validators import validate_statements -from src.schemas.models import FinancialStatement, StatementLineItem +from src.finance.validators import validate_facts, validate_statements +from src.schemas.models import DocumentMeta, FinancialFact, FinancialStatement, SourceRef, StatementLineItem def test_balance_equation_failure(): @@ -80,3 +80,101 @@ def test_profit_to_cfo_ratio_negative_income(): ratio = results["metrics"].get("profit_to_cfo_ratio") assert ratio is not None assert abs(ratio - 10.0 / 20.0) < 1e-9 + + +def test_validate_facts_flags_missing_critical_fact_and_missing_source_ref(): + facts = [ + FinancialFact( + fact_id="fact-1", + doc_id="doc-1", + statement_type="income", + concept="revenue", + label="Revenue", + raw_label="Revenue", + value=100.0, + period_type="duration", + confidence=0.8, + source_refs=[SourceRef(ref_type="page_text", page=1, confidence=0.8)], + ), + FinancialFact( + fact_id="fact-2", + doc_id="doc-1", + statement_type="income", + concept="net_income", + label="Net income", + raw_label="Net income", + value=20.0, + period_type="duration", + confidence=0.8, + source_refs=[], + ), + ] + + result = validate_facts(DocumentMeta(doc_id="doc-1", filename="demo.pdf"), facts, {"income": FinancialStatement(statement_type="income")}) + + assert result.checks["missing_critical_facts"] is False + assert result.checks["critical_facts_have_source_refs"] is False + assert any(issue.code == "missing_critical_facts" for issue in result.issues) + assert any(issue.code == "missing_source_refs" for issue in result.issues) + + +def test_validate_facts_flags_duplicates_and_balance_equation_failure(): + facts = [ + FinancialFact( + fact_id="fact-assets-a", + doc_id="doc-1", + statement_type="balance", + concept="total_assets", + label="Total assets", + raw_label="Total assets", + value=100.0, + period_type="instant", + period_end=None, + confidence=0.8, + source_refs=[SourceRef(ref_type="page_text", page=1, confidence=0.8)], + ), + FinancialFact( + fact_id="fact-assets-b", + doc_id="doc-1", + statement_type="balance", + concept="total_assets", + label="Assets", + raw_label="Assets", + value=100.0, + period_type="instant", + period_end=None, + confidence=0.8, + source_refs=[SourceRef(ref_type="page_text", page=1, confidence=0.8)], + ), + FinancialFact( + fact_id="fact-liabilities", + doc_id="doc-1", + statement_type="balance", + concept="total_liabilities", + label="Total liabilities", + raw_label="Total liabilities", + value=70.0, + period_type="instant", + confidence=0.8, + source_refs=[SourceRef(ref_type="page_text", page=1, confidence=0.8)], + ), + FinancialFact( + fact_id="fact-equity", + doc_id="doc-1", + statement_type="balance", + concept="total_equity", + label="Total equity", + raw_label="Total equity", + value=20.0, + period_type="instant", + confidence=0.8, + source_refs=[SourceRef(ref_type="page_text", page=1, confidence=0.8)], + ), + ] + + result = validate_facts(DocumentMeta(doc_id="doc-1", filename="demo.pdf"), facts, {"balance": FinancialStatement(statement_type="balance")}) + + assert result.checks["duplicate_concepts"] is False + assert result.checks["balance_equation"] is False + assert any(issue.code == "duplicate_concepts" for issue in result.issues) + assert any(issue.code == "balance_equation_failed" for issue in result.issues) diff --git a/web/src/api/docs.ts b/web/src/api/docs.ts index c158b59..e4ce40a 100644 --- a/web/src/api/docs.ts +++ b/web/src/api/docs.ts @@ -9,6 +9,7 @@ import type { ExtractedTable, FinancialFact, FinancialStatements, + FactValidationResult, KeyNote, MetricItem, PdfOperationResult, @@ -185,6 +186,9 @@ export const docsApi = { facts(docId: string) { return unwrap(http.get(`/v1/documents/${docId}/facts`)) }, + factValidation(docId: string) { + return unwrap(http.get(`/v1/documents/${docId}/fact-validation`)) + }, riskSignals(docId: string) { return unwrap(http.get(`/v1/documents/${docId}/risk-signals`)).then(normalizeSignals) }, diff --git a/web/src/api/types.ts b/web/src/api/types.ts index 02235a4..d1f30ca 100644 --- a/web/src/api/types.ts +++ b/web/src/api/types.ts @@ -33,9 +33,14 @@ export interface FinancialStatements { export interface FinancialFact { fact_id: string doc_id: string + company?: string | null + ticker?: string | null + cik?: string | null + filing_type?: string | null statement_type: 'income' | 'balance' | 'cashflow' | 'note' | 'other' | string concept: string label: string + raw_label?: string | null value?: number | null unit?: string | null scale?: number | null @@ -49,6 +54,22 @@ export interface FinancialFact { metadata?: Record } +export interface FactValidationIssue { + code: string + severity: 'low' | 'medium' | 'high' | string + message: string + fact_ids?: string[] + concepts?: string[] + statement_type?: string | null + metadata?: Record +} + +export interface FactValidationResult { + issues: FactValidationIssue[] + checks: Record + metrics: Record +} + export interface RiskSignal { id: string category: string @@ -69,6 +90,9 @@ export interface DocumentMeta { doc_id: string filename: string company?: string | null + ticker?: string | null + cik?: string | null + filing_type?: string | null period_end?: string | null report_type?: string | null language?: string | null