diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..2700b07 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,14 @@ +# Enforce byte-exact checkout for binary fixtures. +# Without this, git's autodetection heuristic can misclassify uncompressed +# PDFs (pageCompression=0) as text on Windows (core.autocrlf=true), which +# rewrites LF to CRLF inside the PDF and corrupts xref/trailer offsets. +*.pdf binary +*.wasm binary +*.ttf binary +*.otf binary +*.woff binary +*.woff2 binary +*.ico binary +*.png binary +*.jpg binary +*.jpeg binary diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4b9b76b..826a290 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -102,3 +102,51 @@ jobs: with: name: test-results-${{ matrix.os }}-py${{ matrix.python-version }} path: results.xml + + accuracy: + name: Table Extraction Accuracy + needs: lint + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - name: Set up Python 3.13 + uses: actions/setup-python@v6 + with: + python-version: "3.13" + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + + - uses: Swatinem/rust-cache@v2.9.1 + with: + save-if: false + + - uses: astral-sh/setup-uv@v7 + with: + cache-dependency-glob: pyproject.toml + + - name: Install dependencies + run: uv sync --extra dev + + - name: Build native extension + uses: PyO3/maturin-action@v1 + with: + command: develop + args: --release + + - name: Verify fixtures are up to date + run: | + cp -r tests/fixtures/tables /tmp/tables_snapshot + uv run python scripts/generate_table_fixtures.py + diff -r /tmp/tables_snapshot tests/fixtures/tables + + - name: Run accuracy harness + run: uv run pytest tests/python/ -m accuracy -v --tb=short + + - name: Upload accuracy report + if: always() + uses: actions/upload-artifact@v7 + with: + name: accuracy-report + path: tests/output/table_accuracy.json diff --git a/.gitignore b/.gitignore index a18592b..249bb50 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ dist/ build/ *.so *.dylib +CLAUDE.md *.dll .venv/ venv/ @@ -36,6 +37,9 @@ python/paperjam/libpdfium.so # Test fixtures (generated) tests/fixtures/large_*.pdf +# Per-session test artifacts (accuracy reports, etc.) +tests/output/ + # Sphinx _build diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 457a2df..d9c0aed 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -47,3 +47,11 @@ repos: language: system files: ^docs-site/src/ pass_filenames: false + + - repo: local + hooks: + - id: strip-ai-attribution + name: strip AI attribution from commit messages + entry: .claude/hooks/strip-ai-attribution.sh + language: script + stages: [commit-msg] diff --git a/crates/paperjam-core/Cargo.toml b/crates/paperjam-core/Cargo.toml index c986c85..a3e7ce4 100644 --- a/crates/paperjam-core/Cargo.toml +++ b/crates/paperjam-core/Cargo.toml @@ -45,3 +45,10 @@ render = ["dep:pdfium-render", "dep:image"] signatures = ["dep:x509-parser", "dep:cms", "dep:der", "dep:sha1", "dep:rsa", "dep:p256", "dep:pkcs8", "dep:spki"] ltv = ["signatures", "dep:ureq", "dep:rustls"] validation = ["dep:roxmltree"] + +[dev-dependencies] +criterion = { version = "0.5", features = ["html_reports"] } + +[[bench]] +name = "table_extraction" +harness = false diff --git a/crates/paperjam-core/benches/table_extraction.rs b/crates/paperjam-core/benches/table_extraction.rs new file mode 100644 index 0000000..c6e2f4f --- /dev/null +++ b/crates/paperjam-core/benches/table_extraction.rs @@ -0,0 +1,79 @@ +//! Criterion microbench for table extraction over the fixture corpus. +//! +//! Run with: `cargo bench -p paperjam-core --bench table_extraction` +//! +//! Each synthetic PDF fixture under `tests/fixtures/tables/` becomes a bench group. +//! The document and page are parsed outside the measured section so the bench only +//! measures `table::extract_tables` itself — that's the code subsequent phases will +//! change. + +use std::path::PathBuf; + +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; +use paperjam_core::document::Document; +use paperjam_core::table::{extract_tables, TableExtractionOptions}; + +fn fixtures_dir() -> PathBuf { + let manifest = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + manifest.join("../../tests/fixtures/tables") +} + +fn collect_fixtures() -> Vec { + let mut out = Vec::new(); + let dir = fixtures_dir(); + let Ok(rd) = std::fs::read_dir(&dir) else { + eprintln!( + "skipping bench: fixtures dir not found at {}", + dir.display() + ); + return out; + }; + for entry in rd.flatten() { + let path = entry.path(); + if path.extension().and_then(|s| s.to_str()) == Some("pdf") { + out.push(path); + } + } + out.sort(); + out +} + +fn bench_extract(c: &mut Criterion) { + let fixtures = collect_fixtures(); + let opts = TableExtractionOptions::default(); + + let mut group = c.benchmark_group("table_extraction"); + for path in &fixtures { + let name = path + .file_stem() + .and_then(|s| s.to_str()) + .unwrap_or("?") + .to_string(); + // Parse the document once and keep its pages alive for the whole bench run. + let doc = match Document::open(path) { + Ok(d) => d, + Err(e) => { + eprintln!("skipping {name}: failed to open ({e:?})"); + continue; + } + }; + let mut pages = Vec::new(); + for n in 1..=doc.page_count() as u32 { + if let Ok(p) = doc.page(n) { + pages.push(p); + } + } + group.throughput(Throughput::Elements(pages.len() as u64)); + group.bench_with_input(BenchmarkId::from_parameter(&name), &pages, |b, pages| { + b.iter(|| { + for page in pages.iter() { + let _ = extract_tables(page, &opts); + } + }); + }); + } + group.finish(); +} + +criterion_group!(benches, bench_extract); +criterion_main!(benches); diff --git a/crates/paperjam-core/src/forms/mod.rs b/crates/paperjam-core/src/forms/mod.rs index f2b6c33..6875ac0 100644 --- a/crates/paperjam-core/src/forms/mod.rs +++ b/crates/paperjam-core/src/forms/mod.rs @@ -93,22 +93,14 @@ pub fn fill_form_fields( for (name, value) in values { match field_name_map.get(name) { - None => { - not_found.push(name.clone()); + Some((false, field_type)) if set_field_value(&mut inner, name, value, field_type)? => { + filled += 1; + filled_fields.push((name.clone(), value.clone(), field_type.clone())); } - Some((true, _)) => { - // Read-only field, skip + // Not in the map, read-only, or set_field_value returned false: record as not-found. + _ => { not_found.push(name.clone()); } - Some((false, field_type)) => { - // Find and update the field object - if set_field_value(&mut inner, name, value, field_type)? { - filled += 1; - filled_fields.push((name.clone(), value.clone(), field_type.clone())); - } else { - not_found.push(name.clone()); - } - } } } diff --git a/crates/paperjam-core/src/manipulation/insert.rs b/crates/paperjam-core/src/manipulation/insert.rs index fbcd5da..6007705 100644 --- a/crates/paperjam-core/src/manipulation/insert.rs +++ b/crates/paperjam-core/src/manipulation/insert.rs @@ -40,7 +40,7 @@ pub fn insert_blank_pages(doc: &Document, positions: &[(u32, f64, f64)]) -> Resu // Sort positions in reverse order so insertions don't shift indices let mut sorted_positions = positions.to_vec(); - sorted_positions.sort_by(|a, b| b.0.cmp(&a.0)); + sorted_positions.sort_by_key(|p| std::cmp::Reverse(p.0)); for (after_page, width, height) in sorted_positions { // Create an empty content stream diff --git a/crates/paperjam-epub/src/parser.rs b/crates/paperjam-epub/src/parser.rs index a452b45..0c621de 100644 --- a/crates/paperjam-epub/src/parser.rs +++ b/crates/paperjam-epub/src/parser.rs @@ -218,22 +218,20 @@ fn parse_opf(xml: &str) -> Result<(OpfMetadata, HashMap, Vec { - if section == Section::Metadata { - let text = e.unescape().unwrap_or_default().trim().to_string(); - if !text.is_empty() { - match current_tag.as_str() { - "title" => metadata.title = Some(text), - "creator" => metadata.creator = Some(text), - "subject" => metadata.subject = Some(text), - "description" => metadata.description = Some(text), - "publisher" => metadata.publisher = Some(text), - "date" => metadata.date = Some(text), - "language" => metadata.language = Some(text), - "identifier" => metadata.identifier = Some(text), - "rights" => metadata.rights = Some(text), - _ => {} - } + Ok(Event::Text(ref e)) if section == Section::Metadata => { + let text = e.unescape().unwrap_or_default().trim().to_string(); + if !text.is_empty() { + match current_tag.as_str() { + "title" => metadata.title = Some(text), + "creator" => metadata.creator = Some(text), + "subject" => metadata.subject = Some(text), + "description" => metadata.description = Some(text), + "publisher" => metadata.publisher = Some(text), + "date" => metadata.date = Some(text), + "language" => metadata.language = Some(text), + "identifier" => metadata.identifier = Some(text), + "rights" => metadata.rights = Some(text), + _ => {} } } } diff --git a/crates/paperjam-epub/src/toc.rs b/crates/paperjam-epub/src/toc.rs index 3f4a5a7..3eb5d01 100644 --- a/crates/paperjam-epub/src/toc.rs +++ b/crates/paperjam-epub/src/toc.rs @@ -76,10 +76,8 @@ fn parse_nav_point(reader: &mut Reader<&[u8]>) -> Option { } } } - Ok(Event::Text(ref e)) => { - if in_text { - title = e.unescape().unwrap_or_default().trim().to_string(); - } + Ok(Event::Text(ref e)) if in_text => { + title = e.unescape().unwrap_or_default().trim().to_string(); } Ok(Event::End(ref e)) => { let local = local_name(e.name().as_ref()); diff --git a/crates/paperjam-pptx/src/parser.rs b/crates/paperjam-pptx/src/parser.rs index 0f6e989..4f6d73f 100644 --- a/crates/paperjam-pptx/src/parser.rs +++ b/crates/paperjam-pptx/src/parser.rs @@ -254,11 +254,9 @@ fn parse_slide(xml: &str, index: usize, notes_xml: Option<&str>) -> Result {} } } - Ok(Event::Empty(ref e)) => { - if parser.in_shape { - let local = local_name_owned(e.name().as_ref()); - handle_shape_empty(&mut parser, e, &local)?; - } + Ok(Event::Empty(ref e)) if parser.in_shape => { + let local = local_name_owned(e.name().as_ref()); + handle_shape_empty(&mut parser, e, &local)?; } Ok(Event::Text(ref e)) => { if parser.in_table_cell { @@ -436,10 +434,8 @@ fn parse_notes_text(xml: &str) -> Result { text.push('\n'); } } - Ok(Event::Text(ref e)) => { - if in_text_body { - text.push_str(&e.unescape().unwrap_or_default()); - } + Ok(Event::Text(ref e)) if in_text_body => { + text.push_str(&e.unescape().unwrap_or_default()); } Ok(Event::Eof) => break, Err(e) => return Err(PptxError::Xml(format!("notes XML error: {e}"))), diff --git a/docs-site/src/pages/index.tsx b/docs-site/src/pages/index.tsx index 7dd2aaa..c4f3b6b 100644 --- a/docs-site/src/pages/index.tsx +++ b/docs-site/src/pages/index.tsx @@ -137,15 +137,6 @@ export default function Home(): React.JSX.Element { borderRadius: '16px', }} /> -

- paperjam -

=0.10", "mypy>=1.8", "ruff>=0.3", + "reportlab>=4.0,<4.2", + "pandas>=2.0", ] [tool.maturin] @@ -56,6 +58,9 @@ include = [ [tool.pytest.ini_options] testpaths = ["tests/python"] +markers = [ + "accuracy: table extraction accuracy harness (run with -m accuracy)", +] [tool.ruff] target-version = "py312" diff --git a/scripts/generate_table_fixtures.py b/scripts/generate_table_fixtures.py new file mode 100644 index 0000000..5b2feea --- /dev/null +++ b/scripts/generate_table_fixtures.py @@ -0,0 +1,345 @@ +#!/usr/bin/env python3 +"""Generate deterministic synthetic PDF fixtures for the table-extraction accuracy harness. + +Each fixture is described by a ``TableSpec`` that is the single source of truth for BOTH +the rendered PDF and the ground-truth JSON sidecar — by construction they cannot drift. + +Usage: + uv run python scripts/generate_table_fixtures.py +""" + +from __future__ import annotations + +import json +import pathlib +from dataclasses import dataclass + +import reportlab.rl_config + +reportlab.rl_config.invariant = 1 # deterministic PDF metadata (fixed /CreationDate, /ID, etc.) + +from reportlab.lib.pagesizes import landscape, letter # noqa: E402 +from reportlab.pdfgen import canvas # noqa: E402 + +REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent +OUT_DIR = REPO_ROOT / "tests" / "fixtures" / "tables" + +FONT_REGULAR = "Helvetica" +FONT_BOLD = "Helvetica-Bold" +FONT_SIZE = 10 + + +@dataclass(frozen=True) +class CellSpec: + row: int + col: int + text: str + row_span: int = 1 + col_span: int = 1 + + +@dataclass +class TableSpec: + """One source of truth for a single-table single-page fixture.""" + + name: str + page_size: tuple[float, float] + bordered: bool + origin: tuple[float, float] # (x_left, y_top) in PDF coords (bottom-left origin) + col_widths: list[float] + row_heights: list[float] + cells: list[CellSpec] + notes: str + bold_header: bool = False + + +def _simple_cells(data: list[list[str]]) -> list[CellSpec]: + return [CellSpec(row=r, col=c, text=t) for r, row in enumerate(data) for c, t in enumerate(row)] + + +def _render_table(c: canvas.Canvas, spec: TableSpec, *, page_y_top: float | None = None) -> None: + """Render one table onto the current canvas page, using spec.origin (or override y).""" + x_left, y_top = spec.origin + if page_y_top is not None: + y_top = page_y_top + xs = [x_left] + for w in spec.col_widths: + xs.append(xs[-1] + w) + ys = [y_top] + for h in spec.row_heights: + ys.append(ys[-1] - h) + + for cell in spec.cells: + x1 = xs[cell.col] + x2 = xs[cell.col + cell.col_span] + y1 = ys[cell.row + cell.row_span] + y2 = ys[cell.row] + if spec.bordered: + c.setStrokeColorRGB(0, 0, 0) + c.setLineWidth(0.5) + c.rect(x1, y1, x2 - x1, y2 - y1) + font = FONT_BOLD if (spec.bold_header and cell.row == 0) else FONT_REGULAR + c.setFont(font, FONT_SIZE) + text_w = c.stringWidth(cell.text, font, FONT_SIZE) + tx = (x1 + x2) / 2 - text_w / 2 + ty = (y1 + y2) / 2 - FONT_SIZE / 3 # rough vertical centering + c.drawString(tx, ty, cell.text) + + +def _table_bbox(spec: TableSpec) -> list[float]: + x_left, y_top = spec.origin + return [ + x_left, + y_top - sum(spec.row_heights), + x_left + sum(spec.col_widths), + y_top, + ] + + +def _gt_for_spec(spec: TableSpec, *, page: int = 1) -> dict: + return { + "page": page, + "bbox": _table_bbox(spec), + "col_count": len(spec.col_widths), + "row_count": len(spec.row_heights), + "cells": [ + { + "row": c.row, + "col": c.col, + "row_span": c.row_span, + "col_span": c.col_span, + "text": c.text, + } + for c in spec.cells + ], + } + + +def _canvas_for(name: str, page_size: tuple[float, float]) -> canvas.Canvas: + pdf_path = OUT_DIR / f"{name}.pdf" + c = canvas.Canvas(str(pdf_path), pagesize=page_size, pageCompression=0) + c.setCreator("paperjam-test-fixtures") + c.setTitle(name) + c.setSubject("Table extraction accuracy fixture") + return c + + +def _write_gt(name: str, notes: str, tables: list[dict]) -> None: + data = { + "source": "synthetic", + "license": "generated", + "notes": notes, + "tables": tables, + } + (OUT_DIR / f"{name}.gt.json").write_text(json.dumps(data, indent=2) + "\n") + + +def emit_single_table_fixture(spec: TableSpec) -> None: + c = _canvas_for(spec.name, spec.page_size) + _render_table(c, spec) + c.showPage() + c.save() + _write_gt(spec.name, spec.notes, [_gt_for_spec(spec)]) + + +def emit_multipage_continuation() -> None: + name = "multipage_continuation" + header = ["Year", "Metric", "Value"] + page1_rows = [[f"201{i}", "Revenue", str(100 * (i + 1))] for i in range(6)] + page2_rows = [[f"202{i}", "Revenue", str(200 * (i + 1))] for i in range(6)] + + col_widths = [100.0, 120.0, 100.0] + row_heights = [24.0] * 7 # header + 6 data rows + origin = (72.0, 720.0) + + c = _canvas_for(name, letter) + for page_data in (page1_rows, page2_rows): + spec = TableSpec( + name=name, + page_size=letter, + bordered=True, + origin=origin, + col_widths=col_widths, + row_heights=row_heights, + cells=_simple_cells([header, *page_data]), + notes="", + ) + _render_table(c, spec) + c.showPage() + c.save() + + def _page_gt(page_idx: int, rows: list[list[str]]) -> dict: + cells = [{"row": r, "col": cc, "row_span": 1, "col_span": 1, "text": t} for r, row in enumerate([header, *rows]) for cc, t in enumerate(row)] + return { + "page": page_idx, + "bbox": [origin[0], origin[1] - sum(row_heights), origin[0] + sum(col_widths), origin[1]], + "col_count": len(col_widths), + "row_count": len(row_heights), + "cells": cells, + } + + _write_gt( + name, + "2-page table, header repeats on page 2; Phase 0 scores per-page tables", + [_page_gt(1, page1_rows), _page_gt(2, page2_rows)], + ) + + +def build_fixtures() -> list[TableSpec]: + specs: list[TableSpec] = [ + TableSpec( + name="bordered_simple", + page_size=letter, + bordered=True, + origin=(72.0, 700.0), + col_widths=[100.0, 100.0, 100.0, 100.0], + row_heights=[24.0, 24.0, 24.0], + cells=_simple_cells( + [ + ["Header A", "Header B", "Header C", "Header D"], + ["a1", "b1", "c1", "d1"], + ["a2", "b2", "c2", "d2"], + ] + ), + notes="3x4 fully bordered, single-line cells", + ), + TableSpec( + name="bordered_dense", + page_size=letter, + bordered=True, + origin=(54.0, 720.0), + col_widths=[60.0] * 6, + row_heights=[22.0] * 8, + cells=_simple_cells( + [ + ["Year", "Q1", "Q2", "Q3", "Q4", "Total"], + ["2016", "100", "110", "120", "130", "460"], + ["2017", "105", "115", "125", "135", "480"], + ["2018", "110", "120", "130", "140", "500"], + ["2019", "115", "125", "135", "145", "520"], + ["2020", "120", "130", "140", "150", "540"], + ["2021", "125", "135", "145", "155", "560"], + ["2022", "130", "140", "150", "160", "580"], + ] + ), + notes="8x6 bordered with numeric data, tight spacing", + ), + TableSpec( + name="bordered_merged", + page_size=letter, + bordered=True, + origin=(72.0, 700.0), + col_widths=[80.0, 80.0, 80.0, 80.0], + row_heights=[24.0] * 5, + cells=[ + CellSpec(row=0, col=0, text="Year", row_span=2, col_span=1), + CellSpec(row=0, col=1, text="Revenue", row_span=1, col_span=2), + CellSpec(row=0, col=3, text="Notes", row_span=2, col_span=1), + CellSpec(row=1, col=1, text="Q1"), + CellSpec(row=1, col=2, text="Q2"), + CellSpec(row=2, col=0, text="2020"), + CellSpec(row=2, col=1, text="100"), + CellSpec(row=2, col=2, text="120"), + CellSpec(row=2, col=3, text="stable"), + CellSpec(row=3, col=0, text="2021"), + CellSpec(row=3, col=1, text="110"), + CellSpec(row=3, col=2, text="130"), + CellSpec(row=3, col=3, text="up"), + CellSpec(row=4, col=0, text="2022"), + CellSpec(row=4, col=1, text="120"), + CellSpec(row=4, col=2, text="140"), + CellSpec(row=4, col=3, text="up"), + ], + notes="5x4 with row-span merges (Year, Notes) and col-span merge (Revenue)", + ), + TableSpec( + name="borderless_financial", + page_size=letter, + bordered=False, + origin=(72.0, 700.0), + col_widths=[120.0, 80.0, 80.0, 80.0], + row_heights=[24.0] * 6, + cells=_simple_cells( + [ + ["Metric", "2021", "2022", "2023"], + ["Revenue", "1,000", "1,200", "1,500"], + ["COGS", "600", "720", "900"], + ["Gross", "400", "480", "600"], + ["OpEx", "200", "240", "300"], + ["Net", "200", "240", "300"], + ] + ), + notes="6x4 borderless financial statement", + ), + TableSpec( + name="borderless_invoice", + page_size=letter, + bordered=False, + origin=(72.0, 700.0), + col_widths=[160.0, 60.0, 80.0], + row_heights=[24.0] * 5, + cells=_simple_cells( + [ + ["Item", "Qty", "Price"], + ["Widget A", "2", "10.00"], + ["Widget B", "1", "25.00"], + ["Gadget", "3", "15.00"], + ["Gizmo", "5", "5.00"], + ] + ), + notes="5x3 borderless invoice with bold header row", + bold_header=True, + ), + TableSpec( + name="rotated_landscape", + page_size=landscape(letter), + bordered=True, + origin=(72.0, 500.0), + col_widths=[100.0] * 5, + row_heights=[24.0] * 4, + cells=_simple_cells( + [ + ["Col 1", "Col 2", "Col 3", "Col 4", "Col 5"], + ["a", "b", "c", "d", "e"], + ["f", "g", "h", "i", "j"], + ["k", "l", "m", "n", "o"], + ] + ), + notes="4x5 bordered on a landscape-oriented page", + ), + TableSpec( + name="sparse_cells", + page_size=letter, + bordered=False, + origin=(72.0, 700.0), + col_widths=[90.0, 60.0, 70.0, 70.0, 70.0], + row_heights=[24.0] * 7, + cells=_simple_cells( + [ + ["Item", "Qty", "Price", "Total", "Note"], + ["Apple", "2", "1.00", "2.00", ""], + ["Banana", "5", "0.50", "2.50", ""], + ["Subtotal", "", "", "4.50", ""], + ["Orange", "3", "1.50", "4.50", ""], + ["Grape", "1", "3.00", "3.00", ""], + ["Total", "", "", "12.00", ""], + ] + ), + notes="7x5 borderless with blank cells on subtotal/total rows", + ), + ] + return specs + + +def main() -> None: + OUT_DIR.mkdir(parents=True, exist_ok=True) + for spec in build_fixtures(): + emit_single_table_fixture(spec) + print(f" wrote {spec.name}.pdf + .gt.json") + emit_multipage_continuation() + print(" wrote multipage_continuation.pdf + .gt.json") + print(f"Done. Fixtures in {OUT_DIR.relative_to(REPO_ROOT)}") + + +if __name__ == "__main__": + main() diff --git a/tests/fixtures/tables/.accuracy_baseline.json b/tests/fixtures/tables/.accuracy_baseline.json new file mode 100644 index 0000000..b1223da --- /dev/null +++ b/tests/fixtures/tables/.accuracy_baseline.json @@ -0,0 +1,8 @@ +{ + "aggregate": { + "table_detection_f1": 1.0, + "avg_cell_f1": 0.9923469387755102, + "avg_grits_top_f1": 0.9557057057057057, + "n_fixtures": 8 + } +} diff --git a/tests/fixtures/tables/bordered_dense.gt.json b/tests/fixtures/tables/bordered_dense.gt.json new file mode 100644 index 0000000..4d7b1e8 --- /dev/null +++ b/tests/fixtures/tables/bordered_dense.gt.json @@ -0,0 +1,356 @@ +{ + "source": "synthetic", + "license": "generated", + "notes": "8x6 bordered with numeric data, tight spacing", + "tables": [ + { + "page": 1, + "bbox": [ + 54.0, + 544.0, + 414.0, + 720.0 + ], + "col_count": 6, + "row_count": 8, + "cells": [ + { + "row": 0, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Year" + }, + { + "row": 0, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Q1" + }, + { + "row": 0, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "Q2" + }, + { + "row": 0, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "Q3" + }, + { + "row": 0, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "Q4" + }, + { + "row": 0, + "col": 5, + "row_span": 1, + "col_span": 1, + "text": "Total" + }, + { + "row": 1, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2016" + }, + { + "row": 1, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "100" + }, + { + "row": 1, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "110" + }, + { + "row": 1, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "120" + }, + { + "row": 1, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "130" + }, + { + "row": 1, + "col": 5, + "row_span": 1, + "col_span": 1, + "text": "460" + }, + { + "row": 2, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2017" + }, + { + "row": 2, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "105" + }, + { + "row": 2, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "115" + }, + { + "row": 2, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "125" + }, + { + "row": 2, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "135" + }, + { + "row": 2, + "col": 5, + "row_span": 1, + "col_span": 1, + "text": "480" + }, + { + "row": 3, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2018" + }, + { + "row": 3, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "110" + }, + { + "row": 3, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "120" + }, + { + "row": 3, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "130" + }, + { + "row": 3, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "140" + }, + { + "row": 3, + "col": 5, + "row_span": 1, + "col_span": 1, + "text": "500" + }, + { + "row": 4, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2019" + }, + { + "row": 4, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "115" + }, + { + "row": 4, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "125" + }, + { + "row": 4, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "135" + }, + { + "row": 4, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "145" + }, + { + "row": 4, + "col": 5, + "row_span": 1, + "col_span": 1, + "text": "520" + }, + { + "row": 5, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2020" + }, + { + "row": 5, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "120" + }, + { + "row": 5, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "130" + }, + { + "row": 5, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "140" + }, + { + "row": 5, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "150" + }, + { + "row": 5, + "col": 5, + "row_span": 1, + "col_span": 1, + "text": "540" + }, + { + "row": 6, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2021" + }, + { + "row": 6, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "125" + }, + { + "row": 6, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "135" + }, + { + "row": 6, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "145" + }, + { + "row": 6, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "155" + }, + { + "row": 6, + "col": 5, + "row_span": 1, + "col_span": 1, + "text": "560" + }, + { + "row": 7, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2022" + }, + { + "row": 7, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "130" + }, + { + "row": 7, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "140" + }, + { + "row": 7, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "150" + }, + { + "row": 7, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "160" + }, + { + "row": 7, + "col": 5, + "row_span": 1, + "col_span": 1, + "text": "580" + } + ] + } + ] +} diff --git a/tests/fixtures/tables/bordered_dense.pdf b/tests/fixtures/tables/bordered_dense.pdf new file mode 100644 index 0000000..1ad86c2 Binary files /dev/null and b/tests/fixtures/tables/bordered_dense.pdf differ diff --git a/tests/fixtures/tables/bordered_merged.gt.json b/tests/fixtures/tables/bordered_merged.gt.json new file mode 100644 index 0000000..57713dc --- /dev/null +++ b/tests/fixtures/tables/bordered_merged.gt.json @@ -0,0 +1,139 @@ +{ + "source": "synthetic", + "license": "generated", + "notes": "5x4 with row-span merges (Year, Notes) and col-span merge (Revenue)", + "tables": [ + { + "page": 1, + "bbox": [ + 72.0, + 580.0, + 392.0, + 700.0 + ], + "col_count": 4, + "row_count": 5, + "cells": [ + { + "row": 0, + "col": 0, + "row_span": 2, + "col_span": 1, + "text": "Year" + }, + { + "row": 0, + "col": 1, + "row_span": 1, + "col_span": 2, + "text": "Revenue" + }, + { + "row": 0, + "col": 3, + "row_span": 2, + "col_span": 1, + "text": "Notes" + }, + { + "row": 1, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Q1" + }, + { + "row": 1, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "Q2" + }, + { + "row": 2, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2020" + }, + { + "row": 2, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "100" + }, + { + "row": 2, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "120" + }, + { + "row": 2, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "stable" + }, + { + "row": 3, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2021" + }, + { + "row": 3, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "110" + }, + { + "row": 3, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "130" + }, + { + "row": 3, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "up" + }, + { + "row": 4, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2022" + }, + { + "row": 4, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "120" + }, + { + "row": 4, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "140" + }, + { + "row": 4, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "up" + } + ] + } + ] +} diff --git a/tests/fixtures/tables/bordered_merged.pdf b/tests/fixtures/tables/bordered_merged.pdf new file mode 100644 index 0000000..8affa22 Binary files /dev/null and b/tests/fixtures/tables/bordered_merged.pdf differ diff --git a/tests/fixtures/tables/bordered_simple.gt.json b/tests/fixtures/tables/bordered_simple.gt.json new file mode 100644 index 0000000..e59a843 --- /dev/null +++ b/tests/fixtures/tables/bordered_simple.gt.json @@ -0,0 +1,104 @@ +{ + "source": "synthetic", + "license": "generated", + "notes": "3x4 fully bordered, single-line cells", + "tables": [ + { + "page": 1, + "bbox": [ + 72.0, + 628.0, + 472.0, + 700.0 + ], + "col_count": 4, + "row_count": 3, + "cells": [ + { + "row": 0, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Header A" + }, + { + "row": 0, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Header B" + }, + { + "row": 0, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "Header C" + }, + { + "row": 0, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "Header D" + }, + { + "row": 1, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "a1" + }, + { + "row": 1, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "b1" + }, + { + "row": 1, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "c1" + }, + { + "row": 1, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "d1" + }, + { + "row": 2, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "a2" + }, + { + "row": 2, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "b2" + }, + { + "row": 2, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "c2" + }, + { + "row": 2, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "d2" + } + ] + } + ] +} diff --git a/tests/fixtures/tables/bordered_simple.pdf b/tests/fixtures/tables/bordered_simple.pdf new file mode 100644 index 0000000..0244bb3 Binary files /dev/null and b/tests/fixtures/tables/bordered_simple.pdf differ diff --git a/tests/fixtures/tables/borderless_financial.gt.json b/tests/fixtures/tables/borderless_financial.gt.json new file mode 100644 index 0000000..c23d338 --- /dev/null +++ b/tests/fixtures/tables/borderless_financial.gt.json @@ -0,0 +1,188 @@ +{ + "source": "synthetic", + "license": "generated", + "notes": "6x4 borderless financial statement", + "tables": [ + { + "page": 1, + "bbox": [ + 72.0, + 556.0, + 432.0, + 700.0 + ], + "col_count": 4, + "row_count": 6, + "cells": [ + { + "row": 0, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Metric" + }, + { + "row": 0, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "2021" + }, + { + "row": 0, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "2022" + }, + { + "row": 0, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "2023" + }, + { + "row": 1, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Revenue" + }, + { + "row": 1, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "1,000" + }, + { + "row": 1, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "1,200" + }, + { + "row": 1, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "1,500" + }, + { + "row": 2, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "COGS" + }, + { + "row": 2, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "600" + }, + { + "row": 2, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "720" + }, + { + "row": 2, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "900" + }, + { + "row": 3, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Gross" + }, + { + "row": 3, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "400" + }, + { + "row": 3, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "480" + }, + { + "row": 3, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "600" + }, + { + "row": 4, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "OpEx" + }, + { + "row": 4, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "200" + }, + { + "row": 4, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "240" + }, + { + "row": 4, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "300" + }, + { + "row": 5, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Net" + }, + { + "row": 5, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "200" + }, + { + "row": 5, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "240" + }, + { + "row": 5, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "300" + } + ] + } + ] +} diff --git a/tests/fixtures/tables/borderless_financial.pdf b/tests/fixtures/tables/borderless_financial.pdf new file mode 100644 index 0000000..7423cc4 Binary files /dev/null and b/tests/fixtures/tables/borderless_financial.pdf differ diff --git a/tests/fixtures/tables/borderless_invoice.gt.json b/tests/fixtures/tables/borderless_invoice.gt.json new file mode 100644 index 0000000..0b6d809 --- /dev/null +++ b/tests/fixtures/tables/borderless_invoice.gt.json @@ -0,0 +1,125 @@ +{ + "source": "synthetic", + "license": "generated", + "notes": "5x3 borderless invoice with bold header row", + "tables": [ + { + "page": 1, + "bbox": [ + 72.0, + 580.0, + 372.0, + 700.0 + ], + "col_count": 3, + "row_count": 5, + "cells": [ + { + "row": 0, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Item" + }, + { + "row": 0, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Qty" + }, + { + "row": 0, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "Price" + }, + { + "row": 1, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Widget A" + }, + { + "row": 1, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "2" + }, + { + "row": 1, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "10.00" + }, + { + "row": 2, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Widget B" + }, + { + "row": 2, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "1" + }, + { + "row": 2, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "25.00" + }, + { + "row": 3, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Gadget" + }, + { + "row": 3, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "3" + }, + { + "row": 3, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "15.00" + }, + { + "row": 4, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Gizmo" + }, + { + "row": 4, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "5" + }, + { + "row": 4, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "5.00" + } + ] + } + ] +} diff --git a/tests/fixtures/tables/borderless_invoice.pdf b/tests/fixtures/tables/borderless_invoice.pdf new file mode 100644 index 0000000..75ad82b Binary files /dev/null and b/tests/fixtures/tables/borderless_invoice.pdf differ diff --git a/tests/fixtures/tables/multipage_continuation.gt.json b/tests/fixtures/tables/multipage_continuation.gt.json new file mode 100644 index 0000000..eecbda3 --- /dev/null +++ b/tests/fixtures/tables/multipage_continuation.gt.json @@ -0,0 +1,327 @@ +{ + "source": "synthetic", + "license": "generated", + "notes": "2-page table, header repeats on page 2; Phase 0 scores per-page tables", + "tables": [ + { + "page": 1, + "bbox": [ + 72.0, + 552.0, + 392.0, + 720.0 + ], + "col_count": 3, + "row_count": 7, + "cells": [ + { + "row": 0, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Year" + }, + { + "row": 0, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Metric" + }, + { + "row": 0, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "Value" + }, + { + "row": 1, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2010" + }, + { + "row": 1, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Revenue" + }, + { + "row": 1, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "100" + }, + { + "row": 2, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2011" + }, + { + "row": 2, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Revenue" + }, + { + "row": 2, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "200" + }, + { + "row": 3, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2012" + }, + { + "row": 3, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Revenue" + }, + { + "row": 3, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "300" + }, + { + "row": 4, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2013" + }, + { + "row": 4, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Revenue" + }, + { + "row": 4, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "400" + }, + { + "row": 5, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2014" + }, + { + "row": 5, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Revenue" + }, + { + "row": 5, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "500" + }, + { + "row": 6, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2015" + }, + { + "row": 6, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Revenue" + }, + { + "row": 6, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "600" + } + ] + }, + { + "page": 2, + "bbox": [ + 72.0, + 552.0, + 392.0, + 720.0 + ], + "col_count": 3, + "row_count": 7, + "cells": [ + { + "row": 0, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Year" + }, + { + "row": 0, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Metric" + }, + { + "row": 0, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "Value" + }, + { + "row": 1, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2020" + }, + { + "row": 1, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Revenue" + }, + { + "row": 1, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "200" + }, + { + "row": 2, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2021" + }, + { + "row": 2, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Revenue" + }, + { + "row": 2, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "400" + }, + { + "row": 3, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2022" + }, + { + "row": 3, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Revenue" + }, + { + "row": 3, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "600" + }, + { + "row": 4, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2023" + }, + { + "row": 4, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Revenue" + }, + { + "row": 4, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "800" + }, + { + "row": 5, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2024" + }, + { + "row": 5, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Revenue" + }, + { + "row": 5, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "1000" + }, + { + "row": 6, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2025" + }, + { + "row": 6, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Revenue" + }, + { + "row": 6, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "1200" + } + ] + } + ] +} diff --git a/tests/fixtures/tables/multipage_continuation.pdf b/tests/fixtures/tables/multipage_continuation.pdf new file mode 100644 index 0000000..bb8997d Binary files /dev/null and b/tests/fixtures/tables/multipage_continuation.pdf differ diff --git a/tests/fixtures/tables/rotated_landscape.gt.json b/tests/fixtures/tables/rotated_landscape.gt.json new file mode 100644 index 0000000..2605fe0 --- /dev/null +++ b/tests/fixtures/tables/rotated_landscape.gt.json @@ -0,0 +1,160 @@ +{ + "source": "synthetic", + "license": "generated", + "notes": "4x5 bordered on a landscape-oriented page", + "tables": [ + { + "page": 1, + "bbox": [ + 72.0, + 404.0, + 572.0, + 500.0 + ], + "col_count": 5, + "row_count": 4, + "cells": [ + { + "row": 0, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Col 1" + }, + { + "row": 0, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Col 2" + }, + { + "row": 0, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "Col 3" + }, + { + "row": 0, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "Col 4" + }, + { + "row": 0, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "Col 5" + }, + { + "row": 1, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "a" + }, + { + "row": 1, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "b" + }, + { + "row": 1, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "c" + }, + { + "row": 1, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "d" + }, + { + "row": 1, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "e" + }, + { + "row": 2, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "f" + }, + { + "row": 2, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "g" + }, + { + "row": 2, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "h" + }, + { + "row": 2, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "i" + }, + { + "row": 2, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "j" + }, + { + "row": 3, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "k" + }, + { + "row": 3, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "l" + }, + { + "row": 3, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "m" + }, + { + "row": 3, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "n" + }, + { + "row": 3, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "o" + } + ] + } + ] +} diff --git a/tests/fixtures/tables/rotated_landscape.pdf b/tests/fixtures/tables/rotated_landscape.pdf new file mode 100644 index 0000000..4dd923f Binary files /dev/null and b/tests/fixtures/tables/rotated_landscape.pdf differ diff --git a/tests/fixtures/tables/sparse_cells.gt.json b/tests/fixtures/tables/sparse_cells.gt.json new file mode 100644 index 0000000..eb7bd91 --- /dev/null +++ b/tests/fixtures/tables/sparse_cells.gt.json @@ -0,0 +1,265 @@ +{ + "source": "synthetic", + "license": "generated", + "notes": "7x5 borderless with blank cells on subtotal/total rows", + "tables": [ + { + "page": 1, + "bbox": [ + 72.0, + 532.0, + 432.0, + 700.0 + ], + "col_count": 5, + "row_count": 7, + "cells": [ + { + "row": 0, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Item" + }, + { + "row": 0, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Qty" + }, + { + "row": 0, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "Price" + }, + { + "row": 0, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "Total" + }, + { + "row": 0, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "Note" + }, + { + "row": 1, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Apple" + }, + { + "row": 1, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "2" + }, + { + "row": 1, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "1.00" + }, + { + "row": 1, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "2.00" + }, + { + "row": 1, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "" + }, + { + "row": 2, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Banana" + }, + { + "row": 2, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "5" + }, + { + "row": 2, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "0.50" + }, + { + "row": 2, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "2.50" + }, + { + "row": 2, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "" + }, + { + "row": 3, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Subtotal" + }, + { + "row": 3, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "" + }, + { + "row": 3, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "" + }, + { + "row": 3, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "4.50" + }, + { + "row": 3, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "" + }, + { + "row": 4, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Orange" + }, + { + "row": 4, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "3" + }, + { + "row": 4, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "1.50" + }, + { + "row": 4, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "4.50" + }, + { + "row": 4, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "" + }, + { + "row": 5, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Grape" + }, + { + "row": 5, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "1" + }, + { + "row": 5, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "3.00" + }, + { + "row": 5, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "3.00" + }, + { + "row": 5, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "" + }, + { + "row": 6, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Total" + }, + { + "row": 6, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "" + }, + { + "row": 6, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "" + }, + { + "row": 6, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "12.00" + }, + { + "row": 6, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "" + } + ] + } + ] +} diff --git a/tests/fixtures/tables/sparse_cells.pdf b/tests/fixtures/tables/sparse_cells.pdf new file mode 100644 index 0000000..93272f6 Binary files /dev/null and b/tests/fixtures/tables/sparse_cells.pdf differ diff --git a/tests/python/conftest.py b/tests/python/conftest.py index a8e87b4..526b779 100644 --- a/tests/python/conftest.py +++ b/tests/python/conftest.py @@ -1,8 +1,21 @@ +import json import pathlib import pytest FIXTURES = pathlib.Path(__file__).resolve().parent.parent / "fixtures" +TABLES_FIXTURES = FIXTURES / "tables" +OUTPUT_DIR = pathlib.Path(__file__).resolve().parent.parent / "output" +BASELINE_PATH = TABLES_FIXTURES / ".accuracy_baseline.json" + + +def pytest_addoption(parser): + parser.addoption( + "--update-baseline", + action="store_true", + default=False, + help="Write the current accuracy scores to the baseline file (skips the regression gate).", + ) @pytest.fixture @@ -28,3 +41,33 @@ def table_bordered_pdf(): @pytest.fixture def metadata_pdf(): return str(FIXTURES / "with_metadata.pdf") + + +@pytest.fixture(scope="session") +def accuracy_report(request): + """Session-scoped accumulator: per-fixture -> score dict; written to tests/output at teardown.""" + report: dict = {} + + def _finalize(): + if not report: + return + OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + aggregate = _aggregate_scores(report) + payload = {"fixtures": report, "aggregate": aggregate} + (OUTPUT_DIR / "table_accuracy.json").write_text(json.dumps(payload, indent=2) + "\n") + if request.config.getoption("--update-baseline"): + BASELINE_PATH.write_text(json.dumps({"aggregate": aggregate}, indent=2) + "\n") + + request.addfinalizer(_finalize) + return report + + +def _aggregate_scores(report: dict) -> dict: + fields = ("table_detection_f1", "avg_cell_f1", "avg_grits_top_f1") + totals = {f: 0.0 for f in fields} + n = 0 + for scores in report.values(): + for f in fields: + totals[f] += float(scores.get(f, 0.0)) + n += 1 + return {f: (totals[f] / n if n else 0.0) for f in fields} | {"n_fixtures": n} diff --git a/tests/python/grits.py b/tests/python/grits.py new file mode 100644 index 0000000..8c2cd19 --- /dev/null +++ b/tests/python/grits.py @@ -0,0 +1,207 @@ +"""Table extraction accuracy scorer. + +Implements: +- Whitespace-normalized text comparison. +- Bbox-IoU matching of predicted tables to ground-truth tables (greedy Hungarian). +- Cell-level precision/recall/F1 on normalized text. +- GriTS-Top: F1 over the set of topological signatures (row_start, row_end, col_start, col_end) + of every cell, accounting for row/col spans. Robust to missing text; penalizes structural errors. + +The internal representation for a table is a dict matching the .gt.json `tables[]` entry: + + { + "page": int, + "bbox": [x_min, y_min, x_max, y_max], + "col_count": int, + "row_count": int, + "cells": [{"row": int, "col": int, "row_span": int, "col_span": int, "text": str}, ...], + } + +Predicted tables from paperjam.Table are converted into this shape by ``predicted_tables_to_gt_shape``. + +Reference: Smock, Pesala, Abraham, "PubTables-1M" (CVPR 2022), §4.1. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Iterable + + +def normalize_text(s: str) -> str: + return " ".join(s.split()) + + +def _bbox_iou(a: list[float] | tuple[float, ...], b: list[float] | tuple[float, ...]) -> float: + ax1, ay1, ax2, ay2 = a + bx1, by1, bx2, by2 = b + ix1, iy1 = max(ax1, bx1), max(ay1, by1) + ix2, iy2 = min(ax2, bx2), min(ay2, by2) + iw, ih = max(0.0, ix2 - ix1), max(0.0, iy2 - iy1) + inter = iw * ih + area_a = max(0.0, ax2 - ax1) * max(0.0, ay2 - ay1) + area_b = max(0.0, bx2 - bx1) * max(0.0, by2 - by1) + union = area_a + area_b - inter + return inter / union if union > 0 else 0.0 + + +def match_tables(pred: list[dict], gt: list[dict], iou_threshold: float = 0.5) -> dict: + """Greedy 1:1 matching on bbox IoU restricted to same page. + + Returns {"matches": [(p_idx, g_idx, iou), ...], "unmatched_pred": [...], "unmatched_gt": [...]}. + """ + pairs: list[tuple[float, int, int]] = [] + for pi, p in enumerate(pred): + for gi, g in enumerate(gt): + if p.get("page") != g.get("page"): + continue + iou = _bbox_iou(p["bbox"], g["bbox"]) + if iou >= iou_threshold: + pairs.append((iou, pi, gi)) + pairs.sort(reverse=True) + used_p: set[int] = set() + used_g: set[int] = set() + matches: list[tuple[int, int, float]] = [] + for iou, pi, gi in pairs: + if pi in used_p or gi in used_g: + continue + matches.append((pi, gi, iou)) + used_p.add(pi) + used_g.add(gi) + return { + "matches": matches, + "unmatched_pred": [i for i in range(len(pred)) if i not in used_p], + "unmatched_gt": [i for i in range(len(gt)) if i not in used_g], + } + + +def _cell_text_bag(cells: Iterable[dict]) -> list[str]: + return sorted(normalize_text(c["text"]) for c in cells if normalize_text(c["text"]) != "") + + +def cell_precision_recall(pred_table: dict, gt_table: dict) -> dict: + """Multiset-level precision/recall over cell texts. + + Treats identical strings as interchangeable; ignores position. A complementary metric + to GriTS-Top, which scores structure and ignores text. Together they catch both failure modes. + """ + pred_bag = _cell_text_bag(pred_table["cells"]) + gt_bag = _cell_text_bag(gt_table["cells"]) + if not pred_bag and not gt_bag: + return {"precision": 1.0, "recall": 1.0, "f1": 1.0, "matched": 0, "pred_total": 0, "gt_total": 0} + # Multiset intersection size. + from collections import Counter + + inter = sum((Counter(pred_bag) & Counter(gt_bag)).values()) + precision = inter / len(pred_bag) if pred_bag else 0.0 + recall = inter / len(gt_bag) if gt_bag else 0.0 + f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0 + return { + "precision": precision, + "recall": recall, + "f1": f1, + "matched": inter, + "pred_total": len(pred_bag), + "gt_total": len(gt_bag), + } + + +def _topology_signatures(cells: Iterable[dict]) -> set[tuple[int, int, int, int]]: + """Each cell → (row_start, row_end_exclusive, col_start, col_end_exclusive).""" + return {(c["row"], c["row"] + c.get("row_span", 1), c["col"], c["col"] + c.get("col_span", 1)) for c in cells} + + +def grits_top(pred_table: dict, gt_table: dict) -> dict: + """GriTS-Top (topology) F1 over cell signatures. + + Score = F1 over the set of `(row_start, row_end, col_start, col_end)` topological + signatures of every cell. A perfect structural match is 1.0; different row/col + count or missed merges drive it down even when text is correct. + """ + pred_sigs = _topology_signatures(pred_table["cells"]) + gt_sigs = _topology_signatures(gt_table["cells"]) + if not pred_sigs and not gt_sigs: + return {"precision": 1.0, "recall": 1.0, "f1": 1.0} + inter = len(pred_sigs & gt_sigs) + precision = inter / len(pred_sigs) if pred_sigs else 0.0 + recall = inter / len(gt_sigs) if gt_sigs else 0.0 + f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0 + return {"precision": precision, "recall": recall, "f1": f1} + + +def score_document(pred: list[dict], gt: list[dict], iou_threshold: float = 0.5) -> dict: + """Aggregate scores across all tables in one document. + + Returns table-detection F1 (matched vs unmatched), plus average cell F1 and GriTS-Top F1 + over matched pairs. Unmatched predicted tables are not scored for cell/GriTS but do hurt + table-detection precision; unmatched GT tables hurt detection recall. + """ + m = match_tables(pred, gt, iou_threshold=iou_threshold) + n_matched = len(m["matches"]) + n_pred = len(pred) + n_gt = len(gt) + td_p = n_matched / n_pred if n_pred > 0 else (1.0 if n_gt == 0 else 0.0) + td_r = n_matched / n_gt if n_gt > 0 else (1.0 if n_pred == 0 else 0.0) + td_f1 = (2 * td_p * td_r / (td_p + td_r)) if (td_p + td_r) > 0 else 0.0 + + cell_f1s: list[float] = [] + grits_f1s: list[float] = [] + per_table: list[dict] = [] + for pi, gi, iou in m["matches"]: + cell = cell_precision_recall(pred[pi], gt[gi]) + top = grits_top(pred[pi], gt[gi]) + cell_f1s.append(cell["f1"]) + grits_f1s.append(top["f1"]) + per_table.append( + { + "pred_idx": pi, + "gt_idx": gi, + "bbox_iou": iou, + "cell_f1": cell["f1"], + "cell_precision": cell["precision"], + "cell_recall": cell["recall"], + "grits_top_f1": top["f1"], + } + ) + avg_cell_f1 = sum(cell_f1s) / len(cell_f1s) if cell_f1s else 0.0 + avg_grits = sum(grits_f1s) / len(grits_f1s) if grits_f1s else 0.0 + return { + "table_detection_precision": td_p, + "table_detection_recall": td_r, + "table_detection_f1": td_f1, + "avg_cell_f1": avg_cell_f1, + "avg_grits_top_f1": avg_grits, + "n_matched": n_matched, + "n_pred": n_pred, + "n_gt": n_gt, + "per_table": per_table, + } + + +def predicted_tables_to_gt_shape(paperjam_tables: list, *, page: int) -> list[dict]: + """Convert paperjam.Table dataclasses into the harness dict shape.""" + out: list[dict] = [] + for t in paperjam_tables: + cells = [ + { + "row": r_idx, + "col": c_idx, + "row_span": int(getattr(cell, "row_span", 1)), + "col_span": int(getattr(cell, "col_span", 1)), + "text": cell.text, + } + for r_idx, row in enumerate(t.rows) + for c_idx, cell in enumerate(row.cells) + ] + out.append( + { + "page": page, + "bbox": list(t.bbox), + "col_count": int(t.col_count), + "row_count": int(t.row_count), + "cells": cells, + } + ) + return out diff --git a/tests/python/test_grits.py b/tests/python/test_grits.py new file mode 100644 index 0000000..f5209c3 --- /dev/null +++ b/tests/python/test_grits.py @@ -0,0 +1,86 @@ +"""Unit tests for the GriTS scorer — independent of the extractor.""" + +from __future__ import annotations + +from grits import ( + cell_precision_recall, + grits_top, + match_tables, + normalize_text, + score_document, +) + + +def _mk_table(cells: list[tuple[int, int, int, int, str]], page: int = 1, bbox=(0, 0, 100, 100), rows=None, cols=None) -> dict: + return { + "page": page, + "bbox": list(bbox), + "row_count": rows or (max((c[0] + c[2] for c in cells), default=0)), + "col_count": cols or (max((c[1] + c[3] for c in cells), default=0)), + "cells": [{"row": r, "col": c, "row_span": rs, "col_span": cs, "text": t} for r, c, rs, cs, t in cells], + } + + +def test_normalize_text_collapses_whitespace(): + assert normalize_text(" hello world\n ") == "hello world" + assert normalize_text("") == "" + + +def test_identical_tables_score_perfect(): + t = _mk_table([(0, 0, 1, 1, "A"), (0, 1, 1, 1, "B"), (1, 0, 1, 1, "x"), (1, 1, 1, 1, "y")]) + assert cell_precision_recall(t, t)["f1"] == 1.0 + assert grits_top(t, t)["f1"] == 1.0 + + +def test_all_wrong_text_grits_perfect_cells_zero(): + gt = _mk_table([(0, 0, 1, 1, "A"), (0, 1, 1, 1, "B")]) + pred = _mk_table([(0, 0, 1, 1, "X"), (0, 1, 1, 1, "Y")]) + assert grits_top(pred, gt)["f1"] == 1.0 # structure identical + assert cell_precision_recall(pred, gt)["f1"] == 0.0 # no shared text + + +def test_missing_column_penalizes_both(): + gt = _mk_table([(0, 0, 1, 1, "A"), (0, 1, 1, 1, "B"), (0, 2, 1, 1, "C")]) + pred = _mk_table([(0, 0, 1, 1, "A"), (0, 1, 1, 1, "B")]) + assert grits_top(pred, gt)["f1"] < 1.0 + cell = cell_precision_recall(pred, gt) + assert cell["precision"] == 1.0 # every pred cell is right + assert cell["recall"] < 1.0 # missed one + + +def test_extra_merge_penalizes_grits(): + gt = _mk_table([(0, 0, 1, 1, "H1"), (0, 1, 1, 1, "H2")], cols=2, rows=1) + pred = _mk_table([(0, 0, 1, 2, "H1 H2")], cols=2, rows=1) # merged header + # Topological signatures differ — pred has one wide cell, gt has two narrow cells + assert grits_top(pred, gt)["f1"] == 0.0 + + +def test_table_detection_f1_counts_extra_tables(): + t = _mk_table([(0, 0, 1, 1, "A")]) + out = score_document([t, t], [t]) # predicted 2, gt 1 + assert out["table_detection_precision"] == 0.5 + assert out["table_detection_recall"] == 1.0 + + +def test_table_detection_f1_missed_table(): + t = _mk_table([(0, 0, 1, 1, "A")]) + out = score_document([], [t]) + assert out["table_detection_recall"] == 0.0 + assert out["n_matched"] == 0 + + +def test_match_tables_respects_page(): + t_p1 = _mk_table([(0, 0, 1, 1, "A")], page=1) + t_p2 = _mk_table([(0, 0, 1, 1, "A")], page=2) + m = match_tables([t_p1], [t_p2]) + assert m["matches"] == [] + assert m["unmatched_pred"] == [0] + assert m["unmatched_gt"] == [0] + + +def test_match_tables_low_iou_unmatched(): + t_far = _mk_table([(0, 0, 1, 1, "A")], bbox=(0, 0, 10, 10)) + t_near = _mk_table([(0, 0, 1, 1, "A")], bbox=(0, 0, 100, 100)) + # The small bbox is fully inside the big one, but the IoU (100 / 10000 = 0.01) is well below threshold. + m = match_tables([t_far], [t_near]) + assert m["matches"] == [] diff --git a/tests/python/test_table_accuracy.py b/tests/python/test_table_accuracy.py new file mode 100644 index 0000000..6298414 --- /dev/null +++ b/tests/python/test_table_accuracy.py @@ -0,0 +1,100 @@ +"""Table extraction accuracy harness. + +Parametrized over every ``.gt.json`` fixture under ``tests/fixtures/tables/``. For each +fixture, opens the paired PDF, runs ``extract_tables`` on every page, converts the result +into the harness shape, and scores it against the ground truth. + +A final ``test_baseline_regression`` compares the aggregate to the committed baseline +and fails on >1% drop in table-detection F1, cell F1, or GriTS-Top F1. Pass +``--update-baseline`` to refresh the committed numbers after an intentional improvement. + +Run: ``uv run pytest tests/python/ -m accuracy -v`` +""" + +from __future__ import annotations + +import json +import pathlib + +import paperjam +import pytest +from grits import predicted_tables_to_gt_shape, score_document + +FIXTURES_DIR = pathlib.Path(__file__).resolve().parent.parent / "fixtures" / "tables" +BASELINE_PATH = FIXTURES_DIR / ".accuracy_baseline.json" +REGRESSION_THRESHOLD = 0.01 # allow 1% noise; anything worse fails CI + +_GT_FILES = sorted(FIXTURES_DIR.glob("*.gt.json")) + + +def _load_pred(pdf_path: pathlib.Path) -> list[dict]: + doc = paperjam.open_pdf(str(pdf_path)) + out: list[dict] = [] + for page_idx, page in enumerate(doc.pages, start=1): + tables = page.extract_tables() + out.extend(predicted_tables_to_gt_shape(tables, page=page_idx)) + return out + + +@pytest.mark.accuracy +@pytest.mark.parametrize("gt_path", _GT_FILES, ids=lambda p: p.stem) +def test_extraction_accuracy(gt_path: pathlib.Path, accuracy_report): + pdf_path = gt_path.with_name(gt_path.name.removesuffix(".gt.json") + ".pdf") + assert pdf_path.exists(), f"fixture PDF missing for {gt_path.name}" + with open(gt_path) as f: + gt_doc = json.load(f) + gt_tables = gt_doc["tables"] + pred_tables = _load_pred(pdf_path) + scores = score_document(pred_tables, gt_tables) + # Slim the per_table dump before storing so the report stays readable. + accuracy_report[gt_path.stem] = { + "table_detection_precision": scores["table_detection_precision"], + "table_detection_recall": scores["table_detection_recall"], + "table_detection_f1": scores["table_detection_f1"], + "avg_cell_f1": scores["avg_cell_f1"], + "avg_grits_top_f1": scores["avg_grits_top_f1"], + "n_matched": scores["n_matched"], + "n_pred": scores["n_pred"], + "n_gt": scores["n_gt"], + } + + +@pytest.mark.accuracy +def test_baseline_regression(accuracy_report, request): + """Fails if aggregate scores have dropped more than REGRESSION_THRESHOLD vs the committed baseline. + + Skipped when --update-baseline is passed, since we're refreshing the numbers in that run. + Also skipped when no baseline exists yet (first ever run; commit the baseline this run produces). + """ + if request.config.getoption("--update-baseline"): + pytest.skip("--update-baseline: regression gate disabled this run") + if not BASELINE_PATH.exists(): + pytest.skip(f"no baseline at {BASELINE_PATH}; run once with --update-baseline and commit it") + + # The session-scoped accuracy_report may not have been populated yet when this test runs + # in isolation; run it last in the file so the parametrized tests above have fired. + if not accuracy_report: + pytest.skip("no accuracy data collected in this session") + + baseline = json.loads(BASELINE_PATH.read_text())["aggregate"] + # Recompute aggregate from the session-scoped report. + current = _aggregate(accuracy_report) + + failures: list[str] = [] + for field in ("table_detection_f1", "avg_cell_f1", "avg_grits_top_f1"): + b = float(baseline.get(field, 0.0)) + c = float(current.get(field, 0.0)) + if c + REGRESSION_THRESHOLD < b: + failures.append(f"{field}: {c:.4f} < baseline {b:.4f} (drop > {REGRESSION_THRESHOLD:.0%})") + assert not failures, "accuracy regression vs baseline:\n " + "\n ".join(failures) + + +def _aggregate(report: dict) -> dict: + fields = ("table_detection_f1", "avg_cell_f1", "avg_grits_top_f1") + totals = {f: 0.0 for f in fields} + n = 0 + for scores in report.values(): + for f in fields: + totals[f] += float(scores.get(f, 0.0)) + n += 1 + return {f: (totals[f] / n if n else 0.0) for f in fields}