From cff2538f1aad6c89a5f646aef6fca1e9d5ddcd6d Mon Sep 17 00:00:00 2001 From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com> Date: Fri, 17 Apr 2026 00:53:06 +0530 Subject: [PATCH 01/12] fix(docs-site): remove redundant 'paperjam' heading from hero MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The landing page rendered an h1 with the text 'paperjam' directly below the logo image, which itself contains the word 'paperjam'. Drop the duplicated heading — the logo's alt text keeps the product name accessible to screen readers, and Docusaurus already sets the HTML . --- docs-site/src/pages/index.tsx | 9 --------- 1 file changed, 9 deletions(-) diff --git a/docs-site/src/pages/index.tsx b/docs-site/src/pages/index.tsx index 7dd2aaa..c4f3b6b 100644 --- a/docs-site/src/pages/index.tsx +++ b/docs-site/src/pages/index.tsx @@ -137,15 +137,6 @@ export default function Home(): React.JSX.Element { borderRadius: '16px', }} /> - <h1 - style={{ - fontSize: '2.5rem', - fontWeight: 800, - marginBottom: '0.5rem', - }} - > - paperjam - </h1> <p style={{ fontSize: '1.25rem', From f3b7fe586ef30d31cf125404f50addf63c500ec1 Mon Sep 17 00:00:00 2001 From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com> Date: Fri, 17 Apr 2026 00:53:18 +0530 Subject: [PATCH 02/12] chore(deps): add reportlab + pandas dev deps and pytest 'accuracy' marker Sets up the foundation for the table extraction accuracy harness: - reportlab for deterministic synthetic PDF fixture generation. - pandas promoted from optional to dev so tests can exercise Table.to_dataframe() in accuracy scoring. - Registers the 'accuracy' pytest marker used by the harness tests. --- pyproject.toml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 5884a34..9553cf2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,8 @@ dev = [ "pdfplumber>=0.10", "mypy>=1.8", "ruff>=0.3", + "reportlab>=4.0,<4.2", + "pandas>=2.0", ] [tool.maturin] @@ -56,6 +58,9 @@ include = [ [tool.pytest.ini_options] testpaths = ["tests/python"] +markers = [ + "accuracy: table extraction accuracy harness (run with -m accuracy)", +] [tool.ruff] target-version = "py312" From caf3fe8084e9ad940c8299c10769bc04010e0e62 Mon Sep 17 00:00:00 2001 From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com> Date: Fri, 17 Apr 2026 00:53:27 +0530 Subject: [PATCH 03/12] feat(fixtures): add deterministic synthetic table fixture generator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each fixture is described by a single TableSpec that is the shared source of truth for both the rendered PDF and the ground-truth JSON sidecar — by construction they cannot drift apart. - reportlab.rl_config.invariant = 1 + pageCompression = 0 for bit-identical output across runs (verified via SHA256). - 8 fixture types covering bordered/borderless, merged cells, borderless financial/invoice layouts, a landscape page, sparse subtotal rows, and a 2-page continuation with repeated header. - Output lands under tests/fixtures/tables/, paired <name>.pdf + <name>.gt.json. --- scripts/generate_table_fixtures.py | 345 +++++++++++++++++++++++++++++ 1 file changed, 345 insertions(+) create mode 100644 scripts/generate_table_fixtures.py diff --git a/scripts/generate_table_fixtures.py b/scripts/generate_table_fixtures.py new file mode 100644 index 0000000..5b2feea --- /dev/null +++ b/scripts/generate_table_fixtures.py @@ -0,0 +1,345 @@ +#!/usr/bin/env python3 +"""Generate deterministic synthetic PDF fixtures for the table-extraction accuracy harness. + +Each fixture is described by a ``TableSpec`` that is the single source of truth for BOTH +the rendered PDF and the ground-truth JSON sidecar — by construction they cannot drift. + +Usage: + uv run python scripts/generate_table_fixtures.py +""" + +from __future__ import annotations + +import json +import pathlib +from dataclasses import dataclass + +import reportlab.rl_config + +reportlab.rl_config.invariant = 1 # deterministic PDF metadata (fixed /CreationDate, /ID, etc.) + +from reportlab.lib.pagesizes import landscape, letter # noqa: E402 +from reportlab.pdfgen import canvas # noqa: E402 + +REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent +OUT_DIR = REPO_ROOT / "tests" / "fixtures" / "tables" + +FONT_REGULAR = "Helvetica" +FONT_BOLD = "Helvetica-Bold" +FONT_SIZE = 10 + + +@dataclass(frozen=True) +class CellSpec: + row: int + col: int + text: str + row_span: int = 1 + col_span: int = 1 + + +@dataclass +class TableSpec: + """One source of truth for a single-table single-page fixture.""" + + name: str + page_size: tuple[float, float] + bordered: bool + origin: tuple[float, float] # (x_left, y_top) in PDF coords (bottom-left origin) + col_widths: list[float] + row_heights: list[float] + cells: list[CellSpec] + notes: str + bold_header: bool = False + + +def _simple_cells(data: list[list[str]]) -> list[CellSpec]: + return [CellSpec(row=r, col=c, text=t) for r, row in enumerate(data) for c, t in enumerate(row)] + + +def _render_table(c: canvas.Canvas, spec: TableSpec, *, page_y_top: float | None = None) -> None: + """Render one table onto the current canvas page, using spec.origin (or override y).""" + x_left, y_top = spec.origin + if page_y_top is not None: + y_top = page_y_top + xs = [x_left] + for w in spec.col_widths: + xs.append(xs[-1] + w) + ys = [y_top] + for h in spec.row_heights: + ys.append(ys[-1] - h) + + for cell in spec.cells: + x1 = xs[cell.col] + x2 = xs[cell.col + cell.col_span] + y1 = ys[cell.row + cell.row_span] + y2 = ys[cell.row] + if spec.bordered: + c.setStrokeColorRGB(0, 0, 0) + c.setLineWidth(0.5) + c.rect(x1, y1, x2 - x1, y2 - y1) + font = FONT_BOLD if (spec.bold_header and cell.row == 0) else FONT_REGULAR + c.setFont(font, FONT_SIZE) + text_w = c.stringWidth(cell.text, font, FONT_SIZE) + tx = (x1 + x2) / 2 - text_w / 2 + ty = (y1 + y2) / 2 - FONT_SIZE / 3 # rough vertical centering + c.drawString(tx, ty, cell.text) + + +def _table_bbox(spec: TableSpec) -> list[float]: + x_left, y_top = spec.origin + return [ + x_left, + y_top - sum(spec.row_heights), + x_left + sum(spec.col_widths), + y_top, + ] + + +def _gt_for_spec(spec: TableSpec, *, page: int = 1) -> dict: + return { + "page": page, + "bbox": _table_bbox(spec), + "col_count": len(spec.col_widths), + "row_count": len(spec.row_heights), + "cells": [ + { + "row": c.row, + "col": c.col, + "row_span": c.row_span, + "col_span": c.col_span, + "text": c.text, + } + for c in spec.cells + ], + } + + +def _canvas_for(name: str, page_size: tuple[float, float]) -> canvas.Canvas: + pdf_path = OUT_DIR / f"{name}.pdf" + c = canvas.Canvas(str(pdf_path), pagesize=page_size, pageCompression=0) + c.setCreator("paperjam-test-fixtures") + c.setTitle(name) + c.setSubject("Table extraction accuracy fixture") + return c + + +def _write_gt(name: str, notes: str, tables: list[dict]) -> None: + data = { + "source": "synthetic", + "license": "generated", + "notes": notes, + "tables": tables, + } + (OUT_DIR / f"{name}.gt.json").write_text(json.dumps(data, indent=2) + "\n") + + +def emit_single_table_fixture(spec: TableSpec) -> None: + c = _canvas_for(spec.name, spec.page_size) + _render_table(c, spec) + c.showPage() + c.save() + _write_gt(spec.name, spec.notes, [_gt_for_spec(spec)]) + + +def emit_multipage_continuation() -> None: + name = "multipage_continuation" + header = ["Year", "Metric", "Value"] + page1_rows = [[f"201{i}", "Revenue", str(100 * (i + 1))] for i in range(6)] + page2_rows = [[f"202{i}", "Revenue", str(200 * (i + 1))] for i in range(6)] + + col_widths = [100.0, 120.0, 100.0] + row_heights = [24.0] * 7 # header + 6 data rows + origin = (72.0, 720.0) + + c = _canvas_for(name, letter) + for page_data in (page1_rows, page2_rows): + spec = TableSpec( + name=name, + page_size=letter, + bordered=True, + origin=origin, + col_widths=col_widths, + row_heights=row_heights, + cells=_simple_cells([header, *page_data]), + notes="", + ) + _render_table(c, spec) + c.showPage() + c.save() + + def _page_gt(page_idx: int, rows: list[list[str]]) -> dict: + cells = [{"row": r, "col": cc, "row_span": 1, "col_span": 1, "text": t} for r, row in enumerate([header, *rows]) for cc, t in enumerate(row)] + return { + "page": page_idx, + "bbox": [origin[0], origin[1] - sum(row_heights), origin[0] + sum(col_widths), origin[1]], + "col_count": len(col_widths), + "row_count": len(row_heights), + "cells": cells, + } + + _write_gt( + name, + "2-page table, header repeats on page 2; Phase 0 scores per-page tables", + [_page_gt(1, page1_rows), _page_gt(2, page2_rows)], + ) + + +def build_fixtures() -> list[TableSpec]: + specs: list[TableSpec] = [ + TableSpec( + name="bordered_simple", + page_size=letter, + bordered=True, + origin=(72.0, 700.0), + col_widths=[100.0, 100.0, 100.0, 100.0], + row_heights=[24.0, 24.0, 24.0], + cells=_simple_cells( + [ + ["Header A", "Header B", "Header C", "Header D"], + ["a1", "b1", "c1", "d1"], + ["a2", "b2", "c2", "d2"], + ] + ), + notes="3x4 fully bordered, single-line cells", + ), + TableSpec( + name="bordered_dense", + page_size=letter, + bordered=True, + origin=(54.0, 720.0), + col_widths=[60.0] * 6, + row_heights=[22.0] * 8, + cells=_simple_cells( + [ + ["Year", "Q1", "Q2", "Q3", "Q4", "Total"], + ["2016", "100", "110", "120", "130", "460"], + ["2017", "105", "115", "125", "135", "480"], + ["2018", "110", "120", "130", "140", "500"], + ["2019", "115", "125", "135", "145", "520"], + ["2020", "120", "130", "140", "150", "540"], + ["2021", "125", "135", "145", "155", "560"], + ["2022", "130", "140", "150", "160", "580"], + ] + ), + notes="8x6 bordered with numeric data, tight spacing", + ), + TableSpec( + name="bordered_merged", + page_size=letter, + bordered=True, + origin=(72.0, 700.0), + col_widths=[80.0, 80.0, 80.0, 80.0], + row_heights=[24.0] * 5, + cells=[ + CellSpec(row=0, col=0, text="Year", row_span=2, col_span=1), + CellSpec(row=0, col=1, text="Revenue", row_span=1, col_span=2), + CellSpec(row=0, col=3, text="Notes", row_span=2, col_span=1), + CellSpec(row=1, col=1, text="Q1"), + CellSpec(row=1, col=2, text="Q2"), + CellSpec(row=2, col=0, text="2020"), + CellSpec(row=2, col=1, text="100"), + CellSpec(row=2, col=2, text="120"), + CellSpec(row=2, col=3, text="stable"), + CellSpec(row=3, col=0, text="2021"), + CellSpec(row=3, col=1, text="110"), + CellSpec(row=3, col=2, text="130"), + CellSpec(row=3, col=3, text="up"), + CellSpec(row=4, col=0, text="2022"), + CellSpec(row=4, col=1, text="120"), + CellSpec(row=4, col=2, text="140"), + CellSpec(row=4, col=3, text="up"), + ], + notes="5x4 with row-span merges (Year, Notes) and col-span merge (Revenue)", + ), + TableSpec( + name="borderless_financial", + page_size=letter, + bordered=False, + origin=(72.0, 700.0), + col_widths=[120.0, 80.0, 80.0, 80.0], + row_heights=[24.0] * 6, + cells=_simple_cells( + [ + ["Metric", "2021", "2022", "2023"], + ["Revenue", "1,000", "1,200", "1,500"], + ["COGS", "600", "720", "900"], + ["Gross", "400", "480", "600"], + ["OpEx", "200", "240", "300"], + ["Net", "200", "240", "300"], + ] + ), + notes="6x4 borderless financial statement", + ), + TableSpec( + name="borderless_invoice", + page_size=letter, + bordered=False, + origin=(72.0, 700.0), + col_widths=[160.0, 60.0, 80.0], + row_heights=[24.0] * 5, + cells=_simple_cells( + [ + ["Item", "Qty", "Price"], + ["Widget A", "2", "10.00"], + ["Widget B", "1", "25.00"], + ["Gadget", "3", "15.00"], + ["Gizmo", "5", "5.00"], + ] + ), + notes="5x3 borderless invoice with bold header row", + bold_header=True, + ), + TableSpec( + name="rotated_landscape", + page_size=landscape(letter), + bordered=True, + origin=(72.0, 500.0), + col_widths=[100.0] * 5, + row_heights=[24.0] * 4, + cells=_simple_cells( + [ + ["Col 1", "Col 2", "Col 3", "Col 4", "Col 5"], + ["a", "b", "c", "d", "e"], + ["f", "g", "h", "i", "j"], + ["k", "l", "m", "n", "o"], + ] + ), + notes="4x5 bordered on a landscape-oriented page", + ), + TableSpec( + name="sparse_cells", + page_size=letter, + bordered=False, + origin=(72.0, 700.0), + col_widths=[90.0, 60.0, 70.0, 70.0, 70.0], + row_heights=[24.0] * 7, + cells=_simple_cells( + [ + ["Item", "Qty", "Price", "Total", "Note"], + ["Apple", "2", "1.00", "2.00", ""], + ["Banana", "5", "0.50", "2.50", ""], + ["Subtotal", "", "", "4.50", ""], + ["Orange", "3", "1.50", "4.50", ""], + ["Grape", "1", "3.00", "3.00", ""], + ["Total", "", "", "12.00", ""], + ] + ), + notes="7x5 borderless with blank cells on subtotal/total rows", + ), + ] + return specs + + +def main() -> None: + OUT_DIR.mkdir(parents=True, exist_ok=True) + for spec in build_fixtures(): + emit_single_table_fixture(spec) + print(f" wrote {spec.name}.pdf + .gt.json") + emit_multipage_continuation() + print(" wrote multipage_continuation.pdf + .gt.json") + print(f"Done. Fixtures in {OUT_DIR.relative_to(REPO_ROOT)}") + + +if __name__ == "__main__": + main() From ee02e1fb9d2ec99d7efc20f0d250ed95d6a68c78 Mon Sep 17 00:00:00 2001 From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com> Date: Fri, 17 Apr 2026 00:53:39 +0530 Subject: [PATCH 04/12] feat(fixtures): commit generated table fixtures and ground-truth sidecars Output of scripts/generate_table_fixtures.py checked in so the harness and CI don't depend on regenerating at test time. 8 fixtures totaling ~50 KB: - bordered_simple, bordered_dense, bordered_merged - borderless_financial, borderless_invoice - multipage_continuation (2 pages, repeated header) - rotated_landscape (landscape page) - sparse_cells (borderless with blank subtotal cells) Regeneration is verified for bit-identical output in CI. --- tests/fixtures/tables/bordered_dense.gt.json | 356 ++++++++++++++++++ tests/fixtures/tables/bordered_dense.pdf | Bin 0 -> 6201 bytes tests/fixtures/tables/bordered_merged.gt.json | 139 +++++++ tests/fixtures/tables/bordered_merged.pdf | Bin 0 -> 3055 bytes tests/fixtures/tables/bordered_simple.gt.json | 104 +++++ tests/fixtures/tables/bordered_simple.pdf | Bin 0 -> 2562 bytes .../tables/borderless_financial.gt.json | 188 +++++++++ .../fixtures/tables/borderless_financial.pdf | Bin 0 -> 2951 bytes .../tables/borderless_invoice.gt.json | 125 ++++++ tests/fixtures/tables/borderless_invoice.pdf | Bin 0 -> 2485 bytes .../tables/multipage_continuation.gt.json | 327 ++++++++++++++++ .../tables/multipage_continuation.pdf | Bin 0 -> 6044 bytes .../fixtures/tables/rotated_landscape.gt.json | 160 ++++++++ tests/fixtures/tables/rotated_landscape.pdf | Bin 0 -> 3352 bytes tests/fixtures/tables/sparse_cells.gt.json | 265 +++++++++++++ tests/fixtures/tables/sparse_cells.pdf | Bin 0 -> 3579 bytes 16 files changed, 1664 insertions(+) create mode 100644 tests/fixtures/tables/bordered_dense.gt.json create mode 100644 tests/fixtures/tables/bordered_dense.pdf create mode 100644 tests/fixtures/tables/bordered_merged.gt.json create mode 100644 tests/fixtures/tables/bordered_merged.pdf create mode 100644 tests/fixtures/tables/bordered_simple.gt.json create mode 100644 tests/fixtures/tables/bordered_simple.pdf create mode 100644 tests/fixtures/tables/borderless_financial.gt.json create mode 100644 tests/fixtures/tables/borderless_financial.pdf create mode 100644 tests/fixtures/tables/borderless_invoice.gt.json create mode 100644 tests/fixtures/tables/borderless_invoice.pdf create mode 100644 tests/fixtures/tables/multipage_continuation.gt.json create mode 100644 tests/fixtures/tables/multipage_continuation.pdf create mode 100644 tests/fixtures/tables/rotated_landscape.gt.json create mode 100644 tests/fixtures/tables/rotated_landscape.pdf create mode 100644 tests/fixtures/tables/sparse_cells.gt.json create mode 100644 tests/fixtures/tables/sparse_cells.pdf diff --git a/tests/fixtures/tables/bordered_dense.gt.json b/tests/fixtures/tables/bordered_dense.gt.json new file mode 100644 index 0000000..4d7b1e8 --- /dev/null +++ b/tests/fixtures/tables/bordered_dense.gt.json @@ -0,0 +1,356 @@ +{ + "source": "synthetic", + "license": "generated", + "notes": "8x6 bordered with numeric data, tight spacing", + "tables": [ + { + "page": 1, + "bbox": [ + 54.0, + 544.0, + 414.0, + 720.0 + ], + "col_count": 6, + "row_count": 8, + "cells": [ + { + "row": 0, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Year" + }, + { + "row": 0, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Q1" + }, + { + "row": 0, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "Q2" + }, + { + "row": 0, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "Q3" + }, + { + "row": 0, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "Q4" + }, + { + "row": 0, + "col": 5, + "row_span": 1, + "col_span": 1, + "text": "Total" + }, + { + "row": 1, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2016" + }, + { + "row": 1, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "100" + }, + { + "row": 1, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "110" + }, + { + "row": 1, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "120" + }, + { + "row": 1, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "130" + }, + { + "row": 1, + "col": 5, + "row_span": 1, + "col_span": 1, + "text": "460" + }, + { + "row": 2, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2017" + }, + { + "row": 2, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "105" + }, + { + "row": 2, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "115" + }, + { + "row": 2, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "125" + }, + { + "row": 2, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "135" + }, + { + "row": 2, + "col": 5, + "row_span": 1, + "col_span": 1, + "text": "480" + }, + { + "row": 3, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2018" + }, + { + "row": 3, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "110" + }, + { + "row": 3, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "120" + }, + { + "row": 3, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "130" + }, + { + "row": 3, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "140" + }, + { + "row": 3, + "col": 5, + "row_span": 1, + "col_span": 1, + "text": "500" + }, + { + "row": 4, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2019" + }, + { + "row": 4, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "115" + }, + { + "row": 4, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "125" + }, + { + "row": 4, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "135" + }, + { + "row": 4, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "145" + }, + { + "row": 4, + "col": 5, + "row_span": 1, + "col_span": 1, + "text": "520" + }, + { + "row": 5, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2020" + }, + { + "row": 5, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "120" + }, + { + "row": 5, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "130" + }, + { + "row": 5, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "140" + }, + { + "row": 5, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "150" + }, + { + "row": 5, + "col": 5, + "row_span": 1, + "col_span": 1, + "text": "540" + }, + { + "row": 6, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2021" + }, + { + "row": 6, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "125" + }, + { + "row": 6, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "135" + }, + { + "row": 6, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "145" + }, + { + "row": 6, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "155" + }, + { + "row": 6, + "col": 5, + "row_span": 1, + "col_span": 1, + "text": "560" + }, + { + "row": 7, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2022" + }, + { + "row": 7, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "130" + }, + { + "row": 7, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "140" + }, + { + "row": 7, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "150" + }, + { + "row": 7, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "160" + }, + { + "row": 7, + "col": 5, + "row_span": 1, + "col_span": 1, + "text": "580" + } + ] + } + ] +} diff --git a/tests/fixtures/tables/bordered_dense.pdf b/tests/fixtures/tables/bordered_dense.pdf new file mode 100644 index 0000000000000000000000000000000000000000..1ad86c2177e820ef989fd5c44017c57454455ca2 GIT binary patch literal 6201 zcmbW5+iv4F5J2Dc74s6*X<<vGZuTMwu(@r6&2CdiMNtGnLCZ{{z?KY2Zj$~&`_}h< zYiDRN5wk80CKubHC>~ypN7U?Ho}3NYDD?LJ`TMVbVJcTu-Cl?pe3GTCMJwlkHDF%l z>!mDPxNqCl2j72ud>qxP?n2B)dA0Ny1W?T$yn_SJKV!i0aOxc%da|5TF&*pEN1~Bu z6*lJoC>MXow#Wter)6Hvi}DWqUyJf%*%Yq@U&T@aHR@ljXYKO}4>EkkV|v3g*j5h} z_E5)o3cKAvO8o)<i<}qYsCt5HJPsgX9MTDg8}KhhP2)>8^`^3^)^#o$P~)M_(g3lA z%eu<1WQz^>G%<cApKu`OOK~TUy7%Mm{TyzvTi{Qt7AF!f_p@4*s!1yMKo8&&FYXN> z!m4_%=+u{Lj-&jgkzcD4KTuJlJGz;HW6_F5b=PYn?zQo8-QHI<3`AL#&&z7v?1O(? zOVJipc|tuKoP6K`{xJMiKR*P)clf94(ZwsVlJ$dF4qMr@!`tGiUDuLY`YfLxt9stR z01vRMyIRBvoUe0<O}@%TGkH<WYEeJK5WbOXU(FRR{)5b07-V9$kbv`Ei=4&)BG1?O z`5C%yVMnr}#X5r-_Dt4t{`*{(jimBgtX3$se<l`9f6{5;^`swHYqjA1=K{xd4P?}m zB-zZiT7ceMlUw@I3t8T^_Yh6^glaS`&hzHW5h#NK$d>>|nerL!%Wi>1qX@DKIL$nA z5>cPLQ4Eh>32_9;WDH3FoI@?)%F|Z{>Poy`U5BNlCn3^s6h)8*@hC}>6tX1@ev_i! zhwK5e@3%o1bEmK{9fhHZ`I*@u={q2kEf8*j@X#H?<1LWT2AQ}+Lbe4GSs>v%Rz6Jm zXgrS1MaeMsEG!`6q=}nhgn?unz8b~@mRKOlFh>dt$zT(M(aA<3#at<qEfi*>D90Qr zJl;a#RtibxND0{%N@$}f%Umhxx0EQcQAjcRcL`z}#S&&Hmf9Adf|w&kE7r%bAhuCR zF;_~bm_cDSic-vxq7^eJ+)5$E94T5cgA&>(N-<YTr&ym7jdv*)^7kkfP7KAyw#6sK z94T6{K89K23nRr`DV<^l#U5YiPU%R|iWwAZeBq>+BSkA_P$JvnE5%$Xonn1TY`wdX z!wjDo+@l!Y`(L(Vc%z)`Qp}N}73*V|HNGgtTq&Jm2E`s<NHIr>R?MJS<BL+vk)jnd zDACR+=1S=l>r-Ouml73=X=mlESgaPO+wJqf3ZcE2E2UG+z%V-nqnI;gqnJst#}`!0 zmC`9@QmpZX6mz9?ikXynR}^!mY!ovnv2`q@811Uv6~pJsUKHcjXFXEPk)jprW0*C* zP!w~fbcz`idwe0q9LJ&+Gbq;hq7-waXvGXlyeo>ijwS3A>r>)g+x-|b5bsbdHn#iR z_N1?(m?K3i*2gewd{K(IQaZ&9iuF55+kMv=(~21sYkX0PIgUjuW>Bn)PkS-fv1rBm zl-Str17(rBR9cr$eyZgyrpw0OCSggh_%suTG2FsSS&E4}b*6kNV*zGY-;{BN$+I_Q z(Uy8qidpP8_0klxe{aghAtn=b873)<g{)Ox{QLyG>w}QXh^GOcvpCO#36tqeq)8M8 zoKJ)p=ZCv%+<1GvRQ=udRQ+%W^8!=Fm@)moto^>{`RC<r1(YtHdYGxMTA=hZe!eRH YlsLaCi7Dz<O?W(JiMO|R`qP>BADZ!6eE<Le literal 0 HcmV?d00001 diff --git a/tests/fixtures/tables/bordered_merged.gt.json b/tests/fixtures/tables/bordered_merged.gt.json new file mode 100644 index 0000000..57713dc --- /dev/null +++ b/tests/fixtures/tables/bordered_merged.gt.json @@ -0,0 +1,139 @@ +{ + "source": "synthetic", + "license": "generated", + "notes": "5x4 with row-span merges (Year, Notes) and col-span merge (Revenue)", + "tables": [ + { + "page": 1, + "bbox": [ + 72.0, + 580.0, + 392.0, + 700.0 + ], + "col_count": 4, + "row_count": 5, + "cells": [ + { + "row": 0, + "col": 0, + "row_span": 2, + "col_span": 1, + "text": "Year" + }, + { + "row": 0, + "col": 1, + "row_span": 1, + "col_span": 2, + "text": "Revenue" + }, + { + "row": 0, + "col": 3, + "row_span": 2, + "col_span": 1, + "text": "Notes" + }, + { + "row": 1, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Q1" + }, + { + "row": 1, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "Q2" + }, + { + "row": 2, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2020" + }, + { + "row": 2, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "100" + }, + { + "row": 2, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "120" + }, + { + "row": 2, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "stable" + }, + { + "row": 3, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2021" + }, + { + "row": 3, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "110" + }, + { + "row": 3, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "130" + }, + { + "row": 3, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "up" + }, + { + "row": 4, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2022" + }, + { + "row": 4, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "120" + }, + { + "row": 4, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "140" + }, + { + "row": 4, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "up" + } + ] + } + ] +} diff --git a/tests/fixtures/tables/bordered_merged.pdf b/tests/fixtures/tables/bordered_merged.pdf new file mode 100644 index 0000000000000000000000000000000000000000..8affa2256d44fdddb2b55fae39bddb13d240d835 GIT binary patch literal 3055 zcmbVO+iv4F5PjEI%!^T{g>^TjEP?<VC*BP<o2HJ6q6mV5mPU>OTQn$Y9rqX7x4!pV zJEZMcPFy8kC$P*RIcH|hp*WqJ^I^}k1FQ4T-+%oJBb*DVuJ{-}VTqC}%z#!vCenI_ zrGf{g<{unqu~^vBtex_)or;;|feT{%Xq}u`&d>v&h9m3r)WR~;W<B0apYaNZf}A;@ zarzBak#cY@%2Z@UIRWRlqWoA^#ny1mX9(J<b6bzqa!!MUzG<u<G()}`P(cG~KvNWU z1(A*ioUfP_{7gK-9gSUJo)6Kn5BK2QaH;cSt92u+L@iUSz~rNY)d`Y^8!6ITROG<X zg>e#mqC_rde1d1|=V1N3gnJ4LoRLseNZRfsl9$G%wp*}C;D$Cg5(sJOrdE3DtH>zJ zUn{&8CH>H*Y7^+H0t2r2R7@H^`VAi+YxN)`ba^St<xJGo5jX>hTos}`*O7J4Klm<v zJo+=ApIr9``roY6o9BFv@{!Ma3M<viizih})SkZJav@|^L6-&;>RTSEfLWa)x!j6P zSMsVDOD>ntgID4@GPNS|A2C(XP55|<Ky{Zq)j0r9)0%!SVI39)k`#*8bjK7KO3eP4 zp`2i*CnTTGN!8AfPpf9tHS?RQ4@7MkcfJ&q*d1s|jkHQu<u(j7imk=9*?NWLL_L5P z#lBuqDJt~pbht)BE~GPnv&0CR<d@{&g?0$Z6<j2i79~M_vg|%AtP&z0Sl@@(g)jz* zaBFRLx@ITsH~Yw{^{F9<L^}+DMZV1#iy)an_jly-2$Dxge%y!R5d@1Ec+3Sqd?$|Y zdG_%!usGb}jPM(lHMT$nl=7h&1wm{dzsTrXkPcgLOiHWDy*nc>?%P2CjJey4e(_pB z{NOp6cZ2wB4~X9aq9x$RS_1FJpi^weF|dGb6Y*W&Z9y^hIus@j$p0=3LARiof*pwB zvpp!SxfqQdh@v(2GM7r}7fnk#ni@C!E^55S=_G2}RA#6-6lP6L6UOu2q~<`B4K+I` zK^uyp=0KDnWafp|M5*WPwDjq?e8-9D$65vJmnPBDb}lt72g9sr*@5v|wX9Ffp(txw zc2UB&XgL&RP0J3-4lSU2zxDmM?gF1A=5)u!)~e0W-*g-FgFfW&Y>Vi&H8bV2%_GUY zw9)<Z$`<Zf7t$$xxi^a9SGG8yYkY%Ctwk|KX}a0VbFl7C`k1+6KF+f|&qLmiGGsoE zBbMfVkca4=zP;k!>NFbc$^Hhr*MqE}29Nrj|Le7nEX%npa{;=^8(Gv}3I)1pq~Ev2 YpGfsJod=i76x?T#YjrvoKM$?{0PDQoUjP6A literal 0 HcmV?d00001 diff --git a/tests/fixtures/tables/bordered_simple.gt.json b/tests/fixtures/tables/bordered_simple.gt.json new file mode 100644 index 0000000..e59a843 --- /dev/null +++ b/tests/fixtures/tables/bordered_simple.gt.json @@ -0,0 +1,104 @@ +{ + "source": "synthetic", + "license": "generated", + "notes": "3x4 fully bordered, single-line cells", + "tables": [ + { + "page": 1, + "bbox": [ + 72.0, + 628.0, + 472.0, + 700.0 + ], + "col_count": 4, + "row_count": 3, + "cells": [ + { + "row": 0, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Header A" + }, + { + "row": 0, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Header B" + }, + { + "row": 0, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "Header C" + }, + { + "row": 0, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "Header D" + }, + { + "row": 1, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "a1" + }, + { + "row": 1, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "b1" + }, + { + "row": 1, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "c1" + }, + { + "row": 1, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "d1" + }, + { + "row": 2, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "a2" + }, + { + "row": 2, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "b2" + }, + { + "row": 2, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "c2" + }, + { + "row": 2, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "d2" + } + ] + } + ] +} diff --git a/tests/fixtures/tables/bordered_simple.pdf b/tests/fixtures/tables/bordered_simple.pdf new file mode 100644 index 0000000000000000000000000000000000000000..0244bb3240e44037886ba50af52df5684a547014 GIT binary patch literal 2562 zcmbVOO>f&c5WVwP%*Cj)3tOL-)FKEF+i@E-Z39O|4?(b?q_LyG5)F!4$NLwyx8D0( zJEW~vNL>Yv6IkMKIB({?G0EQbbT)8?9_xMk`u}&B<61~{#S?goB}%R^1M+}Oq;-L% zf_tUbZ!BxG*$k!eUGZd?ih?;{gGe6O=_#{j4!AU&v$HdXWv0z~+)R&og)>3HtarHj z7ge5eurA6}WO=y+>q}mqmsS4D@R1h?I;eG1Cu+N<K}_E?)(@JYSS={81vQ~5O1pwk z=L6OU%yK>!k8n$48v@6L@U;tfU|n;m>kD>%bF4%yQ>?($qm$JIQiN+M(i>D1z|zE6 zF+Nfumjz$qc>kR2pO<h)X@NBtiio7;RxEjGLTWpMrhsc&+$z8)*M=)S^+9A*<>v}N zijrQmscI74F)-nZuf(zyW7LXqUaNZ{q0dWEZVORYCtyt^a+Qm6sx#|P-?%n?9Qre# zzis;!{WpGk@tUtuKJa3ouu_9X{-|n+I?{XGZiLJ#=+l5weJ&yqnAIs#$Y0H-nY_vq z$>kOX@LO}87_LbC2TT?8W1g%Kh<C|TT?6nmt?7LW`?M&LI9KG;Pbf2#nEg}b#d?K$ zLh|*Ry4srYRn=x)E8nm_5w%g=dY@Bax1d{UuDfJMw^5))wr;Lw=@phsbq|gkz1AL; zBBFP{4%@Vl4QT-|j!i>T`^5`z{Gku=6<owjHzh&6Wy29{SP7vEK^OqZ0<I4d;f6I! zZL^dDn1y834XQE7c^E|yME)=cf)L^Y`tOhv4{&}0@dLzv9fERRf<<lfC#-P?mirPc zkFWxJbg-~V2blK~FuwzA+5x7qM(&Y<Il<8PAqc3xMlkMlfEcNcMR5<HBwZ**q+?M$ zLeU(wYe~CMj5^1n_y=mqx=?gVZuBx)yr3nE`<yJtqBOF!QIakcBg?TUjVx`Hv<t<^ zax6+COB*F?qkz5+?6<`I#o$rmg0{Vg?M@#3O~0U#H-ZKHw1u<*o0;;b&9Uj2d}4DQ z+5?~1`~!YIwZl`tFpPe)MIP<!O*}en@)b&R!MU6QyFCpe*Nzf<F~TH@7FiIW?QwSz zda3V5neUuE?{UX^tvlE9@SSTgfGnpopN_siUB6Bkvo6bp0DbPy8C^Mq0)3C5_nZ7b WB!10d#HBKXyG|IgUhm@5jQs!<q;`Y= literal 0 HcmV?d00001 diff --git a/tests/fixtures/tables/borderless_financial.gt.json b/tests/fixtures/tables/borderless_financial.gt.json new file mode 100644 index 0000000..c23d338 --- /dev/null +++ b/tests/fixtures/tables/borderless_financial.gt.json @@ -0,0 +1,188 @@ +{ + "source": "synthetic", + "license": "generated", + "notes": "6x4 borderless financial statement", + "tables": [ + { + "page": 1, + "bbox": [ + 72.0, + 556.0, + 432.0, + 700.0 + ], + "col_count": 4, + "row_count": 6, + "cells": [ + { + "row": 0, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Metric" + }, + { + "row": 0, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "2021" + }, + { + "row": 0, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "2022" + }, + { + "row": 0, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "2023" + }, + { + "row": 1, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Revenue" + }, + { + "row": 1, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "1,000" + }, + { + "row": 1, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "1,200" + }, + { + "row": 1, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "1,500" + }, + { + "row": 2, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "COGS" + }, + { + "row": 2, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "600" + }, + { + "row": 2, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "720" + }, + { + "row": 2, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "900" + }, + { + "row": 3, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Gross" + }, + { + "row": 3, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "400" + }, + { + "row": 3, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "480" + }, + { + "row": 3, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "600" + }, + { + "row": 4, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "OpEx" + }, + { + "row": 4, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "200" + }, + { + "row": 4, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "240" + }, + { + "row": 4, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "300" + }, + { + "row": 5, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Net" + }, + { + "row": 5, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "200" + }, + { + "row": 5, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "240" + }, + { + "row": 5, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "300" + } + ] + } + ] +} diff --git a/tests/fixtures/tables/borderless_financial.pdf b/tests/fixtures/tables/borderless_financial.pdf new file mode 100644 index 0000000000000000000000000000000000000000..7423cc4061dd5654e5e91da6aad7af2db08aa379 GIT binary patch literal 2951 zcmbW3-*2ln6vyBBE6z)_bXx!$FyyMLbelF^b^A*x(;lj7GT@|C366}pP4+KrZ+qY0 zx?>V<NSo=7BvHV}#~+TrKE|i_a4{Yb+p~Iq|MllTFcC|s^bJqoN)$qIEiyn4$YfeA zM4{nD>*XiMd3$@aRqZ?HiJi)YMZg7_yjrKHmNO<mu{p8M&MZ-6#%$V6_c<@cSR%~% zMa;hnou?d}%OaIoUd+JxEiXP7Wxg}q@r3{bbsnokub0?}@gLjff<4G;J<4l6YJ^=F zb_t=02b^1x<@{W(;0fC<FhU_5Qg{aEfh#jVw(*;YQdTMzCDijV!I}vo;X%ptQD_7> zMi?g+E1byng3rYH_Bz^Lui+WP0%syM3W>v=Sn;Ap8oLF}1Rij3YXUyHHd>jkTbbc3 ze=WtGEbzvd$|lf_0wb>ZT+UjIpvCxH=@+S>&kI?s7qTi(z!@pQbuNnw6IuV_6Ls-J z@LPX=a@`;B*Z7&iOTHB9l`jTblzK4DSGrQdkiLlZn^ai|eQaQ;-y@=cS(OSz?vl+Y zd6OrKt2GSZLvo$eTH)fqic~{C=E+<D>aKWd<^ViRE4*LBHY^Mz&Nce<6O2r#xhTuu zr+LAPH0N{EtN3z>sqKvUylhKvmcNmHB&&Maoi91g?g>m@O-!n6G_RMTrMStkX6TJ5 zX8HvP3q9jeYE*l(Mcg`%Tu2uH=W(6Yn1k^Yh;RE4-@s*THFmeQV{c<MDQEzMM7B2s z7Wp<~EQELg{ac|`o}NJb3h|Hsi$bYuN6|iox|DQbc<w<4?Zk)<GQ2JfLWn&?MVPx& zWFo$bq7q#&lnibE9ftgjTcQ(WcnE?UtP8{o4?%)%5JEzmh5;~-?PM9<Up;n-gi;R& z?=x7pI6U_wBBb5)h(02Q-J&qNhA~uw(487rN|yK*>#}P$fwUohw?G;>_7Krga<ma0 z7lqWh9zYNoS+;MH`{m{82sQQ?T{C@?9Qz3W7&Z0~_-5)d5kh^NQV3Y<m3k+17sPg> z>>-ZSq8$Z2PK!2TUkflb+4e}<R%a_Erj`qlwK@LHKd>P10+_-#TZji>-BWzC5f^Kf zcQ#7!n0jaP5BT{Z)`ai;!sz2OBERK_M}9sRs;)AxFTi>_4Ox~9rz~P_M5j|vxYSRR z#N&AMW`5#%XW#dDwtB6?cXqh&9Sk7LvGBtB<G*U)6U%b0i>U-td`&E@`=ka_%;EiG Y{)a$)>tcW_T?<YF#;jiN@_uan3n6C8+5i9m literal 0 HcmV?d00001 diff --git a/tests/fixtures/tables/borderless_invoice.gt.json b/tests/fixtures/tables/borderless_invoice.gt.json new file mode 100644 index 0000000..0b6d809 --- /dev/null +++ b/tests/fixtures/tables/borderless_invoice.gt.json @@ -0,0 +1,125 @@ +{ + "source": "synthetic", + "license": "generated", + "notes": "5x3 borderless invoice with bold header row", + "tables": [ + { + "page": 1, + "bbox": [ + 72.0, + 580.0, + 372.0, + 700.0 + ], + "col_count": 3, + "row_count": 5, + "cells": [ + { + "row": 0, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Item" + }, + { + "row": 0, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Qty" + }, + { + "row": 0, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "Price" + }, + { + "row": 1, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Widget A" + }, + { + "row": 1, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "2" + }, + { + "row": 1, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "10.00" + }, + { + "row": 2, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Widget B" + }, + { + "row": 2, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "1" + }, + { + "row": 2, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "25.00" + }, + { + "row": 3, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Gadget" + }, + { + "row": 3, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "3" + }, + { + "row": 3, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "15.00" + }, + { + "row": 4, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Gizmo" + }, + { + "row": 4, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "5" + }, + { + "row": 4, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "5.00" + } + ] + } + ] +} diff --git a/tests/fixtures/tables/borderless_invoice.pdf b/tests/fixtures/tables/borderless_invoice.pdf new file mode 100644 index 0000000000000000000000000000000000000000..75ad82b224ac146bc74d3e7c1bbb18dcfd194c0c GIT binary patch literal 2485 zcmb7GO>f&c5WVYH%*9w|7ZyKcNmdaAh@H4=H0^pFEqVxo1ufBz0!uU~x{3Q2c5l7+ zx9(838rj~qk!(ofaOQE|BgJ87dNuAb*K;~w{{81G%;Z{Wb0g;PRu)nVBU3;RNL8}Q zWntjKnDrY%o}QjuUHPuW+)Y&OFrc93kIwnIL&gj^HfKP_96bBxTwFM^NUekIsU3%+ zlw*ak<afFHBu$nGAlF5r(yUkl`Ir@#MVY-AK8Rc*C<Ai4nVanz8!_IoZ6DZ!SdF3V z&_5WeH&m<iHRJJDj@NIDH^Nj<uo4EAKalq_&BRbW!yUFM1dKx%akvLE71}Br?)+x5 zR2!Yh5-KIFI96Ijm}-^WN`nBzP6>(SGYU7!#ZnIU&(Z!lfqRqz$V?fWLJTLd7DWYV z?GDs}O)<D3n2)ZtV7BYMN>SCnO8G$*_+m|Eo#;;SkuYMVmJN)cfpNJp4@yH<6sp+f zYEzy88EGj@riv?@S@-G<r+6{^Ro8En{(}G7&jzo>TIxrU_lzt}Z;?Hljg}Vao!mYt zotDtW2B!KmL=-UHBoZNCmbq2(CYx)ax6p%=Wqnp@g^TS`CI-5(n6D(D?ph?41BfKq z;QJQ#X<;I9X3(cQ$IPT&$+G-AD?X_#k+x5Z^%_4ZG8U_{DZHKhTKJLLRFh8LWytLg z?5djCg|bt;nuLblc74^MH?mlo2VgV^tw(84>)oeGt5rlH$pMCO^$p<)j2FQAZXe<s zxQ?C5&M~^OV{dJ>3(x|vK6673!JzL3K@dWmLw90ieg^R)#J|1@gZmx^`-dQZnr#~l zk40`69zslYCGlY*nDY0@cu0m}A_0A2^D#@8_=#N}XY(T<?nf?1Rhl@w4S=Q7j}8GC zbt!EF!fHBRa6)I;2B@af#9?g!YC25-?zg7nyMf&$U&y=_c5d2aQd4Q7cx?b`Dop^^ zo=WKYz7PIj&>)k2&Q+UK_FN7?{5Aj{=PnNqvpAvxwnpzajeWI#)^g!ch@9Q_Z0`{G zzURXNzS%-7v{g^>&BhpRTSqp|aU(ghdERNP&#{Ia`T5Z)UPMo0qX5qrbu4bv*-Gl_ zOftCw=kC1kbFmPEAmnM3Bq<fllL7Uq3=*C)AufLIaqn~*$E4-|ACr1LNHZ*!STnwN tVmfmiGAR}cY{8s4SgVx*TY&NXHv245^J)th+EkOk>3}+&&h;N-=RYb1cya&$ literal 0 HcmV?d00001 diff --git a/tests/fixtures/tables/multipage_continuation.gt.json b/tests/fixtures/tables/multipage_continuation.gt.json new file mode 100644 index 0000000..eecbda3 --- /dev/null +++ b/tests/fixtures/tables/multipage_continuation.gt.json @@ -0,0 +1,327 @@ +{ + "source": "synthetic", + "license": "generated", + "notes": "2-page table, header repeats on page 2; Phase 0 scores per-page tables", + "tables": [ + { + "page": 1, + "bbox": [ + 72.0, + 552.0, + 392.0, + 720.0 + ], + "col_count": 3, + "row_count": 7, + "cells": [ + { + "row": 0, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Year" + }, + { + "row": 0, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Metric" + }, + { + "row": 0, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "Value" + }, + { + "row": 1, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2010" + }, + { + "row": 1, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Revenue" + }, + { + "row": 1, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "100" + }, + { + "row": 2, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2011" + }, + { + "row": 2, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Revenue" + }, + { + "row": 2, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "200" + }, + { + "row": 3, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2012" + }, + { + "row": 3, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Revenue" + }, + { + "row": 3, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "300" + }, + { + "row": 4, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2013" + }, + { + "row": 4, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Revenue" + }, + { + "row": 4, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "400" + }, + { + "row": 5, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2014" + }, + { + "row": 5, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Revenue" + }, + { + "row": 5, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "500" + }, + { + "row": 6, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2015" + }, + { + "row": 6, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Revenue" + }, + { + "row": 6, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "600" + } + ] + }, + { + "page": 2, + "bbox": [ + 72.0, + 552.0, + 392.0, + 720.0 + ], + "col_count": 3, + "row_count": 7, + "cells": [ + { + "row": 0, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Year" + }, + { + "row": 0, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Metric" + }, + { + "row": 0, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "Value" + }, + { + "row": 1, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2020" + }, + { + "row": 1, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Revenue" + }, + { + "row": 1, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "200" + }, + { + "row": 2, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2021" + }, + { + "row": 2, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Revenue" + }, + { + "row": 2, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "400" + }, + { + "row": 3, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2022" + }, + { + "row": 3, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Revenue" + }, + { + "row": 3, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "600" + }, + { + "row": 4, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2023" + }, + { + "row": 4, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Revenue" + }, + { + "row": 4, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "800" + }, + { + "row": 5, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2024" + }, + { + "row": 5, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Revenue" + }, + { + "row": 5, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "1000" + }, + { + "row": 6, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "2025" + }, + { + "row": 6, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Revenue" + }, + { + "row": 6, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "1200" + } + ] + } + ] +} diff --git a/tests/fixtures/tables/multipage_continuation.pdf b/tests/fixtures/tables/multipage_continuation.pdf new file mode 100644 index 0000000000000000000000000000000000000000..bb8997de980bec603bfc8ec6cb33c44046b31609 GIT binary patch literal 6044 zcmeI0%WmT~6o&VDinB55w6HGJ)q^0w<l<@2WEwbXiy{aLTB02Vwq#J$IO!|Q?7Hu> z_K>y`DsgSg=q5p(_#%qp&-0NKf1S?N>7eJBuF?7H+t<Hg$mRtX7jy)lSk5>VECs9q zX^~VjmJ7HOV*bIh7K??+Roy8anMpA-9I&An-5ZC8hBa`2;Nj3XIx<+E%3^t3pFXA~ z8x+`>^&^{pVIoT?SZ8@sq**=z>$fcbn3vge!Dl*SARD!=tC4t|<3Ws{cr3s047R07 zxti1op2BXI5Xt_4b;;6<9v2UAgU2=m4uP;w;1;Ya%4K}vvfhxDMa2_Vf{I6WRtAVI zT=62gW&#_qBp54Z4;aY#j853`>icB%eGa$SEwF}#K#+L36?2-aCS|z+bpTg*u@-=b zRclsH0rZ#hZz2F+3qZzpS)@2jzm)8Ak>d|pRMw=ILwG_3ofZ=fLZG$ru@ZL$hc3;F z{Bc%P<pEeHoKcY#`Kd(PJ^eszd^z~1UO(9Oclck|lNZnFobh`)>j_qh-Z*;@6=$-g zpV;G~;Ash6JixAQYY}HaS|tpdd|oYbDqLhEPWdDB;MM9lP=k%L_nsvJx-lJ18Q{d> zG?6g?O_B<KKf<b8*pWCBSf@LyrXrgous;%<gjrsxpyhc^=W|@0)__h+efG#1{5*Rq zoVf&J%rL$ikV|6-a()fvoL{n6!=aIvOQmk$g5?u&2ZXr3tWgT2zFh0JS`aoQGl1h* zEl*rb@faM>^dP=~v)GVJ45dC9rVk4vhmb(f4*<sqhzFd(wNYPct4px~bs?6@<t8;8 z$1yz*LfbQgAP6CzLHBn?`2ocD5P!c;#;J8k3mxJ_rrU4oTr$D4WP=jHdq?YtT+%m* z<2R*#rPGRSP?90xyCpi?j3Njk`xza<6|zA_p_4l8&^czm-_#khFD%~}kz&_2B-eE@ zmZlUk*$|Ng=SBO0^MeLBXM>CaCw2A#r|awpu5Cy)I9&>FnHwUK;Bb}hHRD{;07o{+ zC~#6|A8@+Pj^NscRD;u{+^yj74!0LLJ8FP)w}F#7`+(DRb_CZpq#B$q<!uA!hpo2< zb&vBSZF|6boVNv>(%A!CLuWT|twUOYYe@N9!TI}b4t~(w9Q<wIq|QFzbe$c+wGF8T zr%MG5a3CK?RwickU?P3>Yqt~r?{;3>x3}na$fhmsExMiUn|51k;5bX0*t7=Zy}6yO zd(VI5cF22kJI!0@x@Wq!)}Wqox-}p+d2enfYV190z_qppD{u{KfZ*P?+j(u^&^m9r zZR3-P-_{${<aBR<<leU3S(4L3LTjdzTjO^gI2&VP#<7iMrc8c9;QOu*V|XeHF;k?@ z<WFUejd`ONWyHZW&5JT`O+CMlS*aKGf&kNmFUtBMCZC#RDEq1}E~ZeP>tPlxn=-C4 z&*!IL+#Gu2(I^N5o4MmC@$EE?0!Gmt5}zbv-z7)eYup+gEg3#ppA7HyAk8qBi7CB* w%ZncvhIO8g3y}HDp@F&JLV(PV;_vJ1PbLRKCDtexYH0kx35`zY?B{{;53FBdXaE2J literal 0 HcmV?d00001 diff --git a/tests/fixtures/tables/rotated_landscape.gt.json b/tests/fixtures/tables/rotated_landscape.gt.json new file mode 100644 index 0000000..2605fe0 --- /dev/null +++ b/tests/fixtures/tables/rotated_landscape.gt.json @@ -0,0 +1,160 @@ +{ + "source": "synthetic", + "license": "generated", + "notes": "4x5 bordered on a landscape-oriented page", + "tables": [ + { + "page": 1, + "bbox": [ + 72.0, + 404.0, + 572.0, + 500.0 + ], + "col_count": 5, + "row_count": 4, + "cells": [ + { + "row": 0, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Col 1" + }, + { + "row": 0, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Col 2" + }, + { + "row": 0, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "Col 3" + }, + { + "row": 0, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "Col 4" + }, + { + "row": 0, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "Col 5" + }, + { + "row": 1, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "a" + }, + { + "row": 1, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "b" + }, + { + "row": 1, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "c" + }, + { + "row": 1, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "d" + }, + { + "row": 1, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "e" + }, + { + "row": 2, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "f" + }, + { + "row": 2, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "g" + }, + { + "row": 2, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "h" + }, + { + "row": 2, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "i" + }, + { + "row": 2, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "j" + }, + { + "row": 3, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "k" + }, + { + "row": 3, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "l" + }, + { + "row": 3, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "m" + }, + { + "row": 3, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "n" + }, + { + "row": 3, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "o" + } + ] + } + ] +} diff --git a/tests/fixtures/tables/rotated_landscape.pdf b/tests/fixtures/tables/rotated_landscape.pdf new file mode 100644 index 0000000000000000000000000000000000000000..4dd923fbcd2233e37fe08e3db76cd57eed7e988f GIT binary patch literal 3352 zcmbW4-*4hL5XayBSIi4KdM8NY{35HW(w3IJYP)+C&FLPh>SP>4!$FK3bCmlRcHi#( z{?;2CvSeGxDy0=k#`b(X^Bto}=k99M^ZGvN{QLFGe=rs+sq`(Izz0zX#k9x(XFw+T zdMOGGPg<|uInHLY=_@;T!6tn!m&5}X<YZ3H&xtegfa2qrTwD-QWX5cstKmylijhQT z&aYzex6nCf;9M76W_d9K=a0O2UzGX5@R=<I7^lwtdZM>0d`R&dpUnkF&{qxR*HA+o z!flrjoBe?ENn|;@l+W;h&o0CXg~+4u2+kc-CcbDlZ!Ai=R$P=|<1w2x0iuUHCHcM3 z=)f_FaZ>S&f!r+FOkD1-!~OLJ9&uaXjHSjz;&LZdtgtSPoj?`99WJf~5a84*S7zvw z%rMH|OYvD2cw<auwb5M$hD@`CoYj1UH6QQS`bjG2vO*TyrCgV1;0%>uI+w+j*;)7M z9d+^J;h+8d;<`WKe>Kl6Ua^%>bGGbhQR?0_f7WXyjHi!cyOAm@p^Fc=)gO7p1ZHb4 z(B+}nOeSyhiDGICJ$O}IXEs+T{#<Yk-IPrh0x<82aT5bD&ewRqh5fd0BWbR2PFLB~ zXMZkOk(C@nFhh#1R#?~0h%L%`-i`KE-iLB+HFrMd7}^7vq8ghz*(Ka+P)oNduWIS7 zC}#Qzs2j&-MyWB?yYu1NB65K*0WMQp(pY@y6uh7xKza+;DKTY<s1Kwc!iE$OQwZV+ zP#B;AC;|7RTI$-R=)f*St0_^#@I1Hg`w$F*eiTJ9q)X@y<pR7jNav9Le2nA01w>DP zXak781>~Oq`3)fdEui28C};o$ZvllTKw$&Oa6;O)7Lq>2b&-3h1=hf@b<h?<z1NUQ z1H{%qTZoU4@cTe`1H;PSmJu8YBx`_J>DxlWBM|&yI(amW>~zplc8Yp?5cttS_Gtsd z%H9@I$zF%d8X#8owh$w`Dd0NfsR3ejZwm=9U4!I%NZtUky0?XdM<DYCh|%2*TI#+# zBpyA;{#OIS%HEbi_i@xAiw205y)DG|;{L$L^IoSc8z@%ywv_6;*C9m%#OmG_Ql0lY zMAjf+n&f?(uy2>2m6&2HFd(~r%zQ%>hM12je6z*al-QBto6W<$y|hse`+^tqOkKUQ zVZFVy#RKdpUaTGX$-#y_SiTU-w*5C(KpxJMDb2X=XFSaKAfb^v^|OTgJc)vEk}&t; z$1@&Dr}ic?JN_op>p_-dCyWi@sh5&7LY$joD#3KaV}gyi)L^=6yuZ)?5tv`wpfROw P!f6u4q|>?nZAAVDe()Fs literal 0 HcmV?d00001 diff --git a/tests/fixtures/tables/sparse_cells.gt.json b/tests/fixtures/tables/sparse_cells.gt.json new file mode 100644 index 0000000..eb7bd91 --- /dev/null +++ b/tests/fixtures/tables/sparse_cells.gt.json @@ -0,0 +1,265 @@ +{ + "source": "synthetic", + "license": "generated", + "notes": "7x5 borderless with blank cells on subtotal/total rows", + "tables": [ + { + "page": 1, + "bbox": [ + 72.0, + 532.0, + 432.0, + 700.0 + ], + "col_count": 5, + "row_count": 7, + "cells": [ + { + "row": 0, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Item" + }, + { + "row": 0, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "Qty" + }, + { + "row": 0, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "Price" + }, + { + "row": 0, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "Total" + }, + { + "row": 0, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "Note" + }, + { + "row": 1, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Apple" + }, + { + "row": 1, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "2" + }, + { + "row": 1, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "1.00" + }, + { + "row": 1, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "2.00" + }, + { + "row": 1, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "" + }, + { + "row": 2, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Banana" + }, + { + "row": 2, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "5" + }, + { + "row": 2, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "0.50" + }, + { + "row": 2, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "2.50" + }, + { + "row": 2, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "" + }, + { + "row": 3, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Subtotal" + }, + { + "row": 3, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "" + }, + { + "row": 3, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "" + }, + { + "row": 3, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "4.50" + }, + { + "row": 3, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "" + }, + { + "row": 4, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Orange" + }, + { + "row": 4, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "3" + }, + { + "row": 4, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "1.50" + }, + { + "row": 4, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "4.50" + }, + { + "row": 4, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "" + }, + { + "row": 5, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Grape" + }, + { + "row": 5, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "1" + }, + { + "row": 5, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "3.00" + }, + { + "row": 5, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "3.00" + }, + { + "row": 5, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "" + }, + { + "row": 6, + "col": 0, + "row_span": 1, + "col_span": 1, + "text": "Total" + }, + { + "row": 6, + "col": 1, + "row_span": 1, + "col_span": 1, + "text": "" + }, + { + "row": 6, + "col": 2, + "row_span": 1, + "col_span": 1, + "text": "" + }, + { + "row": 6, + "col": 3, + "row_span": 1, + "col_span": 1, + "text": "12.00" + }, + { + "row": 6, + "col": 4, + "row_span": 1, + "col_span": 1, + "text": "" + } + ] + } + ] +} diff --git a/tests/fixtures/tables/sparse_cells.pdf b/tests/fixtures/tables/sparse_cells.pdf new file mode 100644 index 0000000000000000000000000000000000000000..93272f676a8c4ded0537319d952c9f957b269a4e GIT binary patch literal 3579 zcmbtXUvJ_@5P$clm={#i3ft@dK1GoN33upnIZ9mhA&RQSUWg;(wbn}$`W1R__ukLi zS%Z%yx#}<lguSz~zn%Gc#xvWSUX2IDbdBEEFMt07GgfgSZ|EG}vyuraSq7*98Bf<M zR!Vr3vU+D(&(F`MXmkrYH&ebc2-v{qPsaJVVT}nmIGh<57X~XcC065hdPr+F<_Kne zV8x$I<|zg1x=eYNmrJmI&&$iQ&i8^_x?-R}t-JMHZYmrk_>W_C!5PHTwsN(tMmU9L z*AOau!1~0poDTU5+~e4WfH)Aw4m^N0r9$}^Z1rZW=4+9%8XA9;vC2V2m<pcWF^K?{ zBF0MC3pz4c(Ip%1uA|*`0uN{wSTin>NL+3uf|d=Wk{i%EFvZ2110JfjT&bx~JVP&k ztJy6t@kWX2*3c~jBPwaZml}qzVO*}|BNx!8B`-HCzOK)}8VN>a&dV!hS^w&tW8;V5 zxB2{H+dtv2)l-WrS~2lNR|CmvIauT`axIub`p7oVTx2!$ae$`&2oVX))+s~CeXuDe zZ}PdIVgmzs6I^EvSNQOsSSq2P(0RcCc^5QQJ^)SAHQsMvXBHZg<PvrIbwx$Z{zzF- zC|i;Ls=|=A#<Zw)%GG0UDIf85^SG^#IeK*uDxPL4M7CTv^lMD3$ZD6~uyQFM!Ex<a zY19%~-lm4##E%W>3SgKt(TuT|EP!~X2gwawCq^r`8#&IlQWbg%fY_nwggyk3X9htK zLb8JXM6%TxBu|k1d>jZacAP_uU*)C?gyZ>UKn@|MB2QTt5SRF76uojL!Bi<ap+te{ zd55Ola@oleVk092EeY&Bt(R3*bVG^ahF2ZL>Eepxsi%8r1u<>A3x-SHVmMtf;zJ{j z4Xije!vI-ygFTBwTH?2h2V_U1j^cL#Xlc{|Y|}r9MjgZHilJ!K0FI_HG6NrJB+i~j zRrk7P2En?fgB-iR<D=c$=EtsULr+`U_fFwV12}eXEL*1Q_~1uKUi)JFjKy|&N(vvK zxF^Kk0RZXB!8U~tFizo72RNQb*CaOb7-$9NJr>QAXw*T-DKzQ;Zr8Wij>i9mfhnw0 zaXgLN(&l?wLFruDR08V=$L_8Y_>^_7YDMCMrG3oBjs-|jrn$W{8q_&p*ADGY1TTUu z3>zY2+j6RZz!#tE!vem^LTtX8nev+qtMwb1L$F19rRRM|&kOM!@=7nnhU&E}a<x6M z&CClXnv>1s3XJ=6H%LQ=(1pKX)b|%LWpgjg=BXXKZstYviyvz|7(MN<wfz2JYcPN; z$7UaU-v2sxof(ETDHj}6!#^|dtidIy0|egR<$p2cw`oMFkPYE3i9Ms&yZ$^j{sndE BP|pAW literal 0 HcmV?d00001 From 0bff34749cb12d8fca22d34ac158ce1f94c38d5e Mon Sep 17 00:00:00 2001 From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com> Date: Fri, 17 Apr 2026 00:54:36 +0530 Subject: [PATCH 05/12] feat(tests): add GriTS-Top and cell-level P/R scorer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pure-Python scorer — no external deps beyond stdlib. Implements: - Whitespace-normalized text comparison. - Greedy bbox-IoU 1:1 matching of predicted tables to ground truth, restricted to same page. - Cell-level multiset precision/recall on normalized text. - GriTS-Top: F1 over the set of (row_start, row_end, col_start, col_end) topological signatures of every cell, accounting for row/col spans. Robust to missing text; penalizes structural errors. 9 unit tests cover the identical, all-wrong-text, missing-column, extra-merge, page-mismatch, and low-IoU cases. --- tests/python/grits.py | 207 +++++++++++++++++++++++++++++++++++++ tests/python/test_grits.py | 86 +++++++++++++++ 2 files changed, 293 insertions(+) create mode 100644 tests/python/grits.py create mode 100644 tests/python/test_grits.py diff --git a/tests/python/grits.py b/tests/python/grits.py new file mode 100644 index 0000000..8c2cd19 --- /dev/null +++ b/tests/python/grits.py @@ -0,0 +1,207 @@ +"""Table extraction accuracy scorer. + +Implements: +- Whitespace-normalized text comparison. +- Bbox-IoU matching of predicted tables to ground-truth tables (greedy Hungarian). +- Cell-level precision/recall/F1 on normalized text. +- GriTS-Top: F1 over the set of topological signatures (row_start, row_end, col_start, col_end) + of every cell, accounting for row/col spans. Robust to missing text; penalizes structural errors. + +The internal representation for a table is a dict matching the .gt.json `tables[]` entry: + + { + "page": int, + "bbox": [x_min, y_min, x_max, y_max], + "col_count": int, + "row_count": int, + "cells": [{"row": int, "col": int, "row_span": int, "col_span": int, "text": str}, ...], + } + +Predicted tables from paperjam.Table are converted into this shape by ``predicted_tables_to_gt_shape``. + +Reference: Smock, Pesala, Abraham, "PubTables-1M" (CVPR 2022), §4.1. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Iterable + + +def normalize_text(s: str) -> str: + return " ".join(s.split()) + + +def _bbox_iou(a: list[float] | tuple[float, ...], b: list[float] | tuple[float, ...]) -> float: + ax1, ay1, ax2, ay2 = a + bx1, by1, bx2, by2 = b + ix1, iy1 = max(ax1, bx1), max(ay1, by1) + ix2, iy2 = min(ax2, bx2), min(ay2, by2) + iw, ih = max(0.0, ix2 - ix1), max(0.0, iy2 - iy1) + inter = iw * ih + area_a = max(0.0, ax2 - ax1) * max(0.0, ay2 - ay1) + area_b = max(0.0, bx2 - bx1) * max(0.0, by2 - by1) + union = area_a + area_b - inter + return inter / union if union > 0 else 0.0 + + +def match_tables(pred: list[dict], gt: list[dict], iou_threshold: float = 0.5) -> dict: + """Greedy 1:1 matching on bbox IoU restricted to same page. + + Returns {"matches": [(p_idx, g_idx, iou), ...], "unmatched_pred": [...], "unmatched_gt": [...]}. + """ + pairs: list[tuple[float, int, int]] = [] + for pi, p in enumerate(pred): + for gi, g in enumerate(gt): + if p.get("page") != g.get("page"): + continue + iou = _bbox_iou(p["bbox"], g["bbox"]) + if iou >= iou_threshold: + pairs.append((iou, pi, gi)) + pairs.sort(reverse=True) + used_p: set[int] = set() + used_g: set[int] = set() + matches: list[tuple[int, int, float]] = [] + for iou, pi, gi in pairs: + if pi in used_p or gi in used_g: + continue + matches.append((pi, gi, iou)) + used_p.add(pi) + used_g.add(gi) + return { + "matches": matches, + "unmatched_pred": [i for i in range(len(pred)) if i not in used_p], + "unmatched_gt": [i for i in range(len(gt)) if i not in used_g], + } + + +def _cell_text_bag(cells: Iterable[dict]) -> list[str]: + return sorted(normalize_text(c["text"]) for c in cells if normalize_text(c["text"]) != "") + + +def cell_precision_recall(pred_table: dict, gt_table: dict) -> dict: + """Multiset-level precision/recall over cell texts. + + Treats identical strings as interchangeable; ignores position. A complementary metric + to GriTS-Top, which scores structure and ignores text. Together they catch both failure modes. + """ + pred_bag = _cell_text_bag(pred_table["cells"]) + gt_bag = _cell_text_bag(gt_table["cells"]) + if not pred_bag and not gt_bag: + return {"precision": 1.0, "recall": 1.0, "f1": 1.0, "matched": 0, "pred_total": 0, "gt_total": 0} + # Multiset intersection size. + from collections import Counter + + inter = sum((Counter(pred_bag) & Counter(gt_bag)).values()) + precision = inter / len(pred_bag) if pred_bag else 0.0 + recall = inter / len(gt_bag) if gt_bag else 0.0 + f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0 + return { + "precision": precision, + "recall": recall, + "f1": f1, + "matched": inter, + "pred_total": len(pred_bag), + "gt_total": len(gt_bag), + } + + +def _topology_signatures(cells: Iterable[dict]) -> set[tuple[int, int, int, int]]: + """Each cell → (row_start, row_end_exclusive, col_start, col_end_exclusive).""" + return {(c["row"], c["row"] + c.get("row_span", 1), c["col"], c["col"] + c.get("col_span", 1)) for c in cells} + + +def grits_top(pred_table: dict, gt_table: dict) -> dict: + """GriTS-Top (topology) F1 over cell signatures. + + Score = F1 over the set of `(row_start, row_end, col_start, col_end)` topological + signatures of every cell. A perfect structural match is 1.0; different row/col + count or missed merges drive it down even when text is correct. + """ + pred_sigs = _topology_signatures(pred_table["cells"]) + gt_sigs = _topology_signatures(gt_table["cells"]) + if not pred_sigs and not gt_sigs: + return {"precision": 1.0, "recall": 1.0, "f1": 1.0} + inter = len(pred_sigs & gt_sigs) + precision = inter / len(pred_sigs) if pred_sigs else 0.0 + recall = inter / len(gt_sigs) if gt_sigs else 0.0 + f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0 + return {"precision": precision, "recall": recall, "f1": f1} + + +def score_document(pred: list[dict], gt: list[dict], iou_threshold: float = 0.5) -> dict: + """Aggregate scores across all tables in one document. + + Returns table-detection F1 (matched vs unmatched), plus average cell F1 and GriTS-Top F1 + over matched pairs. Unmatched predicted tables are not scored for cell/GriTS but do hurt + table-detection precision; unmatched GT tables hurt detection recall. + """ + m = match_tables(pred, gt, iou_threshold=iou_threshold) + n_matched = len(m["matches"]) + n_pred = len(pred) + n_gt = len(gt) + td_p = n_matched / n_pred if n_pred > 0 else (1.0 if n_gt == 0 else 0.0) + td_r = n_matched / n_gt if n_gt > 0 else (1.0 if n_pred == 0 else 0.0) + td_f1 = (2 * td_p * td_r / (td_p + td_r)) if (td_p + td_r) > 0 else 0.0 + + cell_f1s: list[float] = [] + grits_f1s: list[float] = [] + per_table: list[dict] = [] + for pi, gi, iou in m["matches"]: + cell = cell_precision_recall(pred[pi], gt[gi]) + top = grits_top(pred[pi], gt[gi]) + cell_f1s.append(cell["f1"]) + grits_f1s.append(top["f1"]) + per_table.append( + { + "pred_idx": pi, + "gt_idx": gi, + "bbox_iou": iou, + "cell_f1": cell["f1"], + "cell_precision": cell["precision"], + "cell_recall": cell["recall"], + "grits_top_f1": top["f1"], + } + ) + avg_cell_f1 = sum(cell_f1s) / len(cell_f1s) if cell_f1s else 0.0 + avg_grits = sum(grits_f1s) / len(grits_f1s) if grits_f1s else 0.0 + return { + "table_detection_precision": td_p, + "table_detection_recall": td_r, + "table_detection_f1": td_f1, + "avg_cell_f1": avg_cell_f1, + "avg_grits_top_f1": avg_grits, + "n_matched": n_matched, + "n_pred": n_pred, + "n_gt": n_gt, + "per_table": per_table, + } + + +def predicted_tables_to_gt_shape(paperjam_tables: list, *, page: int) -> list[dict]: + """Convert paperjam.Table dataclasses into the harness dict shape.""" + out: list[dict] = [] + for t in paperjam_tables: + cells = [ + { + "row": r_idx, + "col": c_idx, + "row_span": int(getattr(cell, "row_span", 1)), + "col_span": int(getattr(cell, "col_span", 1)), + "text": cell.text, + } + for r_idx, row in enumerate(t.rows) + for c_idx, cell in enumerate(row.cells) + ] + out.append( + { + "page": page, + "bbox": list(t.bbox), + "col_count": int(t.col_count), + "row_count": int(t.row_count), + "cells": cells, + } + ) + return out diff --git a/tests/python/test_grits.py b/tests/python/test_grits.py new file mode 100644 index 0000000..f5209c3 --- /dev/null +++ b/tests/python/test_grits.py @@ -0,0 +1,86 @@ +"""Unit tests for the GriTS scorer — independent of the extractor.""" + +from __future__ import annotations + +from grits import ( + cell_precision_recall, + grits_top, + match_tables, + normalize_text, + score_document, +) + + +def _mk_table(cells: list[tuple[int, int, int, int, str]], page: int = 1, bbox=(0, 0, 100, 100), rows=None, cols=None) -> dict: + return { + "page": page, + "bbox": list(bbox), + "row_count": rows or (max((c[0] + c[2] for c in cells), default=0)), + "col_count": cols or (max((c[1] + c[3] for c in cells), default=0)), + "cells": [{"row": r, "col": c, "row_span": rs, "col_span": cs, "text": t} for r, c, rs, cs, t in cells], + } + + +def test_normalize_text_collapses_whitespace(): + assert normalize_text(" hello world\n ") == "hello world" + assert normalize_text("") == "" + + +def test_identical_tables_score_perfect(): + t = _mk_table([(0, 0, 1, 1, "A"), (0, 1, 1, 1, "B"), (1, 0, 1, 1, "x"), (1, 1, 1, 1, "y")]) + assert cell_precision_recall(t, t)["f1"] == 1.0 + assert grits_top(t, t)["f1"] == 1.0 + + +def test_all_wrong_text_grits_perfect_cells_zero(): + gt = _mk_table([(0, 0, 1, 1, "A"), (0, 1, 1, 1, "B")]) + pred = _mk_table([(0, 0, 1, 1, "X"), (0, 1, 1, 1, "Y")]) + assert grits_top(pred, gt)["f1"] == 1.0 # structure identical + assert cell_precision_recall(pred, gt)["f1"] == 0.0 # no shared text + + +def test_missing_column_penalizes_both(): + gt = _mk_table([(0, 0, 1, 1, "A"), (0, 1, 1, 1, "B"), (0, 2, 1, 1, "C")]) + pred = _mk_table([(0, 0, 1, 1, "A"), (0, 1, 1, 1, "B")]) + assert grits_top(pred, gt)["f1"] < 1.0 + cell = cell_precision_recall(pred, gt) + assert cell["precision"] == 1.0 # every pred cell is right + assert cell["recall"] < 1.0 # missed one + + +def test_extra_merge_penalizes_grits(): + gt = _mk_table([(0, 0, 1, 1, "H1"), (0, 1, 1, 1, "H2")], cols=2, rows=1) + pred = _mk_table([(0, 0, 1, 2, "H1 H2")], cols=2, rows=1) # merged header + # Topological signatures differ — pred has one wide cell, gt has two narrow cells + assert grits_top(pred, gt)["f1"] == 0.0 + + +def test_table_detection_f1_counts_extra_tables(): + t = _mk_table([(0, 0, 1, 1, "A")]) + out = score_document([t, t], [t]) # predicted 2, gt 1 + assert out["table_detection_precision"] == 0.5 + assert out["table_detection_recall"] == 1.0 + + +def test_table_detection_f1_missed_table(): + t = _mk_table([(0, 0, 1, 1, "A")]) + out = score_document([], [t]) + assert out["table_detection_recall"] == 0.0 + assert out["n_matched"] == 0 + + +def test_match_tables_respects_page(): + t_p1 = _mk_table([(0, 0, 1, 1, "A")], page=1) + t_p2 = _mk_table([(0, 0, 1, 1, "A")], page=2) + m = match_tables([t_p1], [t_p2]) + assert m["matches"] == [] + assert m["unmatched_pred"] == [0] + assert m["unmatched_gt"] == [0] + + +def test_match_tables_low_iou_unmatched(): + t_far = _mk_table([(0, 0, 1, 1, "A")], bbox=(0, 0, 10, 10)) + t_near = _mk_table([(0, 0, 1, 1, "A")], bbox=(0, 0, 100, 100)) + # The small bbox is fully inside the big one, but the IoU (100 / 10000 = 0.01) is well below threshold. + m = match_tables([t_far], [t_near]) + assert m["matches"] == [] From 5eca9cbe463fa4b975d59b6f2e3d6a140d4ef224 Mon Sep 17 00:00:00 2001 From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com> Date: Fri, 17 Apr 2026 00:54:55 +0530 Subject: [PATCH 06/12] feat(tests): add table extraction accuracy harness with baseline gate New pytest suite under -m accuracy runs over every .gt.json fixture, converts paperjam.Table outputs into the harness shape, and scores against ground truth via GriTS-Top + cell-level P/R. - conftest.py session-scoped accuracy_report fixture accumulates per-fixture scores and writes tests/output/table_accuracy.json on session teardown. - --update-baseline CLI option rewrites tests/fixtures/tables/.accuracy_baseline.json after an intentional improvement. - test_baseline_regression fails if aggregate table-detection F1, cell F1, or GriTS-Top F1 drops more than 1% from the committed baseline. - tests/output/ is excluded from version control. Initial committed baseline (on current heuristics): table_detection_f1 = 1.0000 avg_cell_f1 = 0.9923 avg_grits_top_f1 = 0.9557 Phase 1 (merged-cell reconstruction) will drive the GriTS-Top number up by fixing bordered_merged (currently 0.757). --- .gitignore | 3 + tests/fixtures/tables/.accuracy_baseline.json | 8 ++ tests/python/conftest.py | 43 ++++++++ tests/python/test_table_accuracy.py | 100 ++++++++++++++++++ 4 files changed, 154 insertions(+) create mode 100644 tests/fixtures/tables/.accuracy_baseline.json create mode 100644 tests/python/test_table_accuracy.py diff --git a/.gitignore b/.gitignore index a18592b..3a1d21a 100644 --- a/.gitignore +++ b/.gitignore @@ -36,6 +36,9 @@ python/paperjam/libpdfium.so # Test fixtures (generated) tests/fixtures/large_*.pdf +# Per-session test artifacts (accuracy reports, etc.) +tests/output/ + # Sphinx _build diff --git a/tests/fixtures/tables/.accuracy_baseline.json b/tests/fixtures/tables/.accuracy_baseline.json new file mode 100644 index 0000000..b1223da --- /dev/null +++ b/tests/fixtures/tables/.accuracy_baseline.json @@ -0,0 +1,8 @@ +{ + "aggregate": { + "table_detection_f1": 1.0, + "avg_cell_f1": 0.9923469387755102, + "avg_grits_top_f1": 0.9557057057057057, + "n_fixtures": 8 + } +} diff --git a/tests/python/conftest.py b/tests/python/conftest.py index a8e87b4..526b779 100644 --- a/tests/python/conftest.py +++ b/tests/python/conftest.py @@ -1,8 +1,21 @@ +import json import pathlib import pytest FIXTURES = pathlib.Path(__file__).resolve().parent.parent / "fixtures" +TABLES_FIXTURES = FIXTURES / "tables" +OUTPUT_DIR = pathlib.Path(__file__).resolve().parent.parent / "output" +BASELINE_PATH = TABLES_FIXTURES / ".accuracy_baseline.json" + + +def pytest_addoption(parser): + parser.addoption( + "--update-baseline", + action="store_true", + default=False, + help="Write the current accuracy scores to the baseline file (skips the regression gate).", + ) @pytest.fixture @@ -28,3 +41,33 @@ def table_bordered_pdf(): @pytest.fixture def metadata_pdf(): return str(FIXTURES / "with_metadata.pdf") + + +@pytest.fixture(scope="session") +def accuracy_report(request): + """Session-scoped accumulator: per-fixture -> score dict; written to tests/output at teardown.""" + report: dict = {} + + def _finalize(): + if not report: + return + OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + aggregate = _aggregate_scores(report) + payload = {"fixtures": report, "aggregate": aggregate} + (OUTPUT_DIR / "table_accuracy.json").write_text(json.dumps(payload, indent=2) + "\n") + if request.config.getoption("--update-baseline"): + BASELINE_PATH.write_text(json.dumps({"aggregate": aggregate}, indent=2) + "\n") + + request.addfinalizer(_finalize) + return report + + +def _aggregate_scores(report: dict) -> dict: + fields = ("table_detection_f1", "avg_cell_f1", "avg_grits_top_f1") + totals = {f: 0.0 for f in fields} + n = 0 + for scores in report.values(): + for f in fields: + totals[f] += float(scores.get(f, 0.0)) + n += 1 + return {f: (totals[f] / n if n else 0.0) for f in fields} | {"n_fixtures": n} diff --git a/tests/python/test_table_accuracy.py b/tests/python/test_table_accuracy.py new file mode 100644 index 0000000..6298414 --- /dev/null +++ b/tests/python/test_table_accuracy.py @@ -0,0 +1,100 @@ +"""Table extraction accuracy harness. + +Parametrized over every ``.gt.json`` fixture under ``tests/fixtures/tables/``. For each +fixture, opens the paired PDF, runs ``extract_tables`` on every page, converts the result +into the harness shape, and scores it against the ground truth. + +A final ``test_baseline_regression`` compares the aggregate to the committed baseline +and fails on >1% drop in table-detection F1, cell F1, or GriTS-Top F1. Pass +``--update-baseline`` to refresh the committed numbers after an intentional improvement. + +Run: ``uv run pytest tests/python/ -m accuracy -v`` +""" + +from __future__ import annotations + +import json +import pathlib + +import paperjam +import pytest +from grits import predicted_tables_to_gt_shape, score_document + +FIXTURES_DIR = pathlib.Path(__file__).resolve().parent.parent / "fixtures" / "tables" +BASELINE_PATH = FIXTURES_DIR / ".accuracy_baseline.json" +REGRESSION_THRESHOLD = 0.01 # allow 1% noise; anything worse fails CI + +_GT_FILES = sorted(FIXTURES_DIR.glob("*.gt.json")) + + +def _load_pred(pdf_path: pathlib.Path) -> list[dict]: + doc = paperjam.open_pdf(str(pdf_path)) + out: list[dict] = [] + for page_idx, page in enumerate(doc.pages, start=1): + tables = page.extract_tables() + out.extend(predicted_tables_to_gt_shape(tables, page=page_idx)) + return out + + +@pytest.mark.accuracy +@pytest.mark.parametrize("gt_path", _GT_FILES, ids=lambda p: p.stem) +def test_extraction_accuracy(gt_path: pathlib.Path, accuracy_report): + pdf_path = gt_path.with_name(gt_path.name.removesuffix(".gt.json") + ".pdf") + assert pdf_path.exists(), f"fixture PDF missing for {gt_path.name}" + with open(gt_path) as f: + gt_doc = json.load(f) + gt_tables = gt_doc["tables"] + pred_tables = _load_pred(pdf_path) + scores = score_document(pred_tables, gt_tables) + # Slim the per_table dump before storing so the report stays readable. + accuracy_report[gt_path.stem] = { + "table_detection_precision": scores["table_detection_precision"], + "table_detection_recall": scores["table_detection_recall"], + "table_detection_f1": scores["table_detection_f1"], + "avg_cell_f1": scores["avg_cell_f1"], + "avg_grits_top_f1": scores["avg_grits_top_f1"], + "n_matched": scores["n_matched"], + "n_pred": scores["n_pred"], + "n_gt": scores["n_gt"], + } + + +@pytest.mark.accuracy +def test_baseline_regression(accuracy_report, request): + """Fails if aggregate scores have dropped more than REGRESSION_THRESHOLD vs the committed baseline. + + Skipped when --update-baseline is passed, since we're refreshing the numbers in that run. + Also skipped when no baseline exists yet (first ever run; commit the baseline this run produces). + """ + if request.config.getoption("--update-baseline"): + pytest.skip("--update-baseline: regression gate disabled this run") + if not BASELINE_PATH.exists(): + pytest.skip(f"no baseline at {BASELINE_PATH}; run once with --update-baseline and commit it") + + # The session-scoped accuracy_report may not have been populated yet when this test runs + # in isolation; run it last in the file so the parametrized tests above have fired. + if not accuracy_report: + pytest.skip("no accuracy data collected in this session") + + baseline = json.loads(BASELINE_PATH.read_text())["aggregate"] + # Recompute aggregate from the session-scoped report. + current = _aggregate(accuracy_report) + + failures: list[str] = [] + for field in ("table_detection_f1", "avg_cell_f1", "avg_grits_top_f1"): + b = float(baseline.get(field, 0.0)) + c = float(current.get(field, 0.0)) + if c + REGRESSION_THRESHOLD < b: + failures.append(f"{field}: {c:.4f} < baseline {b:.4f} (drop > {REGRESSION_THRESHOLD:.0%})") + assert not failures, "accuracy regression vs baseline:\n " + "\n ".join(failures) + + +def _aggregate(report: dict) -> dict: + fields = ("table_detection_f1", "avg_cell_f1", "avg_grits_top_f1") + totals = {f: 0.0 for f in fields} + n = 0 + for scores in report.values(): + for f in fields: + totals[f] += float(scores.get(f, 0.0)) + n += 1 + return {f: (totals[f] / n if n else 0.0) for f in fields} From 0e7de122c46513d3bbc017c817d95dd47dd71f9c Mon Sep 17 00:00:00 2001 From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com> Date: Fri, 17 Apr 2026 00:55:05 +0530 Subject: [PATCH 07/12] feat(bench): add criterion microbench for table extraction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Runs against the full fixture corpus under tests/fixtures/tables/. Document parsing and page loading happen outside the measured section so the bench only times table::extract_tables — that's the code Phases 1–4 will change. Local baseline on this machine: 13–107 µs per fixture. Not wired into CI yet since cross-runner wall-clock is too noisy; gets a dedicated perf-regression job once a stable single-runner baseline exists. Run with: cargo bench -p paperjam-core --bench table_extraction --- crates/paperjam-core/Cargo.toml | 7 ++ .../paperjam-core/benches/table_extraction.rs | 79 +++++++++++++++++++ 2 files changed, 86 insertions(+) create mode 100644 crates/paperjam-core/benches/table_extraction.rs diff --git a/crates/paperjam-core/Cargo.toml b/crates/paperjam-core/Cargo.toml index c986c85..a3e7ce4 100644 --- a/crates/paperjam-core/Cargo.toml +++ b/crates/paperjam-core/Cargo.toml @@ -45,3 +45,10 @@ render = ["dep:pdfium-render", "dep:image"] signatures = ["dep:x509-parser", "dep:cms", "dep:der", "dep:sha1", "dep:rsa", "dep:p256", "dep:pkcs8", "dep:spki"] ltv = ["signatures", "dep:ureq", "dep:rustls"] validation = ["dep:roxmltree"] + +[dev-dependencies] +criterion = { version = "0.5", features = ["html_reports"] } + +[[bench]] +name = "table_extraction" +harness = false diff --git a/crates/paperjam-core/benches/table_extraction.rs b/crates/paperjam-core/benches/table_extraction.rs new file mode 100644 index 0000000..c6e2f4f --- /dev/null +++ b/crates/paperjam-core/benches/table_extraction.rs @@ -0,0 +1,79 @@ +//! Criterion microbench for table extraction over the fixture corpus. +//! +//! Run with: `cargo bench -p paperjam-core --bench table_extraction` +//! +//! Each synthetic PDF fixture under `tests/fixtures/tables/` becomes a bench group. +//! The document and page are parsed outside the measured section so the bench only +//! measures `table::extract_tables` itself — that's the code subsequent phases will +//! change. + +use std::path::PathBuf; + +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; +use paperjam_core::document::Document; +use paperjam_core::table::{extract_tables, TableExtractionOptions}; + +fn fixtures_dir() -> PathBuf { + let manifest = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + manifest.join("../../tests/fixtures/tables") +} + +fn collect_fixtures() -> Vec<PathBuf> { + let mut out = Vec::new(); + let dir = fixtures_dir(); + let Ok(rd) = std::fs::read_dir(&dir) else { + eprintln!( + "skipping bench: fixtures dir not found at {}", + dir.display() + ); + return out; + }; + for entry in rd.flatten() { + let path = entry.path(); + if path.extension().and_then(|s| s.to_str()) == Some("pdf") { + out.push(path); + } + } + out.sort(); + out +} + +fn bench_extract(c: &mut Criterion) { + let fixtures = collect_fixtures(); + let opts = TableExtractionOptions::default(); + + let mut group = c.benchmark_group("table_extraction"); + for path in &fixtures { + let name = path + .file_stem() + .and_then(|s| s.to_str()) + .unwrap_or("?") + .to_string(); + // Parse the document once and keep its pages alive for the whole bench run. + let doc = match Document::open(path) { + Ok(d) => d, + Err(e) => { + eprintln!("skipping {name}: failed to open ({e:?})"); + continue; + } + }; + let mut pages = Vec::new(); + for n in 1..=doc.page_count() as u32 { + if let Ok(p) = doc.page(n) { + pages.push(p); + } + } + group.throughput(Throughput::Elements(pages.len() as u64)); + group.bench_with_input(BenchmarkId::from_parameter(&name), &pages, |b, pages| { + b.iter(|| { + for page in pages.iter() { + let _ = extract_tables(page, &opts); + } + }); + }); + } + group.finish(); +} + +criterion_group!(benches, bench_extract); +criterion_main!(benches); From ce098dd771c4e098601fb664ab2a59184e749a86 Mon Sep 17 00:00:00 2001 From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com> Date: Fri, 17 Apr 2026 00:55:21 +0530 Subject: [PATCH 08/12] ci: add table extraction accuracy job MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New ubuntu-latest / Python 3.13 job runs on every PR, gated by the existing lint job: 1. Regenerates fixtures via scripts/generate_table_fixtures.py and diffs against the committed copies. Catches non-deterministic generator changes before they pollute the corpus. 2. Runs pytest -m accuracy — the parametrized fixture tests plus the baseline regression gate. 3. Uploads tests/output/table_accuracy.json as an artifact so per-fixture deltas are visible on every PR. The cross-platform pytest matrix already covers correctness on macOS and Windows; one ubuntu runner is enough for accuracy scoring. --- .github/workflows/ci.yml | 48 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4b9b76b..826a290 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -102,3 +102,51 @@ jobs: with: name: test-results-${{ matrix.os }}-py${{ matrix.python-version }} path: results.xml + + accuracy: + name: Table Extraction Accuracy + needs: lint + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - name: Set up Python 3.13 + uses: actions/setup-python@v6 + with: + python-version: "3.13" + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + + - uses: Swatinem/rust-cache@v2.9.1 + with: + save-if: false + + - uses: astral-sh/setup-uv@v7 + with: + cache-dependency-glob: pyproject.toml + + - name: Install dependencies + run: uv sync --extra dev + + - name: Build native extension + uses: PyO3/maturin-action@v1 + with: + command: develop + args: --release + + - name: Verify fixtures are up to date + run: | + cp -r tests/fixtures/tables /tmp/tables_snapshot + uv run python scripts/generate_table_fixtures.py + diff -r /tmp/tables_snapshot tests/fixtures/tables + + - name: Run accuracy harness + run: uv run pytest tests/python/ -m accuracy -v --tb=short + + - name: Upload accuracy report + if: always() + uses: actions/upload-artifact@v7 + with: + name: accuracy-report + path: tests/output/table_accuracy.json From a87609cadfc4b0b21f818bb176468f9b2c0ff34e Mon Sep 17 00:00:00 2001 From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com> Date: Fri, 17 Apr 2026 01:02:09 +0530 Subject: [PATCH 09/12] chore(gitignore): exclude CLAUDE.md project guide CLAUDE.md is a local developer-assistant guide, not a tracked project artifact. --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 3a1d21a..249bb50 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ dist/ build/ *.so *.dylib +CLAUDE.md *.dll .venv/ venv/ From ac553a27e10a950da6d7c727e453a77de771aa70 Mon Sep 17 00:00:00 2001 From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com> Date: Fri, 17 Apr 2026 01:02:14 +0530 Subject: [PATCH 10/12] chore(pre-commit): add commit-msg hook to strip AI attribution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New local hook (stages: [commit-msg]) that removes any Co-Authored-By commit messages before the commit is created. The hook script lives at .claude/hooks/strip-ai-attribution.sh which is gitignored, so contributors who want the hook active must provide their own copy locally. In standard pre-commit runs (and CI) this hook is inert — it only fires on the commit-msg stage, which requires 'pre-commit install --hook-type commit-msg' to be enabled. --- .pre-commit-config.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 457a2df..d9c0aed 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -47,3 +47,11 @@ repos: language: system files: ^docs-site/src/ pass_filenames: false + + - repo: local + hooks: + - id: strip-ai-attribution + name: strip AI attribution from commit messages + entry: .claude/hooks/strip-ai-attribution.sh + language: script + stages: [commit-msg] From 4cf994e520e4899bc99b683ad94fdd2597d4b35f Mon Sep 17 00:00:00 2001 From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com> Date: Fri, 17 Apr 2026 01:25:08 +0530 Subject: [PATCH 11/12] fix(clippy): resolve rust 1.95 warnings on pre-existing code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rust 1.95 tightens collapsible_match and unnecessary_sort_by. These errors were pre-existing on main but surface here because the CI toolchain just bumped to 1.95; leaving them unfixed would keep the accuracy CI job (and rust-test) red on every PR. - paperjam-epub/toc.rs, parser.rs: fold single-branch if inside match arms into if-guards on the arm itself. No behavior change — subsequent arms are distinct Event variants or a _ catch-all. - paperjam-pptx/parser.rs: same treatment for Event::Empty and Event::Text arms. - paperjam-core/forms/mod.rs: merge the None / Some((true, _)) / Some((false, _)) with inner if-else into a guarded arm plus a catch-all. All three original 'not found' paths now share one catch-all body; semantics are identical. - paperjam-core/manipulation/insert.rs: sort_by(|a, b| b.0.cmp(&a.0)) → sort_by_key(|p| Reverse(p.0)). --- crates/paperjam-core/src/forms/mod.rs | 18 ++++------- .../paperjam-core/src/manipulation/insert.rs | 2 +- crates/paperjam-epub/src/parser.rs | 30 +++++++++---------- crates/paperjam-epub/src/toc.rs | 6 ++-- crates/paperjam-pptx/src/parser.rs | 14 ++++----- 5 files changed, 27 insertions(+), 43 deletions(-) diff --git a/crates/paperjam-core/src/forms/mod.rs b/crates/paperjam-core/src/forms/mod.rs index f2b6c33..6875ac0 100644 --- a/crates/paperjam-core/src/forms/mod.rs +++ b/crates/paperjam-core/src/forms/mod.rs @@ -93,22 +93,14 @@ pub fn fill_form_fields( for (name, value) in values { match field_name_map.get(name) { - None => { - not_found.push(name.clone()); + Some((false, field_type)) if set_field_value(&mut inner, name, value, field_type)? => { + filled += 1; + filled_fields.push((name.clone(), value.clone(), field_type.clone())); } - Some((true, _)) => { - // Read-only field, skip + // Not in the map, read-only, or set_field_value returned false: record as not-found. + _ => { not_found.push(name.clone()); } - Some((false, field_type)) => { - // Find and update the field object - if set_field_value(&mut inner, name, value, field_type)? { - filled += 1; - filled_fields.push((name.clone(), value.clone(), field_type.clone())); - } else { - not_found.push(name.clone()); - } - } } } diff --git a/crates/paperjam-core/src/manipulation/insert.rs b/crates/paperjam-core/src/manipulation/insert.rs index fbcd5da..6007705 100644 --- a/crates/paperjam-core/src/manipulation/insert.rs +++ b/crates/paperjam-core/src/manipulation/insert.rs @@ -40,7 +40,7 @@ pub fn insert_blank_pages(doc: &Document, positions: &[(u32, f64, f64)]) -> Resu // Sort positions in reverse order so insertions don't shift indices let mut sorted_positions = positions.to_vec(); - sorted_positions.sort_by(|a, b| b.0.cmp(&a.0)); + sorted_positions.sort_by_key(|p| std::cmp::Reverse(p.0)); for (after_page, width, height) in sorted_positions { // Create an empty content stream diff --git a/crates/paperjam-epub/src/parser.rs b/crates/paperjam-epub/src/parser.rs index a452b45..0c621de 100644 --- a/crates/paperjam-epub/src/parser.rs +++ b/crates/paperjam-epub/src/parser.rs @@ -218,22 +218,20 @@ fn parse_opf(xml: &str) -> Result<(OpfMetadata, HashMap<String, String>, Vec<Str } } } - Ok(Event::Text(ref e)) => { - if section == Section::Metadata { - let text = e.unescape().unwrap_or_default().trim().to_string(); - if !text.is_empty() { - match current_tag.as_str() { - "title" => metadata.title = Some(text), - "creator" => metadata.creator = Some(text), - "subject" => metadata.subject = Some(text), - "description" => metadata.description = Some(text), - "publisher" => metadata.publisher = Some(text), - "date" => metadata.date = Some(text), - "language" => metadata.language = Some(text), - "identifier" => metadata.identifier = Some(text), - "rights" => metadata.rights = Some(text), - _ => {} - } + Ok(Event::Text(ref e)) if section == Section::Metadata => { + let text = e.unescape().unwrap_or_default().trim().to_string(); + if !text.is_empty() { + match current_tag.as_str() { + "title" => metadata.title = Some(text), + "creator" => metadata.creator = Some(text), + "subject" => metadata.subject = Some(text), + "description" => metadata.description = Some(text), + "publisher" => metadata.publisher = Some(text), + "date" => metadata.date = Some(text), + "language" => metadata.language = Some(text), + "identifier" => metadata.identifier = Some(text), + "rights" => metadata.rights = Some(text), + _ => {} } } } diff --git a/crates/paperjam-epub/src/toc.rs b/crates/paperjam-epub/src/toc.rs index 3f4a5a7..3eb5d01 100644 --- a/crates/paperjam-epub/src/toc.rs +++ b/crates/paperjam-epub/src/toc.rs @@ -76,10 +76,8 @@ fn parse_nav_point(reader: &mut Reader<&[u8]>) -> Option<TocEntry> { } } } - Ok(Event::Text(ref e)) => { - if in_text { - title = e.unescape().unwrap_or_default().trim().to_string(); - } + Ok(Event::Text(ref e)) if in_text => { + title = e.unescape().unwrap_or_default().trim().to_string(); } Ok(Event::End(ref e)) => { let local = local_name(e.name().as_ref()); diff --git a/crates/paperjam-pptx/src/parser.rs b/crates/paperjam-pptx/src/parser.rs index 0f6e989..4f6d73f 100644 --- a/crates/paperjam-pptx/src/parser.rs +++ b/crates/paperjam-pptx/src/parser.rs @@ -254,11 +254,9 @@ fn parse_slide(xml: &str, index: usize, notes_xml: Option<&str>) -> Result<Slide _ => {} } } - Ok(Event::Empty(ref e)) => { - if parser.in_shape { - let local = local_name_owned(e.name().as_ref()); - handle_shape_empty(&mut parser, e, &local)?; - } + Ok(Event::Empty(ref e)) if parser.in_shape => { + let local = local_name_owned(e.name().as_ref()); + handle_shape_empty(&mut parser, e, &local)?; } Ok(Event::Text(ref e)) => { if parser.in_table_cell { @@ -436,10 +434,8 @@ fn parse_notes_text(xml: &str) -> Result<String> { text.push('\n'); } } - Ok(Event::Text(ref e)) => { - if in_text_body { - text.push_str(&e.unescape().unwrap_or_default()); - } + Ok(Event::Text(ref e)) if in_text_body => { + text.push_str(&e.unescape().unwrap_or_default()); } Ok(Event::Eof) => break, Err(e) => return Err(PptxError::Xml(format!("notes XML error: {e}"))), From 8251eb0cf648f393361464d5879842f9b7153aaf Mon Sep 17 00:00:00 2001 From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com> Date: Fri, 17 Apr 2026 01:41:49 +0530 Subject: [PATCH 12/12] fix(ci): declare binary fixture types in .gitattributes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Windows CI (test (windows-latest, 3.x)) fails with 'Invalid file trailer' on every PDF under tests/fixtures/tables/ because those fixtures are generated with pageCompression=0 (required for byte-deterministic output) — the uncompressed ASCII streams fool git's binary-vs-text heuristic, so core.autocrlf=true rewrites LF to CRLF on Windows checkout and shifts xref/trailer offsets into parser-breaking garbage. Add an explicit .gitattributes declaring PDFs and other universally binary formats (fonts, images, wasm) as binary so no content-based heuristic applies. Ubuntu/macOS CI and local runs are unaffected. --- .gitattributes | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..2700b07 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,14 @@ +# Enforce byte-exact checkout for binary fixtures. +# Without this, git's autodetection heuristic can misclassify uncompressed +# PDFs (pageCompression=0) as text on Windows (core.autocrlf=true), which +# rewrites LF to CRLF inside the PDF and corrupts xref/trailer offsets. +*.pdf binary +*.wasm binary +*.ttf binary +*.otf binary +*.woff binary +*.woff2 binary +*.ico binary +*.png binary +*.jpg binary +*.jpeg binary