From cff2538f1aad6c89a5f646aef6fca1e9d5ddcd6d Mon Sep 17 00:00:00 2001
From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com>
Date: Fri, 17 Apr 2026 00:53:06 +0530
Subject: [PATCH 01/12] fix(docs-site): remove redundant 'paperjam' heading
 from hero
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The landing page rendered an h1 with the text 'paperjam' directly below
the logo image, which itself contains the word 'paperjam'. Drop the
duplicated heading — the logo's alt text keeps the product name
accessible to screen readers, and Docusaurus <Layout title='Home'>
already sets the HTML <title>.
---
 docs-site/src/pages/index.tsx | 9 ---------
 1 file changed, 9 deletions(-)
diff --git a/docs-site/src/pages/index.tsx b/docs-site/src/pages/index.tsx
index 7dd2aaa..c4f3b6b 100644
--- a/docs-site/src/pages/index.tsx
+++ b/docs-site/src/pages/index.tsx
@@ -137,15 +137,6 @@ export default function Home(): React.JSX.Element {
             borderRadius: '16px',
           }}
         />
-        <h1
-          style={{
-            fontSize: '2.5rem',
-            fontWeight: 800,
-            marginBottom: '0.5rem',
-          }}
-        >
-          paperjam
-        </h1>
         <p
           style={{
             fontSize: '1.25rem',

From f3b7fe586ef30d31cf125404f50addf63c500ec1 Mon Sep 17 00:00:00 2001
From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com>
Date: Fri, 17 Apr 2026 00:53:18 +0530
Subject: [PATCH 02/12] chore(deps): add reportlab + pandas dev deps and pytest
 'accuracy' marker

Sets up the foundation for the table extraction accuracy harness:
- reportlab for deterministic synthetic PDF fixture generation.
- pandas promoted from optional to dev so tests can exercise
  Table.to_dataframe() in accuracy scoring.
- Registers the 'accuracy' pytest marker used by the harness tests.
---
 pyproject.toml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 5884a34..9553cf2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,6 +42,8 @@ dev = [
     "pdfplumber>=0.10",
     "mypy>=1.8",
     "ruff>=0.3",
+    "reportlab>=4.0,<4.2",
+    "pandas>=2.0",
 ]
 
 [tool.maturin]
@@ -56,6 +58,9 @@ include = [
 
 [tool.pytest.ini_options]
 testpaths = ["tests/python"]
+markers = [
+    "accuracy: table extraction accuracy harness (run with -m accuracy)",
+]
 
 [tool.ruff]
 target-version = "py312"

From caf3fe8084e9ad940c8299c10769bc04010e0e62 Mon Sep 17 00:00:00 2001
From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com>
Date: Fri, 17 Apr 2026 00:53:27 +0530
Subject: [PATCH 03/12] feat(fixtures): add deterministic synthetic table
 fixture generator
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Each fixture is described by a single TableSpec that is the shared
source of truth for both the rendered PDF and the ground-truth JSON
sidecar — by construction they cannot drift apart.

- reportlab.rl_config.invariant = 1 + pageCompression = 0 for
  bit-identical output across runs (verified via SHA256).
- 8 fixture types covering bordered/borderless, merged cells,
  borderless financial/invoice layouts, a landscape page, sparse
  subtotal rows, and a 2-page continuation with repeated header.
- Output lands under tests/fixtures/tables/, paired <name>.pdf +
  <name>.gt.json.
---
 scripts/generate_table_fixtures.py | 345 +++++++++++++++++++++++++++++
 1 file changed, 345 insertions(+)
 create mode 100644 scripts/generate_table_fixtures.py

diff --git a/scripts/generate_table_fixtures.py b/scripts/generate_table_fixtures.py
new file mode 100644
index 0000000..5b2feea
--- /dev/null
+++ b/scripts/generate_table_fixtures.py
@@ -0,0 +1,345 @@
+#!/usr/bin/env python3
+"""Generate deterministic synthetic PDF fixtures for the table-extraction accuracy harness.
+
+Each fixture is described by a ``TableSpec`` that is the single source of truth for BOTH
+the rendered PDF and the ground-truth JSON sidecar — by construction they cannot drift.
+
+Usage:
+    uv run python scripts/generate_table_fixtures.py
+"""
+
+from __future__ import annotations
+
+import json
+import pathlib
+from dataclasses import dataclass
+
+import reportlab.rl_config
+
+reportlab.rl_config.invariant = 1  # deterministic PDF metadata (fixed /CreationDate, /ID, etc.)
+
+from reportlab.lib.pagesizes import landscape, letter  # noqa: E402
+from reportlab.pdfgen import canvas  # noqa: E402
+
+REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent
+OUT_DIR = REPO_ROOT / "tests" / "fixtures" / "tables"
+
+FONT_REGULAR = "Helvetica"
+FONT_BOLD = "Helvetica-Bold"
+FONT_SIZE = 10
+
+
+@dataclass(frozen=True)
+class CellSpec:
+    row: int
+    col: int
+    text: str
+    row_span: int = 1
+    col_span: int = 1
+
+
+@dataclass
+class TableSpec:
+    """One source of truth for a single-table single-page fixture."""
+
+    name: str
+    page_size: tuple[float, float]
+    bordered: bool
+    origin: tuple[float, float]  # (x_left, y_top) in PDF coords (bottom-left origin)
+    col_widths: list[float]
+    row_heights: list[float]
+    cells: list[CellSpec]
+    notes: str
+    bold_header: bool = False
+
+
+def _simple_cells(data: list[list[str]]) -> list[CellSpec]:
+    return [CellSpec(row=r, col=c, text=t) for r, row in enumerate(data) for c, t in enumerate(row)]
+
+
+def _render_table(c: canvas.Canvas, spec: TableSpec, *, page_y_top: float | None = None) -> None:
+    """Render one table onto the current canvas page, using spec.origin (or override y)."""
+    x_left, y_top = spec.origin
+    if page_y_top is not None:
+        y_top = page_y_top
+    xs = [x_left]
+    for w in spec.col_widths:
+        xs.append(xs[-1] + w)
+    ys = [y_top]
+    for h in spec.row_heights:
+        ys.append(ys[-1] - h)
+
+    for cell in spec.cells:
+        x1 = xs[cell.col]
+        x2 = xs[cell.col + cell.col_span]
+        y1 = ys[cell.row + cell.row_span]
+        y2 = ys[cell.row]
+        if spec.bordered:
+            c.setStrokeColorRGB(0, 0, 0)
+            c.setLineWidth(0.5)
+            c.rect(x1, y1, x2 - x1, y2 - y1)
+        font = FONT_BOLD if (spec.bold_header and cell.row == 0) else FONT_REGULAR
+        c.setFont(font, FONT_SIZE)
+        text_w = c.stringWidth(cell.text, font, FONT_SIZE)
+        tx = (x1 + x2) / 2 - text_w / 2
+        ty = (y1 + y2) / 2 - FONT_SIZE / 3  # rough vertical centering
+        c.drawString(tx, ty, cell.text)
+
+
+def _table_bbox(spec: TableSpec) -> list[float]:
+    x_left, y_top = spec.origin
+    return [
+        x_left,
+        y_top - sum(spec.row_heights),
+        x_left + sum(spec.col_widths),
+        y_top,
+    ]
+
+
+def _gt_for_spec(spec: TableSpec, *, page: int = 1) -> dict:
+    return {
+        "page": page,
+        "bbox": _table_bbox(spec),
+        "col_count": len(spec.col_widths),
+        "row_count": len(spec.row_heights),
+        "cells": [
+            {
+                "row": c.row,
+                "col": c.col,
+                "row_span": c.row_span,
+                "col_span": c.col_span,
+                "text": c.text,
+            }
+            for c in spec.cells
+        ],
+    }
+
+
+def _canvas_for(name: str, page_size: tuple[float, float]) -> canvas.Canvas:
+    pdf_path = OUT_DIR / f"{name}.pdf"
+    c = canvas.Canvas(str(pdf_path), pagesize=page_size, pageCompression=0)
+    c.setCreator("paperjam-test-fixtures")
+    c.setTitle(name)
+    c.setSubject("Table extraction accuracy fixture")
+    return c
+
+
+def _write_gt(name: str, notes: str, tables: list[dict]) -> None:
+    data = {
+        "source": "synthetic",
+        "license": "generated",
+        "notes": notes,
+        "tables": tables,
+    }
+    (OUT_DIR / f"{name}.gt.json").write_text(json.dumps(data, indent=2) + "\n")
+
+
+def emit_single_table_fixture(spec: TableSpec) -> None:
+    c = _canvas_for(spec.name, spec.page_size)
+    _render_table(c, spec)
+    c.showPage()
+    c.save()
+    _write_gt(spec.name, spec.notes, [_gt_for_spec(spec)])
+
+
+def emit_multipage_continuation() -> None:
+    name = "multipage_continuation"
+    header = ["Year", "Metric", "Value"]
+    page1_rows = [[f"201{i}", "Revenue", str(100 * (i + 1))] for i in range(6)]
+    page2_rows = [[f"202{i}", "Revenue", str(200 * (i + 1))] for i in range(6)]
+
+    col_widths = [100.0, 120.0, 100.0]
+    row_heights = [24.0] * 7  # header + 6 data rows
+    origin = (72.0, 720.0)
+
+    c = _canvas_for(name, letter)
+    for page_data in (page1_rows, page2_rows):
+        spec = TableSpec(
+            name=name,
+            page_size=letter,
+            bordered=True,
+            origin=origin,
+            col_widths=col_widths,
+            row_heights=row_heights,
+            cells=_simple_cells([header, *page_data]),
+            notes="",
+        )
+        _render_table(c, spec)
+        c.showPage()
+    c.save()
+
+    def _page_gt(page_idx: int, rows: list[list[str]]) -> dict:
+        cells = [{"row": r, "col": cc, "row_span": 1, "col_span": 1, "text": t} for r, row in enumerate([header, *rows]) for cc, t in enumerate(row)]
+        return {
+            "page": page_idx,
+            "bbox": [origin[0], origin[1] - sum(row_heights), origin[0] + sum(col_widths), origin[1]],
+            "col_count": len(col_widths),
+            "row_count": len(row_heights),
+            "cells": cells,
+        }
+
+    _write_gt(
+        name,
+        "2-page table, header repeats on page 2; Phase 0 scores per-page tables",
+        [_page_gt(1, page1_rows), _page_gt(2, page2_rows)],
+    )
+
+
+def build_fixtures() -> list[TableSpec]:
+    specs: list[TableSpec] = [
+        TableSpec(
+            name="bordered_simple",
+            page_size=letter,
+            bordered=True,
+            origin=(72.0, 700.0),
+            col_widths=[100.0, 100.0, 100.0, 100.0],
+            row_heights=[24.0, 24.0, 24.0],
+            cells=_simple_cells(
+                [
+                    ["Header A", "Header B", "Header C", "Header D"],
+                    ["a1", "b1", "c1", "d1"],
+                    ["a2", "b2", "c2", "d2"],
+                ]
+            ),
+            notes="3x4 fully bordered, single-line cells",
+        ),
+        TableSpec(
+            name="bordered_dense",
+            page_size=letter,
+            bordered=True,
+            origin=(54.0, 720.0),
+            col_widths=[60.0] * 6,
+            row_heights=[22.0] * 8,
+            cells=_simple_cells(
+                [
+                    ["Year", "Q1", "Q2", "Q3", "Q4", "Total"],
+                    ["2016", "100", "110", "120", "130", "460"],
+                    ["2017", "105", "115", "125", "135", "480"],
+                    ["2018", "110", "120", "130", "140", "500"],
+                    ["2019", "115", "125", "135", "145", "520"],
+                    ["2020", "120", "130", "140", "150", "540"],
+                    ["2021", "125", "135", "145", "155", "560"],
+                    ["2022", "130", "140", "150", "160", "580"],
+                ]
+            ),
+            notes="8x6 bordered with numeric data, tight spacing",
+        ),
+        TableSpec(
+            name="bordered_merged",
+            page_size=letter,
+            bordered=True,
+            origin=(72.0, 700.0),
+            col_widths=[80.0, 80.0, 80.0, 80.0],
+            row_heights=[24.0] * 5,
+            cells=[
+                CellSpec(row=0, col=0, text="Year", row_span=2, col_span=1),
+                CellSpec(row=0, col=1, text="Revenue", row_span=1, col_span=2),
+                CellSpec(row=0, col=3, text="Notes", row_span=2, col_span=1),
+                CellSpec(row=1, col=1, text="Q1"),
+                CellSpec(row=1, col=2, text="Q2"),
+                CellSpec(row=2, col=0, text="2020"),
+                CellSpec(row=2, col=1, text="100"),
+                CellSpec(row=2, col=2, text="120"),
+                CellSpec(row=2, col=3, text="stable"),
+                CellSpec(row=3, col=0, text="2021"),
+                CellSpec(row=3, col=1, text="110"),
+                CellSpec(row=3, col=2, text="130"),
+                CellSpec(row=3, col=3, text="up"),
+                CellSpec(row=4, col=0, text="2022"),
+                CellSpec(row=4, col=1, text="120"),
+                CellSpec(row=4, col=2, text="140"),
+                CellSpec(row=4, col=3, text="up"),
+            ],
+            notes="5x4 with row-span merges (Year, Notes) and col-span merge (Revenue)",
+        ),
+        TableSpec(
+            name="borderless_financial",
+            page_size=letter,
+            bordered=False,
+            origin=(72.0, 700.0),
+            col_widths=[120.0, 80.0, 80.0, 80.0],
+            row_heights=[24.0] * 6,
+            cells=_simple_cells(
+                [
+                    ["Metric", "2021", "2022", "2023"],
+                    ["Revenue", "1,000", "1,200", "1,500"],
+                    ["COGS", "600", "720", "900"],
+                    ["Gross", "400", "480", "600"],
+                    ["OpEx", "200", "240", "300"],
+                    ["Net", "200", "240", "300"],
+                ]
+            ),
+            notes="6x4 borderless financial statement",
+        ),
+        TableSpec(
+            name="borderless_invoice",
+            page_size=letter,
+            bordered=False,
+            origin=(72.0, 700.0),
+            col_widths=[160.0, 60.0, 80.0],
+            row_heights=[24.0] * 5,
+            cells=_simple_cells(
+                [
+                    ["Item", "Qty", "Price"],
+                    ["Widget A", "2", "10.00"],
+                    ["Widget B", "1", "25.00"],
+                    ["Gadget", "3", "15.00"],
+                    ["Gizmo", "5", "5.00"],
+                ]
+            ),
+            notes="5x3 borderless invoice with bold header row",
+            bold_header=True,
+        ),
+        TableSpec(
+            name="rotated_landscape",
+            page_size=landscape(letter),
+            bordered=True,
+            origin=(72.0, 500.0),
+            col_widths=[100.0] * 5,
+            row_heights=[24.0] * 4,
+            cells=_simple_cells(
+                [
+                    ["Col 1", "Col 2", "Col 3", "Col 4", "Col 5"],
+                    ["a", "b", "c", "d", "e"],
+                    ["f", "g", "h", "i", "j"],
+                    ["k", "l", "m", "n", "o"],
+                ]
+            ),
+            notes="4x5 bordered on a landscape-oriented page",
+        ),
+        TableSpec(
+            name="sparse_cells",
+            page_size=letter,
+            bordered=False,
+            origin=(72.0, 700.0),
+            col_widths=[90.0, 60.0, 70.0, 70.0, 70.0],
+            row_heights=[24.0] * 7,
+            cells=_simple_cells(
+                [
+                    ["Item", "Qty", "Price", "Total", "Note"],
+                    ["Apple", "2", "1.00", "2.00", ""],
+                    ["Banana", "5", "0.50", "2.50", ""],
+                    ["Subtotal", "", "", "4.50", ""],
+                    ["Orange", "3", "1.50", "4.50", ""],
+                    ["Grape", "1", "3.00", "3.00", ""],
+                    ["Total", "", "", "12.00", ""],
+                ]
+            ),
+            notes="7x5 borderless with blank cells on subtotal/total rows",
+        ),
+    ]
+    return specs
+
+
+def main() -> None:
+    OUT_DIR.mkdir(parents=True, exist_ok=True)
+    for spec in build_fixtures():
+        emit_single_table_fixture(spec)
+        print(f"  wrote {spec.name}.pdf + .gt.json")
+    emit_multipage_continuation()
+    print("  wrote multipage_continuation.pdf + .gt.json")
+    print(f"Done. Fixtures in {OUT_DIR.relative_to(REPO_ROOT)}")
+
+
+if __name__ == "__main__":
+    main()

From ee02e1fb9d2ec99d7efc20f0d250ed95d6a68c78 Mon Sep 17 00:00:00 2001
From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com>
Date: Fri, 17 Apr 2026 00:53:39 +0530
Subject: [PATCH 04/12] feat(fixtures): commit generated table fixtures and
 ground-truth sidecars

Output of scripts/generate_table_fixtures.py checked in so the
harness and CI don't depend on regenerating at test time. 8 fixtures
totaling ~50 KB:

- bordered_simple, bordered_dense, bordered_merged
- borderless_financial, borderless_invoice
- multipage_continuation (2 pages, repeated header)
- rotated_landscape (landscape page)
- sparse_cells (borderless with blank subtotal cells)

Regeneration is verified for bit-identical output in CI.
---
 tests/fixtures/tables/bordered_dense.gt.json  | 356 ++++++++++++++++++
 tests/fixtures/tables/bordered_dense.pdf      | Bin 0 -> 6201 bytes
 tests/fixtures/tables/bordered_merged.gt.json | 139 +++++++
 tests/fixtures/tables/bordered_merged.pdf     | Bin 0 -> 3055 bytes
 tests/fixtures/tables/bordered_simple.gt.json | 104 +++++
 tests/fixtures/tables/bordered_simple.pdf     | Bin 0 -> 2562 bytes
 .../tables/borderless_financial.gt.json       | 188 +++++++++
 .../fixtures/tables/borderless_financial.pdf  | Bin 0 -> 2951 bytes
 .../tables/borderless_invoice.gt.json         | 125 ++++++
 tests/fixtures/tables/borderless_invoice.pdf  | Bin 0 -> 2485 bytes
 .../tables/multipage_continuation.gt.json     | 327 ++++++++++++++++
 .../tables/multipage_continuation.pdf         | Bin 0 -> 6044 bytes
 .../fixtures/tables/rotated_landscape.gt.json | 160 ++++++++
 tests/fixtures/tables/rotated_landscape.pdf   | Bin 0 -> 3352 bytes
 tests/fixtures/tables/sparse_cells.gt.json    | 265 +++++++++++++
 tests/fixtures/tables/sparse_cells.pdf        | Bin 0 -> 3579 bytes
 16 files changed, 1664 insertions(+)
 create mode 100644 tests/fixtures/tables/bordered_dense.gt.json
 create mode 100644 tests/fixtures/tables/bordered_dense.pdf
 create mode 100644 tests/fixtures/tables/bordered_merged.gt.json
 create mode 100644 tests/fixtures/tables/bordered_merged.pdf
 create mode 100644 tests/fixtures/tables/bordered_simple.gt.json
 create mode 100644 tests/fixtures/tables/bordered_simple.pdf
 create mode 100644 tests/fixtures/tables/borderless_financial.gt.json
 create mode 100644 tests/fixtures/tables/borderless_financial.pdf
 create mode 100644 tests/fixtures/tables/borderless_invoice.gt.json
 create mode 100644 tests/fixtures/tables/borderless_invoice.pdf
 create mode 100644 tests/fixtures/tables/multipage_continuation.gt.json
 create mode 100644 tests/fixtures/tables/multipage_continuation.pdf
 create mode 100644 tests/fixtures/tables/rotated_landscape.gt.json
 create mode 100644 tests/fixtures/tables/rotated_landscape.pdf
 create mode 100644 tests/fixtures/tables/sparse_cells.gt.json
 create mode 100644 tests/fixtures/tables/sparse_cells.pdf

diff --git a/tests/fixtures/tables/bordered_dense.gt.json b/tests/fixtures/tables/bordered_dense.gt.json
new file mode 100644
index 0000000..4d7b1e8
--- /dev/null
+++ b/tests/fixtures/tables/bordered_dense.gt.json
@@ -0,0 +1,356 @@
+{
+  "source": "synthetic",
+  "license": "generated",
+  "notes": "8x6 bordered with numeric data, tight spacing",
+  "tables": [
+    {
+      "page": 1,
+      "bbox": [
+        54.0,
+        544.0,
+        414.0,
+        720.0
+      ],
+      "col_count": 6,
+      "row_count": 8,
+      "cells": [
+        {
+          "row": 0,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Year"
+        },
+        {
+          "row": 0,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Q1"
+        },
+        {
+          "row": 0,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Q2"
+        },
+        {
+          "row": 0,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Q3"
+        },
+        {
+          "row": 0,
+          "col": 4,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Q4"
+        },
+        {
+          "row": 0,
+          "col": 5,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Total"
+        },
+        {
+          "row": 1,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2016"
+        },
+        {
+          "row": 1,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "100"
+        },
+        {
+          "row": 1,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "110"
+        },
+        {
+          "row": 1,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "120"
+        },
+        {
+          "row": 1,
+          "col": 4,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "130"
+        },
+        {
+          "row": 1,
+          "col": 5,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "460"
+        },
+        {
+          "row": 2,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2017"
+        },
+        {
+          "row": 2,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "105"
+        },
+        {
+          "row": 2,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "115"
+        },
+        {
+          "row": 2,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "125"
+        },
+        {
+          "row": 2,
+          "col": 4,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "135"
+        },
+        {
+          "row": 2,
+          "col": 5,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "480"
+        },
+        {
+          "row": 3,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2018"
+        },
+        {
+          "row": 3,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "110"
+        },
+        {
+          "row": 3,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "120"
+        },
+        {
+          "row": 3,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "130"
+        },
+        {
+          "row": 3,
+          "col": 4,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "140"
+        },
+        {
+          "row": 3,
+          "col": 5,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "500"
+        },
+        {
+          "row": 4,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2019"
+        },
+        {
+          "row": 4,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "115"
+        },
+        {
+          "row": 4,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "125"
+        },
+        {
+          "row": 4,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "135"
+        },
+        {
+          "row": 4,
+          "col": 4,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "145"
+        },
+        {
+          "row": 4,
+          "col": 5,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "520"
+        },
+        {
+          "row": 5,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2020"
+        },
+        {
+          "row": 5,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "120"
+        },
+        {
+          "row": 5,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "130"
+        },
+        {
+          "row": 5,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "140"
+        },
+        {
+          "row": 5,
+          "col": 4,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "150"
+        },
+        {
+          "row": 5,
+          "col": 5,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "540"
+        },
+        {
+          "row": 6,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2021"
+        },
+        {
+          "row": 6,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "125"
+        },
+        {
+          "row": 6,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "135"
+        },
+        {
+          "row": 6,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "145"
+        },
+        {
+          "row": 6,
+          "col": 4,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "155"
+        },
+        {
+          "row": 6,
+          "col": 5,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "560"
+        },
+        {
+          "row": 7,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2022"
+        },
+        {
+          "row": 7,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "130"
+        },
+        {
+          "row": 7,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "140"
+        },
+        {
+          "row": 7,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "150"
+        },
+        {
+          "row": 7,
+          "col": 4,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "160"
+        },
+        {
+          "row": 7,
+          "col": 5,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "580"
+        }
+      ]
+    }
+  ]
+}
diff --git a/tests/fixtures/tables/bordered_dense.pdf b/tests/fixtures/tables/bordered_dense.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..1ad86c2177e820ef989fd5c44017c57454455ca2
GIT binary patch
literal 6201
zcmbW5+iv4F5J2Dc74s6*X<<vGZuTMwu(@r6&2CdiMNtGnLCZ{{z?KY2Zj$~&`_}h<
zYiDRN5wk80CKubHC>~ypN7U?Ho}3NYDD?LJ`TMVbVJcTu-Cl?pe3GTCMJwlkHDF%l
z>!mDPxNqCl2j72ud>qxP?n2B)dA0Ny1W?T$yn_SJKV!i0aOxc%da|5TF&*pEN1~Bu
z6*lJoC>MXow#Wter)6Hvi}DWqUyJf%*%Yq@U&T@aHR@ljXYKO}4>EkkV|v3g*j5h}
z_E5)o3cKAvO8o)<i<}qYsCt5HJPsgX9MTDg8}KhhP2)>8^`^3^)^#o$P~)M_(g3lA
z%eu<1WQz^>G%<cApKu`OOK~TUy7%Mm{TyzvTi{Qt7AF!f_p@4*s!1yMKo8&&FYXN>
z!m4_%=+u{Lj-&jgkzcD4KTuJlJGz;HW6_F5b=PYn?zQo8-QHI<3`AL#&&z7v?1O(?
zOVJipc|tuKoP6K`{xJMiKR*P)clf94(ZwsVlJ$dF4qMr@!`tGiUDuLY`YfLxt9stR
z01vRMyIRBvoUe0<O}@%TGkH<WYEeJK5WbOXU(FRR{)5b07-V9$kbv`Ei=4&)BG1?O
z`5C%yVMnr}#X5r-_Dt4t{`*{(jimBgtX3$se<l`9f6{5;^`swHYqjA1=K{xd4P?}m
zB-zZiT7ceMlUw@I3t8T^_Yh6^glaS`&hzHW5h#NK$d>>|nerL!%Wi>1qX@DKIL$nA
z5>cPLQ4Eh>32_9;WDH3FoI@?)%F|Z{>Poy`U5BNlCn3^s6h)8*@hC}>6tX1@ev_i!
zhwK5e@3%o1bEmK{9fhHZ`I*@u={q2kEf8*j@X#H?<1LWT2AQ}+Lbe4GSs>v%Rz6Jm
zXgrS1MaeMsEG!`6q=}nhgn?unz8b~@mRKOlFh>dt$zT(M(aA<3#at<qEfi*>D90Qr
zJl;a#RtibxND0{%N@$}f%Umhxx0EQcQAjcRcL`z}#S&&Hmf9Adf|w&kE7r%bAhuCR
zF;_~bm_cDSic-vxq7^eJ+)5$E94T5cgA&>(N-<YTr&ym7jdv*)^7kkfP7KAyw#6sK
z94T6{K89K23nRr`DV<^l#U5YiPU%R|iWwAZeBq>+BSkA_P$JvnE5%$Xonn1TY`wdX
z!wjDo+@l!Y`(L(Vc%z)`Qp}N}73*V|HNGgtTq&Jm2E`s<NHIr>R?MJS<BL+vk)jnd
zDACR+=1S=l>r-Ouml73=X=mlESgaPO+wJqf3ZcE2E2UG+z%V-nqnI;gqnJst#}`!0
zmC`9@QmpZX6mz9?ikXynR}^!mY!ovnv2`q@811Uv6~pJsUKHcjXFXEPk)jprW0*C*
zP!w~fbcz`idwe0q9LJ&+Gbq;hq7-waXvGXlyeo>ijwS3A>r>)g+x-|b5bsbdHn#iR
z_N1?(m?K3i*2gewd{K(IQaZ&9iuF55+kMv=(~21sYkX0PIgUjuW>Bn)PkS-fv1rBm
zl-Str17(rBR9cr$eyZgyrpw0OCSggh_%suTG2FsSS&E4}b*6kNV*zGY-;{BN$+I_Q
z(Uy8qidpP8_0klxe{aghAtn=b873)<g{)Ox{QLyG>w}QXh^GOcvpCO#36tqeq)8M8
zoKJ)p=ZCv%+<1GvRQ=udRQ+%W^8!=Fm@)moto^>{`RC<r1(YtHdYGxMTA=hZe!eRH
YlsLaCi7Dz<O?W(JiMO|R`qP>BADZ!6eE<Le

literal 0
HcmV?d00001

diff --git a/tests/fixtures/tables/bordered_merged.gt.json b/tests/fixtures/tables/bordered_merged.gt.json
new file mode 100644
index 0000000..57713dc
--- /dev/null
+++ b/tests/fixtures/tables/bordered_merged.gt.json
@@ -0,0 +1,139 @@
+{
+  "source": "synthetic",
+  "license": "generated",
+  "notes": "5x4 with row-span merges (Year, Notes) and col-span merge (Revenue)",
+  "tables": [
+    {
+      "page": 1,
+      "bbox": [
+        72.0,
+        580.0,
+        392.0,
+        700.0
+      ],
+      "col_count": 4,
+      "row_count": 5,
+      "cells": [
+        {
+          "row": 0,
+          "col": 0,
+          "row_span": 2,
+          "col_span": 1,
+          "text": "Year"
+        },
+        {
+          "row": 0,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 2,
+          "text": "Revenue"
+        },
+        {
+          "row": 0,
+          "col": 3,
+          "row_span": 2,
+          "col_span": 1,
+          "text": "Notes"
+        },
+        {
+          "row": 1,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Q1"
+        },
+        {
+          "row": 1,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Q2"
+        },
+        {
+          "row": 2,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2020"
+        },
+        {
+          "row": 2,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "100"
+        },
+        {
+          "row": 2,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "120"
+        },
+        {
+          "row": 2,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "stable"
+        },
+        {
+          "row": 3,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2021"
+        },
+        {
+          "row": 3,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "110"
+        },
+        {
+          "row": 3,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "130"
+        },
+        {
+          "row": 3,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "up"
+        },
+        {
+          "row": 4,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2022"
+        },
+        {
+          "row": 4,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "120"
+        },
+        {
+          "row": 4,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "140"
+        },
+        {
+          "row": 4,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "up"
+        }
+      ]
+    }
+  ]
+}
diff --git a/tests/fixtures/tables/bordered_merged.pdf b/tests/fixtures/tables/bordered_merged.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..8affa2256d44fdddb2b55fae39bddb13d240d835
GIT binary patch
literal 3055
zcmbVO+iv4F5PjEI%!^T{g>^TjEP?<VC*BP<o2HJ6q6mV5mPU>OTQn$Y9rqX7x4!pV
zJEZMcPFy8kC$P*RIcH|hp*WqJ^I^}k1FQ4T-+%oJBb*DVuJ{-}VTqC}%z#!vCenI_
zrGf{g<{unqu~^vBtex_)or;;|feT{%Xq}u`&d>v&h9m3r)WR~;W<B0apYaNZf}A;@
zarzBak#cY@%2Z@UIRWRlqWoA^#ny1mX9(J<b6bzqa!!MUzG<u<G()}`P(cG~KvNWU
z1(A*ioUfP_{7gK-9gSUJo)6Kn5BK2QaH;cSt92u+L@iUSz~rNY)d`Y^8!6ITROG<X
zg>e#mqC_rde1d1|=V1N3gnJ4LoRLseNZRfsl9$G%wp*}C;D$Cg5(sJOrdE3DtH>zJ
zUn{&8CH>H*Y7^+H0t2r2R7@H^`VAi+YxN)`ba^St<xJGo5jX>hTos}`*O7J4Klm<v
zJo+=ApIr9``roY6o9BFv@{!Ma3M<viizih})SkZJav@|^L6-&;>RTSEfLWa)x!j6P
zSMsVDOD>ntgID4@GPNS|A2C(XP55|<Ky{Zq)j0r9)0%!SVI39)k`#*8bjK7KO3eP4
zp`2i*CnTTGN!8AfPpf9tHS?RQ4@7MkcfJ&q*d1s|jkHQu<u(j7imk=9*?NWLL_L5P
z#lBuqDJt~pbht)BE~GPnv&0CR<d@{&g?0$Z6<j2i79~M_vg|%AtP&z0Sl@@(g)jz*
zaBFRLx@ITsH~Yw{^{F9<L^}+DMZV1#iy)an_jly-2$Dxge%y!R5d@1Ec+3Sqd?$|Y
zdG_%!usGb}jPM(lHMT$nl=7h&1wm{dzsTrXkPcgLOiHWDy*nc>?%P2CjJey4e(_pB
z{NOp6cZ2wB4~X9aq9x$RS_1FJpi^weF|dGb6Y*W&Z9y^hIus@j$p0=3LARiof*pwB
zvpp!SxfqQdh@v(2GM7r}7fnk#ni@C!E^55S=_G2}RA#6-6lP6L6UOu2q~<`B4K+I`
zK^uyp=0KDnWafp|M5*WPwDjq?e8-9D$65vJmnPBDb}lt72g9sr*@5v|wX9Ffp(txw
zc2UB&XgL&RP0J3-4lSU2zxDmM?gF1A=5)u!)~e0W-*g-FgFfW&Y>Vi&H8bV2%_GUY
zw9)<Z$`<Zf7t$$xxi^a9SGG8yYkY%Ctwk|KX}a0VbFl7C`k1+6KF+f|&qLmiGGsoE
zBbMfVkca4=zP;k!>NFbc$^Hhr*MqE}29Nrj|Le7nEX%npa{;=^8(Gv}3I)1pq~Ev2
YpGfsJod=i76x?T#YjrvoKM$?{0PDQoUjP6A

literal 0
HcmV?d00001

diff --git a/tests/fixtures/tables/bordered_simple.gt.json b/tests/fixtures/tables/bordered_simple.gt.json
new file mode 100644
index 0000000..e59a843
--- /dev/null
+++ b/tests/fixtures/tables/bordered_simple.gt.json
@@ -0,0 +1,104 @@
+{
+  "source": "synthetic",
+  "license": "generated",
+  "notes": "3x4 fully bordered, single-line cells",
+  "tables": [
+    {
+      "page": 1,
+      "bbox": [
+        72.0,
+        628.0,
+        472.0,
+        700.0
+      ],
+      "col_count": 4,
+      "row_count": 3,
+      "cells": [
+        {
+          "row": 0,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Header A"
+        },
+        {
+          "row": 0,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Header B"
+        },
+        {
+          "row": 0,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Header C"
+        },
+        {
+          "row": 0,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Header D"
+        },
+        {
+          "row": 1,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "a1"
+        },
+        {
+          "row": 1,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "b1"
+        },
+        {
+          "row": 1,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "c1"
+        },
+        {
+          "row": 1,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "d1"
+        },
+        {
+          "row": 2,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "a2"
+        },
+        {
+          "row": 2,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "b2"
+        },
+        {
+          "row": 2,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "c2"
+        },
+        {
+          "row": 2,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "d2"
+        }
+      ]
+    }
+  ]
+}
diff --git a/tests/fixtures/tables/bordered_simple.pdf b/tests/fixtures/tables/bordered_simple.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..0244bb3240e44037886ba50af52df5684a547014
GIT binary patch
literal 2562
zcmbVOO>f&c5WVwP%*Cj)3tOL-)FKEF+i@E-Z39O|4?(b?q_LyG5)F!4$NLwyx8D0(
zJEW~vNL>Yv6IkMKIB({?G0EQbbT)8?9_xMk`u}&B<61~{#S?goB}%R^1M+}Oq;-L%
zf_tUbZ!BxG*$k!eUGZd?ih?;{gGe6O=_#{j4!AU&v$HdXWv0z~+)R&og)>3HtarHj
z7ge5eurA6}WO=y+>q}mqmsS4D@R1h?I;eG1Cu+N<K}_E?)(@JYSS={81vQ~5O1pwk
z=L6OU%yK>!k8n$48v@6L@U;tfU|n;m>kD>%bF4%yQ>?($qm$JIQiN+M(i>D1z|zE6
zF+Nfumjz$qc>kR2pO<h)X@NBtiio7;RxEjGLTWpMrhsc&+$z8)*M=)S^+9A*<>v}N
zijrQmscI74F)-nZuf(zyW7LXqUaNZ{q0dWEZVORYCtyt^a+Qm6sx#|P-?%n?9Qre#
zzis;!{WpGk@tUtuKJa3ouu_9X{-|n+I?{XGZiLJ#=+l5weJ&yqnAIs#$Y0H-nY_vq
z$>kOX@LO}87_LbC2TT?8W1g%Kh<C|TT?6nmt?7LW`?M&LI9KG;Pbf2#nEg}b#d?K$
zLh|*Ry4srYRn=x)E8nm_5w%g=dY@Bax1d{UuDfJMw^5))wr;Lw=@phsbq|gkz1AL;
zBBFP{4%@Vl4QT-|j!i>T`^5`z{Gku=6<owjHzh&6Wy29{SP7vEK^OqZ0<I4d;f6I!
zZL^dDn1y834XQE7c^E|yME)=cf)L^Y`tOhv4{&}0@dLzv9fERRf<<lfC#-P?mirPc
zkFWxJbg-~V2blK~FuwzA+5x7qM(&Y<Il<8PAqc3xMlkMlfEcNcMR5<HBwZ**q+?M$
zLeU(wYe~CMj5^1n_y=mqx=?gVZuBx)yr3nE`<yJtqBOF!QIakcBg?TUjVx`Hv<t<^
zax6+COB*F?qkz5+?6<`I#o$rmg0{Vg?M@#3O~0U#H-ZKHw1u<*o0;;b&9Uj2d}4DQ
z+5?~1`~!YIwZl`tFpPe)MIP<!O*}en@)b&R!MU6QyFCpe*Nzf<F~TH@7FiIW?QwSz
zda3V5neUuE?{UX^tvlE9@SSTgfGnpopN_siUB6Bkvo6bp0DbPy8C^Mq0)3C5_nZ7b
WB!10d#HBKXyG|IgUhm@5jQs!<q;`Y=

literal 0
HcmV?d00001

diff --git a/tests/fixtures/tables/borderless_financial.gt.json b/tests/fixtures/tables/borderless_financial.gt.json
new file mode 100644
index 0000000..c23d338
--- /dev/null
+++ b/tests/fixtures/tables/borderless_financial.gt.json
@@ -0,0 +1,188 @@
+{
+  "source": "synthetic",
+  "license": "generated",
+  "notes": "6x4 borderless financial statement",
+  "tables": [
+    {
+      "page": 1,
+      "bbox": [
+        72.0,
+        556.0,
+        432.0,
+        700.0
+      ],
+      "col_count": 4,
+      "row_count": 6,
+      "cells": [
+        {
+          "row": 0,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Metric"
+        },
+        {
+          "row": 0,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2021"
+        },
+        {
+          "row": 0,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2022"
+        },
+        {
+          "row": 0,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2023"
+        },
+        {
+          "row": 1,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Revenue"
+        },
+        {
+          "row": 1,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "1,000"
+        },
+        {
+          "row": 1,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "1,200"
+        },
+        {
+          "row": 1,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "1,500"
+        },
+        {
+          "row": 2,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "COGS"
+        },
+        {
+          "row": 2,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "600"
+        },
+        {
+          "row": 2,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "720"
+        },
+        {
+          "row": 2,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "900"
+        },
+        {
+          "row": 3,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Gross"
+        },
+        {
+          "row": 3,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "400"
+        },
+        {
+          "row": 3,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "480"
+        },
+        {
+          "row": 3,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "600"
+        },
+        {
+          "row": 4,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "OpEx"
+        },
+        {
+          "row": 4,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "200"
+        },
+        {
+          "row": 4,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "240"
+        },
+        {
+          "row": 4,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "300"
+        },
+        {
+          "row": 5,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Net"
+        },
+        {
+          "row": 5,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "200"
+        },
+        {
+          "row": 5,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "240"
+        },
+        {
+          "row": 5,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "300"
+        }
+      ]
+    }
+  ]
+}
diff --git a/tests/fixtures/tables/borderless_financial.pdf b/tests/fixtures/tables/borderless_financial.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..7423cc4061dd5654e5e91da6aad7af2db08aa379
GIT binary patch
literal 2951
zcmbW3-*2ln6vyBBE6z)_bXx!$FyyMLbelF^b^A*x(;lj7GT@|C366}pP4+KrZ+qY0
zx?>V<NSo=7BvHV}#~+TrKE|i_a4{Yb+p~Iq|MllTFcC|s^bJqoN)$qIEiyn4$YfeA
zM4{nD>*XiMd3$@aRqZ?HiJi)YMZg7_yjrKHmNO<mu{p8M&MZ-6#%$V6_c<@cSR%~%
zMa;hnou?d}%OaIoUd+JxEiXP7Wxg}q@r3{bbsnokub0?}@gLjff<4G;J<4l6YJ^=F
zb_t=02b^1x<@{W(;0fC<FhU_5Qg{aEfh#jVw(*;YQdTMzCDijV!I}vo;X%ptQD_7>
zMi?g+E1byng3rYH_Bz^Lui+WP0%syM3W>v=Sn;Ap8oLF}1Rij3YXUyHHd>jkTbbc3
ze=WtGEbzvd$|lf_0wb>ZT+UjIpvCxH=@+S>&kI?s7qTi(z!@pQbuNnw6IuV_6Ls-J
z@LPX=a@`;B*Z7&iOTHB9l`jTblzK4DSGrQdkiLlZn^ai|eQaQ;-y@=cS(OSz?vl+Y
zd6OrKt2GSZLvo$eTH)fqic~{C=E+<D>aKWd<^ViRE4*LBHY^Mz&Nce<6O2r#xhTuu
zr+LAPH0N{EtN3z>sqKvUylhKvmcNmHB&&Maoi91g?g>m@O-!n6G_RMTrMStkX6TJ5
zX8HvP3q9jeYE*l(Mcg`%Tu2uH=W(6Yn1k^Yh;RE4-@s*THFmeQV{c<MDQEzMM7B2s
z7Wp<~EQELg{ac|`o}NJb3h|Hsi$bYuN6|iox|DQbc<w<4?Zk)<GQ2JfLWn&?MVPx&
zWFo$bq7q#&lnibE9ftgjTcQ(WcnE?UtP8{o4?%)%5JEzmh5;~-?PM9<Up;n-gi;R&
z?=x7pI6U_wBBb5)h(02Q-J&qNhA~uw(487rN|yK*>#}P$fwUohw?G;>_7Krga<ma0
z7lqWh9zYNoS+;MH`{m{82sQQ?T{C@?9Qz3W7&Z0~_-5)d5kh^NQV3Y<m3k+17sPg>
z>>-ZSq8$Z2PK!2TUkflb+4e}<R%a_Erj`qlwK@LHKd>P10+_-#TZji>-BWzC5f^Kf
zcQ#7!n0jaP5BT{Z)`ai;!sz2OBERK_M}9sRs;)AxFTi>_4Ox~9rz~P_M5j|vxYSRR
z#N&AMW`5#%XW#dDwtB6?cXqh&9Sk7LvGBtB<G*U)6U%b0i>U-td`&E@`=ka_%;EiG
Y{)a$)>tcW_T?<YF#;jiN@_uan3n6C8+5i9m

literal 0
HcmV?d00001

diff --git a/tests/fixtures/tables/borderless_invoice.gt.json b/tests/fixtures/tables/borderless_invoice.gt.json
new file mode 100644
index 0000000..0b6d809
--- /dev/null
+++ b/tests/fixtures/tables/borderless_invoice.gt.json
@@ -0,0 +1,125 @@
+{
+  "source": "synthetic",
+  "license": "generated",
+  "notes": "5x3 borderless invoice with bold header row",
+  "tables": [
+    {
+      "page": 1,
+      "bbox": [
+        72.0,
+        580.0,
+        372.0,
+        700.0
+      ],
+      "col_count": 3,
+      "row_count": 5,
+      "cells": [
+        {
+          "row": 0,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Item"
+        },
+        {
+          "row": 0,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Qty"
+        },
+        {
+          "row": 0,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Price"
+        },
+        {
+          "row": 1,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Widget A"
+        },
+        {
+          "row": 1,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2"
+        },
+        {
+          "row": 1,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "10.00"
+        },
+        {
+          "row": 2,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Widget B"
+        },
+        {
+          "row": 2,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "1"
+        },
+        {
+          "row": 2,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "25.00"
+        },
+        {
+          "row": 3,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Gadget"
+        },
+        {
+          "row": 3,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "3"
+        },
+        {
+          "row": 3,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "15.00"
+        },
+        {
+          "row": 4,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Gizmo"
+        },
+        {
+          "row": 4,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "5"
+        },
+        {
+          "row": 4,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "5.00"
+        }
+      ]
+    }
+  ]
+}
diff --git a/tests/fixtures/tables/borderless_invoice.pdf b/tests/fixtures/tables/borderless_invoice.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..75ad82b224ac146bc74d3e7c1bbb18dcfd194c0c
GIT binary patch
literal 2485
zcmb7GO>f&c5WVYH%*9w|7ZyKcNmdaAh@H4=H0^pFEqVxo1ufBz0!uU~x{3Q2c5l7+
zx9(838rj~qk!(ofaOQE|BgJ87dNuAb*K;~w{{81G%;Z{Wb0g;PRu)nVBU3;RNL8}Q
zWntjKnDrY%o}QjuUHPuW+)Y&OFrc93kIwnIL&gj^HfKP_96bBxTwFM^NUekIsU3%+
zlw*ak<afFHBu$nGAlF5r(yUkl`Ir@#MVY-AK8Rc*C<Ai4nVanz8!_IoZ6DZ!SdF3V
z&_5WeH&m<iHRJJDj@NIDH^Nj<uo4EAKalq_&BRbW!yUFM1dKx%akvLE71}Br?)+x5
zR2!Yh5-KIFI96Ijm}-^WN`nBzP6>(SGYU7!#ZnIU&(Z!lfqRqz$V?fWLJTLd7DWYV
z?GDs}O)<D3n2)ZtV7BYMN>SCnO8G$*_+m|Eo#;;SkuYMVmJN)cfpNJp4@yH<6sp+f
zYEzy88EGj@riv?@S@-G<r+6{^Ro8En{(}G7&jzo>TIxrU_lzt}Z;?Hljg}Vao!mYt
zotDtW2B!KmL=-UHBoZNCmbq2(CYx)ax6p%=Wqnp@g^TS`CI-5(n6D(D?ph?41BfKq
z;QJQ#X<;I9X3(cQ$IPT&$+G-AD?X_#k+x5Z^%_4ZG8U_{DZHKhTKJLLRFh8LWytLg
z?5djCg|bt;nuLblc74^MH?mlo2VgV^tw(84>)oeGt5rlH$pMCO^$p<)j2FQAZXe<s
zxQ?C5&M~^OV{dJ>3(x|vK6673!JzL3K@dWmLw90ieg^R)#J|1@gZmx^`-dQZnr#~l
zk40`69zslYCGlY*nDY0@cu0m}A_0A2^D#@8_=#N}XY(T<?nf?1Rhl@w4S=Q7j}8GC
zbt!EF!fHBRa6)I;2B@af#9?g!YC25-?zg7nyMf&$U&y=_c5d2aQd4Q7cx?b`Dop^^
zo=WKYz7PIj&>)k2&Q+UK_FN7?{5Aj{=PnNqvpAvxwnpzajeWI#)^g!ch@9Q_Z0`{G
zzURXNzS%-7v{g^>&BhpRTSqp|aU(ghdERNP&#{Ia`T5Z)UPMo0qX5qrbu4bv*-Gl_
zOftCw=kC1kbFmPEAmnM3Bq<fllL7Uq3=*C)AufLIaqn~*$E4-|ACr1LNHZ*!STnwN
tVmfmiGAR}cY{8s4SgVx*TY&NXHv245^J)th+EkOk>3}+&&h;N-=RYb1cya&$

literal 0
HcmV?d00001

diff --git a/tests/fixtures/tables/multipage_continuation.gt.json b/tests/fixtures/tables/multipage_continuation.gt.json
new file mode 100644
index 0000000..eecbda3
--- /dev/null
+++ b/tests/fixtures/tables/multipage_continuation.gt.json
@@ -0,0 +1,327 @@
+{
+  "source": "synthetic",
+  "license": "generated",
+  "notes": "2-page table, header repeats on page 2; Phase 0 scores per-page tables",
+  "tables": [
+    {
+      "page": 1,
+      "bbox": [
+        72.0,
+        552.0,
+        392.0,
+        720.0
+      ],
+      "col_count": 3,
+      "row_count": 7,
+      "cells": [
+        {
+          "row": 0,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Year"
+        },
+        {
+          "row": 0,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Metric"
+        },
+        {
+          "row": 0,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Value"
+        },
+        {
+          "row": 1,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2010"
+        },
+        {
+          "row": 1,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Revenue"
+        },
+        {
+          "row": 1,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "100"
+        },
+        {
+          "row": 2,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2011"
+        },
+        {
+          "row": 2,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Revenue"
+        },
+        {
+          "row": 2,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "200"
+        },
+        {
+          "row": 3,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2012"
+        },
+        {
+          "row": 3,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Revenue"
+        },
+        {
+          "row": 3,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "300"
+        },
+        {
+          "row": 4,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2013"
+        },
+        {
+          "row": 4,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Revenue"
+        },
+        {
+          "row": 4,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "400"
+        },
+        {
+          "row": 5,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2014"
+        },
+        {
+          "row": 5,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Revenue"
+        },
+        {
+          "row": 5,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "500"
+        },
+        {
+          "row": 6,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2015"
+        },
+        {
+          "row": 6,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Revenue"
+        },
+        {
+          "row": 6,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "600"
+        }
+      ]
+    },
+    {
+      "page": 2,
+      "bbox": [
+        72.0,
+        552.0,
+        392.0,
+        720.0
+      ],
+      "col_count": 3,
+      "row_count": 7,
+      "cells": [
+        {
+          "row": 0,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Year"
+        },
+        {
+          "row": 0,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Metric"
+        },
+        {
+          "row": 0,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Value"
+        },
+        {
+          "row": 1,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2020"
+        },
+        {
+          "row": 1,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Revenue"
+        },
+        {
+          "row": 1,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "200"
+        },
+        {
+          "row": 2,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2021"
+        },
+        {
+          "row": 2,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Revenue"
+        },
+        {
+          "row": 2,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "400"
+        },
+        {
+          "row": 3,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2022"
+        },
+        {
+          "row": 3,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Revenue"
+        },
+        {
+          "row": 3,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "600"
+        },
+        {
+          "row": 4,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2023"
+        },
+        {
+          "row": 4,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Revenue"
+        },
+        {
+          "row": 4,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "800"
+        },
+        {
+          "row": 5,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2024"
+        },
+        {
+          "row": 5,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Revenue"
+        },
+        {
+          "row": 5,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "1000"
+        },
+        {
+          "row": 6,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2025"
+        },
+        {
+          "row": 6,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Revenue"
+        },
+        {
+          "row": 6,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "1200"
+        }
+      ]
+    }
+  ]
+}
diff --git a/tests/fixtures/tables/multipage_continuation.pdf b/tests/fixtures/tables/multipage_continuation.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..bb8997de980bec603bfc8ec6cb33c44046b31609
GIT binary patch
literal 6044
zcmeI0%WmT~6o&VDinB55w6HGJ)q^0w<l<@2WEwbXiy{aLTB02Vwq#J$IO!|Q?7Hu>
z_K>y`DsgSg=q5p(_#%qp&-0NKf1S?N>7eJBuF?7H+t<Hg$mRtX7jy)lSk5>VECs9q
zX^~VjmJ7HOV*bIh7K??+Roy8anMpA-9I&An-5ZC8hBa`2;Nj3XIx<+E%3^t3pFXA~
z8x+`>^&^{pVIoT?SZ8@sq**=z>$fcbn3vge!Dl*SARD!=tC4t|<3Ws{cr3s047R07
zxti1op2BXI5Xt_4b;;6<9v2UAgU2=m4uP;w;1;Ya%4K}vvfhxDMa2_Vf{I6WRtAVI
zT=62gW&#_qBp54Z4;aY#j853`>icB%eGa$SEwF}#K#+L36?2-aCS|z+bpTg*u@-=b
zRclsH0rZ#hZz2F+3qZzpS)@2jzm)8Ak>d|pRMw=ILwG_3ofZ=fLZG$ru@ZL$hc3;F
z{Bc%P<pEeHoKcY#`Kd(PJ^eszd^z~1UO(9Oclck|lNZnFobh`)>j_qh-Z*;@6=$-g
zpV;G~;Ash6JixAQYY}HaS|tpdd|oYbDqLhEPWdDB;MM9lP=k%L_nsvJx-lJ18Q{d>
zG?6g?O_B<KKf<b8*pWCBSf@LyrXrgous;%<gjrsxpyhc^=W|@0)__h+efG#1{5*Rq
zoVf&J%rL$ikV|6-a()fvoL{n6!=aIvOQmk$g5?u&2ZXr3tWgT2zFh0JS`aoQGl1h*
zEl*rb@faM>^dP=~v)GVJ45dC9rVk4vhmb(f4*<sqhzFd(wNYPct4px~bs?6@<t8;8
z$1yz*LfbQgAP6CzLHBn?`2ocD5P!c;#;J8k3mxJ_rrU4oTr$D4WP=jHdq?YtT+%m*
z<2R*#rPGRSP?90xyCpi?j3Njk`xza<6|zA_p_4l8&^czm-_#khFD%~}kz&_2B-eE@
zmZlUk*$|Ng=SBO0^MeLBXM>CaCw2A#r|awpu5Cy)I9&>FnHwUK;Bb}hHRD{;07o{+
zC~#6|A8@+Pj^NscRD;u{+^yj74!0LLJ8FP)w}F#7`+(DRb_CZpq#B$q<!uA!hpo2<
zb&vBSZF|6boVNv>(%A!CLuWT|twUOYYe@N9!TI}b4t~(w9Q<wIq|QFzbe$c+wGF8T
zr%MG5a3CK?RwickU?P3>Yqt~r?{;3>x3}na$fhmsExMiUn|51k;5bX0*t7=Zy}6yO
zd(VI5cF22kJI!0@x@Wq!)}Wqox-}p+d2enfYV190z_qppD{u{KfZ*P?+j(u^&^m9r
zZR3-P-_{${<aBR<<leU3S(4L3LTjdzTjO^gI2&VP#<7iMrc8c9;QOu*V|XeHF;k?@
z<WFUejd`ONWyHZW&5JT`O+CMlS*aKGf&kNmFUtBMCZC#RDEq1}E~ZeP>tPlxn=-C4
z&*!IL+#Gu2(I^N5o4MmC@$EE?0!Gmt5}zbv-z7)eYup+gEg3#ppA7HyAk8qBi7CB*
w%ZncvhIO8g3y}HDp@F&JLV(PV;_vJ1PbLRKCDtexYH0kx35`zY?B{{;53FBdXaE2J

literal 0
HcmV?d00001

diff --git a/tests/fixtures/tables/rotated_landscape.gt.json b/tests/fixtures/tables/rotated_landscape.gt.json
new file mode 100644
index 0000000..2605fe0
--- /dev/null
+++ b/tests/fixtures/tables/rotated_landscape.gt.json
@@ -0,0 +1,160 @@
+{
+  "source": "synthetic",
+  "license": "generated",
+  "notes": "4x5 bordered on a landscape-oriented page",
+  "tables": [
+    {
+      "page": 1,
+      "bbox": [
+        72.0,
+        404.0,
+        572.0,
+        500.0
+      ],
+      "col_count": 5,
+      "row_count": 4,
+      "cells": [
+        {
+          "row": 0,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Col 1"
+        },
+        {
+          "row": 0,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Col 2"
+        },
+        {
+          "row": 0,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Col 3"
+        },
+        {
+          "row": 0,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Col 4"
+        },
+        {
+          "row": 0,
+          "col": 4,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Col 5"
+        },
+        {
+          "row": 1,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "a"
+        },
+        {
+          "row": 1,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "b"
+        },
+        {
+          "row": 1,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "c"
+        },
+        {
+          "row": 1,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "d"
+        },
+        {
+          "row": 1,
+          "col": 4,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "e"
+        },
+        {
+          "row": 2,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "f"
+        },
+        {
+          "row": 2,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "g"
+        },
+        {
+          "row": 2,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "h"
+        },
+        {
+          "row": 2,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "i"
+        },
+        {
+          "row": 2,
+          "col": 4,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "j"
+        },
+        {
+          "row": 3,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "k"
+        },
+        {
+          "row": 3,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "l"
+        },
+        {
+          "row": 3,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "m"
+        },
+        {
+          "row": 3,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "n"
+        },
+        {
+          "row": 3,
+          "col": 4,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "o"
+        }
+      ]
+    }
+  ]
+}
diff --git a/tests/fixtures/tables/rotated_landscape.pdf b/tests/fixtures/tables/rotated_landscape.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..4dd923fbcd2233e37fe08e3db76cd57eed7e988f
GIT binary patch
literal 3352
zcmbW4-*4hL5XayBSIi4KdM8NY{35HW(w3IJYP)+C&FLPh>SP>4!$FK3bCmlRcHi#(
z{?;2CvSeGxDy0=k#`b(X^Bto}=k99M^ZGvN{QLFGe=rs+sq`(Izz0zX#k9x(XFw+T
zdMOGGPg<|uInHLY=_@;T!6tn!m&5}X<YZ3H&xtegfa2qrTwD-QWX5cstKmylijhQT
z&aYzex6nCf;9M76W_d9K=a0O2UzGX5@R=<I7^lwtdZM>0d`R&dpUnkF&{qxR*HA+o
z!flrjoBe?ENn|;@l+W;h&o0CXg~+4u2+kc-CcbDlZ!Ai=R$P=|<1w2x0iuUHCHcM3
z=)f_FaZ>S&f!r+FOkD1-!~OLJ9&uaXjHSjz;&LZdtgtSPoj?`99WJf~5a84*S7zvw
z%rMH|OYvD2cw<auwb5M$hD@`CoYj1UH6QQS`bjG2vO*TyrCgV1;0%>uI+w+j*;)7M
z9d+^J;h+8d;<`WKe>Kl6Ua^%>bGGbhQR?0_f7WXyjHi!cyOAm@p^Fc=)gO7p1ZHb4
z(B+}nOeSyhiDGICJ$O}IXEs+T{#<Yk-IPrh0x<82aT5bD&ewRqh5fd0BWbR2PFLB~
zXMZkOk(C@nFhh#1R#?~0h%L%`-i`KE-iLB+HFrMd7}^7vq8ghz*(Ka+P)oNduWIS7
zC}#Qzs2j&-MyWB?yYu1NB65K*0WMQp(pY@y6uh7xKza+;DKTY<s1Kwc!iE$OQwZV+
zP#B;AC;|7RTI$-R=)f*St0_^#@I1Hg`w$F*eiTJ9q)X@y<pR7jNav9Le2nA01w>DP
zXak781>~Oq`3)fdEui28C};o$ZvllTKw$&Oa6;O)7Lq>2b&-3h1=hf@b<h?<z1NUQ
z1H{%qTZoU4@cTe`1H;PSmJu8YBx`_J>DxlWBM|&yI(amW>~zplc8Yp?5cttS_Gtsd
z%H9@I$zF%d8X#8owh$w`Dd0NfsR3ejZwm=9U4!I%NZtUky0?XdM<DYCh|%2*TI#+#
zBpyA;{#OIS%HEbi_i@xAiw205y)DG|;{L$L^IoSc8z@%ywv_6;*C9m%#OmG_Ql0lY
zMAjf+n&f?(uy2>2m6&2HFd(~r%zQ%>hM12je6z*al-QBto6W<$y|hse`+^tqOkKUQ
zVZFVy#RKdpUaTGX$-#y_SiTU-w*5C(KpxJMDb2X=XFSaKAfb^v^|OTgJc)vEk}&t;
z$1@&Dr}ic?JN_op>p_-dCyWi@sh5&7LY$joD#3KaV}gyi)L^=6yuZ)?5tv`wpfROw
P!f6u4q|>?nZAAVDe()Fs

literal 0
HcmV?d00001

diff --git a/tests/fixtures/tables/sparse_cells.gt.json b/tests/fixtures/tables/sparse_cells.gt.json
new file mode 100644
index 0000000..eb7bd91
--- /dev/null
+++ b/tests/fixtures/tables/sparse_cells.gt.json
@@ -0,0 +1,265 @@
+{
+  "source": "synthetic",
+  "license": "generated",
+  "notes": "7x5 borderless with blank cells on subtotal/total rows",
+  "tables": [
+    {
+      "page": 1,
+      "bbox": [
+        72.0,
+        532.0,
+        432.0,
+        700.0
+      ],
+      "col_count": 5,
+      "row_count": 7,
+      "cells": [
+        {
+          "row": 0,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Item"
+        },
+        {
+          "row": 0,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Qty"
+        },
+        {
+          "row": 0,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Price"
+        },
+        {
+          "row": 0,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Total"
+        },
+        {
+          "row": 0,
+          "col": 4,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Note"
+        },
+        {
+          "row": 1,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Apple"
+        },
+        {
+          "row": 1,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2"
+        },
+        {
+          "row": 1,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "1.00"
+        },
+        {
+          "row": 1,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2.00"
+        },
+        {
+          "row": 1,
+          "col": 4,
+          "row_span": 1,
+          "col_span": 1,
+          "text": ""
+        },
+        {
+          "row": 2,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Banana"
+        },
+        {
+          "row": 2,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "5"
+        },
+        {
+          "row": 2,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "0.50"
+        },
+        {
+          "row": 2,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "2.50"
+        },
+        {
+          "row": 2,
+          "col": 4,
+          "row_span": 1,
+          "col_span": 1,
+          "text": ""
+        },
+        {
+          "row": 3,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Subtotal"
+        },
+        {
+          "row": 3,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": ""
+        },
+        {
+          "row": 3,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": ""
+        },
+        {
+          "row": 3,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "4.50"
+        },
+        {
+          "row": 3,
+          "col": 4,
+          "row_span": 1,
+          "col_span": 1,
+          "text": ""
+        },
+        {
+          "row": 4,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Orange"
+        },
+        {
+          "row": 4,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "3"
+        },
+        {
+          "row": 4,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "1.50"
+        },
+        {
+          "row": 4,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "4.50"
+        },
+        {
+          "row": 4,
+          "col": 4,
+          "row_span": 1,
+          "col_span": 1,
+          "text": ""
+        },
+        {
+          "row": 5,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Grape"
+        },
+        {
+          "row": 5,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "1"
+        },
+        {
+          "row": 5,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "3.00"
+        },
+        {
+          "row": 5,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "3.00"
+        },
+        {
+          "row": 5,
+          "col": 4,
+          "row_span": 1,
+          "col_span": 1,
+          "text": ""
+        },
+        {
+          "row": 6,
+          "col": 0,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "Total"
+        },
+        {
+          "row": 6,
+          "col": 1,
+          "row_span": 1,
+          "col_span": 1,
+          "text": ""
+        },
+        {
+          "row": 6,
+          "col": 2,
+          "row_span": 1,
+          "col_span": 1,
+          "text": ""
+        },
+        {
+          "row": 6,
+          "col": 3,
+          "row_span": 1,
+          "col_span": 1,
+          "text": "12.00"
+        },
+        {
+          "row": 6,
+          "col": 4,
+          "row_span": 1,
+          "col_span": 1,
+          "text": ""
+        }
+      ]
+    }
+  ]
+}
diff --git a/tests/fixtures/tables/sparse_cells.pdf b/tests/fixtures/tables/sparse_cells.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..93272f676a8c4ded0537319d952c9f957b269a4e
GIT binary patch
literal 3579
zcmbtXUvJ_@5P$clm={#i3ft@dK1GoN33upnIZ9mhA&RQSUWg;(wbn}$`W1R__ukLi
zS%Z%yx#}<lguSz~zn%Gc#xvWSUX2IDbdBEEFMt07GgfgSZ|EG}vyuraSq7*98Bf<M
zR!Vr3vU+D(&(F`MXmkrYH&ebc2-v{qPsaJVVT}nmIGh<57X~XcC065hdPr+F<_Kne
zV8x$I<|zg1x=eYNmrJmI&&$iQ&i8^_x?-R}t-JMHZYmrk_>W_C!5PHTwsN(tMmU9L
z*AOau!1~0poDTU5+~e4WfH)Aw4m^N0r9$}^Z1rZW=4+9%8XA9;vC2V2m<pcWF^K?{
zBF0MC3pz4c(Ip%1uA|*`0uN{wSTin>NL+3uf|d=Wk{i%EFvZ2110JfjT&bx~JVP&k
ztJy6t@kWX2*3c~jBPwaZml}qzVO*}|BNx!8B`-HCzOK)}8VN>a&dV!hS^w&tW8;V5
zxB2{H+dtv2)l-WrS~2lNR|CmvIauT`axIub`p7oVTx2!$ae$`&2oVX))+s~CeXuDe
zZ}PdIVgmzs6I^EvSNQOsSSq2P(0RcCc^5QQJ^)SAHQsMvXBHZg<PvrIbwx$Z{zzF-
zC|i;Ls=|=A#<Zw)%GG0UDIf85^SG^#IeK*uDxPL4M7CTv^lMD3$ZD6~uyQFM!Ex<a
zY19%~-lm4##E%W>3SgKt(TuT|EP!~X2gwawCq^r`8#&IlQWbg%fY_nwggyk3X9htK
zLb8JXM6%TxBu|k1d>jZacAP_uU*)C?gyZ>UKn@|MB2QTt5SRF76uojL!Bi<ap+te{
zd55Ola@oleVk092EeY&Bt(R3*bVG^ahF2ZL>Eepxsi%8r1u<>A3x-SHVmMtf;zJ{j
z4Xije!vI-ygFTBwTH?2h2V_U1j^cL#Xlc{|Y|}r9MjgZHilJ!K0FI_HG6NrJB+i~j
zRrk7P2En?fgB-iR<D=c$=EtsULr+`U_fFwV12}eXEL*1Q_~1uKUi)JFjKy|&N(vvK
zxF^Kk0RZXB!8U~tFizo72RNQb*CaOb7-$9NJr>QAXw*T-DKzQ;Zr8Wij>i9mfhnw0
zaXgLN(&l?wLFruDR08V=$L_8Y_>^_7YDMCMrG3oBjs-|jrn$W{8q_&p*ADGY1TTUu
z3>zY2+j6RZz!#tE!vem^LTtX8nev+qtMwb1L$F19rRRM|&kOM!@=7nnhU&E}a<x6M
z&CClXnv>1s3XJ=6H%LQ=(1pKX)b|%LWpgjg=BXXKZstYviyvz|7(MN<wfz2JYcPN;
z$7UaU-v2sxof(ETDHj}6!#^|dtidIy0|egR<$p2cw`oMFkPYE3i9Ms&yZ$^j{sndE
BP|pAW

literal 0
HcmV?d00001


From 0bff34749cb12d8fca22d34ac158ce1f94c38d5e Mon Sep 17 00:00:00 2001
From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com>
Date: Fri, 17 Apr 2026 00:54:36 +0530
Subject: [PATCH 05/12] feat(tests): add GriTS-Top and cell-level P/R scorer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pure-Python scorer — no external deps beyond stdlib. Implements:

- Whitespace-normalized text comparison.
- Greedy bbox-IoU 1:1 matching of predicted tables to ground truth,
  restricted to same page.
- Cell-level multiset precision/recall on normalized text.
- GriTS-Top: F1 over the set of (row_start, row_end, col_start,
  col_end) topological signatures of every cell, accounting for
  row/col spans. Robust to missing text; penalizes structural errors.

9 unit tests cover the identical, all-wrong-text, missing-column,
extra-merge, page-mismatch, and low-IoU cases.
---
 tests/python/grits.py      | 207 +++++++++++++++++++++++++++++++++++++
 tests/python/test_grits.py |  86 +++++++++++++++
 2 files changed, 293 insertions(+)
 create mode 100644 tests/python/grits.py
 create mode 100644 tests/python/test_grits.py

diff --git a/tests/python/grits.py b/tests/python/grits.py
new file mode 100644
index 0000000..8c2cd19
--- /dev/null
+++ b/tests/python/grits.py
@@ -0,0 +1,207 @@
+"""Table extraction accuracy scorer.
+
+Implements:
+- Whitespace-normalized text comparison.
+- Bbox-IoU matching of predicted tables to ground-truth tables (greedy Hungarian).
+- Cell-level precision/recall/F1 on normalized text.
+- GriTS-Top: F1 over the set of topological signatures (row_start, row_end, col_start, col_end)
+  of every cell, accounting for row/col spans. Robust to missing text; penalizes structural errors.
+
+The internal representation for a table is a dict matching the .gt.json `tables[]` entry:
+
+    {
+        "page": int,
+        "bbox": [x_min, y_min, x_max, y_max],
+        "col_count": int,
+        "row_count": int,
+        "cells": [{"row": int, "col": int, "row_span": int, "col_span": int, "text": str}, ...],
+    }
+
+Predicted tables from paperjam.Table are converted into this shape by ``predicted_tables_to_gt_shape``.
+
+Reference: Smock, Pesala, Abraham, "PubTables-1M" (CVPR 2022), §4.1.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
+
+def normalize_text(s: str) -> str:
+    return " ".join(s.split())
+
+
+def _bbox_iou(a: list[float] | tuple[float, ...], b: list[float] | tuple[float, ...]) -> float:
+    ax1, ay1, ax2, ay2 = a
+    bx1, by1, bx2, by2 = b
+    ix1, iy1 = max(ax1, bx1), max(ay1, by1)
+    ix2, iy2 = min(ax2, bx2), min(ay2, by2)
+    iw, ih = max(0.0, ix2 - ix1), max(0.0, iy2 - iy1)
+    inter = iw * ih
+    area_a = max(0.0, ax2 - ax1) * max(0.0, ay2 - ay1)
+    area_b = max(0.0, bx2 - bx1) * max(0.0, by2 - by1)
+    union = area_a + area_b - inter
+    return inter / union if union > 0 else 0.0
+
+
+def match_tables(pred: list[dict], gt: list[dict], iou_threshold: float = 0.5) -> dict:
+    """Greedy 1:1 matching on bbox IoU restricted to same page.
+
+    Returns {"matches": [(p_idx, g_idx, iou), ...], "unmatched_pred": [...], "unmatched_gt": [...]}.
+    """
+    pairs: list[tuple[float, int, int]] = []
+    for pi, p in enumerate(pred):
+        for gi, g in enumerate(gt):
+            if p.get("page") != g.get("page"):
+                continue
+            iou = _bbox_iou(p["bbox"], g["bbox"])
+            if iou >= iou_threshold:
+                pairs.append((iou, pi, gi))
+    pairs.sort(reverse=True)
+    used_p: set[int] = set()
+    used_g: set[int] = set()
+    matches: list[tuple[int, int, float]] = []
+    for iou, pi, gi in pairs:
+        if pi in used_p or gi in used_g:
+            continue
+        matches.append((pi, gi, iou))
+        used_p.add(pi)
+        used_g.add(gi)
+    return {
+        "matches": matches,
+        "unmatched_pred": [i for i in range(len(pred)) if i not in used_p],
+        "unmatched_gt": [i for i in range(len(gt)) if i not in used_g],
+    }
+
+
+def _cell_text_bag(cells: Iterable[dict]) -> list[str]:
+    return sorted(normalize_text(c["text"]) for c in cells if normalize_text(c["text"]) != "")
+
+
+def cell_precision_recall(pred_table: dict, gt_table: dict) -> dict:
+    """Multiset-level precision/recall over cell texts.
+
+    Treats identical strings as interchangeable; ignores position. A complementary metric
+    to GriTS-Top, which scores structure and ignores text. Together they catch both failure modes.
+    """
+    pred_bag = _cell_text_bag(pred_table["cells"])
+    gt_bag = _cell_text_bag(gt_table["cells"])
+    if not pred_bag and not gt_bag:
+        return {"precision": 1.0, "recall": 1.0, "f1": 1.0, "matched": 0, "pred_total": 0, "gt_total": 0}
+    # Multiset intersection size.
+    from collections import Counter
+
+    inter = sum((Counter(pred_bag) & Counter(gt_bag)).values())
+    precision = inter / len(pred_bag) if pred_bag else 0.0
+    recall = inter / len(gt_bag) if gt_bag else 0.0
+    f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0
+    return {
+        "precision": precision,
+        "recall": recall,
+        "f1": f1,
+        "matched": inter,
+        "pred_total": len(pred_bag),
+        "gt_total": len(gt_bag),
+    }
+
+
+def _topology_signatures(cells: Iterable[dict]) -> set[tuple[int, int, int, int]]:
+    """Each cell → (row_start, row_end_exclusive, col_start, col_end_exclusive)."""
+    return {(c["row"], c["row"] + c.get("row_span", 1), c["col"], c["col"] + c.get("col_span", 1)) for c in cells}
+
+
+def grits_top(pred_table: dict, gt_table: dict) -> dict:
+    """GriTS-Top (topology) F1 over cell signatures.
+
+    Score = F1 over the set of `(row_start, row_end, col_start, col_end)` topological
+    signatures of every cell. A perfect structural match is 1.0; different row/col
+    count or missed merges drive it down even when text is correct.
+    """
+    pred_sigs = _topology_signatures(pred_table["cells"])
+    gt_sigs = _topology_signatures(gt_table["cells"])
+    if not pred_sigs and not gt_sigs:
+        return {"precision": 1.0, "recall": 1.0, "f1": 1.0}
+    inter = len(pred_sigs & gt_sigs)
+    precision = inter / len(pred_sigs) if pred_sigs else 0.0
+    recall = inter / len(gt_sigs) if gt_sigs else 0.0
+    f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0
+    return {"precision": precision, "recall": recall, "f1": f1}
+
+
+def score_document(pred: list[dict], gt: list[dict], iou_threshold: float = 0.5) -> dict:
+    """Aggregate scores across all tables in one document.
+
+    Returns table-detection F1 (matched vs unmatched), plus average cell F1 and GriTS-Top F1
+    over matched pairs. Unmatched predicted tables are not scored for cell/GriTS but do hurt
+    table-detection precision; unmatched GT tables hurt detection recall.
+    """
+    m = match_tables(pred, gt, iou_threshold=iou_threshold)
+    n_matched = len(m["matches"])
+    n_pred = len(pred)
+    n_gt = len(gt)
+    td_p = n_matched / n_pred if n_pred > 0 else (1.0 if n_gt == 0 else 0.0)
+    td_r = n_matched / n_gt if n_gt > 0 else (1.0 if n_pred == 0 else 0.0)
+    td_f1 = (2 * td_p * td_r / (td_p + td_r)) if (td_p + td_r) > 0 else 0.0
+
+    cell_f1s: list[float] = []
+    grits_f1s: list[float] = []
+    per_table: list[dict] = []
+    for pi, gi, iou in m["matches"]:
+        cell = cell_precision_recall(pred[pi], gt[gi])
+        top = grits_top(pred[pi], gt[gi])
+        cell_f1s.append(cell["f1"])
+        grits_f1s.append(top["f1"])
+        per_table.append(
+            {
+                "pred_idx": pi,
+                "gt_idx": gi,
+                "bbox_iou": iou,
+                "cell_f1": cell["f1"],
+                "cell_precision": cell["precision"],
+                "cell_recall": cell["recall"],
+                "grits_top_f1": top["f1"],
+            }
+        )
+    avg_cell_f1 = sum(cell_f1s) / len(cell_f1s) if cell_f1s else 0.0
+    avg_grits = sum(grits_f1s) / len(grits_f1s) if grits_f1s else 0.0
+    return {
+        "table_detection_precision": td_p,
+        "table_detection_recall": td_r,
+        "table_detection_f1": td_f1,
+        "avg_cell_f1": avg_cell_f1,
+        "avg_grits_top_f1": avg_grits,
+        "n_matched": n_matched,
+        "n_pred": n_pred,
+        "n_gt": n_gt,
+        "per_table": per_table,
+    }
+
+
+def predicted_tables_to_gt_shape(paperjam_tables: list, *, page: int) -> list[dict]:
+    """Convert paperjam.Table dataclasses into the harness dict shape."""
+    out: list[dict] = []
+    for t in paperjam_tables:
+        cells = [
+            {
+                "row": r_idx,
+                "col": c_idx,
+                "row_span": int(getattr(cell, "row_span", 1)),
+                "col_span": int(getattr(cell, "col_span", 1)),
+                "text": cell.text,
+            }
+            for r_idx, row in enumerate(t.rows)
+            for c_idx, cell in enumerate(row.cells)
+        ]
+        out.append(
+            {
+                "page": page,
+                "bbox": list(t.bbox),
+                "col_count": int(t.col_count),
+                "row_count": int(t.row_count),
+                "cells": cells,
+            }
+        )
+    return out
diff --git a/tests/python/test_grits.py b/tests/python/test_grits.py
new file mode 100644
index 0000000..f5209c3
--- /dev/null
+++ b/tests/python/test_grits.py
@@ -0,0 +1,86 @@
+"""Unit tests for the GriTS scorer — independent of the extractor."""
+
+from __future__ import annotations
+
+from grits import (
+    cell_precision_recall,
+    grits_top,
+    match_tables,
+    normalize_text,
+    score_document,
+)
+
+
+def _mk_table(cells: list[tuple[int, int, int, int, str]], page: int = 1, bbox=(0, 0, 100, 100), rows=None, cols=None) -> dict:
+    return {
+        "page": page,
+        "bbox": list(bbox),
+        "row_count": rows or (max((c[0] + c[2] for c in cells), default=0)),
+        "col_count": cols or (max((c[1] + c[3] for c in cells), default=0)),
+        "cells": [{"row": r, "col": c, "row_span": rs, "col_span": cs, "text": t} for r, c, rs, cs, t in cells],
+    }
+
+
+def test_normalize_text_collapses_whitespace():
+    assert normalize_text("  hello   world\n ") == "hello world"
+    assert normalize_text("") == ""
+
+
+def test_identical_tables_score_perfect():
+    t = _mk_table([(0, 0, 1, 1, "A"), (0, 1, 1, 1, "B"), (1, 0, 1, 1, "x"), (1, 1, 1, 1, "y")])
+    assert cell_precision_recall(t, t)["f1"] == 1.0
+    assert grits_top(t, t)["f1"] == 1.0
+
+
+def test_all_wrong_text_grits_perfect_cells_zero():
+    gt = _mk_table([(0, 0, 1, 1, "A"), (0, 1, 1, 1, "B")])
+    pred = _mk_table([(0, 0, 1, 1, "X"), (0, 1, 1, 1, "Y")])
+    assert grits_top(pred, gt)["f1"] == 1.0  # structure identical
+    assert cell_precision_recall(pred, gt)["f1"] == 0.0  # no shared text
+
+
+def test_missing_column_penalizes_both():
+    gt = _mk_table([(0, 0, 1, 1, "A"), (0, 1, 1, 1, "B"), (0, 2, 1, 1, "C")])
+    pred = _mk_table([(0, 0, 1, 1, "A"), (0, 1, 1, 1, "B")])
+    assert grits_top(pred, gt)["f1"] < 1.0
+    cell = cell_precision_recall(pred, gt)
+    assert cell["precision"] == 1.0  # every pred cell is right
+    assert cell["recall"] < 1.0  # missed one
+
+
+def test_extra_merge_penalizes_grits():
+    gt = _mk_table([(0, 0, 1, 1, "H1"), (0, 1, 1, 1, "H2")], cols=2, rows=1)
+    pred = _mk_table([(0, 0, 1, 2, "H1 H2")], cols=2, rows=1)  # merged header
+    # Topological signatures differ — pred has one wide cell, gt has two narrow cells
+    assert grits_top(pred, gt)["f1"] == 0.0
+
+
+def test_table_detection_f1_counts_extra_tables():
+    t = _mk_table([(0, 0, 1, 1, "A")])
+    out = score_document([t, t], [t])  # predicted 2, gt 1
+    assert out["table_detection_precision"] == 0.5
+    assert out["table_detection_recall"] == 1.0
+
+
+def test_table_detection_f1_missed_table():
+    t = _mk_table([(0, 0, 1, 1, "A")])
+    out = score_document([], [t])
+    assert out["table_detection_recall"] == 0.0
+    assert out["n_matched"] == 0
+
+
+def test_match_tables_respects_page():
+    t_p1 = _mk_table([(0, 0, 1, 1, "A")], page=1)
+    t_p2 = _mk_table([(0, 0, 1, 1, "A")], page=2)
+    m = match_tables([t_p1], [t_p2])
+    assert m["matches"] == []
+    assert m["unmatched_pred"] == [0]
+    assert m["unmatched_gt"] == [0]
+
+
+def test_match_tables_low_iou_unmatched():
+    t_far = _mk_table([(0, 0, 1, 1, "A")], bbox=(0, 0, 10, 10))
+    t_near = _mk_table([(0, 0, 1, 1, "A")], bbox=(0, 0, 100, 100))
+    # The small bbox is fully inside the big one, but the IoU (100 / 10000 = 0.01) is well below threshold.
+    m = match_tables([t_far], [t_near])
+    assert m["matches"] == []

From 5eca9cbe463fa4b975d59b6f2e3d6a140d4ef224 Mon Sep 17 00:00:00 2001
From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com>
Date: Fri, 17 Apr 2026 00:54:55 +0530
Subject: [PATCH 06/12] feat(tests): add table extraction accuracy harness with
 baseline gate

New pytest suite under -m accuracy runs over every .gt.json fixture,
converts paperjam.Table outputs into the harness shape, and scores
against ground truth via GriTS-Top + cell-level P/R.

- conftest.py session-scoped accuracy_report fixture accumulates
  per-fixture scores and writes tests/output/table_accuracy.json on
  session teardown.
- --update-baseline CLI option rewrites tests/fixtures/tables/.accuracy_baseline.json
  after an intentional improvement.
- test_baseline_regression fails if aggregate table-detection F1,
  cell F1, or GriTS-Top F1 drops more than 1% from the committed
  baseline.
- tests/output/ is excluded from version control.

Initial committed baseline (on current heuristics):
  table_detection_f1   = 1.0000
  avg_cell_f1          = 0.9923
  avg_grits_top_f1     = 0.9557
Phase 1 (merged-cell reconstruction) will drive the GriTS-Top number
up by fixing bordered_merged (currently 0.757).
---
 .gitignore                                    |   3 +
 tests/fixtures/tables/.accuracy_baseline.json |   8 ++
 tests/python/conftest.py                      |  43 ++++++++
 tests/python/test_table_accuracy.py           | 100 ++++++++++++++++++
 4 files changed, 154 insertions(+)
 create mode 100644 tests/fixtures/tables/.accuracy_baseline.json
 create mode 100644 tests/python/test_table_accuracy.py

diff --git a/.gitignore b/.gitignore
index a18592b..3a1d21a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -36,6 +36,9 @@ python/paperjam/libpdfium.so
 # Test fixtures (generated)
 tests/fixtures/large_*.pdf
 
+# Per-session test artifacts (accuracy reports, etc.)
+tests/output/
+
 # Sphinx
 _build
 
diff --git a/tests/fixtures/tables/.accuracy_baseline.json b/tests/fixtures/tables/.accuracy_baseline.json
new file mode 100644
index 0000000..b1223da
--- /dev/null
+++ b/tests/fixtures/tables/.accuracy_baseline.json
@@ -0,0 +1,8 @@
+{
+  "aggregate": {
+    "table_detection_f1": 1.0,
+    "avg_cell_f1": 0.9923469387755102,
+    "avg_grits_top_f1": 0.9557057057057057,
+    "n_fixtures": 8
+  }
+}
diff --git a/tests/python/conftest.py b/tests/python/conftest.py
index a8e87b4..526b779 100644
--- a/tests/python/conftest.py
+++ b/tests/python/conftest.py
@@ -1,8 +1,21 @@
+import json
 import pathlib
 
 import pytest
 
 FIXTURES = pathlib.Path(__file__).resolve().parent.parent / "fixtures"
+TABLES_FIXTURES = FIXTURES / "tables"
+OUTPUT_DIR = pathlib.Path(__file__).resolve().parent.parent / "output"
+BASELINE_PATH = TABLES_FIXTURES / ".accuracy_baseline.json"
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--update-baseline",
+        action="store_true",
+        default=False,
+        help="Write the current accuracy scores to the baseline file (skips the regression gate).",
+    )
 
 
 @pytest.fixture
@@ -28,3 +41,33 @@ def table_bordered_pdf():
 @pytest.fixture
 def metadata_pdf():
     return str(FIXTURES / "with_metadata.pdf")
+
+
+@pytest.fixture(scope="session")
+def accuracy_report(request):
+    """Session-scoped accumulator: per-fixture -> score dict; written to tests/output at teardown."""
+    report: dict = {}
+
+    def _finalize():
+        if not report:
+            return
+        OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+        aggregate = _aggregate_scores(report)
+        payload = {"fixtures": report, "aggregate": aggregate}
+        (OUTPUT_DIR / "table_accuracy.json").write_text(json.dumps(payload, indent=2) + "\n")
+        if request.config.getoption("--update-baseline"):
+            BASELINE_PATH.write_text(json.dumps({"aggregate": aggregate}, indent=2) + "\n")
+
+    request.addfinalizer(_finalize)
+    return report
+
+
+def _aggregate_scores(report: dict) -> dict:
+    fields = ("table_detection_f1", "avg_cell_f1", "avg_grits_top_f1")
+    totals = {f: 0.0 for f in fields}
+    n = 0
+    for scores in report.values():
+        for f in fields:
+            totals[f] += float(scores.get(f, 0.0))
+        n += 1
+    return {f: (totals[f] / n if n else 0.0) for f in fields} | {"n_fixtures": n}
diff --git a/tests/python/test_table_accuracy.py b/tests/python/test_table_accuracy.py
new file mode 100644
index 0000000..6298414
--- /dev/null
+++ b/tests/python/test_table_accuracy.py
@@ -0,0 +1,100 @@
+"""Table extraction accuracy harness.
+
+Parametrized over every ``.gt.json`` fixture under ``tests/fixtures/tables/``. For each
+fixture, opens the paired PDF, runs ``extract_tables`` on every page, converts the result
+into the harness shape, and scores it against the ground truth.
+
+A final ``test_baseline_regression`` compares the aggregate to the committed baseline
+and fails on >1% drop in table-detection F1, cell F1, or GriTS-Top F1. Pass
+``--update-baseline`` to refresh the committed numbers after an intentional improvement.
+
+Run: ``uv run pytest tests/python/ -m accuracy -v``
+"""
+
+from __future__ import annotations
+
+import json
+import pathlib
+
+import paperjam
+import pytest
+from grits import predicted_tables_to_gt_shape, score_document
+
+FIXTURES_DIR = pathlib.Path(__file__).resolve().parent.parent / "fixtures" / "tables"
+BASELINE_PATH = FIXTURES_DIR / ".accuracy_baseline.json"
+REGRESSION_THRESHOLD = 0.01  # allow 1% noise; anything worse fails CI
+
+_GT_FILES = sorted(FIXTURES_DIR.glob("*.gt.json"))
+
+
+def _load_pred(pdf_path: pathlib.Path) -> list[dict]:
+    doc = paperjam.open_pdf(str(pdf_path))
+    out: list[dict] = []
+    for page_idx, page in enumerate(doc.pages, start=1):
+        tables = page.extract_tables()
+        out.extend(predicted_tables_to_gt_shape(tables, page=page_idx))
+    return out
+
+
+@pytest.mark.accuracy
+@pytest.mark.parametrize("gt_path", _GT_FILES, ids=lambda p: p.stem)
+def test_extraction_accuracy(gt_path: pathlib.Path, accuracy_report):
+    pdf_path = gt_path.with_name(gt_path.name.removesuffix(".gt.json") + ".pdf")
+    assert pdf_path.exists(), f"fixture PDF missing for {gt_path.name}"
+    with open(gt_path) as f:
+        gt_doc = json.load(f)
+    gt_tables = gt_doc["tables"]
+    pred_tables = _load_pred(pdf_path)
+    scores = score_document(pred_tables, gt_tables)
+    # Slim the per_table dump before storing so the report stays readable.
+    accuracy_report[gt_path.stem] = {
+        "table_detection_precision": scores["table_detection_precision"],
+        "table_detection_recall": scores["table_detection_recall"],
+        "table_detection_f1": scores["table_detection_f1"],
+        "avg_cell_f1": scores["avg_cell_f1"],
+        "avg_grits_top_f1": scores["avg_grits_top_f1"],
+        "n_matched": scores["n_matched"],
+        "n_pred": scores["n_pred"],
+        "n_gt": scores["n_gt"],
+    }
+
+
+@pytest.mark.accuracy
+def test_baseline_regression(accuracy_report, request):
+    """Fails if aggregate scores have dropped more than REGRESSION_THRESHOLD vs the committed baseline.
+
+    Skipped when --update-baseline is passed, since we're refreshing the numbers in that run.
+    Also skipped when no baseline exists yet (first ever run; commit the baseline this run produces).
+    """
+    if request.config.getoption("--update-baseline"):
+        pytest.skip("--update-baseline: regression gate disabled this run")
+    if not BASELINE_PATH.exists():
+        pytest.skip(f"no baseline at {BASELINE_PATH}; run once with --update-baseline and commit it")
+
+    # The session-scoped accuracy_report may not have been populated yet when this test runs
+    # in isolation; run it last in the file so the parametrized tests above have fired.
+    if not accuracy_report:
+        pytest.skip("no accuracy data collected in this session")
+
+    baseline = json.loads(BASELINE_PATH.read_text())["aggregate"]
+    # Recompute aggregate from the session-scoped report.
+    current = _aggregate(accuracy_report)
+
+    failures: list[str] = []
+    for field in ("table_detection_f1", "avg_cell_f1", "avg_grits_top_f1"):
+        b = float(baseline.get(field, 0.0))
+        c = float(current.get(field, 0.0))
+        if c + REGRESSION_THRESHOLD < b:
+            failures.append(f"{field}: {c:.4f} < baseline {b:.4f} (drop > {REGRESSION_THRESHOLD:.0%})")
+    assert not failures, "accuracy regression vs baseline:\n  " + "\n  ".join(failures)
+
+
+def _aggregate(report: dict) -> dict:
+    fields = ("table_detection_f1", "avg_cell_f1", "avg_grits_top_f1")
+    totals = {f: 0.0 for f in fields}
+    n = 0
+    for scores in report.values():
+        for f in fields:
+            totals[f] += float(scores.get(f, 0.0))
+        n += 1
+    return {f: (totals[f] / n if n else 0.0) for f in fields}

From 0e7de122c46513d3bbc017c817d95dd47dd71f9c Mon Sep 17 00:00:00 2001
From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com>
Date: Fri, 17 Apr 2026 00:55:05 +0530
Subject: [PATCH 07/12] feat(bench): add criterion microbench for table
 extraction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Runs against the full fixture corpus under tests/fixtures/tables/.
Document parsing and page loading happen outside the measured section
so the bench only times table::extract_tables — that's the code
Phases 1–4 will change.

Local baseline on this machine: 13–107 µs per fixture. Not wired into
CI yet since cross-runner wall-clock is too noisy; gets a dedicated
perf-regression job once a stable single-runner baseline exists.

Run with: cargo bench -p paperjam-core --bench table_extraction
---
 crates/paperjam-core/Cargo.toml               |  7 ++
 .../paperjam-core/benches/table_extraction.rs | 79 +++++++++++++++++++
 2 files changed, 86 insertions(+)
 create mode 100644 crates/paperjam-core/benches/table_extraction.rs

diff --git a/crates/paperjam-core/Cargo.toml b/crates/paperjam-core/Cargo.toml
index c986c85..a3e7ce4 100644
--- a/crates/paperjam-core/Cargo.toml
+++ b/crates/paperjam-core/Cargo.toml
@@ -45,3 +45,10 @@ render = ["dep:pdfium-render", "dep:image"]
 signatures = ["dep:x509-parser", "dep:cms", "dep:der", "dep:sha1", "dep:rsa", "dep:p256", "dep:pkcs8", "dep:spki"]
 ltv = ["signatures", "dep:ureq", "dep:rustls"]
 validation = ["dep:roxmltree"]
+
+[dev-dependencies]
+criterion = { version = "0.5", features = ["html_reports"] }
+
+[[bench]]
+name = "table_extraction"
+harness = false
diff --git a/crates/paperjam-core/benches/table_extraction.rs b/crates/paperjam-core/benches/table_extraction.rs
new file mode 100644
index 0000000..c6e2f4f
--- /dev/null
+++ b/crates/paperjam-core/benches/table_extraction.rs
@@ -0,0 +1,79 @@
+//! Criterion microbench for table extraction over the fixture corpus.
+//!
+//! Run with: `cargo bench -p paperjam-core --bench table_extraction`
+//!
+//! Each synthetic PDF fixture under `tests/fixtures/tables/` becomes a bench group.
+//! The document and page are parsed outside the measured section so the bench only
+//! measures `table::extract_tables` itself — that's the code subsequent phases will
+//! change.
+
+use std::path::PathBuf;
+
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+use paperjam_core::document::Document;
+use paperjam_core::table::{extract_tables, TableExtractionOptions};
+
+fn fixtures_dir() -> PathBuf {
+    let manifest = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    manifest.join("../../tests/fixtures/tables")
+}
+
+fn collect_fixtures() -> Vec<PathBuf> {
+    let mut out = Vec::new();
+    let dir = fixtures_dir();
+    let Ok(rd) = std::fs::read_dir(&dir) else {
+        eprintln!(
+            "skipping bench: fixtures dir not found at {}",
+            dir.display()
+        );
+        return out;
+    };
+    for entry in rd.flatten() {
+        let path = entry.path();
+        if path.extension().and_then(|s| s.to_str()) == Some("pdf") {
+            out.push(path);
+        }
+    }
+    out.sort();
+    out
+}
+
+fn bench_extract(c: &mut Criterion) {
+    let fixtures = collect_fixtures();
+    let opts = TableExtractionOptions::default();
+
+    let mut group = c.benchmark_group("table_extraction");
+    for path in &fixtures {
+        let name = path
+            .file_stem()
+            .and_then(|s| s.to_str())
+            .unwrap_or("?")
+            .to_string();
+        // Parse the document once and keep its pages alive for the whole bench run.
+        let doc = match Document::open(path) {
+            Ok(d) => d,
+            Err(e) => {
+                eprintln!("skipping {name}: failed to open ({e:?})");
+                continue;
+            }
+        };
+        let mut pages = Vec::new();
+        for n in 1..=doc.page_count() as u32 {
+            if let Ok(p) = doc.page(n) {
+                pages.push(p);
+            }
+        }
+        group.throughput(Throughput::Elements(pages.len() as u64));
+        group.bench_with_input(BenchmarkId::from_parameter(&name), &pages, |b, pages| {
+            b.iter(|| {
+                for page in pages.iter() {
+                    let _ = extract_tables(page, &opts);
+                }
+            });
+        });
+    }
+    group.finish();
+}
+
+criterion_group!(benches, bench_extract);
+criterion_main!(benches);

From ce098dd771c4e098601fb664ab2a59184e749a86 Mon Sep 17 00:00:00 2001
From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com>
Date: Fri, 17 Apr 2026 00:55:21 +0530
Subject: [PATCH 08/12] ci: add table extraction accuracy job
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New ubuntu-latest / Python 3.13 job runs on every PR, gated by the
existing lint job:

1. Regenerates fixtures via scripts/generate_table_fixtures.py and
   diffs against the committed copies. Catches non-deterministic
   generator changes before they pollute the corpus.
2. Runs pytest -m accuracy — the parametrized fixture tests plus the
   baseline regression gate.
3. Uploads tests/output/table_accuracy.json as an artifact so
   per-fixture deltas are visible on every PR.

The cross-platform pytest matrix already covers correctness on macOS
and Windows; one ubuntu runner is enough for accuracy scoring.
---
 .github/workflows/ci.yml | 48 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4b9b76b..826a290 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -102,3 +102,51 @@ jobs:
         with:
           name: test-results-${{ matrix.os }}-py${{ matrix.python-version }}
           path: results.xml
+
+  accuracy:
+    name: Table Extraction Accuracy
+    needs: lint
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Set up Python 3.13
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.13"
+
+      - name: Install Rust toolchain
+        uses: dtolnay/rust-toolchain@stable
+
+      - uses: Swatinem/rust-cache@v2.9.1
+        with:
+          save-if: false
+
+      - uses: astral-sh/setup-uv@v7
+        with:
+          cache-dependency-glob: pyproject.toml
+
+      - name: Install dependencies
+        run: uv sync --extra dev
+
+      - name: Build native extension
+        uses: PyO3/maturin-action@v1
+        with:
+          command: develop
+          args: --release
+
+      - name: Verify fixtures are up to date
+        run: |
+          cp -r tests/fixtures/tables /tmp/tables_snapshot
+          uv run python scripts/generate_table_fixtures.py
+          diff -r /tmp/tables_snapshot tests/fixtures/tables
+
+      - name: Run accuracy harness
+        run: uv run pytest tests/python/ -m accuracy -v --tb=short
+
+      - name: Upload accuracy report
+        if: always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: accuracy-report
+          path: tests/output/table_accuracy.json

From a87609cadfc4b0b21f818bb176468f9b2c0ff34e Mon Sep 17 00:00:00 2001
From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com>
Date: Fri, 17 Apr 2026 01:02:09 +0530
Subject: [PATCH 09/12] chore(gitignore): exclude CLAUDE.md project guide

CLAUDE.md is a local developer-assistant guide, not a tracked project
artifact.
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 3a1d21a..249bb50 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,6 +11,7 @@ dist/
 build/
 *.so
 *.dylib
+CLAUDE.md
 *.dll
 .venv/
 venv/

From ac553a27e10a950da6d7c727e453a77de771aa70 Mon Sep 17 00:00:00 2001
From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com>
Date: Fri, 17 Apr 2026 01:02:14 +0530
Subject: [PATCH 10/12] chore(pre-commit): add commit-msg hook to strip AI
 attribution
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New local hook (stages: [commit-msg]) that removes any Co-Authored-By
commit messages before the commit is created.

The hook script lives at .claude/hooks/strip-ai-attribution.sh which
is gitignored, so contributors who want the hook active must provide
their own copy locally. In standard pre-commit runs (and CI) this
hook is inert — it only fires on the commit-msg stage, which requires
'pre-commit install --hook-type commit-msg' to be enabled.
---
 .pre-commit-config.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 457a2df..d9c0aed 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -47,3 +47,11 @@ repos:
         language: system
         files: ^docs-site/src/
         pass_filenames: false
+
+  - repo: local
+    hooks:
+      - id: strip-ai-attribution
+        name: strip AI attribution from commit messages
+        entry: .claude/hooks/strip-ai-attribution.sh
+        language: script
+        stages: [commit-msg]

From 4cf994e520e4899bc99b683ad94fdd2597d4b35f Mon Sep 17 00:00:00 2001
From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com>
Date: Fri, 17 Apr 2026 01:25:08 +0530
Subject: [PATCH 11/12] fix(clippy): resolve rust 1.95 warnings on pre-existing
 code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rust 1.95 tightens collapsible_match and unnecessary_sort_by. These
errors were pre-existing on main but surface here because the CI
toolchain just bumped to 1.95; leaving them unfixed would keep the
accuracy CI job (and rust-test) red on every PR.

- paperjam-epub/toc.rs, parser.rs: fold single-branch if inside
  match arms into if-guards on the arm itself. No behavior change —
  subsequent arms are distinct Event variants or a _ catch-all.
- paperjam-pptx/parser.rs: same treatment for Event::Empty and
  Event::Text arms.
- paperjam-core/forms/mod.rs: merge the None / Some((true, _)) /
  Some((false, _)) with inner if-else into a guarded arm plus a
  catch-all. All three original 'not found' paths now share one
  catch-all body; semantics are identical.
- paperjam-core/manipulation/insert.rs: sort_by(|a, b| b.0.cmp(&a.0))
  → sort_by_key(|p| Reverse(p.0)).
---
 crates/paperjam-core/src/forms/mod.rs         | 18 ++++-------
 .../paperjam-core/src/manipulation/insert.rs  |  2 +-
 crates/paperjam-epub/src/parser.rs            | 30 +++++++++----------
 crates/paperjam-epub/src/toc.rs               |  6 ++--
 crates/paperjam-pptx/src/parser.rs            | 14 ++++-----
 5 files changed, 27 insertions(+), 43 deletions(-)

diff --git a/crates/paperjam-core/src/forms/mod.rs b/crates/paperjam-core/src/forms/mod.rs
index f2b6c33..6875ac0 100644
--- a/crates/paperjam-core/src/forms/mod.rs
+++ b/crates/paperjam-core/src/forms/mod.rs
@@ -93,22 +93,14 @@ pub fn fill_form_fields(
 
     for (name, value) in values {
         match field_name_map.get(name) {
-            None => {
-                not_found.push(name.clone());
+            Some((false, field_type)) if set_field_value(&mut inner, name, value, field_type)? => {
+                filled += 1;
+                filled_fields.push((name.clone(), value.clone(), field_type.clone()));
             }
-            Some((true, _)) => {
-                // Read-only field, skip
+            // Not in the map, read-only, or set_field_value returned false: record as not-found.
+            _ => {
                 not_found.push(name.clone());
             }
-            Some((false, field_type)) => {
-                // Find and update the field object
-                if set_field_value(&mut inner, name, value, field_type)? {
-                    filled += 1;
-                    filled_fields.push((name.clone(), value.clone(), field_type.clone()));
-                } else {
-                    not_found.push(name.clone());
-                }
-            }
         }
     }
 
diff --git a/crates/paperjam-core/src/manipulation/insert.rs b/crates/paperjam-core/src/manipulation/insert.rs
index fbcd5da..6007705 100644
--- a/crates/paperjam-core/src/manipulation/insert.rs
+++ b/crates/paperjam-core/src/manipulation/insert.rs
@@ -40,7 +40,7 @@ pub fn insert_blank_pages(doc: &Document, positions: &[(u32, f64, f64)]) -> Resu
 
     // Sort positions in reverse order so insertions don't shift indices
     let mut sorted_positions = positions.to_vec();
-    sorted_positions.sort_by(|a, b| b.0.cmp(&a.0));
+    sorted_positions.sort_by_key(|p| std::cmp::Reverse(p.0));
 
     for (after_page, width, height) in sorted_positions {
         // Create an empty content stream
diff --git a/crates/paperjam-epub/src/parser.rs b/crates/paperjam-epub/src/parser.rs
index a452b45..0c621de 100644
--- a/crates/paperjam-epub/src/parser.rs
+++ b/crates/paperjam-epub/src/parser.rs
@@ -218,22 +218,20 @@ fn parse_opf(xml: &str) -> Result<(OpfMetadata, HashMap<String, String>, Vec<Str
                     }
                 }
             }
-            Ok(Event::Text(ref e)) => {
-                if section == Section::Metadata {
-                    let text = e.unescape().unwrap_or_default().trim().to_string();
-                    if !text.is_empty() {
-                        match current_tag.as_str() {
-                            "title" => metadata.title = Some(text),
-                            "creator" => metadata.creator = Some(text),
-                            "subject" => metadata.subject = Some(text),
-                            "description" => metadata.description = Some(text),
-                            "publisher" => metadata.publisher = Some(text),
-                            "date" => metadata.date = Some(text),
-                            "language" => metadata.language = Some(text),
-                            "identifier" => metadata.identifier = Some(text),
-                            "rights" => metadata.rights = Some(text),
-                            _ => {}
-                        }
+            Ok(Event::Text(ref e)) if section == Section::Metadata => {
+                let text = e.unescape().unwrap_or_default().trim().to_string();
+                if !text.is_empty() {
+                    match current_tag.as_str() {
+                        "title" => metadata.title = Some(text),
+                        "creator" => metadata.creator = Some(text),
+                        "subject" => metadata.subject = Some(text),
+                        "description" => metadata.description = Some(text),
+                        "publisher" => metadata.publisher = Some(text),
+                        "date" => metadata.date = Some(text),
+                        "language" => metadata.language = Some(text),
+                        "identifier" => metadata.identifier = Some(text),
+                        "rights" => metadata.rights = Some(text),
+                        _ => {}
                     }
                 }
             }
diff --git a/crates/paperjam-epub/src/toc.rs b/crates/paperjam-epub/src/toc.rs
index 3f4a5a7..3eb5d01 100644
--- a/crates/paperjam-epub/src/toc.rs
+++ b/crates/paperjam-epub/src/toc.rs
@@ -76,10 +76,8 @@ fn parse_nav_point(reader: &mut Reader<&[u8]>) -> Option<TocEntry> {
                     }
                 }
             }
-            Ok(Event::Text(ref e)) => {
-                if in_text {
-                    title = e.unescape().unwrap_or_default().trim().to_string();
-                }
+            Ok(Event::Text(ref e)) if in_text => {
+                title = e.unescape().unwrap_or_default().trim().to_string();
             }
             Ok(Event::End(ref e)) => {
                 let local = local_name(e.name().as_ref());
diff --git a/crates/paperjam-pptx/src/parser.rs b/crates/paperjam-pptx/src/parser.rs
index 0f6e989..4f6d73f 100644
--- a/crates/paperjam-pptx/src/parser.rs
+++ b/crates/paperjam-pptx/src/parser.rs
@@ -254,11 +254,9 @@ fn parse_slide(xml: &str, index: usize, notes_xml: Option<&str>) -> Result<Slide
                     _ => {}
                 }
             }
-            Ok(Event::Empty(ref e)) => {
-                if parser.in_shape {
-                    let local = local_name_owned(e.name().as_ref());
-                    handle_shape_empty(&mut parser, e, &local)?;
-                }
+            Ok(Event::Empty(ref e)) if parser.in_shape => {
+                let local = local_name_owned(e.name().as_ref());
+                handle_shape_empty(&mut parser, e, &local)?;
             }
             Ok(Event::Text(ref e)) => {
                 if parser.in_table_cell {
@@ -436,10 +434,8 @@ fn parse_notes_text(xml: &str) -> Result<String> {
                     text.push('\n');
                 }
             }
-            Ok(Event::Text(ref e)) => {
-                if in_text_body {
-                    text.push_str(&e.unescape().unwrap_or_default());
-                }
+            Ok(Event::Text(ref e)) if in_text_body => {
+                text.push_str(&e.unescape().unwrap_or_default());
             }
             Ok(Event::Eof) => break,
             Err(e) => return Err(PptxError::Xml(format!("notes XML error: {e}"))),

From 8251eb0cf648f393361464d5879842f9b7153aaf Mon Sep 17 00:00:00 2001
From: Pratyush Sharma <56130065+pratyush618@users.noreply.github.com>
Date: Fri, 17 Apr 2026 01:41:49 +0530
Subject: [PATCH 12/12] fix(ci): declare binary fixture types in .gitattributes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Windows CI (test (windows-latest, 3.x)) fails with 'Invalid file
trailer' on every PDF under tests/fixtures/tables/ because those
fixtures are generated with pageCompression=0 (required for
byte-deterministic output) — the uncompressed ASCII streams fool
git's binary-vs-text heuristic, so core.autocrlf=true rewrites LF to
CRLF on Windows checkout and shifts xref/trailer offsets into
parser-breaking garbage.

Add an explicit .gitattributes declaring PDFs and other universally
binary formats (fonts, images, wasm) as binary so no content-based
heuristic applies. Ubuntu/macOS CI and local runs are unaffected.
---
 .gitattributes | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 .gitattributes

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..2700b07
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,14 @@
+# Enforce byte-exact checkout for binary fixtures.
+# Without this, git's autodetection heuristic can misclassify uncompressed
+# PDFs (pageCompression=0) as text on Windows (core.autocrlf=true), which
+# rewrites LF to CRLF inside the PDF and corrupts xref/trailer offsets.
+*.pdf      binary
+*.wasm     binary
+*.ttf      binary
+*.otf      binary
+*.woff     binary
+*.woff2    binary
+*.ico      binary
+*.png      binary
+*.jpg      binary
+*.jpeg     binary