Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Enforce byte-exact checkout for binary fixtures.
# Without this, git's autodetection heuristic can misclassify uncompressed
# PDFs (pageCompression=0) as text on Windows (core.autocrlf=true), which
# rewrites LF to CRLF inside the PDF and corrupts xref/trailer offsets.
*.pdf binary
*.wasm binary
*.ttf binary
*.otf binary
*.woff binary
*.woff2 binary
*.ico binary
*.png binary
*.jpg binary
*.jpeg binary
48 changes: 48 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,51 @@ jobs:
with:
name: test-results-${{ matrix.os }}-py${{ matrix.python-version }}
path: results.xml

accuracy:
name: Table Extraction Accuracy
needs: lint
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6

- name: Set up Python 3.13
uses: actions/setup-python@v6
with:
python-version: "3.13"

- name: Install Rust toolchain
uses: dtolnay/rust-toolchain@stable

- uses: Swatinem/rust-cache@v2.9.1
with:
save-if: false

- uses: astral-sh/setup-uv@v7
with:
cache-dependency-glob: pyproject.toml

- name: Install dependencies
run: uv sync --extra dev

- name: Build native extension
uses: PyO3/maturin-action@v1
with:
command: develop
args: --release

- name: Verify fixtures are up to date
run: |
cp -r tests/fixtures/tables /tmp/tables_snapshot
uv run python scripts/generate_table_fixtures.py
diff -r /tmp/tables_snapshot tests/fixtures/tables

- name: Run accuracy harness
run: uv run pytest tests/python/ -m accuracy -v --tb=short

- name: Upload accuracy report
if: always()
uses: actions/upload-artifact@v7
with:
name: accuracy-report
path: tests/output/table_accuracy.json
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ dist/
build/
*.so
*.dylib
CLAUDE.md
*.dll
.venv/
venv/
Expand All @@ -36,6 +37,9 @@ python/paperjam/libpdfium.so
# Test fixtures (generated)
tests/fixtures/large_*.pdf

# Per-session test artifacts (accuracy reports, etc.)
tests/output/

# Sphinx
_build

Expand Down
8 changes: 8 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,11 @@ repos:
language: system
files: ^docs-site/src/
pass_filenames: false

- repo: local
hooks:
- id: strip-ai-attribution
name: strip AI attribution from commit messages
entry: .claude/hooks/strip-ai-attribution.sh
language: script
stages: [commit-msg]
7 changes: 7 additions & 0 deletions crates/paperjam-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,10 @@ render = ["dep:pdfium-render", "dep:image"]
signatures = ["dep:x509-parser", "dep:cms", "dep:der", "dep:sha1", "dep:rsa", "dep:p256", "dep:pkcs8", "dep:spki"]
ltv = ["signatures", "dep:ureq", "dep:rustls"]
validation = ["dep:roxmltree"]

[dev-dependencies]
criterion = { version = "0.5", features = ["html_reports"] }

[[bench]]
name = "table_extraction"
harness = false
79 changes: 79 additions & 0 deletions crates/paperjam-core/benches/table_extraction.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
//! Criterion microbench for table extraction over the fixture corpus.
//!
//! Run with: `cargo bench -p paperjam-core --bench table_extraction`
//!
//! Each synthetic PDF fixture under `tests/fixtures/tables/` becomes a bench group.
//! The document and page are parsed outside the measured section so the bench only
//! measures `table::extract_tables` itself — that's the code subsequent phases will
//! change.

use std::path::PathBuf;

use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use paperjam_core::document::Document;
use paperjam_core::table::{extract_tables, TableExtractionOptions};

fn fixtures_dir() -> PathBuf {
let manifest = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
manifest.join("../../tests/fixtures/tables")
}

fn collect_fixtures() -> Vec<PathBuf> {
let mut out = Vec::new();
let dir = fixtures_dir();
let Ok(rd) = std::fs::read_dir(&dir) else {
eprintln!(
"skipping bench: fixtures dir not found at {}",
dir.display()
);
return out;
};
for entry in rd.flatten() {
let path = entry.path();
if path.extension().and_then(|s| s.to_str()) == Some("pdf") {
out.push(path);
}
}
out.sort();
out
}

fn bench_extract(c: &mut Criterion) {
let fixtures = collect_fixtures();
let opts = TableExtractionOptions::default();

let mut group = c.benchmark_group("table_extraction");
for path in &fixtures {
let name = path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("?")
.to_string();
// Parse the document once and keep its pages alive for the whole bench run.
let doc = match Document::open(path) {
Ok(d) => d,
Err(e) => {
eprintln!("skipping {name}: failed to open ({e:?})");
continue;
}
};
let mut pages = Vec::new();
for n in 1..=doc.page_count() as u32 {
if let Ok(p) = doc.page(n) {
pages.push(p);
}
}
group.throughput(Throughput::Elements(pages.len() as u64));
group.bench_with_input(BenchmarkId::from_parameter(&name), &pages, |b, pages| {
b.iter(|| {
for page in pages.iter() {
let _ = extract_tables(page, &opts);
}
});
});
}
group.finish();
}

criterion_group!(benches, bench_extract);
criterion_main!(benches);
18 changes: 5 additions & 13 deletions crates/paperjam-core/src/forms/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,22 +93,14 @@ pub fn fill_form_fields(

for (name, value) in values {
match field_name_map.get(name) {
None => {
not_found.push(name.clone());
Some((false, field_type)) if set_field_value(&mut inner, name, value, field_type)? => {
filled += 1;
filled_fields.push((name.clone(), value.clone(), field_type.clone()));
}
Some((true, _)) => {
// Read-only field, skip
// Not in the map, read-only, or set_field_value returned false: record as not-found.
_ => {
not_found.push(name.clone());
}
Some((false, field_type)) => {
// Find and update the field object
if set_field_value(&mut inner, name, value, field_type)? {
filled += 1;
filled_fields.push((name.clone(), value.clone(), field_type.clone()));
} else {
not_found.push(name.clone());
}
}
}
}

Expand Down
2 changes: 1 addition & 1 deletion crates/paperjam-core/src/manipulation/insert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ pub fn insert_blank_pages(doc: &Document, positions: &[(u32, f64, f64)]) -> Resu

// Sort positions in reverse order so insertions don't shift indices
let mut sorted_positions = positions.to_vec();
sorted_positions.sort_by(|a, b| b.0.cmp(&a.0));
sorted_positions.sort_by_key(|p| std::cmp::Reverse(p.0));

for (after_page, width, height) in sorted_positions {
// Create an empty content stream
Expand Down
30 changes: 14 additions & 16 deletions crates/paperjam-epub/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -218,22 +218,20 @@ fn parse_opf(xml: &str) -> Result<(OpfMetadata, HashMap<String, String>, Vec<Str
}
}
}
Ok(Event::Text(ref e)) => {
if section == Section::Metadata {
let text = e.unescape().unwrap_or_default().trim().to_string();
if !text.is_empty() {
match current_tag.as_str() {
"title" => metadata.title = Some(text),
"creator" => metadata.creator = Some(text),
"subject" => metadata.subject = Some(text),
"description" => metadata.description = Some(text),
"publisher" => metadata.publisher = Some(text),
"date" => metadata.date = Some(text),
"language" => metadata.language = Some(text),
"identifier" => metadata.identifier = Some(text),
"rights" => metadata.rights = Some(text),
_ => {}
}
Ok(Event::Text(ref e)) if section == Section::Metadata => {
let text = e.unescape().unwrap_or_default().trim().to_string();
if !text.is_empty() {
match current_tag.as_str() {
"title" => metadata.title = Some(text),
"creator" => metadata.creator = Some(text),
"subject" => metadata.subject = Some(text),
"description" => metadata.description = Some(text),
"publisher" => metadata.publisher = Some(text),
"date" => metadata.date = Some(text),
"language" => metadata.language = Some(text),
"identifier" => metadata.identifier = Some(text),
"rights" => metadata.rights = Some(text),
_ => {}
}
}
}
Expand Down
6 changes: 2 additions & 4 deletions crates/paperjam-epub/src/toc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,8 @@ fn parse_nav_point(reader: &mut Reader<&[u8]>) -> Option<TocEntry> {
}
}
}
Ok(Event::Text(ref e)) => {
if in_text {
title = e.unescape().unwrap_or_default().trim().to_string();
}
Ok(Event::Text(ref e)) if in_text => {
title = e.unescape().unwrap_or_default().trim().to_string();
}
Ok(Event::End(ref e)) => {
let local = local_name(e.name().as_ref());
Expand Down
14 changes: 5 additions & 9 deletions crates/paperjam-pptx/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -254,11 +254,9 @@ fn parse_slide(xml: &str, index: usize, notes_xml: Option<&str>) -> Result<Slide
_ => {}
}
}
Ok(Event::Empty(ref e)) => {
if parser.in_shape {
let local = local_name_owned(e.name().as_ref());
handle_shape_empty(&mut parser, e, &local)?;
}
Ok(Event::Empty(ref e)) if parser.in_shape => {
let local = local_name_owned(e.name().as_ref());
handle_shape_empty(&mut parser, e, &local)?;
}
Ok(Event::Text(ref e)) => {
if parser.in_table_cell {
Expand Down Expand Up @@ -436,10 +434,8 @@ fn parse_notes_text(xml: &str) -> Result<String> {
text.push('\n');
}
}
Ok(Event::Text(ref e)) => {
if in_text_body {
text.push_str(&e.unescape().unwrap_or_default());
}
Ok(Event::Text(ref e)) if in_text_body => {
text.push_str(&e.unescape().unwrap_or_default());
}
Ok(Event::Eof) => break,
Err(e) => return Err(PptxError::Xml(format!("notes XML error: {e}"))),
Expand Down
9 changes: 0 additions & 9 deletions docs-site/src/pages/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -137,15 +137,6 @@ export default function Home(): React.JSX.Element {
borderRadius: '16px',
}}
/>
<h1
style={{
fontSize: '2.5rem',
fontWeight: 800,
marginBottom: '0.5rem',
}}
>
paperjam
</h1>
<p
style={{
fontSize: '1.25rem',
Expand Down
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ dev = [
"pdfplumber>=0.10",
"mypy>=1.8",
"ruff>=0.3",
"reportlab>=4.0,<4.2",
"pandas>=2.0",
]

[tool.maturin]
Expand All @@ -56,6 +58,9 @@ include = [

[tool.pytest.ini_options]
testpaths = ["tests/python"]
markers = [
"accuracy: table extraction accuracy harness (run with -m accuracy)",
]

[tool.ruff]
target-version = "py312"
Expand Down
Loading
Loading