diff --git a/README.md b/README.md index fb9b3aa..784ee98 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ If you work with academic papers, you know that the OCR process itself is not th I used to rely on [`marker`](https://github.com/datalab-to/marker) for PDF parsing and thought it was great. However, after converting the [Batista et al. (2022)](https://hess.copernicus.org/articles/26/3753/2022/) article one day, I discovered that Table 4 was missing, regardless of the settings or LLMs I used (via the `--use-llm` flag). I then switched to [`docling`](https://github.com/docling-project/docling), and Table 4 reappeared, but all the formulas were gone. Furthermore, both tools require a GPU, and even on a Google Colab T4 instance, processing one article takes 4 to 5 minutes. -Therefore, this project was created because, while [`docling`](https://github.com/docling-project/docling) and [`marker`](https://github.com/datalab-to/marker) are both good tools, they can sometimes miss tables or mix up table structures in ways that require manual correction. I wanted a simple, reliable process that produces a single Markdown file I can trust, a local `figures/` folder, and the ability to process my entire library quickly on my laptop. +Therefore, this project was created because, while [`docling`](https://github.com/docling-project/docling) and [`marker`](https://github.com/datalab-to/marker) are both good tools, they can sometimes miss tables or mix up table structures in ways that require manual correction. I wanted a simple, reliable process that produces a Markdown index file I can trust, local `figures/` and optional `tables/` folders, and the ability to process my entire library quickly on my laptop. ## Features @@ -71,6 +71,7 @@ paperdown converts one PDF or a directory of PDFs into markdown output folders. For each PDF, it creates: - //index.md - //figures/ +- //tables/ (when `--normalize-tables` is enabled) - //log.jsonl API key lookup order: @@ -87,7 +88,8 @@ Options: --max-download-bytes Maximum allowed size (bytes) for each downloaded figure file. [default: 20971520] --workers Maximum number of PDFs processed concurrently in batch mode. [default: 32] -v, --verbose Enable verbose progress messages on stderr. - --overwrite Replace existing managed output artifacts (index.md and figures/). + --overwrite Replace existing managed output artifacts (index.md, figures/, and tables/ when enabled). + --normalize-tables Normalize OCR HTML tables into Markdown and store raw HTML under tables/. -h, --help Print help (see a summary with '-h') -V, --version Print version ``` diff --git a/src/cli.rs b/src/cli.rs index 47e0f77..9872e53 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -10,6 +10,7 @@ use std::path::PathBuf; For each PDF, it creates:\n\ - //index.md\n\ - //figures/\n\ +- //tables/ (when --normalize-tables is enabled)\n\ - //log.jsonl\n\n\ API key lookup order:\n\ 1) ZAI_API_KEY from --env-file\n\ @@ -17,9 +18,11 @@ API key lookup order:\n\ after_help = "Examples:\n \ paperdown --input pdf/paper.pdf\n \ paperdown --input pdf/ --output md/ --workers 4\n \ -paperdown --input pdf/ --output md/ --overwrite\n\n\ +paperdown --input pdf/ --output md/ --overwrite\n \ +paperdown --input pdf/ --output md/ --normalize-tables\n\n\ Notes:\n \ Without --overwrite, existing index.md or figures/ causes a failure.\n \ +When --normalize-tables is enabled, existing tables/ also causes a failure.\n \ Progress bars are shown on stderr only when running in a TTY." )] pub struct Cli { @@ -83,6 +86,13 @@ pub struct Cli { help = "Replace existing managed output artifacts (index.md and figures/)." )] pub overwrite: bool, + + #[arg( + long = "normalize-tables", + action = ArgAction::SetTrue, + help = "Normalize OCR HTML tables into Markdown and store raw HTML under tables/." + )] + pub normalize_tables: bool, } pub fn default_workers() -> usize { @@ -124,6 +134,7 @@ mod tests { assert_eq!(cli.workers, default_workers()); assert!(!cli.verbose); assert!(!cli.overwrite); + assert!(!cli.normalize_tables); } #[test] @@ -148,6 +159,7 @@ mod tests { let help = cmd.render_long_help().to_string(); assert!(help.contains("Examples:")); assert!(help.contains("--overwrite")); + assert!(help.contains("--normalize-tables")); let file_first = help.find("1) ZAI_API_KEY from --env-file"); let env_second = help.find("2) ZAI_API_KEY from environment"); assert!(file_first.is_some()); diff --git a/src/core.rs b/src/core.rs index be90d5d..4ca44b9 100644 --- a/src/core.rs +++ b/src/core.rs @@ -12,6 +12,7 @@ mod input; mod markdown; mod ocr; mod output; +mod table_normalization; pub fn collect_pdfs(input_path: &Path) -> Result> { input::collect_pdfs(input_path) @@ -29,6 +30,15 @@ pub enum ProgressEvent { pub type ProgressCallback = Arc; +#[derive(Clone)] +pub struct ProcessPdfOptions { + pub timeout: Duration, + pub max_download_bytes: u64, + pub overwrite: bool, + pub normalize_tables: bool, + pub progress: Option, +} + #[derive(Debug, Serialize, Clone)] pub struct PdfSummary { pub pdf: String, @@ -45,10 +55,7 @@ pub async fn process_pdf( pdf_path: &Path, output_root: &Path, env_file: &Path, - timeout: Duration, - max_download_bytes: u64, - overwrite: bool, - progress: Option, + options: ProcessPdfOptions, ) -> Result { let run_started = Instant::now(); let pdf_path = pdf_path @@ -57,16 +64,23 @@ pub async fn process_pdf( if !pdf_path.is_file() || !input::is_pdf_path(&pdf_path) { return Err(anyhow!("Input must be a PDF: {}", pdf_path.display())); } - let prepared = output::prepare_output_paths(output_root, &pdf_path, overwrite)?; - let client = reqwest::Client::builder().timeout(timeout).build()?; + let prepared = output::prepare_output_paths( + output_root, + &pdf_path, + options.overwrite, + options.normalize_tables, + )?; + let client = reqwest::Client::builder() + .timeout(options.timeout) + .build()?; let api_key = input::load_api_key(env_file)?; let payload = ocr::build_payload(&pdf_path).await?; - fire(&progress, ProgressEvent::OcrStarted); + fire(&options.progress, ProgressEvent::OcrStarted); let ocr_started = Instant::now(); let response = ocr::call_layout_parsing(&client, &api_key, payload).await?; let ocr_seconds = ocr_started.elapsed(); - fire(&progress, ProgressEvent::OcrFinished); + fire(&options.progress, ProgressEvent::OcrFinished); let (markdown, layout_details, usage) = ocr::validate_layout_response(response)?; @@ -77,22 +91,31 @@ pub async fn process_pdf( &layout_details, &client, &prepared.figures_dir, - max_download_bytes, - progress.clone(), + options.max_download_bytes, + options.progress.clone(), ) .await?; let figure_seconds = figure_started.elapsed(); let markdown = markdown::strip_html_img_alt_attributes(&markdown); + let (markdown, table_stats) = if options.normalize_tables { + let tables_dir = prepared + .tables_dir + .as_ref() + .expect("tables_dir must exist when normalize_tables is enabled"); + table_normalization::normalize_tables(&markdown, tables_dir).await? + } else { + (markdown, table_normalization::TableStats::default()) + }; fire( - &progress, + &options.progress, ProgressEvent::MarkdownWriteStarted { bytes: markdown.len(), }, ); let write_started = Instant::now(); output::atomic_write_text(&prepared.markdown_path, &markdown).await?; - fire(&progress, ProgressEvent::MarkdownWriteFinished); + fire(&options.progress, ProgressEvent::MarkdownWriteFinished); output::append_log( &prepared.log_path, @@ -104,6 +127,14 @@ pub async fn process_pdf( "downloaded_figures": downloaded_figures, "remote_figure_links": remote_figure_links, "image_blocks": image_blocks, + "tables_found": table_stats.tables_found, + "tables_raw_written": table_stats.tables_raw_written, + "tables_normalized": table_stats.tables_normalized, + "tables_skipped_in_code": table_stats.tables_skipped_in_code, + "tables_skipped_nested": table_stats.tables_skipped_nested, + "tables_skipped_too_large": table_stats.tables_skipped_too_large, + "tables_failed_extract": table_stats.tables_failed_extract, + "tables_failed_parse": table_stats.tables_failed_parse, "usage": usage, "timing": { "ocr_call_s": round3(ocr_seconds), @@ -122,6 +153,7 @@ pub async fn process_pdf( downloaded_figures, remote_figure_links, image_blocks, + // Table stats are logged but not surfaced in the summary. usage, log_path: prepared.log_path.display().to_string(), }) @@ -140,9 +172,11 @@ fn round3(duration: Duration) -> f64 { #[cfg(feature = "internal-testing")] #[doc(hidden)] pub mod testing { + pub use super::ProcessPdfOptions; pub use super::ProgressCallback; pub use super::ProgressEvent; pub use super::process_pdf; + pub use super::table_normalization::TableStats; use anyhow::Result; use serde_json::Value; use std::collections::HashMap; @@ -153,6 +187,7 @@ pub mod testing { pub struct PreparedOutputPaths { pub output_dir: std::path::PathBuf, pub figures_dir: std::path::PathBuf, + pub tables_dir: Option, pub markdown_path: std::path::PathBuf, pub log_path: std::path::PathBuf, } @@ -228,16 +263,30 @@ pub mod testing { output_root: &Path, pdf_path: &Path, overwrite: bool, + normalize_tables: bool, ) -> Result { - let prepared = super::output::prepare_output_paths(output_root, pdf_path, overwrite)?; + let prepared = super::output::prepare_output_paths( + output_root, + pdf_path, + overwrite, + normalize_tables, + )?; Ok(PreparedOutputPaths { output_dir: prepared.output_dir, figures_dir: prepared.figures_dir, + tables_dir: prepared.tables_dir, markdown_path: prepared.markdown_path, log_path: prepared.log_path, }) } + pub async fn normalize_tables( + markdown: &str, + tables_dir: &Path, + ) -> Result<(String, TableStats)> { + super::table_normalization::normalize_tables(markdown, tables_dir).await + } + pub async fn append_log(log_path: &Path, entry: Value) -> Result<()> { super::output::append_log(log_path, entry).await } diff --git a/src/core/output.rs b/src/core/output.rs index f66d3c4..902eb4a 100644 --- a/src/core/output.rs +++ b/src/core/output.rs @@ -9,6 +9,7 @@ use tokio::io::AsyncWriteExt; pub(crate) struct PreparedOutput { pub(crate) output_dir: PathBuf, pub(crate) figures_dir: PathBuf, + pub(crate) tables_dir: Option, pub(crate) markdown_path: PathBuf, pub(crate) log_path: PathBuf, } @@ -17,6 +18,7 @@ pub(crate) fn prepare_output_paths( output_root: &Path, pdf_path: &Path, overwrite: bool, + normalize_tables: bool, ) -> Result { let stem = pdf_path .file_stem() @@ -28,6 +30,7 @@ pub(crate) fn prepare_output_paths( let markdown_path = output_dir.join("index.md"); let figures_dir = output_dir.join("figures"); + let tables_dir = output_dir.join("tables"); let log_path = output_dir.join("log.jsonl"); if !overwrite { @@ -43,6 +46,12 @@ pub(crate) fn prepare_output_paths( figures_dir.display() )); } + if normalize_tables && tables_dir.exists() { + return Err(anyhow::anyhow!( + "Output already exists: {}. Re-run with --overwrite", + tables_dir.display() + )); + } } else { if markdown_path.exists() { std::fs::remove_file(&markdown_path)?; @@ -54,13 +63,27 @@ pub(crate) fn prepare_output_paths( std::fs::remove_file(&figures_dir)?; } } + if normalize_tables && tables_dir.exists() { + if tables_dir.is_dir() { + std::fs::remove_dir_all(&tables_dir)?; + } else { + std::fs::remove_file(&tables_dir)?; + } + } } std::fs::create_dir_all(&figures_dir)?; + let tables_dir = if normalize_tables { + std::fs::create_dir_all(&tables_dir)?; + Some(tables_dir) + } else { + None + }; Ok(PreparedOutput { output_dir, figures_dir, + tables_dir, markdown_path, log_path, }) diff --git a/src/core/table_normalization.rs b/src/core/table_normalization.rs new file mode 100644 index 0000000..985a160 --- /dev/null +++ b/src/core/table_normalization.rs @@ -0,0 +1,796 @@ +use anyhow::{Result, anyhow}; +use regex::Regex; +use serde::Serialize; +use std::collections::HashMap; +use std::fmt::Write as _; +use std::path::Path; + +use super::output; + +const RAW_HTML_LIMIT: usize = 128 * 1024; +const CELL_LIMIT: usize = 2_000; +const NORMALIZED_CHAR_LIMIT: usize = 32 * 1024; +const ROW_GROUP_SIZE: usize = 25; + +#[derive(Debug, Default, Clone, Serialize)] +pub struct TableStats { + pub tables_found: usize, + pub tables_raw_written: usize, + pub tables_normalized: usize, + pub tables_skipped_in_code: usize, + pub tables_skipped_nested: usize, + pub tables_skipped_too_large: usize, + pub tables_failed_extract: usize, + pub tables_failed_parse: usize, +} + +pub(crate) async fn normalize_tables( + markdown: &str, + tables_dir: &Path, +) -> Result<(String, TableStats)> { + let mut out = String::with_capacity(markdown.len()); + let mut stats = TableStats::default(); + let mut table_index = 0usize; + let mut pos = 0usize; + + while pos < markdown.len() { + let line_end_pos = line_end(markdown, pos); + let line = &markdown[pos..line_end_pos]; + + if let Some((marker, fence_len)) = fence_start(line) { + let fence_start_pos = pos; + let mut fence_end = line_end_pos; + let mut closed = false; + + while fence_end < markdown.len() { + let next_end = line_end(markdown, fence_end); + let next_line = &markdown[fence_end..next_end]; + fence_end = next_end; + if is_closing_fence_line(next_line, marker, fence_len) { + closed = true; + break; + } + } + + let block = &markdown[fence_start_pos..fence_end]; + stats.tables_skipped_in_code += count_case_insensitive_occurrences(block, " Result { + let mut out = String::with_capacity(chunk.len()); + let mut i = 0usize; + + while i < chunk.len() { + if let Some(run_len) = backtick_run_len(chunk, i) { + let end = find_matching_backtick_run(chunk, i + run_len, run_len) + .map(|offset| offset + run_len) + .unwrap_or(chunk.len()); + let code = &chunk[i..end]; + stats.tables_skipped_in_code += count_case_insensitive_occurrences(code, " { + stats.tables_failed_extract += 1; + out.push_str(&chunk[i..]); + break; + } + TableExtraction::Span { html, end, nested } => { + output::atomic_write_text(&artifact_path, &html).await?; + stats.tables_raw_written += 1; + + if nested { + stats.tables_skipped_nested += 1; + out.push_str(&render_placeholder_block( + ordinal, + &artifact_rel, + "normalization skipped (nested table detected)", + )); + } else { + match render_normalized_table(&html, ordinal, &artifact_rel) { + Ok(Some(rendered)) => { + stats.tables_normalized += 1; + out.push_str(&rendered); + } + Ok(None) => { + stats.tables_skipped_too_large += 1; + out.push_str(&render_placeholder_block( + ordinal, + &artifact_rel, + "normalization skipped (table too large)", + )); + } + Err(_) => { + stats.tables_failed_parse += 1; + out.push_str(&render_placeholder_block( + ordinal, + &artifact_rel, + "normalization skipped (parse failed)", + )); + } + } + } + + i = end; + continue; + } + } + } + + let ch = chunk[i..] + .chars() + .next() + .ok_or_else(|| anyhow!("invalid markdown boundary"))?; + out.push(ch); + i += ch.len_utf8(); + } + + Ok(out) +} + +fn render_normalized_table( + html: &str, + ordinal: usize, + artifact_rel: &str, +) -> Result> { + if html.len() > RAW_HTML_LIMIT { + return Ok(None); + } + + let parsed = parse_table_fragment(html)?; + if parsed.rows.is_empty() { + return Err(anyhow!("table has no rows")); + } + + let cell_count = parsed.columns.len() * parsed.rows.len(); + if cell_count > CELL_LIMIT { + return Ok(None); + } + + let mut out = String::new(); + write!(&mut out, "\n\n##### OCR Table {ordinal}\n").unwrap(); + writeln!(&mut out, "Source (OCR HTML): {artifact_rel}").unwrap(); + write!(&mut out, "Columns: {}\n\n", parsed.columns.join(", ")).unwrap(); + + for (row_index, row) in parsed.rows.iter().enumerate() { + if row_index > 0 && row_index % ROW_GROUP_SIZE == 0 { + out.push('\n'); + } + writeln!(&mut out, "Row: {}", render_row_json(&parsed.columns, row)?).unwrap(); + } + out.push('\n'); + + if out.len() > NORMALIZED_CHAR_LIMIT { + return Ok(None); + } + + Ok(Some(out)) +} + +fn render_placeholder_block(ordinal: usize, artifact_rel: &str, reason: &str) -> String { + format!( + "\n\n##### OCR Table {ordinal}\nSource (OCR HTML): {artifact_rel}\nStatus: {reason}\n\n" + ) +} + +fn render_row_json(columns: &[String], values: &[String]) -> Result { + let mut out = String::from("{"); + for (index, (key, value)) in columns.iter().zip(values.iter()).enumerate() { + if index > 0 { + out.push(','); + } + out.push_str(&serde_json::to_string(key)?); + out.push(':'); + out.push_str(&serde_json::to_string(value)?); + } + out.push('}'); + Ok(out) +} + +fn parse_table_fragment(fragment: &str) -> Result { + let rows = parse_rows(fragment)?; + if rows.is_empty() { + return Err(anyhow!("table has no rows")); + } + + let header_rows = rows.iter().take_while(|row| row.has_th).count(); + let expanded = expand_rows(rows)?; + let width = expanded.width; + if width == 0 { + return Err(anyhow!("table has no columns")); + } + + let columns = if header_rows == 0 { + (1..=width).map(|index| format!("col_{index}")).collect() + } else { + build_columns(&expanded.grid, header_rows) + }; + + let mut data_rows = Vec::new(); + for row in expanded.grid.into_iter().skip(header_rows) { + data_rows.push( + row.into_iter() + .map(|cell| cell.unwrap_or_default()) + .collect::>(), + ); + } + + Ok(ParsedTable { + columns, + rows: data_rows, + }) +} + +fn build_columns(grid: &[Vec>], header_rows: usize) -> Vec { + let width = grid.first().map(|row| row.len()).unwrap_or_default(); + let mut raw_keys = Vec::with_capacity(width); + + for col in 0..width { + let mut parts = Vec::new(); + for row in grid.iter().take(header_rows.min(grid.len())) { + let value = row[col].as_ref().map(|value| value.trim()).unwrap_or(""); + if value.is_empty() { + continue; + } + if parts.last().is_none_or(|last: &String| last != value) { + parts.push(value.to_string()); + } + } + let joined = parts.join(" / "); + raw_keys.push(normalize_key(&joined, col + 1)); + } + + disambiguate_keys(raw_keys) +} + +fn disambiguate_keys(keys: Vec) -> Vec { + let mut seen = HashMap::::new(); + let mut out = Vec::with_capacity(keys.len()); + + for key in keys { + let count = seen.entry(key.clone()).or_insert(0); + *count += 1; + if *count == 1 { + out.push(key); + } else { + out.push(format!("{key}_{count}")); + } + } + + out +} + +fn normalize_key(value: &str, index: usize) -> String { + let mut out = String::new(); + let mut last_was_underscore = false; + + for ch in value.chars() { + if ch.is_ascii_alphanumeric() { + out.push(ch.to_ascii_lowercase()); + last_was_underscore = false; + } else if !out.is_empty() && !last_was_underscore { + out.push('_'); + last_was_underscore = true; + } + } + + let trimmed = out.trim_matches('_'); + if trimmed.is_empty() { + format!("col_{index}") + } else { + trimmed.to_string() + } +} + +fn expand_rows(rows: Vec) -> Result { + let mut grid: Vec>> = Vec::new(); + let mut occupied: Vec> = Vec::new(); + let mut width = 0usize; + + for (row_index, row) in rows.iter().enumerate() { + ensure_row(&mut grid, &mut occupied, row_index, width); + let mut col = 0usize; + + for cell in &row.cells { + while col < width && occupied[row_index][col] { + col += 1; + } + if col >= width { + width = width.max(col + cell.colspan); + resize_width(&mut grid, &mut occupied, width); + } + + let required_width = col + cell.colspan; + if required_width > width { + width = required_width; + resize_width(&mut grid, &mut occupied, width); + } + + for row_offset in 0..cell.rowspan { + let target_row = row_index + row_offset; + ensure_row(&mut grid, &mut occupied, target_row, width); + for col_offset in 0..cell.colspan { + let target_col = col + col_offset; + occupied[target_row][target_col] = true; + grid[target_row][target_col] = Some(cell.text.clone()); + } + } + + col += cell.colspan; + } + } + + if width == 0 { + return Err(anyhow!("table has no columns")); + } + + for row in &mut grid { + if row.len() < width { + row.resize(width, None); + } + } + + Ok(ExpandedTable { grid, width }) +} + +fn ensure_row( + grid: &mut Vec>>, + occupied: &mut Vec>, + row_index: usize, + width: usize, +) { + while grid.len() <= row_index { + grid.push(vec![None; width]); + occupied.push(vec![false; width]); + } +} + +fn resize_width(grid: &mut [Vec>], occupied: &mut [Vec], width: usize) { + for row in grid.iter_mut() { + if row.len() < width { + row.resize(width, None); + } + } + for row in occupied.iter_mut() { + if row.len() < width { + row.resize(width, false); + } + } +} + +fn parse_rows(fragment: &str) -> Result> { + let mut rows = Vec::new(); + let mut pos = 0usize; + + while let Some(row_start) = find_tag(fragment, pos, "tr", false) { + let row_open_end = find_tag_end(fragment, row_start) + .ok_or_else(|| anyhow!("table row start tag was not closed"))?; + let row_close_start = find_tag(fragment, row_open_end, "tr", true) + .ok_or_else(|| anyhow!("table row end tag was not found"))?; + let row_close_end = find_tag_end(fragment, row_close_start) + .ok_or_else(|| anyhow!("table row end tag was not closed"))?; + let row_inner = &fragment[row_open_end..row_close_start]; + rows.push(parse_row(row_inner)?); + pos = row_close_end; + } + + Ok(rows) +} + +fn parse_row(row: &str) -> Result { + let mut cells = Vec::new(); + let mut pos = 0usize; + let mut has_th = false; + + while pos < row.len() { + if let Some(cell_start) = find_tag(row, pos, "th", false) { + let (cell, cell_end) = parse_cell(row, cell_start, "th")?; + has_th = true; + cells.push(cell); + pos = cell_end; + continue; + } + if let Some(cell_start) = find_tag(row, pos, "td", false) { + let (cell, cell_end) = parse_cell(row, cell_start, "td")?; + cells.push(cell); + pos = cell_end; + continue; + } + + let ch = row[pos..] + .chars() + .next() + .ok_or_else(|| anyhow!("invalid table row boundary"))?; + pos += ch.len_utf8(); + } + + Ok(ParsedRow { cells, has_th }) +} + +fn parse_cell(row: &str, start: usize, name: &str) -> Result<(ParsedCell, usize)> { + let open_end = + find_tag_end(row, start).ok_or_else(|| anyhow!("table cell start tag was not closed"))?; + let close_start = find_tag(row, open_end, name, true) + .ok_or_else(|| anyhow!("table cell end tag was not found"))?; + let close_end = find_tag_end(row, close_start) + .ok_or_else(|| anyhow!("table cell end tag was not closed"))?; + let tag = &row[start..open_end]; + let inner = &row[open_end..close_start]; + let rowspan = parse_span_attr(tag, "rowspan"); + let colspan = parse_span_attr(tag, "colspan"); + + Ok(( + ParsedCell { + text: html_fragment_to_text(inner), + rowspan, + colspan, + }, + close_end, + )) +} + +fn html_fragment_to_text(fragment: &str) -> String { + let mut out = String::with_capacity(fragment.len()); + let mut pos = 0usize; + + while pos < fragment.len() { + if fragment.as_bytes()[pos] == b'<' { + if let Some(tag_end) = find_tag_end(fragment, pos) { + let tag = fragment[pos + 1..tag_end - 1].trim(); + let lower = tag.to_ascii_lowercase(); + if lower.starts_with("br") + || lower.starts_with("/p") + || lower.starts_with("/div") + || lower.starts_with("/tr") + || lower.starts_with("/td") + || lower.starts_with("/th") + || lower.starts_with("p") + || lower.starts_with("div") + { + out.push('\n'); + } + pos = tag_end; + continue; + } + break; + } + + if fragment.as_bytes()[pos] == b'&' + && let Some((decoded, consumed)) = decode_html_entity(&fragment[pos..]) + { + out.push_str(&decoded); + pos += consumed; + continue; + } + + let ch = fragment[pos..] + .chars() + .next() + .expect("valid fragment boundary"); + out.push(ch); + pos += ch.len_utf8(); + } + + out.trim().to_string() +} + +fn decode_html_entity(fragment: &str) -> Option<(String, usize)> { + let end = fragment.find(';')?; + let entity = &fragment[..=end]; + let decoded = match entity { + "&" => "&".to_string(), + "<" => "<".to_string(), + ">" => ">".to_string(), + """ => "\"".to_string(), + "'" | "'" => "'".to_string(), + _ if entity.starts_with("&#x") || entity.starts_with("&#X") => { + let value = u32::from_str_radix(&entity[3..end], 16).ok()?; + char::from_u32(value)?.to_string() + } + _ if entity.starts_with("&#") => { + let value = entity[2..end].parse::().ok()?; + char::from_u32(value)?.to_string() + } + _ => return None, + }; + Some((decoded, end + 1)) +} + +fn parse_span_attr(tag: &str, attr: &str) -> usize { + let pattern = format!( + r#"(?i)\b{}\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s/>]+))"#, + regex::escape(attr) + ); + let Ok(re) = Regex::new(&pattern) else { + return 1; + }; + let Some(caps) = re.captures(tag) else { + return 1; + }; + let value = caps + .get(1) + .or_else(|| caps.get(2)) + .or_else(|| caps.get(3)) + .map(|value| value.as_str()) + .unwrap_or("1"); + value + .trim() + .parse::() + .ok() + .filter(|value| *value > 0) + .unwrap_or(1) +} + +fn extract_table_span(text: &str, start: usize) -> TableExtraction { + let Some(open_end) = find_tag_end(text, start) else { + return TableExtraction::Failed; + }; + let mut depth = 1usize; + let mut nested = false; + let mut pos = open_end; + + while pos < text.len() { + if let Some(tag_start) = find_next_table_tag(text, pos) { + if tag_start > pos { + pos = tag_start; + } + if starts_tag(text, tag_start, "table") { + let Some(tag_end) = find_tag_end(text, tag_start) else { + return TableExtraction::Failed; + }; + depth += 1; + nested = true; + pos = tag_end; + continue; + } + if starts_tag(text, tag_start, "/table") { + let Some(tag_end) = find_tag_end(text, tag_start) else { + return TableExtraction::Failed; + }; + depth -= 1; + pos = tag_end; + if depth == 0 { + return TableExtraction::Span { + html: text[start..pos].to_string(), + end: pos, + nested, + }; + } + continue; + } + } + let ch = text[pos..].chars().next().expect("valid table boundary"); + pos += ch.len_utf8(); + } + + TableExtraction::Failed +} + +fn find_next_table_tag(text: &str, start: usize) -> Option { + let mut pos = start; + while pos < text.len() { + if text.as_bytes()[pos] == b'<' + && (starts_tag(text, pos, "table") || starts_tag(text, pos, "/table")) + { + return Some(pos); + } + let ch = text[pos..].chars().next()?; + pos += ch.len_utf8(); + } + None +} + +fn starts_tag(text: &str, start: usize, name: &str) -> bool { + if text.as_bytes().get(start) != Some(&b'<') { + return false; + } + let Some(prefix) = text.get(start + 1..start + 1 + name.len()) else { + return false; + }; + if !prefix.eq_ignore_ascii_case(name) { + return false; + } + match text.as_bytes().get(start + 1 + name.len()) { + None => true, + Some(b) if b.is_ascii_whitespace() || *b == b'>' || *b == b'/' => true, + _ => false, + } +} + +fn find_tag(text: &str, start: usize, name: &str, closing: bool) -> Option { + let target = if closing { + format!("/{name}") + } else { + name.to_string() + }; + let mut pos = start; + while pos < text.len() { + if starts_tag(text, pos, &target) { + return Some(pos); + } + let ch = text[pos..].chars().next()?; + pos += ch.len_utf8(); + } + None +} + +fn find_tag_end(text: &str, start: usize) -> Option { + let bytes = text.as_bytes(); + let mut in_single = false; + let mut in_double = false; + let mut pos = start + 1; + + while pos < bytes.len() { + match bytes[pos] { + b'\'' if !in_double => in_single = !in_single, + b'"' if !in_single => in_double = !in_double, + b'>' if !in_single && !in_double => return Some(pos + 1), + _ => {} + } + pos += 1; + } + + None +} + +fn fence_start(line: &str) -> Option<(char, usize)> { + let trimmed = line.trim_start(); + let mut chars = trimmed.chars(); + let marker = chars.next()?; + if marker != '`' && marker != '~' { + return None; + } + + let mut len = 1usize; + for ch in chars { + if ch == marker { + len += 1; + } else { + break; + } + } + + (len >= 3).then_some((marker, len)) +} + +fn is_closing_fence_line(line: &str, marker: char, len: usize) -> bool { + let trimmed = line.trim_start(); + let mut chars = trimmed.chars(); + let mut count = 0usize; + + while matches!(chars.clone().next(), Some(ch) if ch == marker) { + chars.next(); + count += 1; + } + + count >= len && chars.all(char::is_whitespace) +} + +fn backtick_run_len(text: &str, start: usize) -> Option { + let bytes = text.as_bytes(); + if bytes.get(start) != Some(&b'`') { + return None; + } + + let mut len = 1usize; + while start + len < bytes.len() && bytes[start + len] == b'`' { + len += 1; + } + Some(len) +} + +fn find_matching_backtick_run(text: &str, start: usize, run_len: usize) -> Option { + let bytes = text.as_bytes(); + let mut pos = start; + + while pos + run_len <= bytes.len() { + if bytes[pos] == b'`' && bytes[pos..pos + run_len].iter().all(|byte| *byte == b'`') { + return Some(pos); + } + pos += 1; + } + + None +} + +fn count_case_insensitive_occurrences(text: &str, needle: &str) -> usize { + if needle.is_empty() || text.len() < needle.len() { + return 0; + } + + let haystack = text.to_ascii_lowercase(); + let needle = needle.to_ascii_lowercase(); + let mut count = 0usize; + let mut start = 0usize; + + while let Some(index) = haystack[start..].find(&needle) { + count += 1; + start += index + needle.len(); + } + + count +} + +fn line_end(text: &str, start: usize) -> usize { + text[start..] + .find('\n') + .map(|offset| start + offset + 1) + .unwrap_or(text.len()) +} + +#[derive(Debug)] +struct ParsedCell { + text: String, + rowspan: usize, + colspan: usize, +} + +#[derive(Debug)] +struct ParsedRow { + cells: Vec, + has_th: bool, +} + +#[derive(Debug)] +struct ParsedTable { + columns: Vec, + rows: Vec>, +} + +#[derive(Debug)] +struct ExpandedTable { + grid: Vec>>, + width: usize, +} + +enum TableExtraction { + Span { + html: String, + end: usize, + nested: bool, + }, + Failed, +} diff --git a/src/main.rs b/src/main.rs index 3393b29..2907992 100644 --- a/src/main.rs +++ b/src/main.rs @@ -4,7 +4,9 @@ use anyhow::Result; use clap::Parser; use futures::stream::{self, StreamExt}; use indicatif::{MultiProgress, ProgressBar, ProgressDrawTarget, ProgressStyle}; -use paperdown::core::{self, PdfSummary, ProgressCallback, ProgressEvent, collect_pdfs}; +use paperdown::core::{ + self, PdfSummary, ProcessPdfOptions, ProgressCallback, ProgressEvent, collect_pdfs, +}; use std::io::IsTerminal; use std::path::Path; use std::sync::Arc; @@ -42,10 +44,13 @@ async fn run() -> Result { &pdfs[0], &args.output, &args.env_file, - Duration::from_secs(args.timeout), - args.max_download_bytes, - args.overwrite, - progress_callback(&pdfs[0], progress.clone()), + ProcessPdfOptions { + timeout: Duration::from_secs(args.timeout), + max_download_bytes: args.max_download_bytes, + overwrite: args.overwrite, + normalize_tables: args.normalize_tables, + progress: progress_callback(&pdfs[0], progress.clone()), + }, ) .await?; print_single_summary_stdout(&summary); @@ -60,22 +65,17 @@ async fn run() -> Result { let permit_pool = semaphore.clone(); let output = args.output.clone(); let env_file = args.env_file.clone(); - let timeout = Duration::from_secs(args.timeout); - let max_download_bytes = args.max_download_bytes; - let overwrite = args.overwrite; let progress = progress.clone(); + let options = ProcessPdfOptions { + timeout: Duration::from_secs(args.timeout), + max_download_bytes: args.max_download_bytes, + overwrite: args.overwrite, + normalize_tables: args.normalize_tables, + progress: progress_callback(&pdf, progress), + }; async move { let _permit = permit_pool.acquire_owned().await.expect("semaphore"); - let res = core::process_pdf( - &pdf, - &output, - &env_file, - timeout, - max_download_bytes, - overwrite, - progress_callback(&pdf, progress), - ) - .await; + let res = core::process_pdf(&pdf, &output, &env_file, options).await; (pdf, res) } })) diff --git a/tests/core_internal.rs b/tests/core_internal.rs index ec22d14..0915f19 100644 --- a/tests/core_internal.rs +++ b/tests/core_internal.rs @@ -4,10 +4,10 @@ use httpmock::prelude::*; use paperdown::core::collect_pdfs; use paperdown::core::testing::{ - ProgressCallback, ProgressEvent, append_log, atomic_write_text, build_payload, - content_type_to_suffix, extract_image_url, fire_for_test, is_http_url, load_api_key, - prepare_output_paths, process_pdf, replace_image_urls, round3_for_test, - strip_html_img_alt_attributes, url_suffix, validate_layout_response, + ProcessPdfOptions, ProgressCallback, ProgressEvent, append_log, atomic_write_text, + build_payload, content_type_to_suffix, extract_image_url, fire_for_test, is_http_url, + load_api_key, normalize_tables, prepare_output_paths, process_pdf, replace_image_urls, + round3_for_test, strip_html_img_alt_attributes, url_suffix, validate_layout_response, }; #[cfg(feature = "net-tests")] use paperdown::core::testing::{download_figure, localize_figures}; @@ -320,6 +320,101 @@ fn round3_rounds_millis() { assert_eq!(round3_for_test(Duration::from_millis(1234)), 1.234); } +#[test] +fn normalize_tables_rewrites_table_and_writes_artifact() { + let tmp = TempDir::new().unwrap(); + let markdown = "before\n\n\n\n
Sample NameValue
Alpha1
\nafter"; + + let rt = tokio::runtime::Runtime::new().unwrap(); + let (updated, stats) = rt.block_on(normalize_tables(markdown, tmp.path())).unwrap(); + + assert!(updated.contains("##### OCR Table 1")); + assert!(updated.contains("Source (OCR HTML): tables/table_001.html")); + assert!(updated.contains("Columns: sample_name, value")); + assert!(updated.contains(r#"Row: {"sample_name":"Alpha","value":"1"}"#)); + assert!(tmp.path().join("table_001.html").exists()); + assert_eq!(stats.tables_found, 1); + assert_eq!(stats.tables_raw_written, 1); + assert_eq!(stats.tables_normalized, 1); +} + +#[test] +fn normalize_tables_skips_fenced_and_inline_code() { + let tmp = TempDir::new().unwrap(); + let markdown = "start `
inline
`\n```html\n
fenced
\n```\n
real
\nend"; + + let rt = tokio::runtime::Runtime::new().unwrap(); + let (updated, stats) = rt.block_on(normalize_tables(markdown, tmp.path())).unwrap(); + + assert!(updated.contains("
inline
")); + assert!(updated.contains("```html")); + assert!(updated.contains("##### OCR Table 1")); + assert_eq!(stats.tables_found, 1); + assert_eq!(stats.tables_skipped_in_code, 2); + assert_eq!(stats.tables_normalized, 1); +} + +#[test] +fn normalize_tables_expands_rowspan_and_colspan() { + let tmp = TempDir::new().unwrap(); + let markdown = "\n\n\n\n\n
CategoryMeasure
MeanMax
A12
34
"; + + let rt = tokio::runtime::Runtime::new().unwrap(); + let (updated, _) = rt.block_on(normalize_tables(markdown, tmp.path())).unwrap(); + + assert!(updated.contains("Columns: category, measure_mean, measure_max")); + assert!(updated.contains(r#"Row: {"category":"A","measure_mean":"1","measure_max":"2"}"#)); + assert!(updated.contains(r#"Row: {"category":"A","measure_mean":"3","measure_max":"4"}"#)); +} + +#[test] +fn normalize_tables_preserves_json_escaping() { + let tmp = TempDir::new().unwrap(); + let markdown = "
Value
a;=b\n\"c\"\\d
"; + + let rt = tokio::runtime::Runtime::new().unwrap(); + let (updated, _) = rt.block_on(normalize_tables(markdown, tmp.path())).unwrap(); + + let row_line = updated + .lines() + .find(|line| line.starts_with("Row: ")) + .expect("row line"); + let row_json = row_line.trim_start_matches("Row: "); + let parsed: Value = serde_json::from_str(row_json).unwrap(); + assert_eq!(parsed["value"], "a;=b\n\"c\"\\d"); +} + +#[test] +fn normalize_tables_leaves_unclosed_table_intact() { + let tmp = TempDir::new().unwrap(); + let markdown = "before\n").collect::(); + let markdown = format!("
broken"; + + let rt = tokio::runtime::Runtime::new().unwrap(); + let (updated, stats) = rt.block_on(normalize_tables(markdown, tmp.path())).unwrap(); + + assert_eq!(updated, markdown); + assert_eq!(stats.tables_found, 1); + assert_eq!(stats.tables_failed_extract, 1); +} + +#[test] +fn normalize_tables_uses_placeholder_for_oversized_table() { + let tmp = TempDir::new().unwrap(); + let cells = (0..2001).map(|_| "x
{cells}
"); + + let rt = tokio::runtime::Runtime::new().unwrap(); + let (updated, stats) = rt + .block_on(normalize_tables(&markdown, tmp.path())) + .unwrap(); + + assert!(updated.contains("normalization skipped (table too large)")); + assert!(tmp.path().join("table_001.html").exists()); + assert_eq!(stats.tables_found, 1); + assert_eq!(stats.tables_raw_written, 1); + assert_eq!(stats.tables_skipped_too_large, 1); +} + #[cfg(feature = "net-tests")] async fn start_chunked_image_server(chunks: Vec>) -> (String, tokio::task::JoinHandle<()>) { let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); @@ -677,7 +772,7 @@ fn prepare_output_without_overwrite_fails_on_existing_managed_artifacts() { std::fs::create_dir_all(&target).unwrap(); std::fs::write(target.join("index.md"), b"old").unwrap(); - let err = prepare_output_paths(&tmp.path().join("out"), &pdf, false) + let err = prepare_output_paths(&tmp.path().join("out"), &pdf, false, false) .unwrap_err() .to_string(); assert!(err.contains("--overwrite")); @@ -691,7 +786,7 @@ fn prepare_output_without_overwrite_fails_when_only_figures_exists() { let target = tmp.path().join("out").join("paper"); std::fs::create_dir_all(target.join("figures")).unwrap(); - let err = prepare_output_paths(&tmp.path().join("out"), &pdf, false) + let err = prepare_output_paths(&tmp.path().join("out"), &pdf, false, false) .unwrap_err() .to_string(); assert!(err.contains("figures")); @@ -707,7 +802,7 @@ fn prepare_output_without_overwrite_fails_when_both_exist() { std::fs::create_dir_all(target.join("figures")).unwrap(); std::fs::write(target.join("index.md"), b"old").unwrap(); - let err = prepare_output_paths(&tmp.path().join("out"), &pdf, false) + let err = prepare_output_paths(&tmp.path().join("out"), &pdf, false, false) .unwrap_err() .to_string(); assert!(err.contains("index.md")); @@ -727,7 +822,7 @@ fn prepare_output_with_overwrite_preserves_unrelated_files() { std::fs::write(figures.join("stale.png"), b"old").unwrap(); std::fs::write(out.join("keep.txt"), b"keep").unwrap(); - let prepared = prepare_output_paths(&tmp.path().join("out"), &pdf, true).unwrap(); + let prepared = prepare_output_paths(&tmp.path().join("out"), &pdf, true, false).unwrap(); assert!(prepared.figures_dir.exists()); assert!(!prepared.figures_dir.join("stale.png").exists()); assert!(out.join("keep.txt").exists()); @@ -742,10 +837,43 @@ fn prepare_output_with_overwrite_handles_figures_file() { std::fs::create_dir_all(&out).unwrap(); std::fs::write(out.join("figures"), b"stale").unwrap(); - let prepared = prepare_output_paths(&tmp.path().join("out"), &pdf, true).unwrap(); + let prepared = prepare_output_paths(&tmp.path().join("out"), &pdf, true, false).unwrap(); assert!(prepared.figures_dir.is_dir()); } +#[test] +fn prepare_output_with_normalize_tables_manages_tables_dir() { + let tmp = TempDir::new().unwrap(); + let pdf = tmp.path().join("paper.pdf"); + std::fs::write(&pdf, b"%PDF").unwrap(); + let out = tmp.path().join("out").join("paper"); + std::fs::create_dir_all(out.join("tables")).unwrap(); + std::fs::write(out.join("tables").join("stale.html"), b"old").unwrap(); + + let prepared = prepare_output_paths(&tmp.path().join("out"), &pdf, true, true).unwrap(); + assert!(prepared.tables_dir.as_ref().unwrap().is_dir()); + assert!( + !prepared + .tables_dir + .as_ref() + .unwrap() + .join("stale.html") + .exists() + ); +} + +#[test] +fn prepare_output_without_overwrite_ignores_stale_tables_when_disabled() { + let tmp = TempDir::new().unwrap(); + let pdf = tmp.path().join("paper.pdf"); + std::fs::write(&pdf, b"%PDF").unwrap(); + let out = tmp.path().join("out").join("paper"); + std::fs::create_dir_all(out.join("tables")).unwrap(); + + let prepared = prepare_output_paths(&tmp.path().join("out"), &pdf, false, false).unwrap(); + assert!(prepared.tables_dir.is_none()); +} + #[test] fn extract_image_url_checks_fallback_keys() { let block = json!({ @@ -819,10 +947,13 @@ fn process_pdf_checks_output_conflict_before_env_lookup() { &pdf, &output_root, &missing_env, - Duration::from_secs(1), - 1024, - false, - None, + ProcessPdfOptions { + timeout: Duration::from_secs(1), + max_download_bytes: 1024, + overwrite: false, + normalize_tables: false, + progress: None, + }, )) .unwrap_err() .to_string();