diff --git a/Cargo.lock b/Cargo.lock index e628d8c..f41113a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,6 +11,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + [[package]] name = "anstream" version = "1.0.0" @@ -288,6 +294,19 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" +[[package]] +name = "auto_encoder" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f6364e11e0270035ec392151a54f1476e6b3612ef9f4fe09d35e72a8cebcb65" +dependencies = [ + "chardetng", + "encoding_rs", + "percent-encoding", + "phf 0.11.3", + "phf_codegen 0.11.3", +] + [[package]] name = "autocfg" version = "1.5.0" @@ -396,6 +415,17 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" +[[package]] +name = "chardetng" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea" +dependencies = [ + "cfg-if", + "encoding_rs", + "memchr", +] + [[package]] name = "clap" version = "4.6.0" @@ -476,6 +506,29 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" +[[package]] +name = "cssparser" +version = "0.36.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dae61cf9c0abb83bd659dab65b7e4e38d8236824c85f0f804f173567bda257d2" +dependencies = [ + "cssparser-macros", + "dtoa-short", + "itoa", + "phf 0.13.1", + "smallvec", +] + +[[package]] +name = "cssparser-macros" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" +dependencies = [ + "quote", + "syn 2.0.117", +] + [[package]] name = "deranged" version = "0.5.8" @@ -485,6 +538,27 @@ dependencies = [ "powerfmt", ] +[[package]] +name = "derive_more" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d751e9e49156b02b44f9c1815bcb94b984cdcc4396ecc32521c739452808b134" +dependencies = [ + "derive_more-impl", +] + +[[package]] +name = "derive_more-impl" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "799a97264921d8623a957f6c3b9011f3b5492f557bbb7a5a19b7fa6d06ba8dcb" +dependencies = [ + "proc-macro2", + "quote", + "rustc_version", + "syn 2.0.117", +] + [[package]] name = "difflib" version = "0.4.0" @@ -529,6 +603,21 @@ version = "0.15.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b" +[[package]] +name = "dtoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c3cf4824e2d5f025c7b531afcb2325364084a16806f6d47fbc1f5fbd9960590" + +[[package]] +name = "dtoa-short" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87" +dependencies = [ + "dtoa", +] + [[package]] name = "either" version = "1.15.0" @@ -550,6 +639,15 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + [[package]] name = "equivalent" version = "1.0.2" @@ -593,6 +691,21 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "fast_html2md" +version = "0.0.58" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af3a0122fee1bcf6bb9f3d73782e911cce69d95b76a5e29e930af92cd4a8e4e3" +dependencies = [ + "auto_encoder", + "futures-util", + "lazy_static", + "lol_html", + "percent-encoding", + "regex", + "url", +] + [[package]] name = "fastrand" version = "2.3.0" @@ -617,6 +730,12 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "form_urlencoded" version = "1.2.2" @@ -771,6 +890,11 @@ name = "hashbrown" version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash", +] [[package]] name = "heck" @@ -1232,6 +1356,25 @@ dependencies = [ "value-bag", ] +[[package]] +name = "lol_html" +version = "2.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ff94cb6aef6ee52afd2c69331e9109906d855e82bd241f3110dfdf6185899ab" +dependencies = [ + "bitflags", + "cfg-if", + "cssparser", + "encoding_rs", + "foldhash", + "hashbrown", + "memchr", + "mime", + "precomputed-hash", + "selectors", + "thiserror 2.0.18", +] + [[package]] name = "lru-slab" version = "0.1.2" @@ -1244,6 +1387,12 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + [[package]] name = "mio" version = "1.1.1" @@ -1294,6 +1443,7 @@ dependencies = [ "base64 0.22.1", "clap", "dotenvy", + "fast_html2md", "futures", "httpmock", "indicatif", @@ -1352,6 +1502,93 @@ dependencies = [ "indexmap", ] +[[package]] +name = "phf" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" +dependencies = [ + "phf_macros 0.11.3", + "phf_shared 0.11.3", +] + +[[package]] +name = "phf" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf" +dependencies = [ + "phf_macros 0.13.1", + "phf_shared 0.13.1", + "serde", +] + +[[package]] +name = "phf_codegen" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" +dependencies = [ + "phf_generator 0.11.3", + "phf_shared 0.11.3", +] + +[[package]] +name = "phf_codegen" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1" +dependencies = [ + "phf_generator 0.13.1", + "phf_shared 0.13.1", +] + +[[package]] +name = "phf_generator" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" +dependencies = [ + "phf_shared 0.11.3", + "rand 0.8.5", +] + +[[package]] +name = "phf_generator" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737" +dependencies = [ + "fastrand", + "phf_shared 0.13.1", +] + +[[package]] +name = "phf_macros" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" +dependencies = [ + "phf_generator 0.11.3", + "phf_shared 0.11.3", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "phf_macros" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "812f032b54b1e759ccd5f8b6677695d5268c588701effba24601f6932f8269ef" +dependencies = [ + "phf_generator 0.13.1", + "phf_shared 0.13.1", + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "phf_shared" version = "0.11.3" @@ -1361,6 +1598,15 @@ dependencies = [ "siphasher", ] +[[package]] +name = "phf_shared" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266" +dependencies = [ + "siphasher", +] + [[package]] name = "pico-args" version = "0.5.0" @@ -1505,7 +1751,7 @@ dependencies = [ "bytes", "getrandom 0.3.4", "lru-slab", - "rand", + "rand 0.9.2", "ring", "rustc-hash", "rustls", @@ -1546,6 +1792,15 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "rand_core 0.6.4", +] + [[package]] name = "rand" version = "0.9.2" @@ -1553,7 +1808,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" dependencies = [ "rand_chacha", - "rand_core", + "rand_core 0.9.5", ] [[package]] @@ -1563,9 +1818,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.9.5", ] +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" + [[package]] name = "rand_core" version = "0.9.5" @@ -1685,6 +1946,15 @@ version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + [[package]] name = "rustix" version = "1.1.4" @@ -1760,6 +2030,31 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "selectors" +version = "0.33.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "feef350c36147532e1b79ea5c1f3791373e61cbd9a6a2615413b3807bb164fb7" +dependencies = [ + "bitflags", + "cssparser", + "derive_more", + "log", + "new_debug_unreachable", + "phf 0.13.1", + "phf_codegen 0.13.1", + "precomputed-hash", + "rustc-hash", + "servo_arc", + "smallvec", +] + +[[package]] +name = "semver" +version = "1.0.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" + [[package]] name = "serde" version = "1.0.228" @@ -1825,6 +2120,15 @@ dependencies = [ "serde", ] +[[package]] +name = "servo_arc" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "170fb83ab34de17dc69aa7c67482b22218ddb85da56546f9bd6b929e32a05930" +dependencies = [ + "stable_deref_trait", +] + [[package]] name = "shlex" version = "1.3.0" @@ -1899,7 +2203,7 @@ checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f" dependencies = [ "new_debug_unreachable", "parking_lot", - "phf_shared", + "phf_shared 0.11.3", "precomputed-hash", ] diff --git a/Cargo.toml b/Cargo.toml index 298c33f..d06c851 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "paperdown" -version = "0.2.0" +version = "0.2.1" authors = ["Anatoly Tsyplenkov "] edition = "2024" description = "A fast CLI tool to batch convert PDFs into Markdown using GLM-OCR." @@ -33,6 +33,7 @@ futures = "0.3" indicatif = "0.17" dotenvy = "0.15" regex = "1" +fast_html2md = "0.0.58" reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "json", "stream"] } serde = { version = "1", features = ["derive"] } serde_json = "1" diff --git a/README.md b/README.md index fb9b3aa..15fd1fc 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@

-`paperdown` converts research papers from PDF to Markdown using Z.AI's [GLM-OCR](https://github.com/zai-org/GLM-OCR) model and downloads referenced figure assets locally. +`paperdown` converts research papers from PDF to Markdown using Z.AI's [GLM-OCR](https://github.com/zai-org/GLM-OCR) model, rewrites HTML tables and images into CommonMark Markdown, and downloads referenced figure assets locally. If you work with academic papers, you know that the OCR process itself is not the most difficult part. The real challenge is cleaning up the output. Tables can disappear, their structure can become jumbled, and formulas might be converted into meaningless text. This often means you spend more time correcting the output than working with it. @@ -28,6 +28,7 @@ Therefore, this project was created because, while [`docling`](https://github.co ## Features - Async OCR requests and batch PDF processing using the Z.AI API. +- Async HTML table and image cleanup using `fast_html2md`. - Concurrent figure downloads for each PDF. - Fast processing: approximately 25 seconds per batch of 32 PDFs. Speed depends on the z.ai API availability. See the cost section for more details on spending. diff --git a/src/core.rs b/src/core.rs index be90d5d..bd6340d 100644 --- a/src/core.rs +++ b/src/core.rs @@ -82,7 +82,7 @@ pub async fn process_pdf( ) .await?; let figure_seconds = figure_started.elapsed(); - let markdown = markdown::strip_html_img_alt_attributes(&markdown); + let markdown = markdown::sanitize_html_fragments(markdown).await?; fire( &progress, @@ -220,8 +220,8 @@ pub mod testing { super::markdown::replace_image_urls(markdown, replacements) } - pub fn strip_html_img_alt_attributes(markdown: &str) -> String { - super::markdown::strip_html_img_alt_attributes(markdown) + pub async fn sanitize_html_fragments(markdown: String) -> Result { + super::markdown::sanitize_html_fragments(markdown).await } pub fn prepare_output_paths( diff --git a/src/core/markdown.rs b/src/core/markdown.rs index 37fa0c9..d6691bd 100644 --- a/src/core/markdown.rs +++ b/src/core/markdown.rs @@ -1,7 +1,11 @@ +use anyhow::Result; +use futures::stream::{self, StreamExt}; use regex::Regex; use std::collections::HashMap; use std::sync::LazyLock; +const HTML_FRAGMENT_CONCURRENCY: usize = 16; + static MARKDOWN_IMAGE_URL_PATTERN: LazyLock = LazyLock::new(|| { Regex::new(r"\((https?://[^)\s]+)\)").expect("valid markdown image URL regex") }); @@ -10,10 +14,12 @@ static HTML_IMAGE_URL_PATTERN: LazyLock = LazyLock::new(|| { Regex::new(r#"(src\s*=\s*)(['"])(https?://[^'"]+)(['"])"#).expect("valid HTML image URL regex") }); -static HTML_IMAGE_ALT_PATTERN: LazyLock = LazyLock::new(|| { - Regex::new(r#"(?is)\s+alt(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s/>]+))?"#) - .expect("valid HTML image alt regex") -}); +#[derive(Debug)] +enum Segment { + Text(String), + Code(String), + Html { index: usize, raw: String }, +} pub(crate) fn replace_image_urls(markdown: &str, replacements: &HashMap) -> String { let updated = MARKDOWN_IMAGE_URL_PATTERN @@ -42,7 +48,7 @@ pub(crate) fn replace_image_urls(markdown: &str, replacements: &HashMap String { +pub(crate) async fn sanitize_html_fragments(markdown: String) -> Result { let mut out = String::with_capacity(markdown.len()); let mut chunk_start = 0usize; let mut in_fence = false; @@ -68,7 +74,7 @@ pub(crate) fn strip_html_img_alt_attributes(markdown: &str) -> String { } if let Some((marker, len)) = fence_start(line) { - out.push_str(&sanitize_non_code_chunk(&markdown[chunk_start..i])); + out.push_str(&sanitize_non_code_chunk(&markdown[chunk_start..i]).await?); out.push_str(line); in_fence = true; fence_marker = marker; @@ -82,41 +88,132 @@ pub(crate) fn strip_html_img_alt_attributes(markdown: &str) -> String { } if !in_fence { - out.push_str(&sanitize_non_code_chunk(&markdown[chunk_start..])); + out.push_str(&sanitize_non_code_chunk(&markdown[chunk_start..]).await?); } - out + Ok(out) } -fn sanitize_non_code_chunk(chunk: &str) -> String { - let mut out = String::with_capacity(chunk.len()); +async fn sanitize_non_code_chunk(chunk: &str) -> Result { + let mut segments = Vec::new(); let mut i = 0usize; + let mut literal_start = 0usize; + let mut html_count = 0usize; while i < chunk.len() { if let Some(run_len) = backtick_run_len(chunk, i) && let Some(end) = find_matching_backtick_run(chunk, i + run_len, run_len) { - out.push_str(&chunk[i..end + run_len]); + if literal_start < i { + segments.push(Segment::Text(chunk[literal_start..i].to_string())); + } + segments.push(Segment::Code(chunk[i..end + run_len].to_string())); i = end + run_len; + literal_start = i; continue; } - if starts_html_img_tag(chunk, i) - && let Some(tag_end) = find_html_tag_end(chunk, i) - { - out.push_str(&HTML_IMAGE_ALT_PATTERN.replace_all(&chunk[i..tag_end], "")); + if let Some((tag_end, fragment)) = extract_html_fragment(chunk, i) { + if literal_start < i { + segments.push(Segment::Text(chunk[literal_start..i].to_string())); + } + segments.push(Segment::Html { + index: html_count, + raw: fragment, + }); + html_count += 1; i = tag_end; + literal_start = i; continue; } let ch = chunk[i..].chars().next().expect("valid char boundary"); - out.push(ch); i += ch.len_utf8(); } + if literal_start < chunk.len() { + segments.push(Segment::Text(chunk[literal_start..].to_string())); + } + + if html_count == 0 { + return Ok(join_segments(&segments, &[])); + } + + let html_fragments: Vec = segments + .iter() + .filter_map(|segment| match segment { + Segment::Html { raw, .. } => Some(raw.clone()), + _ => None, + }) + .collect(); + let converted = convert_html_fragments(html_fragments).await; + Ok(join_segments(&segments, &converted)) +} + +fn join_segments(segments: &[Segment], converted: &[Option]) -> String { + let mut out = String::with_capacity( + segments + .iter() + .map(|segment| match segment { + Segment::Text(text) | Segment::Code(text) => text.len(), + Segment::Html { raw, .. } => raw.len(), + }) + .sum(), + ); + + for segment in segments { + match segment { + Segment::Text(text) | Segment::Code(text) => out.push_str(text), + Segment::Html { index, raw } => { + if let Some(Some(rewritten)) = converted.get(*index) { + out.push_str(rewritten); + } else { + out.push_str(raw); + } + } + } + } + out } +fn extract_html_fragment(text: &str, start: usize) -> Option<(usize, String)> { + if starts_html_tag(text, start, "table") { + let end = find_table_fragment_end(text, start)?; + return Some((end, text[start..end].to_string())); + } + + if starts_html_tag(text, start, "img") { + let end = find_html_tag_end(text, start)?; + return Some((end, text[start..end].to_string())); + } + + None +} + +async fn convert_html_fragments(fragments: Vec) -> Vec> { + let converted = stream::iter(fragments.into_iter().enumerate().map( + |(index, fragment)| async move { + let converted = html2md::rewrite_html_streaming(&fragment, true).await; + let converted = if converted.trim().is_empty() { + None + } else { + Some(converted) + }; + (index, converted) + }, + )) + .buffer_unordered(HTML_FRAGMENT_CONCURRENCY) + .collect::>() + .await; + + let mut ordered = vec![None; converted.len()]; + for (index, item) in converted { + ordered[index] = item; + } + ordered +} + fn backtick_run_len(text: &str, start: usize) -> Option { let bytes = text.as_bytes(); if bytes.get(start) != Some(&b'`') { @@ -144,20 +241,71 @@ fn find_matching_backtick_run(text: &str, start: usize, run_len: usize) -> Optio None } -fn starts_html_img_tag(text: &str, start: usize) -> bool { +fn starts_html_tag(text: &str, start: usize, tag: &str) -> bool { let bytes = text.as_bytes(); if bytes.get(start) != Some(&b'<') { return false; } - let Some(prefix) = text.get(start + 1..start + 4) else { + let tag_start = start + 1; + let tag_end = tag_start + tag.len(); + let Some(candidate) = text.get(tag_start..tag_end) else { return false; }; - if !prefix.eq_ignore_ascii_case("img") { + if !candidate.eq_ignore_ascii_case(tag) { return false; } - !matches!(bytes.get(start + 4), Some(b) if b.is_ascii_alphanumeric() || *b == b'-') + !matches!(bytes.get(tag_end), Some(b) if b.is_ascii_alphanumeric() || *b == b'-') +} + +fn starts_end_html_tag(text: &str, start: usize, tag: &str) -> bool { + let bytes = text.as_bytes(); + if bytes.get(start) != Some(&b'<') || bytes.get(start + 1) != Some(&b'/') { + return false; + } + + let tag_start = start + 2; + let tag_end = tag_start + tag.len(); + let Some(candidate) = text.get(tag_start..tag_end) else { + return false; + }; + if !candidate.eq_ignore_ascii_case(tag) { + return false; + } + + !matches!(bytes.get(tag_end), Some(b) if b.is_ascii_alphanumeric() || *b == b'-') +} + +fn find_table_fragment_end(text: &str, start: usize) -> Option { + let mut depth = 1usize; + let mut i = find_html_tag_end(text, start)?; + + while i < text.len() { + let ch = text[i..].chars().next()?; + if ch == '<' { + if starts_html_tag(text, i, "table") { + let end = find_html_tag_end(text, i)?; + depth += 1; + i = end; + continue; + } + + if starts_end_html_tag(text, i, "table") { + let end = find_html_tag_end(text, i)?; + depth = depth.saturating_sub(1); + i = end; + if depth == 0 { + return Some(end); + } + continue; + } + } + + i += ch.len_utf8(); + } + + None } fn find_html_tag_end(text: &str, start: usize) -> Option { diff --git a/tests/core_internal.rs b/tests/core_internal.rs index ec22d14..a8d7987 100644 --- a/tests/core_internal.rs +++ b/tests/core_internal.rs @@ -7,7 +7,7 @@ use paperdown::core::testing::{ ProgressCallback, ProgressEvent, append_log, atomic_write_text, build_payload, content_type_to_suffix, extract_image_url, fire_for_test, is_http_url, load_api_key, prepare_output_paths, process_pdf, replace_image_urls, round3_for_test, - strip_html_img_alt_attributes, url_suffix, validate_layout_response, + sanitize_html_fragments, url_suffix, validate_layout_response, }; #[cfg(feature = "net-tests")] use paperdown::core::testing::{download_figure, localize_figures}; @@ -608,64 +608,78 @@ fn replace_image_urls_no_replacements_passthrough() { assert_eq!(updated, markdown); } +fn sanitize_html(markdown: &str) -> String { + let rt = tokio::runtime::Runtime::new().unwrap(); + rt.block_on(sanitize_html_fragments(markdown.to_string())) + .unwrap() +} + #[test] -fn strip_html_img_alt_attributes_removes_alt_and_preserves_other_attrs() { - let markdown = "before OCR图片 after"; +fn sanitize_html_fragments_smoke() { + let markdown = "before OCR图片 after"; - let updated = strip_html_img_alt_attributes(markdown); + let updated = sanitize_html(markdown); - assert_eq!(updated, "before after"); + assert_eq!(updated, "before ![OCR图片](x.png) after"); } #[test] -fn strip_html_img_alt_attributes_handles_case_and_spacing() { - let markdown = ""; +fn sanitize_html_fragments_converts_tables_and_images() { + let markdown = "start
A
B
end OCR图片"; - let updated = strip_html_img_alt_attributes(markdown); + let updated = sanitize_html(markdown); - assert_eq!(updated, ""); + assert!(!updated.contains("\n```\n`OCR图片`" + "![OCR图片](https://x/a.png)\n```html\nOCR图片\n```\n`
x
`" ); } #[test] -fn strip_html_img_alt_attributes_removes_multiple_alt_attributes() { - let markdown = "one"; +fn sanitize_html_fragments_leaves_malformed_fragments_unchanged() { + let markdown = "before
broken after"; - let updated = strip_html_img_alt_attributes(markdown); + let updated = sanitize_html(markdown); - assert_eq!(updated, ""); + assert_eq!(updated, markdown); } #[test] -fn strip_html_img_alt_attributes_removes_boolean_and_unquoted_alt() { - let markdown = ""; +fn sanitize_html_fragments_keeps_nested_table_content_in_order() { + let markdown = "A
1 inside
B outside"; - let updated = strip_html_img_alt_attributes(markdown); + let updated = sanitize_html(markdown); - assert_eq!(updated, ""); + assert!(!updated.contains(""); + assert!(!updated.contains("