diff --git a/Cargo.lock b/Cargo.lock index e628d8c..76f7028 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,6 +11,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + [[package]] name = "anstream" version = "1.0.0" @@ -288,6 +294,19 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" +[[package]] +name = "auto_encoder" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f6364e11e0270035ec392151a54f1476e6b3612ef9f4fe09d35e72a8cebcb65" +dependencies = [ + "chardetng", + "encoding_rs", + "percent-encoding", + "phf 0.11.3", + "phf_codegen 0.11.3", +] + [[package]] name = "autocfg" version = "1.5.0" @@ -396,6 +415,17 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" +[[package]] +name = "chardetng" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea" +dependencies = [ + "cfg-if", + "encoding_rs", + "memchr", +] + [[package]] name = "clap" version = "4.6.0" @@ -476,6 +506,29 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" +[[package]] +name = "cssparser" +version = "0.36.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dae61cf9c0abb83bd659dab65b7e4e38d8236824c85f0f804f173567bda257d2" +dependencies = [ + "cssparser-macros", + "dtoa-short", + "itoa", + "phf 0.13.1", + "smallvec", +] + +[[package]] +name = "cssparser-macros" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" +dependencies = [ + "quote", + "syn 2.0.117", +] + [[package]] name = "deranged" version = "0.5.8" @@ -485,6 +538,27 @@ dependencies = [ "powerfmt", ] +[[package]] +name = "derive_more" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d751e9e49156b02b44f9c1815bcb94b984cdcc4396ecc32521c739452808b134" +dependencies = [ + "derive_more-impl", +] + +[[package]] +name = "derive_more-impl" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "799a97264921d8623a957f6c3b9011f3b5492f557bbb7a5a19b7fa6d06ba8dcb" +dependencies = [ + "proc-macro2", + "quote", + "rustc_version", + "syn 2.0.117", +] + [[package]] name = "difflib" version = "0.4.0" @@ -529,6 +603,21 @@ version = "0.15.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b" +[[package]] +name = "dtoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c3cf4824e2d5f025c7b531afcb2325364084a16806f6d47fbc1f5fbd9960590" + +[[package]] +name = "dtoa-short" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87" +dependencies = [ + "dtoa", +] + [[package]] name = "either" version = "1.15.0" @@ -550,6 +639,15 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + [[package]] name = "equivalent" version = "1.0.2" @@ -593,6 +691,21 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "fast_html2md" +version = "0.0.58" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af3a0122fee1bcf6bb9f3d73782e911cce69d95b76a5e29e930af92cd4a8e4e3" +dependencies = [ + "auto_encoder", + "futures-util", + "lazy_static", + "lol_html", + "percent-encoding", + "regex", + "url", +] + [[package]] name = "fastrand" version = "2.3.0" @@ -617,6 +730,12 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "form_urlencoded" version = "1.2.2" @@ -771,6 +890,11 @@ name = "hashbrown" version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash", +] [[package]] name = "heck" @@ -1232,6 +1356,25 @@ dependencies = [ "value-bag", ] +[[package]] +name = "lol_html" +version = "2.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ff94cb6aef6ee52afd2c69331e9109906d855e82bd241f3110dfdf6185899ab" +dependencies = [ + "bitflags", + "cfg-if", + "cssparser", + "encoding_rs", + "foldhash", + "hashbrown", + "memchr", + "mime", + "precomputed-hash", + "selectors", + "thiserror 2.0.18", +] + [[package]] name = "lru-slab" version = "0.1.2" @@ -1244,6 +1387,12 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + [[package]] name = "mio" version = "1.1.1" @@ -1287,13 +1436,14 @@ checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" [[package]] name = "paperdown" -version = "0.2.0" +version = "0.2.1-dev" dependencies = [ "anyhow", "assert_cmd", "base64 0.22.1", "clap", "dotenvy", + "fast_html2md", "futures", "httpmock", "indicatif", @@ -1352,6 +1502,93 @@ dependencies = [ "indexmap", ] +[[package]] +name = "phf" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" +dependencies = [ + "phf_macros 0.11.3", + "phf_shared 0.11.3", +] + +[[package]] +name = "phf" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf" +dependencies = [ + "phf_macros 0.13.1", + "phf_shared 0.13.1", + "serde", +] + +[[package]] +name = "phf_codegen" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" +dependencies = [ + "phf_generator 0.11.3", + "phf_shared 0.11.3", +] + +[[package]] +name = "phf_codegen" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1" +dependencies = [ + "phf_generator 0.13.1", + "phf_shared 0.13.1", +] + +[[package]] +name = "phf_generator" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" +dependencies = [ + "phf_shared 0.11.3", + "rand 0.8.5", +] + +[[package]] +name = "phf_generator" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737" +dependencies = [ + "fastrand", + "phf_shared 0.13.1", +] + +[[package]] +name = "phf_macros" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" +dependencies = [ + "phf_generator 0.11.3", + "phf_shared 0.11.3", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "phf_macros" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "812f032b54b1e759ccd5f8b6677695d5268c588701effba24601f6932f8269ef" +dependencies = [ + "phf_generator 0.13.1", + "phf_shared 0.13.1", + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "phf_shared" version = "0.11.3" @@ -1361,6 +1598,15 @@ dependencies = [ "siphasher", ] +[[package]] +name = "phf_shared" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266" +dependencies = [ + "siphasher", +] + [[package]] name = "pico-args" version = "0.5.0" @@ -1505,7 +1751,7 @@ dependencies = [ "bytes", "getrandom 0.3.4", "lru-slab", - "rand", + "rand 0.9.2", "ring", "rustc-hash", "rustls", @@ -1546,6 +1792,15 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "rand_core 0.6.4", +] + [[package]] name = "rand" version = "0.9.2" @@ -1553,7 +1808,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" dependencies = [ "rand_chacha", - "rand_core", + "rand_core 0.9.5", ] [[package]] @@ -1563,9 +1818,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.9.5", ] +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" + [[package]] name = "rand_core" version = "0.9.5" @@ -1685,6 +1946,15 @@ version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + [[package]] name = "rustix" version = "1.1.4" @@ -1760,6 +2030,31 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "selectors" +version = "0.33.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "feef350c36147532e1b79ea5c1f3791373e61cbd9a6a2615413b3807bb164fb7" +dependencies = [ + "bitflags", + "cssparser", + "derive_more", + "log", + "new_debug_unreachable", + "phf 0.13.1", + "phf_codegen 0.13.1", + "precomputed-hash", + "rustc-hash", + "servo_arc", + "smallvec", +] + +[[package]] +name = "semver" +version = "1.0.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" + [[package]] name = "serde" version = "1.0.228" @@ -1825,6 +2120,15 @@ dependencies = [ "serde", ] +[[package]] +name = "servo_arc" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "170fb83ab34de17dc69aa7c67482b22218ddb85da56546f9bd6b929e32a05930" +dependencies = [ + "stable_deref_trait", +] + [[package]] name = "shlex" version = "1.3.0" @@ -1899,7 +2203,7 @@ checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f" dependencies = [ "new_debug_unreachable", "parking_lot", - "phf_shared", + "phf_shared 0.11.3", "precomputed-hash", ] diff --git a/Cargo.toml b/Cargo.toml index 298c33f..9923301 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "paperdown" -version = "0.2.0" +version = "0.2.1-dev" authors = ["Anatoly Tsyplenkov "] edition = "2024" description = "A fast CLI tool to batch convert PDFs into Markdown using GLM-OCR." @@ -33,6 +33,7 @@ futures = "0.3" indicatif = "0.17" dotenvy = "0.15" regex = "1" +fast_html2md = "0.0.58" reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "json", "stream"] } serde = { version = "1", features = ["derive"] } serde_json = "1" diff --git a/README.md b/README.md index fb9b3aa..eb09b97 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@

-`paperdown` converts research papers from PDF to Markdown using Z.AI's [GLM-OCR](https://github.com/zai-org/GLM-OCR) model and downloads referenced figure assets locally. +`paperdown` converts research papers from PDF to Markdown using Z.AI's [GLM-OCR](https://github.com/zai-org/GLM-OCR) model, rewrites common HTML into CommonMark Markdown, and downloads referenced figure assets locally. If you work with academic papers, you know that the OCR process itself is not the most difficult part. The real challenge is cleaning up the output. Tables can disappear, their structure can become jumbled, and formulas might be converted into meaningless text. This often means you spend more time correcting the output than working with it. @@ -28,6 +28,7 @@ Therefore, this project was created because, while [`docling`](https://github.co ## Features - Async OCR requests and batch PDF processing using the Z.AI API. +- Async HTML cleanup using `fast_html2md`. - Concurrent figure downloads for each PDF. - Fast processing: approximately 25 seconds per batch of 32 PDFs. Speed depends on the z.ai API availability. See the cost section for more details on spending. diff --git a/src/core.rs b/src/core.rs index be90d5d..bd6340d 100644 --- a/src/core.rs +++ b/src/core.rs @@ -82,7 +82,7 @@ pub async fn process_pdf( ) .await?; let figure_seconds = figure_started.elapsed(); - let markdown = markdown::strip_html_img_alt_attributes(&markdown); + let markdown = markdown::sanitize_html_fragments(markdown).await?; fire( &progress, @@ -220,8 +220,8 @@ pub mod testing { super::markdown::replace_image_urls(markdown, replacements) } - pub fn strip_html_img_alt_attributes(markdown: &str) -> String { - super::markdown::strip_html_img_alt_attributes(markdown) + pub async fn sanitize_html_fragments(markdown: String) -> Result { + super::markdown::sanitize_html_fragments(markdown).await } pub fn prepare_output_paths( diff --git a/src/core/markdown.rs b/src/core/markdown.rs index 37fa0c9..720f1b0 100644 --- a/src/core/markdown.rs +++ b/src/core/markdown.rs @@ -1,7 +1,54 @@ +use anyhow::Result; +use futures::stream::{self, StreamExt}; use regex::Regex; use std::collections::HashMap; use std::sync::LazyLock; +const HTML_FRAGMENT_CONCURRENCY: usize = 16; + +const HTML_ALLOWLIST_TAGS: &[&str] = &[ + "img", + "a", + "p", + "div", + "span", + "br", + "hr", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "ul", + "ol", + "li", + "blockquote", + "table", + "thead", + "tbody", + "tfoot", + "tr", + "th", + "td", + "strong", + "b", + "em", + "i", + "u", + "s", + "del", + "pre", + "code", +]; + +const HTML_EXCLUDED_TAGS: &[&str] = &["math", "sub", "sup"]; + +const HTML_VOID_TAGS: &[&str] = &[ + "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", + "track", "wbr", +]; + static MARKDOWN_IMAGE_URL_PATTERN: LazyLock = LazyLock::new(|| { Regex::new(r"\((https?://[^)\s]+)\)").expect("valid markdown image URL regex") }); @@ -10,10 +57,27 @@ static HTML_IMAGE_URL_PATTERN: LazyLock = LazyLock::new(|| { Regex::new(r#"(src\s*=\s*)(['"])(https?://[^'"]+)(['"])"#).expect("valid HTML image URL regex") }); -static HTML_IMAGE_ALT_PATTERN: LazyLock = LazyLock::new(|| { - Regex::new(r#"(?is)\s+alt(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s/>]+))?"#) - .expect("valid HTML image alt regex") -}); +#[derive(Debug)] +enum Segment { + Text(String), + Code(String), + Html { index: usize, raw: String }, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum HtmlTagKind { + Opening, + Closing, + Special, +} + +#[derive(Debug, Clone, Copy)] +struct ParsedHtmlTag<'a> { + name: &'a str, + kind: HtmlTagKind, + end: usize, + self_closing: bool, +} pub(crate) fn replace_image_urls(markdown: &str, replacements: &HashMap) -> String { let updated = MARKDOWN_IMAGE_URL_PATTERN @@ -42,7 +106,7 @@ pub(crate) fn replace_image_urls(markdown: &str, replacements: &HashMap String { +pub(crate) async fn sanitize_html_fragments(markdown: String) -> Result { let mut out = String::with_capacity(markdown.len()); let mut chunk_start = 0usize; let mut in_fence = false; @@ -68,7 +132,7 @@ pub(crate) fn strip_html_img_alt_attributes(markdown: &str) -> String { } if let Some((marker, len)) = fence_start(line) { - out.push_str(&sanitize_non_code_chunk(&markdown[chunk_start..i])); + out.push_str(&sanitize_non_code_chunk(&markdown[chunk_start..i]).await?); out.push_str(line); in_fence = true; fence_marker = marker; @@ -82,41 +146,141 @@ pub(crate) fn strip_html_img_alt_attributes(markdown: &str) -> String { } if !in_fence { - out.push_str(&sanitize_non_code_chunk(&markdown[chunk_start..])); + out.push_str(&sanitize_non_code_chunk(&markdown[chunk_start..]).await?); } - out + Ok(out) } -fn sanitize_non_code_chunk(chunk: &str) -> String { - let mut out = String::with_capacity(chunk.len()); +async fn sanitize_non_code_chunk(chunk: &str) -> Result { + let mut segments = Vec::new(); let mut i = 0usize; + let mut literal_start = 0usize; + let mut html_count = 0usize; while i < chunk.len() { if let Some(run_len) = backtick_run_len(chunk, i) && let Some(end) = find_matching_backtick_run(chunk, i + run_len, run_len) { - out.push_str(&chunk[i..end + run_len]); + if literal_start < i { + segments.push(Segment::Text(chunk[literal_start..i].to_string())); + } + segments.push(Segment::Code(chunk[i..end + run_len].to_string())); i = end + run_len; + literal_start = i; continue; } - if starts_html_img_tag(chunk, i) - && let Some(tag_end) = find_html_tag_end(chunk, i) - { - out.push_str(&HTML_IMAGE_ALT_PATTERN.replace_all(&chunk[i..tag_end], "")); + if let Some((tag_end, fragment)) = extract_html_fragment(chunk, i) { + if literal_start < i { + segments.push(Segment::Text(chunk[literal_start..i].to_string())); + } + segments.push(Segment::Html { + index: html_count, + raw: fragment, + }); + html_count += 1; i = tag_end; + literal_start = i; continue; } let ch = chunk[i..].chars().next().expect("valid char boundary"); - out.push(ch); i += ch.len_utf8(); } + if literal_start < chunk.len() { + segments.push(Segment::Text(chunk[literal_start..].to_string())); + } + + if html_count == 0 { + return Ok(join_segments(&segments, &[])); + } + + let html_fragments: Vec = segments + .iter() + .filter_map(|segment| match segment { + Segment::Html { raw, .. } => Some(raw.clone()), + _ => None, + }) + .collect(); + let converted = convert_html_fragments(html_fragments).await; + Ok(join_segments(&segments, &converted)) +} + +fn join_segments(segments: &[Segment], converted: &[Option]) -> String { + let mut out = String::with_capacity( + segments + .iter() + .map(|segment| match segment { + Segment::Text(text) | Segment::Code(text) => text.len(), + Segment::Html { raw, .. } => raw.len(), + }) + .sum(), + ); + + for segment in segments { + match segment { + Segment::Text(text) | Segment::Code(text) => out.push_str(text), + Segment::Html { index, raw } => { + if let Some(Some(rewritten)) = converted.get(*index) { + out.push_str(rewritten); + } else { + out.push_str(raw); + } + } + } + } + out } +fn extract_html_fragment(text: &str, start: usize) -> Option<(usize, String)> { + let tag = parse_html_tag(text, start)?; + if tag.kind != HtmlTagKind::Opening { + return None; + } + + if !is_html_allowlisted(tag.name) || is_html_excluded(tag.name) { + return None; + } + + let end = if tag.self_closing || is_html_void(tag.name) { + tag.end + } else { + find_html_region_end(text, start, tag.name)? + }; + let fragment = text[start..end].to_string(); + if contains_tex_delimiters(&fragment) || contains_excluded_math_tags(&fragment) { + return None; + } + + Some((end, fragment)) +} + +async fn convert_html_fragments(fragments: Vec) -> Vec> { + let converted = stream::iter(fragments.into_iter().enumerate().map( + |(index, fragment)| async move { + let converted = html2md::rewrite_html_streaming(&fragment, true).await; + let converted = if converted.trim().is_empty() { + None + } else { + Some(converted) + }; + (index, converted) + }, + )) + .buffer_unordered(HTML_FRAGMENT_CONCURRENCY) + .collect::>() + .await; + + let mut ordered = vec![None; converted.len()]; + for (index, item) in converted { + ordered[index] = item; + } + ordered +} + fn backtick_run_len(text: &str, start: usize) -> Option { let bytes = text.as_bytes(); if bytes.get(start) != Some(&b'`') { @@ -144,20 +308,118 @@ fn find_matching_backtick_run(text: &str, start: usize, run_len: usize) -> Optio None } -fn starts_html_img_tag(text: &str, start: usize) -> bool { +fn parse_html_tag(text: &str, start: usize) -> Option> { let bytes = text.as_bytes(); if bytes.get(start) != Some(&b'<') { - return false; + return None; } - let Some(prefix) = text.get(start + 1..start + 4) else { - return false; - }; - if !prefix.eq_ignore_ascii_case("img") { - return false; + if matches!(bytes.get(start + 1), Some(b'!') | Some(b'?')) { + let end = find_html_tag_end(text, start)?; + return Some(ParsedHtmlTag { + name: "", + kind: HtmlTagKind::Special, + end, + self_closing: false, + }); + } + + if bytes.get(start + 1) == Some(&b'/') { + let name_start = start + 2; + let name_end = name_end(text, name_start)?; + let end = find_html_tag_end(text, start)?; + return Some(ParsedHtmlTag { + name: &text[name_start..name_end], + kind: HtmlTagKind::Closing, + end, + self_closing: false, + }); } - !matches!(bytes.get(start + 4), Some(b) if b.is_ascii_alphanumeric() || *b == b'-') + let name_start = start + 1; + let name_end = name_end(text, name_start)?; + let end = find_html_tag_end(text, start)?; + let self_closing = is_self_closing_tag(text, start, end); + + Some(ParsedHtmlTag { + name: &text[name_start..name_end], + kind: HtmlTagKind::Opening, + end, + self_closing, + }) +} + +fn name_end(text: &str, start: usize) -> Option { + let bytes = text.as_bytes(); + let mut i = start; + + while i < bytes.len() { + let b = bytes[i]; + if b.is_ascii_alphanumeric() || matches!(b, b'-' | b':' | b'_') { + i += 1; + continue; + } + break; + } + + (i > start).then_some(i) +} + +fn is_self_closing_tag(text: &str, start: usize, end: usize) -> bool { + let bytes = text.as_bytes(); + let mut i = end.saturating_sub(1); + while i > start && bytes[i - 1].is_ascii_whitespace() { + i -= 1; + } + + bytes.get(i - 1) == Some(&b'/') +} + +fn find_html_region_end(text: &str, start: usize, root_tag: &str) -> Option { + let root = parse_html_tag(text, start)?; + if root.kind != HtmlTagKind::Opening { + return None; + } + + let mut stack = vec![root_tag.to_ascii_lowercase()]; + let mut i = root.end; + + while i < text.len() { + if text.as_bytes().get(i) == Some(&b'<') + && let Some(tag) = parse_html_tag(text, i) + { + match tag.kind { + HtmlTagKind::Special => { + i = tag.end; + continue; + } + HtmlTagKind::Closing => { + let current = stack.last()?; + if !current.eq_ignore_ascii_case(tag.name) { + return None; + } + stack.pop(); + i = tag.end; + if stack.is_empty() { + return Some(i); + } + continue; + } + HtmlTagKind::Opening => { + if !(tag.self_closing || is_html_void(tag.name)) { + stack.push(tag.name.to_ascii_lowercase()); + } + i = tag.end; + continue; + } + } + } + + let ch = text[i..].chars().next()?; + i += ch.len_utf8(); + } + + None } fn find_html_tag_end(text: &str, start: usize) -> Option { @@ -179,6 +441,77 @@ fn find_html_tag_end(text: &str, start: usize) -> Option { None } +fn is_html_allowlisted(tag: &str) -> bool { + HTML_ALLOWLIST_TAGS + .iter() + .any(|candidate| candidate.eq_ignore_ascii_case(tag)) +} + +fn is_html_excluded(tag: &str) -> bool { + HTML_EXCLUDED_TAGS + .iter() + .any(|candidate| candidate.eq_ignore_ascii_case(tag)) +} + +fn is_html_void(tag: &str) -> bool { + HTML_VOID_TAGS + .iter() + .any(|candidate| candidate.eq_ignore_ascii_case(tag)) +} + +fn contains_excluded_math_tags(fragment: &str) -> bool { + let mut i = 0usize; + + while i < fragment.len() { + if let Some(run_len) = backtick_run_len(fragment, i) + && let Some(end) = find_matching_backtick_run(fragment, i + run_len, run_len) + { + i = end + run_len; + continue; + } + + if let Some(tag) = parse_html_tag(fragment, i) + && matches!(tag.kind, HtmlTagKind::Opening | HtmlTagKind::Closing) + && is_html_excluded(tag.name) + { + return true; + } + + let ch = fragment[i..].chars().next().expect("valid char boundary"); + i += ch.len_utf8(); + } + + false +} + +fn contains_tex_delimiters(fragment: &str) -> bool { + let bytes = fragment.as_bytes(); + let mut i = 0usize; + let mut saw_dollar = false; + + while i < bytes.len() { + if bytes[i] == b'\\' { + i = i.saturating_add(2); + continue; + } + + if bytes[i] == b'$' { + if i + 1 < bytes.len() && bytes[i + 1] == b'$' { + return true; + } + + if saw_dollar { + return true; + } + saw_dollar = true; + } + + i += 1; + } + + false +} + fn fence_start(line: &str) -> Option<(char, usize)> { let trimmed = line.trim_start(); let mut chars = trimmed.chars(); diff --git a/tests/core_internal.rs b/tests/core_internal.rs index ec22d14..dca4c8e 100644 --- a/tests/core_internal.rs +++ b/tests/core_internal.rs @@ -7,7 +7,7 @@ use paperdown::core::testing::{ ProgressCallback, ProgressEvent, append_log, atomic_write_text, build_payload, content_type_to_suffix, extract_image_url, fire_for_test, is_http_url, load_api_key, prepare_output_paths, process_pdf, replace_image_urls, round3_for_test, - strip_html_img_alt_attributes, url_suffix, validate_layout_response, + sanitize_html_fragments, url_suffix, validate_layout_response, }; #[cfg(feature = "net-tests")] use paperdown::core::testing::{download_figure, localize_figures}; @@ -608,64 +608,102 @@ fn replace_image_urls_no_replacements_passthrough() { assert_eq!(updated, markdown); } +fn sanitize_html(markdown: &str) -> String { + let rt = tokio::runtime::Runtime::new().unwrap(); + rt.block_on(sanitize_html_fragments(markdown.to_string())) + .unwrap() +} + #[test] -fn strip_html_img_alt_attributes_removes_alt_and_preserves_other_attrs() { - let markdown = "before OCR图片 after"; +fn sanitize_html_fragments_smoke() { + let markdown = "before OCR图片 after"; - let updated = strip_html_img_alt_attributes(markdown); + let updated = sanitize_html(markdown); - assert_eq!(updated, "before after"); + assert_eq!(updated, "before ![OCR图片](x.png) after"); } #[test] -fn strip_html_img_alt_attributes_handles_case_and_spacing() { - let markdown = ""; +fn sanitize_html_fragments_converts_common_html_regions() { + let markdown = "start

Hello world

mid

Title

end"; - let updated = strip_html_img_alt_attributes(markdown); + let updated = sanitize_html(markdown); - assert_eq!(updated, ""); + assert!(!updated.contains("\n```\n`OCR图片`" + "![OCR图片](https://x/a.png)\n```html\nOCR图片\n```\n`
x
`" ); } #[test] -fn strip_html_img_alt_attributes_removes_multiple_alt_attributes() { - let markdown = "one"; +fn sanitize_html_fragments_leaves_malformed_fragments_unchanged() { + let markdown = "before

broken after"; - let updated = strip_html_img_alt_attributes(markdown); + let updated = sanitize_html(markdown); - assert_eq!(updated, ""); + assert_eq!(updated, markdown); } #[test] -fn strip_html_img_alt_attributes_removes_boolean_and_unquoted_alt() { - let markdown = ""; +fn sanitize_html_fragments_keeps_nested_html_content_in_order() { + let markdown = "A

1 2 inside
B outside"; - let updated = strip_html_img_alt_attributes(markdown); + let updated = sanitize_html(markdown); - assert_eq!(updated, ""); + assert!(!updated.contains(""); +#[test] +fn sanitize_html_fragments_keeps_excluded_math_tags_raw() { + let markdown = "before
x + 2 y
after"; + + let updated = sanitize_html(markdown); + + assert_eq!(updated, markdown); } #[test]