diff --git a/Cargo.lock b/Cargo.lock
index e628d8c..76f7028 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -11,6 +11,12 @@ dependencies = [
"memchr",
]
+[[package]]
+name = "allocator-api2"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
+
[[package]]
name = "anstream"
version = "1.0.0"
@@ -288,6 +294,19 @@ version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
+[[package]]
+name = "auto_encoder"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f6364e11e0270035ec392151a54f1476e6b3612ef9f4fe09d35e72a8cebcb65"
+dependencies = [
+ "chardetng",
+ "encoding_rs",
+ "percent-encoding",
+ "phf 0.11.3",
+ "phf_codegen 0.11.3",
+]
+
[[package]]
name = "autocfg"
version = "1.5.0"
@@ -396,6 +415,17 @@ version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
+[[package]]
+name = "chardetng"
+version = "0.1.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea"
+dependencies = [
+ "cfg-if",
+ "encoding_rs",
+ "memchr",
+]
+
[[package]]
name = "clap"
version = "4.6.0"
@@ -476,6 +506,29 @@ version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
+[[package]]
+name = "cssparser"
+version = "0.36.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dae61cf9c0abb83bd659dab65b7e4e38d8236824c85f0f804f173567bda257d2"
+dependencies = [
+ "cssparser-macros",
+ "dtoa-short",
+ "itoa",
+ "phf 0.13.1",
+ "smallvec",
+]
+
+[[package]]
+name = "cssparser-macros"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
+dependencies = [
+ "quote",
+ "syn 2.0.117",
+]
+
[[package]]
name = "deranged"
version = "0.5.8"
@@ -485,6 +538,27 @@ dependencies = [
"powerfmt",
]
+[[package]]
+name = "derive_more"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d751e9e49156b02b44f9c1815bcb94b984cdcc4396ecc32521c739452808b134"
+dependencies = [
+ "derive_more-impl",
+]
+
+[[package]]
+name = "derive_more-impl"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "799a97264921d8623a957f6c3b9011f3b5492f557bbb7a5a19b7fa6d06ba8dcb"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "rustc_version",
+ "syn 2.0.117",
+]
+
[[package]]
name = "difflib"
version = "0.4.0"
@@ -529,6 +603,21 @@ version = "0.15.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b"
+[[package]]
+name = "dtoa"
+version = "1.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c3cf4824e2d5f025c7b531afcb2325364084a16806f6d47fbc1f5fbd9960590"
+
+[[package]]
+name = "dtoa-short"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87"
+dependencies = [
+ "dtoa",
+]
+
[[package]]
name = "either"
version = "1.15.0"
@@ -550,6 +639,15 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
+[[package]]
+name = "encoding_rs"
+version = "0.8.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
+dependencies = [
+ "cfg-if",
+]
+
[[package]]
name = "equivalent"
version = "1.0.2"
@@ -593,6 +691,21 @@ dependencies = [
"pin-project-lite",
]
+[[package]]
+name = "fast_html2md"
+version = "0.0.58"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af3a0122fee1bcf6bb9f3d73782e911cce69d95b76a5e29e930af92cd4a8e4e3"
+dependencies = [
+ "auto_encoder",
+ "futures-util",
+ "lazy_static",
+ "lol_html",
+ "percent-encoding",
+ "regex",
+ "url",
+]
+
[[package]]
name = "fastrand"
version = "2.3.0"
@@ -617,6 +730,12 @@ version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
+[[package]]
+name = "foldhash"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
+
[[package]]
name = "form_urlencoded"
version = "1.2.2"
@@ -771,6 +890,11 @@ name = "hashbrown"
version = "0.16.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
+dependencies = [
+ "allocator-api2",
+ "equivalent",
+ "foldhash",
+]
[[package]]
name = "heck"
@@ -1232,6 +1356,25 @@ dependencies = [
"value-bag",
]
+[[package]]
+name = "lol_html"
+version = "2.7.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ff94cb6aef6ee52afd2c69331e9109906d855e82bd241f3110dfdf6185899ab"
+dependencies = [
+ "bitflags",
+ "cfg-if",
+ "cssparser",
+ "encoding_rs",
+ "foldhash",
+ "hashbrown",
+ "memchr",
+ "mime",
+ "precomputed-hash",
+ "selectors",
+ "thiserror 2.0.18",
+]
+
[[package]]
name = "lru-slab"
version = "0.1.2"
@@ -1244,6 +1387,12 @@ version = "2.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
+[[package]]
+name = "mime"
+version = "0.3.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
+
[[package]]
name = "mio"
version = "1.1.1"
@@ -1287,13 +1436,14 @@ checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
[[package]]
name = "paperdown"
-version = "0.2.0"
+version = "0.2.1-dev"
dependencies = [
"anyhow",
"assert_cmd",
"base64 0.22.1",
"clap",
"dotenvy",
+ "fast_html2md",
"futures",
"httpmock",
"indicatif",
@@ -1352,6 +1502,93 @@ dependencies = [
"indexmap",
]
+[[package]]
+name = "phf"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078"
+dependencies = [
+ "phf_macros 0.11.3",
+ "phf_shared 0.11.3",
+]
+
+[[package]]
+name = "phf"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf"
+dependencies = [
+ "phf_macros 0.13.1",
+ "phf_shared 0.13.1",
+ "serde",
+]
+
+[[package]]
+name = "phf_codegen"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a"
+dependencies = [
+ "phf_generator 0.11.3",
+ "phf_shared 0.11.3",
+]
+
+[[package]]
+name = "phf_codegen"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1"
+dependencies = [
+ "phf_generator 0.13.1",
+ "phf_shared 0.13.1",
+]
+
+[[package]]
+name = "phf_generator"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
+dependencies = [
+ "phf_shared 0.11.3",
+ "rand 0.8.5",
+]
+
+[[package]]
+name = "phf_generator"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737"
+dependencies = [
+ "fastrand",
+ "phf_shared 0.13.1",
+]
+
+[[package]]
+name = "phf_macros"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216"
+dependencies = [
+ "phf_generator 0.11.3",
+ "phf_shared 0.11.3",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "phf_macros"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "812f032b54b1e759ccd5f8b6677695d5268c588701effba24601f6932f8269ef"
+dependencies = [
+ "phf_generator 0.13.1",
+ "phf_shared 0.13.1",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
[[package]]
name = "phf_shared"
version = "0.11.3"
@@ -1361,6 +1598,15 @@ dependencies = [
"siphasher",
]
+[[package]]
+name = "phf_shared"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266"
+dependencies = [
+ "siphasher",
+]
+
[[package]]
name = "pico-args"
version = "0.5.0"
@@ -1505,7 +1751,7 @@ dependencies = [
"bytes",
"getrandom 0.3.4",
"lru-slab",
- "rand",
+ "rand 0.9.2",
"ring",
"rustc-hash",
"rustls",
@@ -1546,6 +1792,15 @@ version = "5.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
+[[package]]
+name = "rand"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
+dependencies = [
+ "rand_core 0.6.4",
+]
+
[[package]]
name = "rand"
version = "0.9.2"
@@ -1553,7 +1808,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1"
dependencies = [
"rand_chacha",
- "rand_core",
+ "rand_core 0.9.5",
]
[[package]]
@@ -1563,9 +1818,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
dependencies = [
"ppv-lite86",
- "rand_core",
+ "rand_core 0.9.5",
]
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+
[[package]]
name = "rand_core"
version = "0.9.5"
@@ -1685,6 +1946,15 @@ version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
+[[package]]
+name = "rustc_version"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
+dependencies = [
+ "semver",
+]
+
[[package]]
name = "rustix"
version = "1.1.4"
@@ -1760,6 +2030,31 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
+[[package]]
+name = "selectors"
+version = "0.33.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "feef350c36147532e1b79ea5c1f3791373e61cbd9a6a2615413b3807bb164fb7"
+dependencies = [
+ "bitflags",
+ "cssparser",
+ "derive_more",
+ "log",
+ "new_debug_unreachable",
+ "phf 0.13.1",
+ "phf_codegen 0.13.1",
+ "precomputed-hash",
+ "rustc-hash",
+ "servo_arc",
+ "smallvec",
+]
+
+[[package]]
+name = "semver"
+version = "1.0.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2"
+
[[package]]
name = "serde"
version = "1.0.228"
@@ -1825,6 +2120,15 @@ dependencies = [
"serde",
]
+[[package]]
+name = "servo_arc"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "170fb83ab34de17dc69aa7c67482b22218ddb85da56546f9bd6b929e32a05930"
+dependencies = [
+ "stable_deref_trait",
+]
+
[[package]]
name = "shlex"
version = "1.3.0"
@@ -1899,7 +2203,7 @@ checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f"
dependencies = [
"new_debug_unreachable",
"parking_lot",
- "phf_shared",
+ "phf_shared 0.11.3",
"precomputed-hash",
]
diff --git a/Cargo.toml b/Cargo.toml
index 298c33f..9923301 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "paperdown"
-version = "0.2.0"
+version = "0.2.1-dev"
authors = ["Anatoly Tsyplenkov "]
edition = "2024"
description = "A fast CLI tool to batch convert PDFs into Markdown using GLM-OCR."
@@ -33,6 +33,7 @@ futures = "0.3"
indicatif = "0.17"
dotenvy = "0.15"
regex = "1"
+fast_html2md = "0.0.58"
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "json", "stream"] }
serde = { version = "1", features = ["derive"] }
serde_json = "1"
diff --git a/README.md b/README.md
index fb9b3aa..eb09b97 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@
-`paperdown` converts research papers from PDF to Markdown using Z.AI's [GLM-OCR](https://github.com/zai-org/GLM-OCR) model and downloads referenced figure assets locally.
+`paperdown` converts research papers from PDF to Markdown using Z.AI's [GLM-OCR](https://github.com/zai-org/GLM-OCR) model, rewrites common HTML into CommonMark Markdown, and downloads referenced figure assets locally.
If you work with academic papers, you know that the OCR process itself is not the most difficult part. The real challenge is cleaning up the output. Tables can disappear, their structure can become jumbled, and formulas might be converted into meaningless text. This often means you spend more time correcting the output than working with it.
@@ -28,6 +28,7 @@ Therefore, this project was created because, while [`docling`](https://github.co
## Features
- Async OCR requests and batch PDF processing using the Z.AI API.
+- Async HTML cleanup using `fast_html2md`.
- Concurrent figure downloads for each PDF.
- Fast processing: approximately 25 seconds per batch of 32 PDFs. Speed depends on the z.ai API availability. See the cost section for more details on spending.
diff --git a/src/core.rs b/src/core.rs
index be90d5d..bd6340d 100644
--- a/src/core.rs
+++ b/src/core.rs
@@ -82,7 +82,7 @@ pub async fn process_pdf(
)
.await?;
let figure_seconds = figure_started.elapsed();
- let markdown = markdown::strip_html_img_alt_attributes(&markdown);
+ let markdown = markdown::sanitize_html_fragments(markdown).await?;
fire(
&progress,
@@ -220,8 +220,8 @@ pub mod testing {
super::markdown::replace_image_urls(markdown, replacements)
}
- pub fn strip_html_img_alt_attributes(markdown: &str) -> String {
- super::markdown::strip_html_img_alt_attributes(markdown)
+ pub async fn sanitize_html_fragments(markdown: String) -> Result {
+ super::markdown::sanitize_html_fragments(markdown).await
}
pub fn prepare_output_paths(
diff --git a/src/core/markdown.rs b/src/core/markdown.rs
index 37fa0c9..720f1b0 100644
--- a/src/core/markdown.rs
+++ b/src/core/markdown.rs
@@ -1,7 +1,54 @@
+use anyhow::Result;
+use futures::stream::{self, StreamExt};
use regex::Regex;
use std::collections::HashMap;
use std::sync::LazyLock;
+const HTML_FRAGMENT_CONCURRENCY: usize = 16;
+
+const HTML_ALLOWLIST_TAGS: &[&str] = &[
+ "img",
+ "a",
+ "p",
+ "div",
+ "span",
+ "br",
+ "hr",
+ "h1",
+ "h2",
+ "h3",
+ "h4",
+ "h5",
+ "h6",
+ "ul",
+ "ol",
+ "li",
+ "blockquote",
+ "table",
+ "thead",
+ "tbody",
+ "tfoot",
+ "tr",
+ "th",
+ "td",
+ "strong",
+ "b",
+ "em",
+ "i",
+ "u",
+ "s",
+ "del",
+ "pre",
+ "code",
+];
+
+const HTML_EXCLUDED_TAGS: &[&str] = &["math", "sub", "sup"];
+
+const HTML_VOID_TAGS: &[&str] = &[
+ "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source",
+ "track", "wbr",
+];
+
static MARKDOWN_IMAGE_URL_PATTERN: LazyLock = LazyLock::new(|| {
Regex::new(r"\((https?://[^)\s]+)\)").expect("valid markdown image URL regex")
});
@@ -10,10 +57,27 @@ static HTML_IMAGE_URL_PATTERN: LazyLock = LazyLock::new(|| {
Regex::new(r#"(src\s*=\s*)(['"])(https?://[^'"]+)(['"])"#).expect("valid HTML image URL regex")
});
-static HTML_IMAGE_ALT_PATTERN: LazyLock = LazyLock::new(|| {
- Regex::new(r#"(?is)\s+alt(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s/>]+))?"#)
- .expect("valid HTML image alt regex")
-});
+#[derive(Debug)]
+enum Segment {
+ Text(String),
+ Code(String),
+ Html { index: usize, raw: String },
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum HtmlTagKind {
+ Opening,
+ Closing,
+ Special,
+}
+
+#[derive(Debug, Clone, Copy)]
+struct ParsedHtmlTag<'a> {
+ name: &'a str,
+ kind: HtmlTagKind,
+ end: usize,
+ self_closing: bool,
+}
pub(crate) fn replace_image_urls(markdown: &str, replacements: &HashMap) -> String {
let updated = MARKDOWN_IMAGE_URL_PATTERN
@@ -42,7 +106,7 @@ pub(crate) fn replace_image_urls(markdown: &str, replacements: &HashMap String {
+pub(crate) async fn sanitize_html_fragments(markdown: String) -> Result {
let mut out = String::with_capacity(markdown.len());
let mut chunk_start = 0usize;
let mut in_fence = false;
@@ -68,7 +132,7 @@ pub(crate) fn strip_html_img_alt_attributes(markdown: &str) -> String {
}
if let Some((marker, len)) = fence_start(line) {
- out.push_str(&sanitize_non_code_chunk(&markdown[chunk_start..i]));
+ out.push_str(&sanitize_non_code_chunk(&markdown[chunk_start..i]).await?);
out.push_str(line);
in_fence = true;
fence_marker = marker;
@@ -82,41 +146,141 @@ pub(crate) fn strip_html_img_alt_attributes(markdown: &str) -> String {
}
if !in_fence {
- out.push_str(&sanitize_non_code_chunk(&markdown[chunk_start..]));
+ out.push_str(&sanitize_non_code_chunk(&markdown[chunk_start..]).await?);
}
- out
+ Ok(out)
}
-fn sanitize_non_code_chunk(chunk: &str) -> String {
- let mut out = String::with_capacity(chunk.len());
+async fn sanitize_non_code_chunk(chunk: &str) -> Result {
+ let mut segments = Vec::new();
let mut i = 0usize;
+ let mut literal_start = 0usize;
+ let mut html_count = 0usize;
while i < chunk.len() {
if let Some(run_len) = backtick_run_len(chunk, i)
&& let Some(end) = find_matching_backtick_run(chunk, i + run_len, run_len)
{
- out.push_str(&chunk[i..end + run_len]);
+ if literal_start < i {
+ segments.push(Segment::Text(chunk[literal_start..i].to_string()));
+ }
+ segments.push(Segment::Code(chunk[i..end + run_len].to_string()));
i = end + run_len;
+ literal_start = i;
continue;
}
- if starts_html_img_tag(chunk, i)
- && let Some(tag_end) = find_html_tag_end(chunk, i)
- {
- out.push_str(&HTML_IMAGE_ALT_PATTERN.replace_all(&chunk[i..tag_end], ""));
+ if let Some((tag_end, fragment)) = extract_html_fragment(chunk, i) {
+ if literal_start < i {
+ segments.push(Segment::Text(chunk[literal_start..i].to_string()));
+ }
+ segments.push(Segment::Html {
+ index: html_count,
+ raw: fragment,
+ });
+ html_count += 1;
i = tag_end;
+ literal_start = i;
continue;
}
let ch = chunk[i..].chars().next().expect("valid char boundary");
- out.push(ch);
i += ch.len_utf8();
}
+ if literal_start < chunk.len() {
+ segments.push(Segment::Text(chunk[literal_start..].to_string()));
+ }
+
+ if html_count == 0 {
+ return Ok(join_segments(&segments, &[]));
+ }
+
+ let html_fragments: Vec = segments
+ .iter()
+ .filter_map(|segment| match segment {
+ Segment::Html { raw, .. } => Some(raw.clone()),
+ _ => None,
+ })
+ .collect();
+ let converted = convert_html_fragments(html_fragments).await;
+ Ok(join_segments(&segments, &converted))
+}
+
+fn join_segments(segments: &[Segment], converted: &[Option]) -> String {
+ let mut out = String::with_capacity(
+ segments
+ .iter()
+ .map(|segment| match segment {
+ Segment::Text(text) | Segment::Code(text) => text.len(),
+ Segment::Html { raw, .. } => raw.len(),
+ })
+ .sum(),
+ );
+
+ for segment in segments {
+ match segment {
+ Segment::Text(text) | Segment::Code(text) => out.push_str(text),
+ Segment::Html { index, raw } => {
+ if let Some(Some(rewritten)) = converted.get(*index) {
+ out.push_str(rewritten);
+ } else {
+ out.push_str(raw);
+ }
+ }
+ }
+ }
+
out
}
+fn extract_html_fragment(text: &str, start: usize) -> Option<(usize, String)> {
+ let tag = parse_html_tag(text, start)?;
+ if tag.kind != HtmlTagKind::Opening {
+ return None;
+ }
+
+ if !is_html_allowlisted(tag.name) || is_html_excluded(tag.name) {
+ return None;
+ }
+
+ let end = if tag.self_closing || is_html_void(tag.name) {
+ tag.end
+ } else {
+ find_html_region_end(text, start, tag.name)?
+ };
+ let fragment = text[start..end].to_string();
+ if contains_tex_delimiters(&fragment) || contains_excluded_math_tags(&fragment) {
+ return None;
+ }
+
+ Some((end, fragment))
+}
+
+async fn convert_html_fragments(fragments: Vec) -> Vec