From 7f9d0a95b9b6e2059965bcec19ed3f48ad5a0aca Mon Sep 17 00:00:00 2001
From: atsyplenkov <sharp.couch9400@fastmail.com>
Date: Thu, 19 Mar 2026 13:32:48 +0300
Subject: [PATCH 1/7] feat: sanitize html output with fast_html2md

---
 Cargo.lock             | 312 ++++++++++++++++++++++++++++++++++++++++-
 Cargo.toml             |   1 +
 README.md              |   3 +-
 src/core.rs            |   6 +-
 src/core/markdown.rs   | 188 ++++++++++++++++++++++---
 tests/core_internal.rs |  70 +++++----
 6 files changed, 524 insertions(+), 56 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index e628d8c..f41113a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -11,6 +11,12 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "allocator-api2"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
+
 [[package]]
 name = "anstream"
 version = "1.0.0"
@@ -288,6 +294,19 @@ version = "1.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
 
+[[package]]
+name = "auto_encoder"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f6364e11e0270035ec392151a54f1476e6b3612ef9f4fe09d35e72a8cebcb65"
+dependencies = [
+ "chardetng",
+ "encoding_rs",
+ "percent-encoding",
+ "phf 0.11.3",
+ "phf_codegen 0.11.3",
+]
+
 [[package]]
 name = "autocfg"
 version = "1.5.0"
@@ -396,6 +415,17 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
 
+[[package]]
+name = "chardetng"
+version = "0.1.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea"
+dependencies = [
+ "cfg-if",
+ "encoding_rs",
+ "memchr",
+]
+
 [[package]]
 name = "clap"
 version = "4.6.0"
@@ -476,6 +506,29 @@ version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
 
+[[package]]
+name = "cssparser"
+version = "0.36.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dae61cf9c0abb83bd659dab65b7e4e38d8236824c85f0f804f173567bda257d2"
+dependencies = [
+ "cssparser-macros",
+ "dtoa-short",
+ "itoa",
+ "phf 0.13.1",
+ "smallvec",
+]
+
+[[package]]
+name = "cssparser-macros"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
+dependencies = [
+ "quote",
+ "syn 2.0.117",
+]
+
 [[package]]
 name = "deranged"
 version = "0.5.8"
@@ -485,6 +538,27 @@ dependencies = [
  "powerfmt",
 ]
 
+[[package]]
+name = "derive_more"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d751e9e49156b02b44f9c1815bcb94b984cdcc4396ecc32521c739452808b134"
+dependencies = [
+ "derive_more-impl",
+]
+
+[[package]]
+name = "derive_more-impl"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "799a97264921d8623a957f6c3b9011f3b5492f557bbb7a5a19b7fa6d06ba8dcb"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "rustc_version",
+ "syn 2.0.117",
+]
+
 [[package]]
 name = "difflib"
 version = "0.4.0"
@@ -529,6 +603,21 @@ version = "0.15.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b"
 
+[[package]]
+name = "dtoa"
+version = "1.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c3cf4824e2d5f025c7b531afcb2325364084a16806f6d47fbc1f5fbd9960590"
+
+[[package]]
+name = "dtoa-short"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87"
+dependencies = [
+ "dtoa",
+]
+
 [[package]]
 name = "either"
 version = "1.15.0"
@@ -550,6 +639,15 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
 
+[[package]]
+name = "encoding_rs"
+version = "0.8.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
+dependencies = [
+ "cfg-if",
+]
+
 [[package]]
 name = "equivalent"
 version = "1.0.2"
@@ -593,6 +691,21 @@ dependencies = [
  "pin-project-lite",
 ]
 
+[[package]]
+name = "fast_html2md"
+version = "0.0.58"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af3a0122fee1bcf6bb9f3d73782e911cce69d95b76a5e29e930af92cd4a8e4e3"
+dependencies = [
+ "auto_encoder",
+ "futures-util",
+ "lazy_static",
+ "lol_html",
+ "percent-encoding",
+ "regex",
+ "url",
+]
+
 [[package]]
 name = "fastrand"
 version = "2.3.0"
@@ -617,6 +730,12 @@ version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
 
+[[package]]
+name = "foldhash"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
+
 [[package]]
 name = "form_urlencoded"
 version = "1.2.2"
@@ -771,6 +890,11 @@ name = "hashbrown"
 version = "0.16.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
+dependencies = [
+ "allocator-api2",
+ "equivalent",
+ "foldhash",
+]
 
 [[package]]
 name = "heck"
@@ -1232,6 +1356,25 @@ dependencies = [
  "value-bag",
 ]
 
+[[package]]
+name = "lol_html"
+version = "2.7.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ff94cb6aef6ee52afd2c69331e9109906d855e82bd241f3110dfdf6185899ab"
+dependencies = [
+ "bitflags",
+ "cfg-if",
+ "cssparser",
+ "encoding_rs",
+ "foldhash",
+ "hashbrown",
+ "memchr",
+ "mime",
+ "precomputed-hash",
+ "selectors",
+ "thiserror 2.0.18",
+]
+
 [[package]]
 name = "lru-slab"
 version = "0.1.2"
@@ -1244,6 +1387,12 @@ version = "2.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
 
+[[package]]
+name = "mime"
+version = "0.3.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
+
 [[package]]
 name = "mio"
 version = "1.1.1"
@@ -1294,6 +1443,7 @@ dependencies = [
  "base64 0.22.1",
  "clap",
  "dotenvy",
+ "fast_html2md",
  "futures",
  "httpmock",
  "indicatif",
@@ -1352,6 +1502,93 @@ dependencies = [
  "indexmap",
 ]
 
+[[package]]
+name = "phf"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078"
+dependencies = [
+ "phf_macros 0.11.3",
+ "phf_shared 0.11.3",
+]
+
+[[package]]
+name = "phf"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf"
+dependencies = [
+ "phf_macros 0.13.1",
+ "phf_shared 0.13.1",
+ "serde",
+]
+
+[[package]]
+name = "phf_codegen"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a"
+dependencies = [
+ "phf_generator 0.11.3",
+ "phf_shared 0.11.3",
+]
+
+[[package]]
+name = "phf_codegen"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1"
+dependencies = [
+ "phf_generator 0.13.1",
+ "phf_shared 0.13.1",
+]
+
+[[package]]
+name = "phf_generator"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
+dependencies = [
+ "phf_shared 0.11.3",
+ "rand 0.8.5",
+]
+
+[[package]]
+name = "phf_generator"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737"
+dependencies = [
+ "fastrand",
+ "phf_shared 0.13.1",
+]
+
+[[package]]
+name = "phf_macros"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216"
+dependencies = [
+ "phf_generator 0.11.3",
+ "phf_shared 0.11.3",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "phf_macros"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "812f032b54b1e759ccd5f8b6677695d5268c588701effba24601f6932f8269ef"
+dependencies = [
+ "phf_generator 0.13.1",
+ "phf_shared 0.13.1",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
 [[package]]
 name = "phf_shared"
 version = "0.11.3"
@@ -1361,6 +1598,15 @@ dependencies = [
  "siphasher",
 ]
 
+[[package]]
+name = "phf_shared"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266"
+dependencies = [
+ "siphasher",
+]
+
 [[package]]
 name = "pico-args"
 version = "0.5.0"
@@ -1505,7 +1751,7 @@ dependencies = [
  "bytes",
  "getrandom 0.3.4",
  "lru-slab",
- "rand",
+ "rand 0.9.2",
  "ring",
  "rustc-hash",
  "rustls",
@@ -1546,6 +1792,15 @@ version = "5.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
 
+[[package]]
+name = "rand"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
+dependencies = [
+ "rand_core 0.6.4",
+]
+
 [[package]]
 name = "rand"
 version = "0.9.2"
@@ -1553,7 +1808,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1"
 dependencies = [
  "rand_chacha",
- "rand_core",
+ "rand_core 0.9.5",
 ]
 
 [[package]]
@@ -1563,9 +1818,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
 dependencies = [
  "ppv-lite86",
- "rand_core",
+ "rand_core 0.9.5",
 ]
 
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+
 [[package]]
 name = "rand_core"
 version = "0.9.5"
@@ -1685,6 +1946,15 @@ version = "2.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
 
+[[package]]
+name = "rustc_version"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
+dependencies = [
+ "semver",
+]
+
 [[package]]
 name = "rustix"
 version = "1.1.4"
@@ -1760,6 +2030,31 @@ version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
 
+[[package]]
+name = "selectors"
+version = "0.33.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "feef350c36147532e1b79ea5c1f3791373e61cbd9a6a2615413b3807bb164fb7"
+dependencies = [
+ "bitflags",
+ "cssparser",
+ "derive_more",
+ "log",
+ "new_debug_unreachable",
+ "phf 0.13.1",
+ "phf_codegen 0.13.1",
+ "precomputed-hash",
+ "rustc-hash",
+ "servo_arc",
+ "smallvec",
+]
+
+[[package]]
+name = "semver"
+version = "1.0.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2"
+
 [[package]]
 name = "serde"
 version = "1.0.228"
@@ -1825,6 +2120,15 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "servo_arc"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "170fb83ab34de17dc69aa7c67482b22218ddb85da56546f9bd6b929e32a05930"
+dependencies = [
+ "stable_deref_trait",
+]
+
 [[package]]
 name = "shlex"
 version = "1.3.0"
@@ -1899,7 +2203,7 @@ checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f"
 dependencies = [
  "new_debug_unreachable",
  "parking_lot",
- "phf_shared",
+ "phf_shared 0.11.3",
  "precomputed-hash",
 ]
 
diff --git a/Cargo.toml b/Cargo.toml
index 298c33f..c00f3db 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -33,6 +33,7 @@ futures = "0.3"
 indicatif = "0.17"
 dotenvy = "0.15"
 regex = "1"
+fast_html2md = "0.0.58"
 reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "json", "stream"] }
 serde = { version = "1", features = ["derive"] }
 serde_json = "1"
diff --git a/README.md b/README.md
index fb9b3aa..443ac31 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@
     <br>
 </p>
 
-`paperdown` converts research papers from PDF to Markdown using Z.AI's [GLM-OCR](https://github.com/zai-org/GLM-OCR) model and downloads referenced figure assets locally.
+`paperdown` converts research papers from PDF to Markdown using Z.AI's [GLM-OCR](https://github.com/zai-org/GLM-OCR) model, rewrites HTML tables and images into Markdown, and downloads referenced figure assets locally.
 
 If you work with academic papers, you know that the OCR process itself is not the most difficult part. The real challenge is cleaning up the output. Tables can disappear, their structure can become jumbled, and formulas might be converted into meaningless text. This often means you spend more time correcting the output than working with it.
 
@@ -28,6 +28,7 @@ Therefore, this project was created because, while [`docling`](https://github.co
 ## Features
 
 - Async OCR requests and batch PDF processing using the Z.AI API.
+- Async HTML table and image cleanup using `fast_html2md`.
 - Concurrent figure downloads for each PDF.
 - Fast processing: approximately 25 seconds per batch of 32 PDFs. Speed depends on the z.ai API availability. See the cost section for more details on spending.
 
diff --git a/src/core.rs b/src/core.rs
index be90d5d..bd6340d 100644
--- a/src/core.rs
+++ b/src/core.rs
@@ -82,7 +82,7 @@ pub async fn process_pdf(
         )
         .await?;
     let figure_seconds = figure_started.elapsed();
-    let markdown = markdown::strip_html_img_alt_attributes(&markdown);
+    let markdown = markdown::sanitize_html_fragments(markdown).await?;
 
     fire(
         &progress,
@@ -220,8 +220,8 @@ pub mod testing {
         super::markdown::replace_image_urls(markdown, replacements)
     }
 
-    pub fn strip_html_img_alt_attributes(markdown: &str) -> String {
-        super::markdown::strip_html_img_alt_attributes(markdown)
+    pub async fn sanitize_html_fragments(markdown: String) -> Result<String> {
+        super::markdown::sanitize_html_fragments(markdown).await
     }
 
     pub fn prepare_output_paths(
diff --git a/src/core/markdown.rs b/src/core/markdown.rs
index 37fa0c9..d6691bd 100644
--- a/src/core/markdown.rs
+++ b/src/core/markdown.rs
@@ -1,7 +1,11 @@
+use anyhow::Result;
+use futures::stream::{self, StreamExt};
 use regex::Regex;
 use std::collections::HashMap;
 use std::sync::LazyLock;
 
+const HTML_FRAGMENT_CONCURRENCY: usize = 16;
+
 static MARKDOWN_IMAGE_URL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
     Regex::new(r"\((https?://[^)\s]+)\)").expect("valid markdown image URL regex")
 });
@@ -10,10 +14,12 @@ static HTML_IMAGE_URL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
     Regex::new(r#"(src\s*=\s*)(['"])(https?://[^'"]+)(['"])"#).expect("valid HTML image URL regex")
 });
 
-static HTML_IMAGE_ALT_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
-    Regex::new(r#"(?is)\s+alt(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s/>]+))?"#)
-        .expect("valid HTML image alt regex")
-});
+#[derive(Debug)]
+enum Segment {
+    Text(String),
+    Code(String),
+    Html { index: usize, raw: String },
+}
 
 pub(crate) fn replace_image_urls(markdown: &str, replacements: &HashMap<String, String>) -> String {
     let updated = MARKDOWN_IMAGE_URL_PATTERN
@@ -42,7 +48,7 @@ pub(crate) fn replace_image_urls(markdown: &str, replacements: &HashMap<String,
         .into_owned()
 }
 
-pub(crate) fn strip_html_img_alt_attributes(markdown: &str) -> String {
+pub(crate) async fn sanitize_html_fragments(markdown: String) -> Result<String> {
     let mut out = String::with_capacity(markdown.len());
     let mut chunk_start = 0usize;
     let mut in_fence = false;
@@ -68,7 +74,7 @@ pub(crate) fn strip_html_img_alt_attributes(markdown: &str) -> String {
         }
 
         if let Some((marker, len)) = fence_start(line) {
-            out.push_str(&sanitize_non_code_chunk(&markdown[chunk_start..i]));
+            out.push_str(&sanitize_non_code_chunk(&markdown[chunk_start..i]).await?);
             out.push_str(line);
             in_fence = true;
             fence_marker = marker;
@@ -82,41 +88,132 @@ pub(crate) fn strip_html_img_alt_attributes(markdown: &str) -> String {
     }
 
     if !in_fence {
-        out.push_str(&sanitize_non_code_chunk(&markdown[chunk_start..]));
+        out.push_str(&sanitize_non_code_chunk(&markdown[chunk_start..]).await?);
     }
 
-    out
+    Ok(out)
 }
 
-fn sanitize_non_code_chunk(chunk: &str) -> String {
-    let mut out = String::with_capacity(chunk.len());
+async fn sanitize_non_code_chunk(chunk: &str) -> Result<String> {
+    let mut segments = Vec::new();
     let mut i = 0usize;
+    let mut literal_start = 0usize;
+    let mut html_count = 0usize;
 
     while i < chunk.len() {
         if let Some(run_len) = backtick_run_len(chunk, i)
             && let Some(end) = find_matching_backtick_run(chunk, i + run_len, run_len)
         {
-            out.push_str(&chunk[i..end + run_len]);
+            if literal_start < i {
+                segments.push(Segment::Text(chunk[literal_start..i].to_string()));
+            }
+            segments.push(Segment::Code(chunk[i..end + run_len].to_string()));
             i = end + run_len;
+            literal_start = i;
             continue;
         }
 
-        if starts_html_img_tag(chunk, i)
-            && let Some(tag_end) = find_html_tag_end(chunk, i)
-        {
-            out.push_str(&HTML_IMAGE_ALT_PATTERN.replace_all(&chunk[i..tag_end], ""));
+        if let Some((tag_end, fragment)) = extract_html_fragment(chunk, i) {
+            if literal_start < i {
+                segments.push(Segment::Text(chunk[literal_start..i].to_string()));
+            }
+            segments.push(Segment::Html {
+                index: html_count,
+                raw: fragment,
+            });
+            html_count += 1;
             i = tag_end;
+            literal_start = i;
             continue;
         }
 
         let ch = chunk[i..].chars().next().expect("valid char boundary");
-        out.push(ch);
         i += ch.len_utf8();
     }
 
+    if literal_start < chunk.len() {
+        segments.push(Segment::Text(chunk[literal_start..].to_string()));
+    }
+
+    if html_count == 0 {
+        return Ok(join_segments(&segments, &[]));
+    }
+
+    let html_fragments: Vec<String> = segments
+        .iter()
+        .filter_map(|segment| match segment {
+            Segment::Html { raw, .. } => Some(raw.clone()),
+            _ => None,
+        })
+        .collect();
+    let converted = convert_html_fragments(html_fragments).await;
+    Ok(join_segments(&segments, &converted))
+}
+
+fn join_segments(segments: &[Segment], converted: &[Option<String>]) -> String {
+    let mut out = String::with_capacity(
+        segments
+            .iter()
+            .map(|segment| match segment {
+                Segment::Text(text) | Segment::Code(text) => text.len(),
+                Segment::Html { raw, .. } => raw.len(),
+            })
+            .sum(),
+    );
+
+    for segment in segments {
+        match segment {
+            Segment::Text(text) | Segment::Code(text) => out.push_str(text),
+            Segment::Html { index, raw } => {
+                if let Some(Some(rewritten)) = converted.get(*index) {
+                    out.push_str(rewritten);
+                } else {
+                    out.push_str(raw);
+                }
+            }
+        }
+    }
+
     out
 }
 
+fn extract_html_fragment(text: &str, start: usize) -> Option<(usize, String)> {
+    if starts_html_tag(text, start, "table") {
+        let end = find_table_fragment_end(text, start)?;
+        return Some((end, text[start..end].to_string()));
+    }
+
+    if starts_html_tag(text, start, "img") {
+        let end = find_html_tag_end(text, start)?;
+        return Some((end, text[start..end].to_string()));
+    }
+
+    None
+}
+
+async fn convert_html_fragments(fragments: Vec<String>) -> Vec<Option<String>> {
+    let converted = stream::iter(fragments.into_iter().enumerate().map(
+        |(index, fragment)| async move {
+            let converted = html2md::rewrite_html_streaming(&fragment, true).await;
+            let converted = if converted.trim().is_empty() {
+                None
+            } else {
+                Some(converted)
+            };
+            (index, converted)
+        },
+    ))
+    .buffer_unordered(HTML_FRAGMENT_CONCURRENCY)
+    .collect::<Vec<_>>()
+    .await;
+
+    let mut ordered = vec![None; converted.len()];
+    for (index, item) in converted {
+        ordered[index] = item;
+    }
+    ordered
+}
+
 fn backtick_run_len(text: &str, start: usize) -> Option<usize> {
     let bytes = text.as_bytes();
     if bytes.get(start) != Some(&b'`') {
@@ -144,20 +241,71 @@ fn find_matching_backtick_run(text: &str, start: usize, run_len: usize) -> Optio
     None
 }
 
-fn starts_html_img_tag(text: &str, start: usize) -> bool {
+fn starts_html_tag(text: &str, start: usize, tag: &str) -> bool {
     let bytes = text.as_bytes();
     if bytes.get(start) != Some(&b'<') {
         return false;
     }
 
-    let Some(prefix) = text.get(start + 1..start + 4) else {
+    let tag_start = start + 1;
+    let tag_end = tag_start + tag.len();
+    let Some(candidate) = text.get(tag_start..tag_end) else {
         return false;
     };
-    if !prefix.eq_ignore_ascii_case("img") {
+    if !candidate.eq_ignore_ascii_case(tag) {
         return false;
     }
 
-    !matches!(bytes.get(start + 4), Some(b) if b.is_ascii_alphanumeric() || *b == b'-')
+    !matches!(bytes.get(tag_end), Some(b) if b.is_ascii_alphanumeric() || *b == b'-')
+}
+
+fn starts_end_html_tag(text: &str, start: usize, tag: &str) -> bool {
+    let bytes = text.as_bytes();
+    if bytes.get(start) != Some(&b'<') || bytes.get(start + 1) != Some(&b'/') {
+        return false;
+    }
+
+    let tag_start = start + 2;
+    let tag_end = tag_start + tag.len();
+    let Some(candidate) = text.get(tag_start..tag_end) else {
+        return false;
+    };
+    if !candidate.eq_ignore_ascii_case(tag) {
+        return false;
+    }
+
+    !matches!(bytes.get(tag_end), Some(b) if b.is_ascii_alphanumeric() || *b == b'-')
+}
+
+fn find_table_fragment_end(text: &str, start: usize) -> Option<usize> {
+    let mut depth = 1usize;
+    let mut i = find_html_tag_end(text, start)?;
+
+    while i < text.len() {
+        let ch = text[i..].chars().next()?;
+        if ch == '<' {
+            if starts_html_tag(text, i, "table") {
+                let end = find_html_tag_end(text, i)?;
+                depth += 1;
+                i = end;
+                continue;
+            }
+
+            if starts_end_html_tag(text, i, "table") {
+                let end = find_html_tag_end(text, i)?;
+                depth = depth.saturating_sub(1);
+                i = end;
+                if depth == 0 {
+                    return Some(end);
+                }
+                continue;
+            }
+        }
+
+        i += ch.len_utf8();
+    }
+
+    None
 }
 
 fn find_html_tag_end(text: &str, start: usize) -> Option<usize> {
diff --git a/tests/core_internal.rs b/tests/core_internal.rs
index ec22d14..a8d7987 100644
--- a/tests/core_internal.rs
+++ b/tests/core_internal.rs
@@ -7,7 +7,7 @@ use paperdown::core::testing::{
     ProgressCallback, ProgressEvent, append_log, atomic_write_text, build_payload,
     content_type_to_suffix, extract_image_url, fire_for_test, is_http_url, load_api_key,
     prepare_output_paths, process_pdf, replace_image_urls, round3_for_test,
-    strip_html_img_alt_attributes, url_suffix, validate_layout_response,
+    sanitize_html_fragments, url_suffix, validate_layout_response,
 };
 #[cfg(feature = "net-tests")]
 use paperdown::core::testing::{download_figure, localize_figures};
@@ -608,64 +608,78 @@ fn replace_image_urls_no_replacements_passthrough() {
     assert_eq!(updated, markdown);
 }
 
+fn sanitize_html(markdown: &str) -> String {
+    let rt = tokio::runtime::Runtime::new().unwrap();
+    rt.block_on(sanitize_html_fragments(markdown.to_string()))
+        .unwrap()
+}
+
 #[test]
-fn strip_html_img_alt_attributes_removes_alt_and_preserves_other_attrs() {
-    let markdown = "before <img src='x.png' alt='OCR图片' data-id='1'/> after";
+fn sanitize_html_fragments_smoke() {
+    let markdown = "before <img src='x.png' alt='OCR图片'/> after";
 
-    let updated = strip_html_img_alt_attributes(markdown);
+    let updated = sanitize_html(markdown);
 
-    assert_eq!(updated, "before <img src='x.png' data-id='1'/> after");
+    assert_eq!(updated, "before ![OCR图片](x.png) after");
 }
 
 #[test]
-fn strip_html_img_alt_attributes_handles_case_and_spacing() {
-    let markdown = "<IMG\n  SRC=\"x.png\"\n  ALT=\"OCR图片\"\n  title='keep me'>";
+fn sanitize_html_fragments_converts_tables_and_images() {
+    let markdown = "start <table><thead><tr><th>A</th></tr></thead><tbody><tr><td>B</td></tr></tbody></table> end <img src='https://x/a.png' alt='OCR图片'/>";
 
-    let updated = strip_html_img_alt_attributes(markdown);
+    let updated = sanitize_html(markdown);
 
-    assert_eq!(updated, "<IMG\n  SRC=\"x.png\"\n  title='keep me'>");
+    assert!(!updated.contains("<table"));
+    assert!(!updated.contains("<img"));
+    assert!(updated.contains("|"));
+    assert!(updated.contains("B"));
+    assert!(updated.contains("![OCR图片](https://x/a.png)"));
 }
 
 #[test]
-fn strip_html_img_alt_attributes_leaves_markdown_and_code_unchanged() {
-    let markdown = "![OCR图片](https://x/a.png)\n```html\n<img src='x.png' alt='OCR图片'/>\n```\n`<img src='y.png' alt='OCR图片'/>`";
+fn sanitize_html_fragments_preserves_code_spans_and_fences() {
+    let markdown = "![OCR图片](https://x/a.png)\n```html\n<img src='x.png' alt='OCR图片'/>\n```\n`<table><tr><td>x</td></tr></table>`";
 
-    let updated = strip_html_img_alt_attributes(markdown);
+    let updated = sanitize_html(markdown);
 
     assert_eq!(
         updated,
-        "![OCR图片](https://x/a.png)\n```html\n<img src='x.png' alt='OCR图片'/>\n```\n`<img src='y.png' alt='OCR图片'/>`"
+        "![OCR图片](https://x/a.png)\n```html\n<img src='x.png' alt='OCR图片'/>\n```\n`<table><tr><td>x</td></tr></table>`"
     );
 }
 
 #[test]
-fn strip_html_img_alt_attributes_removes_multiple_alt_attributes() {
-    let markdown = "<img alt='one' src='x.png' ALT=\"two\"/>";
+fn sanitize_html_fragments_leaves_malformed_fragments_unchanged() {
+    let markdown = "before <table><tr><td>broken after";
 
-    let updated = strip_html_img_alt_attributes(markdown);
+    let updated = sanitize_html(markdown);
 
-    assert_eq!(updated, "<img src='x.png'/>");
+    assert_eq!(updated, markdown);
 }
 
 #[test]
-fn strip_html_img_alt_attributes_removes_boolean_and_unquoted_alt() {
-    let markdown = "<img alt src='x.png' alt=x data-id='1'>";
+fn sanitize_html_fragments_keeps_nested_table_content_in_order() {
+    let markdown = "A <table><tr><td>1 <img src='x.png' alt='inside'/></td></tr></table> B <img src='y.png' alt='outside'/>";
 
-    let updated = strip_html_img_alt_attributes(markdown);
+    let updated = sanitize_html(markdown);
 
-    assert_eq!(updated, "<img src='x.png' data-id='1'>");
+    assert!(!updated.contains("<table"));
+    assert!(!updated.contains("<img"));
+    assert!(updated.contains("![inside](x.png)"));
+    assert!(updated.contains("![outside](y.png)"));
+    assert!(updated.starts_with("A "));
+    assert!(updated.ends_with("![outside](y.png)"));
 }
 
 #[test]
-fn strip_html_img_alt_attributes_keeps_localized_image_urls() {
-    let markdown = "<img src='https://x/a.png' alt='OCR图片'/>";
-    let mut replacements = HashMap::new();
-    replacements.insert("https://x/a.png".to_string(), "figures/a.png".to_string());
+fn process_pdf_sanitizes_html_output() {
+    let markdown = "start <table><thead><tr><th>A</th></tr></thead><tbody><tr><td>B</td></tr></tbody></table> end <img src='https://x/a.png' alt='OCR图片'/>";
 
-    let localized = replace_image_urls(markdown, &replacements);
-    let updated = strip_html_img_alt_attributes(&localized);
+    let updated = sanitize_html(markdown);
 
-    assert_eq!(updated, "<img src='figures/a.png'/>");
+    assert!(!updated.contains("<table"));
+    assert!(!updated.contains("<img"));
+    assert!(updated.contains("![OCR图片](https://x/a.png)"));
 }
 
 #[test]

From ea11d84cbfdbc4ecb9e81be132a5d020d2aae2e0 Mon Sep 17 00:00:00 2001
From: Anatolii Tsyplenkov <34775595+atsyplenkov@users.noreply.github.com>
Date: Thu, 19 Mar 2026 13:36:40 +0300
Subject: [PATCH 2/7] chore: bump version to 0.2.1

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index c00f3db..d06c851 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "paperdown"
-version = "0.2.0"
+version = "0.2.1"
 authors = ["Anatoly Tsyplenkov <atsyplenkov@fastmail.com>"]
 edition = "2024"
 description = "A fast CLI tool to batch convert PDFs into Markdown using GLM-OCR."

From fd733f8745d4a49092e94f64823b584af4cfed23 Mon Sep 17 00:00:00 2001
From: Anatolii Tsyplenkov <34775595+atsyplenkov@users.noreply.github.com>
Date: Thu, 19 Mar 2026 13:37:25 +0300
Subject: [PATCH 3/7] Clarify Markdown conversion type in README

Updated the description to specify that paperdown converts HTML tables and images into CommonMark Markdown instead of just Markdown.
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 443ac31..15fd1fc 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@
     <br>
 </p>
 
-`paperdown` converts research papers from PDF to Markdown using Z.AI's [GLM-OCR](https://github.com/zai-org/GLM-OCR) model, rewrites HTML tables and images into Markdown, and downloads referenced figure assets locally.
+`paperdown` converts research papers from PDF to Markdown using Z.AI's [GLM-OCR](https://github.com/zai-org/GLM-OCR) model, rewrites HTML tables and images into CommonMark Markdown, and downloads referenced figure assets locally.
 
 If you work with academic papers, you know that the OCR process itself is not the most difficult part. The real challenge is cleaning up the output. Tables can disappear, their structure can become jumbled, and formulas might be converted into meaningless text. This often means you spend more time correcting the output than working with it.
 

From 560fa591a7f0c69f3a55174104174a0cb4d3b794 Mon Sep 17 00:00:00 2001
From: atsyplenkov <sharp.couch9400@fastmail.com>
Date: Thu, 19 Mar 2026 13:46:05 +0300
Subject: [PATCH 4/7] Update Cargo.lock

---
 Cargo.lock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.lock b/Cargo.lock
index f41113a..9d3aff9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1436,7 +1436,7 @@ checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
 
 [[package]]
 name = "paperdown"
-version = "0.2.0"
+version = "0.2.1"
 dependencies = [
  "anyhow",
  "assert_cmd",

From e401c8ec424ba9ab7bfe44fd140402a237ecbeed Mon Sep 17 00:00:00 2001
From: atsyplenkov <sharp.couch9400@fastmail.com>
Date: Thu, 19 Mar 2026 13:54:29 +0300
Subject: [PATCH 5/7] bump to dev version

---
 Cargo.lock | 2 +-
 Cargo.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 9d3aff9..76f7028 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1436,7 +1436,7 @@ checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
 
 [[package]]
 name = "paperdown"
-version = "0.2.1"
+version = "0.2.1-dev"
 dependencies = [
  "anyhow",
  "assert_cmd",
diff --git a/Cargo.toml b/Cargo.toml
index d06c851..9923301 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "paperdown"
-version = "0.2.1"
+version = "0.2.1-dev"
 authors = ["Anatoly Tsyplenkov <atsyplenkov@fastmail.com>"]
 edition = "2024"
 description = "A fast CLI tool to batch convert PDFs into Markdown using GLM-OCR."

From f59f3ad937f1b69daa94da48b13469ead5e45f79 Mon Sep 17 00:00:00 2001
From: atsyplenkov <sharp.couch9400@fastmail.com>
Date: Thu, 19 Mar 2026 14:27:50 +0300
Subject: [PATCH 6/7] fix: broaden html sanitization

---
 README.md              |   4 +-
 src/core/markdown.rs   | 273 ++++++++++++++++++++++++++++++++++-------
 tests/core_internal.rs |  56 ++++++---
 3 files changed, 268 insertions(+), 65 deletions(-)

diff --git a/README.md b/README.md
index 15fd1fc..eb09b97 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@
     <br>
 </p>
 
-`paperdown` converts research papers from PDF to Markdown using Z.AI's [GLM-OCR](https://github.com/zai-org/GLM-OCR) model, rewrites HTML tables and images into CommonMark Markdown, and downloads referenced figure assets locally.
+`paperdown` converts research papers from PDF to Markdown using Z.AI's [GLM-OCR](https://github.com/zai-org/GLM-OCR) model, rewrites common HTML into CommonMark Markdown, and downloads referenced figure assets locally.
 
 If you work with academic papers, you know that the OCR process itself is not the most difficult part. The real challenge is cleaning up the output. Tables can disappear, their structure can become jumbled, and formulas might be converted into meaningless text. This often means you spend more time correcting the output than working with it.
 
@@ -28,7 +28,7 @@ Therefore, this project was created because, while [`docling`](https://github.co
 ## Features
 
 - Async OCR requests and batch PDF processing using the Z.AI API.
-- Async HTML table and image cleanup using `fast_html2md`.
+- Async HTML cleanup using `fast_html2md`.
 - Concurrent figure downloads for each PDF.
 - Fast processing: approximately 25 seconds per batch of 32 PDFs. Speed depends on the z.ai API availability. See the cost section for more details on spending.
 
diff --git a/src/core/markdown.rs b/src/core/markdown.rs
index d6691bd..33ec914 100644
--- a/src/core/markdown.rs
+++ b/src/core/markdown.rs
@@ -6,6 +6,49 @@ use std::sync::LazyLock;
 
 const HTML_FRAGMENT_CONCURRENCY: usize = 16;
 
+const HTML_ALLOWLIST_TAGS: &[&str] = &[
+    "img",
+    "a",
+    "p",
+    "div",
+    "span",
+    "br",
+    "hr",
+    "h1",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "h6",
+    "ul",
+    "ol",
+    "li",
+    "blockquote",
+    "table",
+    "thead",
+    "tbody",
+    "tfoot",
+    "tr",
+    "th",
+    "td",
+    "strong",
+    "b",
+    "em",
+    "i",
+    "u",
+    "s",
+    "del",
+    "pre",
+    "code",
+];
+
+const HTML_EXCLUDED_TAGS: &[&str] = &["math", "sub", "sup"];
+
+const HTML_VOID_TAGS: &[&str] = &[
+    "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param",
+    "source", "track", "wbr",
+];
+
 static MARKDOWN_IMAGE_URL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
     Regex::new(r"\((https?://[^)\s]+)\)").expect("valid markdown image URL regex")
 });
@@ -21,6 +64,21 @@ enum Segment {
     Html { index: usize, raw: String },
 }
 
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum HtmlTagKind {
+    Opening,
+    Closing,
+    Special,
+}
+
+#[derive(Debug, Clone, Copy)]
+struct ParsedHtmlTag<'a> {
+    name: &'a str,
+    kind: HtmlTagKind,
+    end: usize,
+    self_closing: bool,
+}
+
 pub(crate) fn replace_image_urls(markdown: &str, replacements: &HashMap<String, String>) -> String {
     let updated = MARKDOWN_IMAGE_URL_PATTERN
         .replace_all(markdown, |caps: &regex::Captures<'_>| {
@@ -178,17 +236,26 @@ fn join_segments(segments: &[Segment], converted: &[Option<String>]) -> String {
 }
 
 fn extract_html_fragment(text: &str, start: usize) -> Option<(usize, String)> {
-    if starts_html_tag(text, start, "table") {
-        let end = find_table_fragment_end(text, start)?;
-        return Some((end, text[start..end].to_string()));
+    let tag = parse_html_tag(text, start)?;
+    if tag.kind != HtmlTagKind::Opening {
+        return None;
     }
 
-    if starts_html_tag(text, start, "img") {
-        let end = find_html_tag_end(text, start)?;
-        return Some((end, text[start..end].to_string()));
+    if !is_html_allowlisted(tag.name) || is_html_excluded(tag.name) {
+        return None;
     }
 
-    None
+    let end = if tag.self_closing || is_html_void(tag.name) {
+        tag.end
+    } else {
+        find_html_region_end(text, start, tag.name)?
+    };
+    let fragment = text[start..end].to_string();
+    if contains_tex_delimiters(&fragment) || contains_excluded_math_tags(&fragment) {
+        return None;
+    }
+
+    Some((end, fragment))
 }
 
 async fn convert_html_fragments(fragments: Vec<String>) -> Vec<Option<String>> {
@@ -241,67 +308,114 @@ fn find_matching_backtick_run(text: &str, start: usize, run_len: usize) -> Optio
     None
 }
 
-fn starts_html_tag(text: &str, start: usize, tag: &str) -> bool {
+fn parse_html_tag(text: &str, start: usize) -> Option<ParsedHtmlTag<'_>> {
     let bytes = text.as_bytes();
     if bytes.get(start) != Some(&b'<') {
-        return false;
+        return None;
     }
 
-    let tag_start = start + 1;
-    let tag_end = tag_start + tag.len();
-    let Some(candidate) = text.get(tag_start..tag_end) else {
-        return false;
-    };
-    if !candidate.eq_ignore_ascii_case(tag) {
-        return false;
+    if matches!(bytes.get(start + 1), Some(b'!') | Some(b'?')) {
+        let end = find_html_tag_end(text, start)?;
+        return Some(ParsedHtmlTag {
+            name: "",
+            kind: HtmlTagKind::Special,
+            end,
+            self_closing: false,
+        });
+    }
+
+    if bytes.get(start + 1) == Some(&b'/') {
+        let name_start = start + 2;
+        let name_end = name_end(text, name_start)?;
+        let end = find_html_tag_end(text, start)?;
+        return Some(ParsedHtmlTag {
+            name: &text[name_start..name_end],
+            kind: HtmlTagKind::Closing,
+            end,
+            self_closing: false,
+        });
     }
 
-    !matches!(bytes.get(tag_end), Some(b) if b.is_ascii_alphanumeric() || *b == b'-')
+    let name_start = start + 1;
+    let name_end = name_end(text, name_start)?;
+    let end = find_html_tag_end(text, start)?;
+    let self_closing = is_self_closing_tag(text, start, end);
+
+    Some(ParsedHtmlTag {
+        name: &text[name_start..name_end],
+        kind: HtmlTagKind::Opening,
+        end,
+        self_closing,
+    })
 }
 
-fn starts_end_html_tag(text: &str, start: usize, tag: &str) -> bool {
+fn name_end(text: &str, start: usize) -> Option<usize> {
     let bytes = text.as_bytes();
-    if bytes.get(start) != Some(&b'<') || bytes.get(start + 1) != Some(&b'/') {
-        return false;
+    let mut i = start;
+
+    while i < bytes.len() {
+        let b = bytes[i];
+        if b.is_ascii_alphanumeric() || matches!(b, b'-' | b':' | b'_') {
+            i += 1;
+            continue;
+        }
+        break;
     }
 
-    let tag_start = start + 2;
-    let tag_end = tag_start + tag.len();
-    let Some(candidate) = text.get(tag_start..tag_end) else {
-        return false;
-    };
-    if !candidate.eq_ignore_ascii_case(tag) {
-        return false;
+    (i > start).then_some(i)
+}
+
+fn is_self_closing_tag(text: &str, start: usize, end: usize) -> bool {
+    let bytes = text.as_bytes();
+    let mut i = end.saturating_sub(1);
+    while i > start && bytes[i - 1].is_ascii_whitespace() {
+        i -= 1;
     }
 
-    !matches!(bytes.get(tag_end), Some(b) if b.is_ascii_alphanumeric() || *b == b'-')
+    bytes.get(i - 1) == Some(&b'/')
 }
 
-fn find_table_fragment_end(text: &str, start: usize) -> Option<usize> {
-    let mut depth = 1usize;
-    let mut i = find_html_tag_end(text, start)?;
+fn find_html_region_end(text: &str, start: usize, root_tag: &str) -> Option<usize> {
+    let root = parse_html_tag(text, start)?;
+    if root.kind != HtmlTagKind::Opening {
+        return None;
+    }
 
-    while i < text.len() {
-        let ch = text[i..].chars().next()?;
-        if ch == '<' {
-            if starts_html_tag(text, i, "table") {
-                let end = find_html_tag_end(text, i)?;
-                depth += 1;
-                i = end;
-                continue;
-            }
+    let mut stack = vec![root_tag.to_ascii_lowercase()];
+    let mut i = root.end;
 
-            if starts_end_html_tag(text, i, "table") {
-                let end = find_html_tag_end(text, i)?;
-                depth = depth.saturating_sub(1);
-                i = end;
-                if depth == 0 {
-                    return Some(end);
+    while i < text.len() {
+        if text.as_bytes().get(i) == Some(&b'<') && let Some(tag) = parse_html_tag(text, i) {
+            match tag.kind {
+                HtmlTagKind::Special => {
+                    i = tag.end;
+                    continue;
+                }
+                HtmlTagKind::Closing => {
+                    let Some(current) = stack.last() else {
+                        return None;
+                    };
+                    if !current.eq_ignore_ascii_case(tag.name) {
+                        return None;
+                    }
+                    stack.pop();
+                    i = tag.end;
+                    if stack.is_empty() {
+                        return Some(i);
+                    }
+                    continue;
+                }
+                HtmlTagKind::Opening => {
+                    if !(tag.self_closing || is_html_void(tag.name)) {
+                        stack.push(tag.name.to_ascii_lowercase());
+                    }
+                    i = tag.end;
+                    continue;
                 }
-                continue;
             }
         }
 
+        let ch = text[i..].chars().next()?;
         i += ch.len_utf8();
     }
 
@@ -327,6 +441,71 @@ fn find_html_tag_end(text: &str, start: usize) -> Option<usize> {
     None
 }
 
+fn is_html_allowlisted(tag: &str) -> bool {
+    HTML_ALLOWLIST_TAGS.iter().any(|candidate| candidate.eq_ignore_ascii_case(tag))
+}
+
+fn is_html_excluded(tag: &str) -> bool {
+    HTML_EXCLUDED_TAGS.iter().any(|candidate| candidate.eq_ignore_ascii_case(tag))
+}
+
+fn is_html_void(tag: &str) -> bool {
+    HTML_VOID_TAGS.iter().any(|candidate| candidate.eq_ignore_ascii_case(tag))
+}
+
+fn contains_excluded_math_tags(fragment: &str) -> bool {
+    let mut i = 0usize;
+
+    while i < fragment.len() {
+        if let Some(run_len) = backtick_run_len(fragment, i)
+            && let Some(end) = find_matching_backtick_run(fragment, i + run_len, run_len)
+        {
+            i = end + run_len;
+            continue;
+        }
+
+        if let Some(tag) = parse_html_tag(fragment, i)
+            && matches!(tag.kind, HtmlTagKind::Opening | HtmlTagKind::Closing)
+            && is_html_excluded(tag.name)
+        {
+            return true;
+        }
+
+        let ch = fragment[i..].chars().next().expect("valid char boundary");
+        i += ch.len_utf8();
+    }
+
+    false
+}
+
+fn contains_tex_delimiters(fragment: &str) -> bool {
+    let bytes = fragment.as_bytes();
+    let mut i = 0usize;
+    let mut saw_dollar = false;
+
+    while i < bytes.len() {
+        if bytes[i] == b'\\' {
+            i = i.saturating_add(2);
+            continue;
+        }
+
+        if bytes[i] == b'$' {
+            if i + 1 < bytes.len() && bytes[i + 1] == b'$' {
+                return true;
+            }
+
+            if saw_dollar {
+                return true;
+            }
+            saw_dollar = true;
+        }
+
+        i += 1;
+    }
+
+    false
+}
+
 fn fence_start(line: &str) -> Option<(char, usize)> {
     let trimmed = line.trim_start();
     let mut chars = trimmed.chars();
diff --git a/tests/core_internal.rs b/tests/core_internal.rs
index a8d7987..dca4c8e 100644
--- a/tests/core_internal.rs
+++ b/tests/core_internal.rs
@@ -624,16 +624,17 @@ fn sanitize_html_fragments_smoke() {
 }
 
 #[test]
-fn sanitize_html_fragments_converts_tables_and_images() {
-    let markdown = "start <table><thead><tr><th>A</th></tr></thead><tbody><tr><td>B</td></tr></tbody></table> end <img src='https://x/a.png' alt='OCR图片'/>";
+fn sanitize_html_fragments_converts_common_html_regions() {
+    let markdown = "start <p>Hello <a href='https://x'>world</a></p> mid <h2>Title</h2> end";
 
     let updated = sanitize_html(markdown);
 
-    assert!(!updated.contains("<table"));
-    assert!(!updated.contains("<img"));
-    assert!(updated.contains("|"));
-    assert!(updated.contains("B"));
-    assert!(updated.contains("![OCR图片](https://x/a.png)"));
+    assert!(!updated.contains("<p"));
+    assert!(!updated.contains("<a "));
+    assert!(!updated.contains("<h2"));
+    assert!(updated.contains("Hello"));
+    assert!(updated.contains("[world](https://x)"));
+    assert!(updated.contains("## Title"));
 }
 
 #[test]
@@ -650,7 +651,7 @@ fn sanitize_html_fragments_preserves_code_spans_and_fences() {
 
 #[test]
 fn sanitize_html_fragments_leaves_malformed_fragments_unchanged() {
-    let markdown = "before <table><tr><td>broken after";
+    let markdown = "before <div><p>broken after";
 
     let updated = sanitize_html(markdown);
 
@@ -658,13 +659,16 @@ fn sanitize_html_fragments_leaves_malformed_fragments_unchanged() {
 }
 
 #[test]
-fn sanitize_html_fragments_keeps_nested_table_content_in_order() {
-    let markdown = "A <table><tr><td>1 <img src='x.png' alt='inside'/></td></tr></table> B <img src='y.png' alt='outside'/>";
+fn sanitize_html_fragments_keeps_nested_html_content_in_order() {
+    let markdown = "A <div>1 <span>2 <img src='x.png' alt='inside'/></span></div> B <img src='y.png' alt='outside'/>";
 
     let updated = sanitize_html(markdown);
 
-    assert!(!updated.contains("<table"));
+    assert!(!updated.contains("<div"));
+    assert!(!updated.contains("<span"));
     assert!(!updated.contains("<img"));
+    assert!(updated.contains("1"));
+    assert!(updated.contains("2"));
     assert!(updated.contains("![inside](x.png)"));
     assert!(updated.contains("![outside](y.png)"));
     assert!(updated.starts_with("A "));
@@ -672,14 +676,34 @@ fn sanitize_html_fragments_keeps_nested_table_content_in_order() {
 }
 
 #[test]
-fn process_pdf_sanitizes_html_output() {
-    let markdown = "start <table><thead><tr><th>A</th></tr></thead><tbody><tr><td>B</td></tr></tbody></table> end <img src='https://x/a.png' alt='OCR图片'/>";
+fn sanitize_html_fragments_preserves_markdown_around_html() {
+    let markdown = "before **bold** <p>text <em>ok</em></p> after";
 
     let updated = sanitize_html(markdown);
 
-    assert!(!updated.contains("<table"));
-    assert!(!updated.contains("<img"));
-    assert!(updated.contains("![OCR图片](https://x/a.png)"));
+    assert!(updated.contains("**bold**"));
+    assert!(updated.contains("text"));
+    assert!(updated.contains("*ok*"));
+    assert!(updated.starts_with("before "));
+    assert!(updated.ends_with(" after"));
+}
+
+#[test]
+fn sanitize_html_fragments_keeps_math_sensitive_html_raw() {
+    let markdown = "before <p>$K_{\\mathrm{TC\\_FP}}$</p> after";
+
+    let updated = sanitize_html(markdown);
+
+    assert_eq!(updated, markdown);
+}
+
+#[test]
+fn sanitize_html_fragments_keeps_excluded_math_tags_raw() {
+    let markdown = "before <div><sub>x</sub> + <sup>2</sup> <math>y</math></div> after";
+
+    let updated = sanitize_html(markdown);
+
+    assert_eq!(updated, markdown);
 }
 
 #[test]

From a579b31120036c15a471ccd167f9e37624f3f564 Mon Sep 17 00:00:00 2001
From: atsyplenkov <sharp.couch9400@fastmail.com>
Date: Thu, 19 Mar 2026 14:39:16 +0300
Subject: [PATCH 7/7] lint: remove lints

---
 src/core/markdown.rs | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/src/core/markdown.rs b/src/core/markdown.rs
index 33ec914..720f1b0 100644
--- a/src/core/markdown.rs
+++ b/src/core/markdown.rs
@@ -45,8 +45,8 @@ const HTML_ALLOWLIST_TAGS: &[&str] = &[
 const HTML_EXCLUDED_TAGS: &[&str] = &["math", "sub", "sup"];
 
 const HTML_VOID_TAGS: &[&str] = &[
-    "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param",
-    "source", "track", "wbr",
+    "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source",
+    "track", "wbr",
 ];
 
 static MARKDOWN_IMAGE_URL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
@@ -385,16 +385,16 @@ fn find_html_region_end(text: &str, start: usize, root_tag: &str) -> Option<usiz
     let mut i = root.end;
 
     while i < text.len() {
-        if text.as_bytes().get(i) == Some(&b'<') && let Some(tag) = parse_html_tag(text, i) {
+        if text.as_bytes().get(i) == Some(&b'<')
+            && let Some(tag) = parse_html_tag(text, i)
+        {
             match tag.kind {
                 HtmlTagKind::Special => {
                     i = tag.end;
                     continue;
                 }
                 HtmlTagKind::Closing => {
-                    let Some(current) = stack.last() else {
-                        return None;
-                    };
+                    let current = stack.last()?;
                     if !current.eq_ignore_ascii_case(tag.name) {
                         return None;
                     }
@@ -442,15 +442,21 @@ fn find_html_tag_end(text: &str, start: usize) -> Option<usize> {
 }
 
 fn is_html_allowlisted(tag: &str) -> bool {
-    HTML_ALLOWLIST_TAGS.iter().any(|candidate| candidate.eq_ignore_ascii_case(tag))
+    HTML_ALLOWLIST_TAGS
+        .iter()
+        .any(|candidate| candidate.eq_ignore_ascii_case(tag))
 }
 
 fn is_html_excluded(tag: &str) -> bool {
-    HTML_EXCLUDED_TAGS.iter().any(|candidate| candidate.eq_ignore_ascii_case(tag))
+    HTML_EXCLUDED_TAGS
+        .iter()
+        .any(|candidate| candidate.eq_ignore_ascii_case(tag))
 }
 
 fn is_html_void(tag: &str) -> bool {
-    HTML_VOID_TAGS.iter().any(|candidate| candidate.eq_ignore_ascii_case(tag))
+    HTML_VOID_TAGS
+        .iter()
+        .any(|candidate| candidate.eq_ignore_ascii_case(tag))
 }
 
 fn contains_excluded_math_tags(fragment: &str) -> bool {