From 7f9d0a95b9b6e2059965bcec19ed3f48ad5a0aca Mon Sep 17 00:00:00 2001
From: atsyplenkov
Date: Thu, 19 Mar 2026 13:32:48 +0300
Subject: [PATCH 1/3] feat: sanitize html output with fast_html2md
---
Cargo.lock | 312 ++++++++++++++++++++++++++++++++++++++++-
Cargo.toml | 1 +
README.md | 3 +-
src/core.rs | 6 +-
src/core/markdown.rs | 188 ++++++++++++++++++++++---
tests/core_internal.rs | 70 +++++----
6 files changed, 524 insertions(+), 56 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index e628d8c..f41113a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -11,6 +11,12 @@ dependencies = [
"memchr",
]
+[[package]]
+name = "allocator-api2"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
+
[[package]]
name = "anstream"
version = "1.0.0"
@@ -288,6 +294,19 @@ version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
+[[package]]
+name = "auto_encoder"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f6364e11e0270035ec392151a54f1476e6b3612ef9f4fe09d35e72a8cebcb65"
+dependencies = [
+ "chardetng",
+ "encoding_rs",
+ "percent-encoding",
+ "phf 0.11.3",
+ "phf_codegen 0.11.3",
+]
+
[[package]]
name = "autocfg"
version = "1.5.0"
@@ -396,6 +415,17 @@ version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
+[[package]]
+name = "chardetng"
+version = "0.1.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea"
+dependencies = [
+ "cfg-if",
+ "encoding_rs",
+ "memchr",
+]
+
[[package]]
name = "clap"
version = "4.6.0"
@@ -476,6 +506,29 @@ version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
+[[package]]
+name = "cssparser"
+version = "0.36.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dae61cf9c0abb83bd659dab65b7e4e38d8236824c85f0f804f173567bda257d2"
+dependencies = [
+ "cssparser-macros",
+ "dtoa-short",
+ "itoa",
+ "phf 0.13.1",
+ "smallvec",
+]
+
+[[package]]
+name = "cssparser-macros"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
+dependencies = [
+ "quote",
+ "syn 2.0.117",
+]
+
[[package]]
name = "deranged"
version = "0.5.8"
@@ -485,6 +538,27 @@ dependencies = [
"powerfmt",
]
+[[package]]
+name = "derive_more"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d751e9e49156b02b44f9c1815bcb94b984cdcc4396ecc32521c739452808b134"
+dependencies = [
+ "derive_more-impl",
+]
+
+[[package]]
+name = "derive_more-impl"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "799a97264921d8623a957f6c3b9011f3b5492f557bbb7a5a19b7fa6d06ba8dcb"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "rustc_version",
+ "syn 2.0.117",
+]
+
[[package]]
name = "difflib"
version = "0.4.0"
@@ -529,6 +603,21 @@ version = "0.15.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b"
+[[package]]
+name = "dtoa"
+version = "1.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c3cf4824e2d5f025c7b531afcb2325364084a16806f6d47fbc1f5fbd9960590"
+
+[[package]]
+name = "dtoa-short"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87"
+dependencies = [
+ "dtoa",
+]
+
[[package]]
name = "either"
version = "1.15.0"
@@ -550,6 +639,15 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
+[[package]]
+name = "encoding_rs"
+version = "0.8.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
+dependencies = [
+ "cfg-if",
+]
+
[[package]]
name = "equivalent"
version = "1.0.2"
@@ -593,6 +691,21 @@ dependencies = [
"pin-project-lite",
]
+[[package]]
+name = "fast_html2md"
+version = "0.0.58"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af3a0122fee1bcf6bb9f3d73782e911cce69d95b76a5e29e930af92cd4a8e4e3"
+dependencies = [
+ "auto_encoder",
+ "futures-util",
+ "lazy_static",
+ "lol_html",
+ "percent-encoding",
+ "regex",
+ "url",
+]
+
[[package]]
name = "fastrand"
version = "2.3.0"
@@ -617,6 +730,12 @@ version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
+[[package]]
+name = "foldhash"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
+
[[package]]
name = "form_urlencoded"
version = "1.2.2"
@@ -771,6 +890,11 @@ name = "hashbrown"
version = "0.16.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
+dependencies = [
+ "allocator-api2",
+ "equivalent",
+ "foldhash",
+]
[[package]]
name = "heck"
@@ -1232,6 +1356,25 @@ dependencies = [
"value-bag",
]
+[[package]]
+name = "lol_html"
+version = "2.7.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ff94cb6aef6ee52afd2c69331e9109906d855e82bd241f3110dfdf6185899ab"
+dependencies = [
+ "bitflags",
+ "cfg-if",
+ "cssparser",
+ "encoding_rs",
+ "foldhash",
+ "hashbrown",
+ "memchr",
+ "mime",
+ "precomputed-hash",
+ "selectors",
+ "thiserror 2.0.18",
+]
+
[[package]]
name = "lru-slab"
version = "0.1.2"
@@ -1244,6 +1387,12 @@ version = "2.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
+[[package]]
+name = "mime"
+version = "0.3.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
+
[[package]]
name = "mio"
version = "1.1.1"
@@ -1294,6 +1443,7 @@ dependencies = [
"base64 0.22.1",
"clap",
"dotenvy",
+ "fast_html2md",
"futures",
"httpmock",
"indicatif",
@@ -1352,6 +1502,93 @@ dependencies = [
"indexmap",
]
+[[package]]
+name = "phf"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078"
+dependencies = [
+ "phf_macros 0.11.3",
+ "phf_shared 0.11.3",
+]
+
+[[package]]
+name = "phf"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf"
+dependencies = [
+ "phf_macros 0.13.1",
+ "phf_shared 0.13.1",
+ "serde",
+]
+
+[[package]]
+name = "phf_codegen"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a"
+dependencies = [
+ "phf_generator 0.11.3",
+ "phf_shared 0.11.3",
+]
+
+[[package]]
+name = "phf_codegen"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1"
+dependencies = [
+ "phf_generator 0.13.1",
+ "phf_shared 0.13.1",
+]
+
+[[package]]
+name = "phf_generator"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
+dependencies = [
+ "phf_shared 0.11.3",
+ "rand 0.8.5",
+]
+
+[[package]]
+name = "phf_generator"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737"
+dependencies = [
+ "fastrand",
+ "phf_shared 0.13.1",
+]
+
+[[package]]
+name = "phf_macros"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216"
+dependencies = [
+ "phf_generator 0.11.3",
+ "phf_shared 0.11.3",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
+[[package]]
+name = "phf_macros"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "812f032b54b1e759ccd5f8b6677695d5268c588701effba24601f6932f8269ef"
+dependencies = [
+ "phf_generator 0.13.1",
+ "phf_shared 0.13.1",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
[[package]]
name = "phf_shared"
version = "0.11.3"
@@ -1361,6 +1598,15 @@ dependencies = [
"siphasher",
]
+[[package]]
+name = "phf_shared"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266"
+dependencies = [
+ "siphasher",
+]
+
[[package]]
name = "pico-args"
version = "0.5.0"
@@ -1505,7 +1751,7 @@ dependencies = [
"bytes",
"getrandom 0.3.4",
"lru-slab",
- "rand",
+ "rand 0.9.2",
"ring",
"rustc-hash",
"rustls",
@@ -1546,6 +1792,15 @@ version = "5.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
+[[package]]
+name = "rand"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
+dependencies = [
+ "rand_core 0.6.4",
+]
+
[[package]]
name = "rand"
version = "0.9.2"
@@ -1553,7 +1808,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1"
dependencies = [
"rand_chacha",
- "rand_core",
+ "rand_core 0.9.5",
]
[[package]]
@@ -1563,9 +1818,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
dependencies = [
"ppv-lite86",
- "rand_core",
+ "rand_core 0.9.5",
]
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+
[[package]]
name = "rand_core"
version = "0.9.5"
@@ -1685,6 +1946,15 @@ version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
+[[package]]
+name = "rustc_version"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
+dependencies = [
+ "semver",
+]
+
[[package]]
name = "rustix"
version = "1.1.4"
@@ -1760,6 +2030,31 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
+[[package]]
+name = "selectors"
+version = "0.33.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "feef350c36147532e1b79ea5c1f3791373e61cbd9a6a2615413b3807bb164fb7"
+dependencies = [
+ "bitflags",
+ "cssparser",
+ "derive_more",
+ "log",
+ "new_debug_unreachable",
+ "phf 0.13.1",
+ "phf_codegen 0.13.1",
+ "precomputed-hash",
+ "rustc-hash",
+ "servo_arc",
+ "smallvec",
+]
+
+[[package]]
+name = "semver"
+version = "1.0.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2"
+
[[package]]
name = "serde"
version = "1.0.228"
@@ -1825,6 +2120,15 @@ dependencies = [
"serde",
]
+[[package]]
+name = "servo_arc"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "170fb83ab34de17dc69aa7c67482b22218ddb85da56546f9bd6b929e32a05930"
+dependencies = [
+ "stable_deref_trait",
+]
+
[[package]]
name = "shlex"
version = "1.3.0"
@@ -1899,7 +2203,7 @@ checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f"
dependencies = [
"new_debug_unreachable",
"parking_lot",
- "phf_shared",
+ "phf_shared 0.11.3",
"precomputed-hash",
]
diff --git a/Cargo.toml b/Cargo.toml
index 298c33f..c00f3db 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -33,6 +33,7 @@ futures = "0.3"
indicatif = "0.17"
dotenvy = "0.15"
regex = "1"
+fast_html2md = "0.0.58"
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "json", "stream"] }
serde = { version = "1", features = ["derive"] }
serde_json = "1"
diff --git a/README.md b/README.md
index fb9b3aa..443ac31 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@
-`paperdown` converts research papers from PDF to Markdown using Z.AI's [GLM-OCR](https://github.com/zai-org/GLM-OCR) model and downloads referenced figure assets locally.
+`paperdown` converts research papers from PDF to Markdown using Z.AI's [GLM-OCR](https://github.com/zai-org/GLM-OCR) model, rewrites HTML tables and images into Markdown, and downloads referenced figure assets locally.
If you work with academic papers, you know that the OCR process itself is not the most difficult part. The real challenge is cleaning up the output. Tables can disappear, their structure can become jumbled, and formulas might be converted into meaningless text. This often means you spend more time correcting the output than working with it.
@@ -28,6 +28,7 @@ Therefore, this project was created because, while [`docling`](https://github.co
## Features
- Async OCR requests and batch PDF processing using the Z.AI API.
+- Async HTML table and image cleanup using `fast_html2md`.
- Concurrent figure downloads for each PDF.
- Fast processing: approximately 25 seconds per batch of 32 PDFs. Speed depends on the z.ai API availability. See the cost section for more details on spending.
diff --git a/src/core.rs b/src/core.rs
index be90d5d..bd6340d 100644
--- a/src/core.rs
+++ b/src/core.rs
@@ -82,7 +82,7 @@ pub async fn process_pdf(
)
.await?;
let figure_seconds = figure_started.elapsed();
- let markdown = markdown::strip_html_img_alt_attributes(&markdown);
+ let markdown = markdown::sanitize_html_fragments(markdown).await?;
fire(
&progress,
@@ -220,8 +220,8 @@ pub mod testing {
super::markdown::replace_image_urls(markdown, replacements)
}
- pub fn strip_html_img_alt_attributes(markdown: &str) -> String {
- super::markdown::strip_html_img_alt_attributes(markdown)
+ pub async fn sanitize_html_fragments(markdown: String) -> Result {
+ super::markdown::sanitize_html_fragments(markdown).await
}
pub fn prepare_output_paths(
diff --git a/src/core/markdown.rs b/src/core/markdown.rs
index 37fa0c9..d6691bd 100644
--- a/src/core/markdown.rs
+++ b/src/core/markdown.rs
@@ -1,7 +1,11 @@
+use anyhow::Result;
+use futures::stream::{self, StreamExt};
use regex::Regex;
use std::collections::HashMap;
use std::sync::LazyLock;
+const HTML_FRAGMENT_CONCURRENCY: usize = 16;
+
static MARKDOWN_IMAGE_URL_PATTERN: LazyLock = LazyLock::new(|| {
Regex::new(r"\((https?://[^)\s]+)\)").expect("valid markdown image URL regex")
});
@@ -10,10 +14,12 @@ static HTML_IMAGE_URL_PATTERN: LazyLock = LazyLock::new(|| {
Regex::new(r#"(src\s*=\s*)(['"])(https?://[^'"]+)(['"])"#).expect("valid HTML image URL regex")
});
-static HTML_IMAGE_ALT_PATTERN: LazyLock = LazyLock::new(|| {
- Regex::new(r#"(?is)\s+alt(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s/>]+))?"#)
- .expect("valid HTML image alt regex")
-});
+#[derive(Debug)]
+enum Segment {
+ Text(String),
+ Code(String),
+ Html { index: usize, raw: String },
+}
pub(crate) fn replace_image_urls(markdown: &str, replacements: &HashMap) -> String {
let updated = MARKDOWN_IMAGE_URL_PATTERN
@@ -42,7 +48,7 @@ pub(crate) fn replace_image_urls(markdown: &str, replacements: &HashMap String {
+pub(crate) async fn sanitize_html_fragments(markdown: String) -> Result {
let mut out = String::with_capacity(markdown.len());
let mut chunk_start = 0usize;
let mut in_fence = false;
@@ -68,7 +74,7 @@ pub(crate) fn strip_html_img_alt_attributes(markdown: &str) -> String {
}
if let Some((marker, len)) = fence_start(line) {
- out.push_str(&sanitize_non_code_chunk(&markdown[chunk_start..i]));
+ out.push_str(&sanitize_non_code_chunk(&markdown[chunk_start..i]).await?);
out.push_str(line);
in_fence = true;
fence_marker = marker;
@@ -82,41 +88,132 @@ pub(crate) fn strip_html_img_alt_attributes(markdown: &str) -> String {
}
if !in_fence {
- out.push_str(&sanitize_non_code_chunk(&markdown[chunk_start..]));
+ out.push_str(&sanitize_non_code_chunk(&markdown[chunk_start..]).await?);
}
- out
+ Ok(out)
}
-fn sanitize_non_code_chunk(chunk: &str) -> String {
- let mut out = String::with_capacity(chunk.len());
+async fn sanitize_non_code_chunk(chunk: &str) -> Result {
+ let mut segments = Vec::new();
let mut i = 0usize;
+ let mut literal_start = 0usize;
+ let mut html_count = 0usize;
while i < chunk.len() {
if let Some(run_len) = backtick_run_len(chunk, i)
&& let Some(end) = find_matching_backtick_run(chunk, i + run_len, run_len)
{
- out.push_str(&chunk[i..end + run_len]);
+ if literal_start < i {
+ segments.push(Segment::Text(chunk[literal_start..i].to_string()));
+ }
+ segments.push(Segment::Code(chunk[i..end + run_len].to_string()));
i = end + run_len;
+ literal_start = i;
continue;
}
- if starts_html_img_tag(chunk, i)
- && let Some(tag_end) = find_html_tag_end(chunk, i)
- {
- out.push_str(&HTML_IMAGE_ALT_PATTERN.replace_all(&chunk[i..tag_end], ""));
+ if let Some((tag_end, fragment)) = extract_html_fragment(chunk, i) {
+ if literal_start < i {
+ segments.push(Segment::Text(chunk[literal_start..i].to_string()));
+ }
+ segments.push(Segment::Html {
+ index: html_count,
+ raw: fragment,
+ });
+ html_count += 1;
i = tag_end;
+ literal_start = i;
continue;
}
let ch = chunk[i..].chars().next().expect("valid char boundary");
- out.push(ch);
i += ch.len_utf8();
}
+ if literal_start < chunk.len() {
+ segments.push(Segment::Text(chunk[literal_start..].to_string()));
+ }
+
+ if html_count == 0 {
+ return Ok(join_segments(&segments, &[]));
+ }
+
+ let html_fragments: Vec = segments
+ .iter()
+ .filter_map(|segment| match segment {
+ Segment::Html { raw, .. } => Some(raw.clone()),
+ _ => None,
+ })
+ .collect();
+ let converted = convert_html_fragments(html_fragments).await;
+ Ok(join_segments(&segments, &converted))
+}
+
+fn join_segments(segments: &[Segment], converted: &[Option]) -> String {
+ let mut out = String::with_capacity(
+ segments
+ .iter()
+ .map(|segment| match segment {
+ Segment::Text(text) | Segment::Code(text) => text.len(),
+ Segment::Html { raw, .. } => raw.len(),
+ })
+ .sum(),
+ );
+
+ for segment in segments {
+ match segment {
+ Segment::Text(text) | Segment::Code(text) => out.push_str(text),
+ Segment::Html { index, raw } => {
+ if let Some(Some(rewritten)) = converted.get(*index) {
+ out.push_str(rewritten);
+ } else {
+ out.push_str(raw);
+ }
+ }
+ }
+ }
+
out
}
+fn extract_html_fragment(text: &str, start: usize) -> Option<(usize, String)> {
+ if starts_html_tag(text, start, "table") {
+ let end = find_table_fragment_end(text, start)?;
+ return Some((end, text[start..end].to_string()));
+ }
+
+ if starts_html_tag(text, start, "img") {
+ let end = find_html_tag_end(text, start)?;
+ return Some((end, text[start..end].to_string()));
+ }
+
+ None
+}
+
+async fn convert_html_fragments(fragments: Vec) -> Vec