broken after";
- let updated = strip_html_img_alt_attributes(markdown);
+ let updated = sanitize_html(markdown);
- assert_eq!(updated, " ");
+ assert_eq!(updated, markdown);
}
#[test]
-fn strip_html_img_alt_attributes_removes_boolean_and_unquoted_alt() {
- let markdown = " ";
+fn sanitize_html_fragments_keeps_nested_table_content_in_order() {
+ let markdown = "A 1  | B ";
- let updated = strip_html_img_alt_attributes(markdown);
+ let updated = sanitize_html(markdown);
- assert_eq!(updated, " ");
+ assert!(!updated.contains("");
+ assert!(!updated.contains("
Date: Thu, 19 Mar 2026 13:36:40 +0300
Subject: [PATCH 2/7] chore: bump version to 0.2.1
---
Cargo.toml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/Cargo.toml b/Cargo.toml
index c00f3db..d06c851 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "paperdown"
-version = "0.2.0"
+version = "0.2.1"
authors = ["Anatoly Tsyplenkov "]
edition = "2024"
description = "A fast CLI tool to batch convert PDFs into Markdown using GLM-OCR."
From fd733f8745d4a49092e94f64823b584af4cfed23 Mon Sep 17 00:00:00 2001
From: Anatolii Tsyplenkov <34775595+atsyplenkov@users.noreply.github.com>
Date: Thu, 19 Mar 2026 13:37:25 +0300
Subject: [PATCH 3/7] Clarify Markdown conversion type in README
Updated the description to specify that paperdown converts HTML tables and images into CommonMark Markdown instead of just Markdown.
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 443ac31..15fd1fc 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@
-`paperdown` converts research papers from PDF to Markdown using Z.AI's [GLM-OCR](https://github.com/zai-org/GLM-OCR) model, rewrites HTML tables and images into Markdown, and downloads referenced figure assets locally.
+`paperdown` converts research papers from PDF to Markdown using Z.AI's [GLM-OCR](https://github.com/zai-org/GLM-OCR) model, rewrites HTML tables and images into CommonMark Markdown, and downloads referenced figure assets locally.
If you work with academic papers, you know that the OCR process itself is not the most difficult part. The real challenge is cleaning up the output. Tables can disappear, their structure can become jumbled, and formulas might be converted into meaningless text. This often means you spend more time correcting the output than working with it.
From 560fa591a7f0c69f3a55174104174a0cb4d3b794 Mon Sep 17 00:00:00 2001
From: atsyplenkov
Date: Thu, 19 Mar 2026 13:46:05 +0300
Subject: [PATCH 4/7] Update Cargo.lock
---
Cargo.lock | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/Cargo.lock b/Cargo.lock
index f41113a..9d3aff9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1436,7 +1436,7 @@ checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
[[package]]
name = "paperdown"
-version = "0.2.0"
+version = "0.2.1"
dependencies = [
"anyhow",
"assert_cmd",
From e401c8ec424ba9ab7bfe44fd140402a237ecbeed Mon Sep 17 00:00:00 2001
From: atsyplenkov
Date: Thu, 19 Mar 2026 13:54:29 +0300
Subject: [PATCH 5/7] bump to dev version
---
Cargo.lock | 2 +-
Cargo.toml | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 9d3aff9..76f7028 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1436,7 +1436,7 @@ checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
[[package]]
name = "paperdown"
-version = "0.2.1"
+version = "0.2.1-dev"
dependencies = [
"anyhow",
"assert_cmd",
diff --git a/Cargo.toml b/Cargo.toml
index d06c851..9923301 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "paperdown"
-version = "0.2.1"
+version = "0.2.1-dev"
authors = ["Anatoly Tsyplenkov "]
edition = "2024"
description = "A fast CLI tool to batch convert PDFs into Markdown using GLM-OCR."
From f59f3ad937f1b69daa94da48b13469ead5e45f79 Mon Sep 17 00:00:00 2001
From: atsyplenkov
Date: Thu, 19 Mar 2026 14:27:50 +0300
Subject: [PATCH 6/7] fix: broaden html sanitization
---
README.md | 4 +-
src/core/markdown.rs | 273 ++++++++++++++++++++++++++++++++++-------
tests/core_internal.rs | 56 ++++++---
3 files changed, 268 insertions(+), 65 deletions(-)
diff --git a/README.md b/README.md
index 15fd1fc..eb09b97 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@
-`paperdown` converts research papers from PDF to Markdown using Z.AI's [GLM-OCR](https://github.com/zai-org/GLM-OCR) model, rewrites HTML tables and images into CommonMark Markdown, and downloads referenced figure assets locally.
+`paperdown` converts research papers from PDF to Markdown using Z.AI's [GLM-OCR](https://github.com/zai-org/GLM-OCR) model, rewrites common HTML into CommonMark Markdown, and downloads referenced figure assets locally.
If you work with academic papers, you know that the OCR process itself is not the most difficult part. The real challenge is cleaning up the output. Tables can disappear, their structure can become jumbled, and formulas might be converted into meaningless text. This often means you spend more time correcting the output than working with it.
@@ -28,7 +28,7 @@ Therefore, this project was created because, while [`docling`](https://github.co
## Features
- Async OCR requests and batch PDF processing using the Z.AI API.
-- Async HTML table and image cleanup using `fast_html2md`.
+- Async HTML cleanup using `fast_html2md`.
- Concurrent figure downloads for each PDF.
- Fast processing: approximately 25 seconds per batch of 32 PDFs. Speed depends on the z.ai API availability. See the cost section for more details on spending.
diff --git a/src/core/markdown.rs b/src/core/markdown.rs
index d6691bd..33ec914 100644
--- a/src/core/markdown.rs
+++ b/src/core/markdown.rs
@@ -6,6 +6,49 @@ use std::sync::LazyLock;
const HTML_FRAGMENT_CONCURRENCY: usize = 16;
+const HTML_ALLOWLIST_TAGS: &[&str] = &[
+ "img",
+ "a",
+ "p",
+ "div",
+ "span",
+ "br",
+ "hr",
+ "h1",
+ "h2",
+ "h3",
+ "h4",
+ "h5",
+ "h6",
+ "ul",
+ "ol",
+ "li",
+ "blockquote",
+ "table",
+ "thead",
+ "tbody",
+ "tfoot",
+ "tr",
+ "th",
+ "td",
+ "strong",
+ "b",
+ "em",
+ "i",
+ "u",
+ "s",
+ "del",
+ "pre",
+ "code",
+];
+
+const HTML_EXCLUDED_TAGS: &[&str] = &["math", "sub", "sup"];
+
+const HTML_VOID_TAGS: &[&str] = &[
+ "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param",
+ "source", "track", "wbr",
+];
+
static MARKDOWN_IMAGE_URL_PATTERN: LazyLock = LazyLock::new(|| {
Regex::new(r"\((https?://[^)\s]+)\)").expect("valid markdown image URL regex")
});
@@ -21,6 +64,21 @@ enum Segment {
Html { index: usize, raw: String },
}
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum HtmlTagKind {
+ Opening,
+ Closing,
+ Special,
+}
+
+#[derive(Debug, Clone, Copy)]
+struct ParsedHtmlTag<'a> {
+ name: &'a str,
+ kind: HtmlTagKind,
+ end: usize,
+ self_closing: bool,
+}
+
pub(crate) fn replace_image_urls(markdown: &str, replacements: &HashMap) -> String {
let updated = MARKDOWN_IMAGE_URL_PATTERN
.replace_all(markdown, |caps: ®ex::Captures<'_>| {
@@ -178,17 +236,26 @@ fn join_segments(segments: &[Segment], converted: &[Option]) -> String {
}
fn extract_html_fragment(text: &str, start: usize) -> Option<(usize, String)> {
- if starts_html_tag(text, start, "table") {
- let end = find_table_fragment_end(text, start)?;
- return Some((end, text[start..end].to_string()));
+ let tag = parse_html_tag(text, start)?;
+ if tag.kind != HtmlTagKind::Opening {
+ return None;
}
- if starts_html_tag(text, start, "img") {
- let end = find_html_tag_end(text, start)?;
- return Some((end, text[start..end].to_string()));
+ if !is_html_allowlisted(tag.name) || is_html_excluded(tag.name) {
+ return None;
}
- None
+ let end = if tag.self_closing || is_html_void(tag.name) {
+ tag.end
+ } else {
+ find_html_region_end(text, start, tag.name)?
+ };
+ let fragment = text[start..end].to_string();
+ if contains_tex_delimiters(&fragment) || contains_excluded_math_tags(&fragment) {
+ return None;
+ }
+
+ Some((end, fragment))
}
async fn convert_html_fragments(fragments: Vec) -> Vec |