diff --git a/README.md b/README.md
index fb9b3aa..784ee98 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@ If you work with academic papers, you know that the OCR process itself is not th
 
 I used to rely on [`marker`](https://github.com/datalab-to/marker) for PDF parsing and thought it was great. However, after converting the [Batista et al. (2022)](https://hess.copernicus.org/articles/26/3753/2022/) article one day, I discovered that Table 4 was missing, regardless of the settings or LLMs I used (via the `--use-llm` flag). I then switched to [`docling`](https://github.com/docling-project/docling), and Table 4 reappeared, but all the formulas were gone. Furthermore, both tools require a GPU, and even on a Google Colab T4 instance, processing one article takes 4 to 5 minutes.
 
-Therefore, this project was created because, while [`docling`](https://github.com/docling-project/docling) and [`marker`](https://github.com/datalab-to/marker) are both good tools, they can sometimes miss tables or mix up table structures in ways that require manual correction. I wanted a simple, reliable process that produces a single Markdown file I can trust, a local `figures/` folder, and the ability to process my entire library quickly on my laptop.
+Therefore, this project was created because, while [`docling`](https://github.com/docling-project/docling) and [`marker`](https://github.com/datalab-to/marker) are both good tools, they can sometimes miss tables or mix up table structures in ways that require manual correction. I wanted a simple, reliable process that produces a Markdown index file I can trust, local `figures/` and optional `tables/` folders, and the ability to process my entire library quickly on my laptop.
 
 ## Features
 
@@ -71,6 +71,7 @@ paperdown converts one PDF or a directory of PDFs into markdown output folders.
 For each PDF, it creates:
 - <output>/<pdf_stem>/index.md
 - <output>/<pdf_stem>/figures/
+- <output>/<pdf_stem>/tables/ (when `--normalize-tables` is enabled)
 - <output>/<pdf_stem>/log.jsonl
 
 API key lookup order:
@@ -87,7 +88,8 @@ Options:
       --max-download-bytes <MAX_DOWNLOAD_BYTES>  Maximum allowed size (bytes) for each downloaded figure file. [default: 20971520]
       --workers <WORKERS>                        Maximum number of PDFs processed concurrently in batch mode. [default: 32]
   -v, --verbose                                  Enable verbose progress messages on stderr.
-      --overwrite                                Replace existing managed output artifacts (index.md and figures/).
+      --overwrite                                Replace existing managed output artifacts (index.md, figures/, and tables/ when enabled).
+      --normalize-tables                         Normalize OCR HTML tables into Markdown and store raw HTML under tables/.
   -h, --help                                     Print help (see a summary with '-h')
   -V, --version                                  Print version
 ```
diff --git a/src/cli.rs b/src/cli.rs
index 47e0f77..9872e53 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -10,6 +10,7 @@ use std::path::PathBuf;
 For each PDF, it creates:\n\
 - <output>/<pdf_stem>/index.md\n\
 - <output>/<pdf_stem>/figures/\n\
+- <output>/<pdf_stem>/tables/ (when --normalize-tables is enabled)\n\
 - <output>/<pdf_stem>/log.jsonl\n\n\
 API key lookup order:\n\
 1) ZAI_API_KEY from --env-file\n\
@@ -17,9 +18,11 @@ API key lookup order:\n\
     after_help = "Examples:\n  \
 paperdown --input pdf/paper.pdf\n  \
 paperdown --input pdf/ --output md/ --workers 4\n  \
-paperdown --input pdf/ --output md/ --overwrite\n\n\
+paperdown --input pdf/ --output md/ --overwrite\n  \
+paperdown --input pdf/ --output md/ --normalize-tables\n\n\
 Notes:\n  \
 Without --overwrite, existing index.md or figures/ causes a failure.\n  \
+When --normalize-tables is enabled, existing tables/ also causes a failure.\n  \
 Progress bars are shown on stderr only when running in a TTY."
 )]
 pub struct Cli {
@@ -83,6 +86,13 @@ pub struct Cli {
         help = "Replace existing managed output artifacts (index.md and figures/)."
     )]
     pub overwrite: bool,
+
+    #[arg(
+        long = "normalize-tables",
+        action = ArgAction::SetTrue,
+        help = "Normalize OCR HTML tables into Markdown and store raw HTML under tables/."
+    )]
+    pub normalize_tables: bool,
 }
 
 pub fn default_workers() -> usize {
@@ -124,6 +134,7 @@ mod tests {
         assert_eq!(cli.workers, default_workers());
         assert!(!cli.verbose);
         assert!(!cli.overwrite);
+        assert!(!cli.normalize_tables);
     }
 
     #[test]
@@ -148,6 +159,7 @@ mod tests {
         let help = cmd.render_long_help().to_string();
         assert!(help.contains("Examples:"));
         assert!(help.contains("--overwrite"));
+        assert!(help.contains("--normalize-tables"));
         let file_first = help.find("1) ZAI_API_KEY from --env-file");
         let env_second = help.find("2) ZAI_API_KEY from environment");
         assert!(file_first.is_some());
diff --git a/src/core.rs b/src/core.rs
index be90d5d..4ca44b9 100644
--- a/src/core.rs
+++ b/src/core.rs
@@ -12,6 +12,7 @@ mod input;
 mod markdown;
 mod ocr;
 mod output;
+mod table_normalization;
 
 pub fn collect_pdfs(input_path: &Path) -> Result<Vec<std::path::PathBuf>> {
     input::collect_pdfs(input_path)
@@ -29,6 +30,15 @@ pub enum ProgressEvent {
 
 pub type ProgressCallback = Arc<dyn Fn(ProgressEvent) + Send + Sync>;
 
+#[derive(Clone)]
+pub struct ProcessPdfOptions {
+    pub timeout: Duration,
+    pub max_download_bytes: u64,
+    pub overwrite: bool,
+    pub normalize_tables: bool,
+    pub progress: Option<ProgressCallback>,
+}
+
 #[derive(Debug, Serialize, Clone)]
 pub struct PdfSummary {
     pub pdf: String,
@@ -45,10 +55,7 @@ pub async fn process_pdf(
     pdf_path: &Path,
     output_root: &Path,
     env_file: &Path,
-    timeout: Duration,
-    max_download_bytes: u64,
-    overwrite: bool,
-    progress: Option<ProgressCallback>,
+    options: ProcessPdfOptions,
 ) -> Result<PdfSummary> {
     let run_started = Instant::now();
     let pdf_path = pdf_path
@@ -57,16 +64,23 @@ pub async fn process_pdf(
     if !pdf_path.is_file() || !input::is_pdf_path(&pdf_path) {
         return Err(anyhow!("Input must be a PDF: {}", pdf_path.display()));
     }
-    let prepared = output::prepare_output_paths(output_root, &pdf_path, overwrite)?;
-    let client = reqwest::Client::builder().timeout(timeout).build()?;
+    let prepared = output::prepare_output_paths(
+        output_root,
+        &pdf_path,
+        options.overwrite,
+        options.normalize_tables,
+    )?;
+    let client = reqwest::Client::builder()
+        .timeout(options.timeout)
+        .build()?;
 
     let api_key = input::load_api_key(env_file)?;
     let payload = ocr::build_payload(&pdf_path).await?;
-    fire(&progress, ProgressEvent::OcrStarted);
+    fire(&options.progress, ProgressEvent::OcrStarted);
     let ocr_started = Instant::now();
     let response = ocr::call_layout_parsing(&client, &api_key, payload).await?;
     let ocr_seconds = ocr_started.elapsed();
-    fire(&progress, ProgressEvent::OcrFinished);
+    fire(&options.progress, ProgressEvent::OcrFinished);
 
     let (markdown, layout_details, usage) = ocr::validate_layout_response(response)?;
 
@@ -77,22 +91,31 @@ pub async fn process_pdf(
             &layout_details,
             &client,
             &prepared.figures_dir,
-            max_download_bytes,
-            progress.clone(),
+            options.max_download_bytes,
+            options.progress.clone(),
         )
         .await?;
     let figure_seconds = figure_started.elapsed();
     let markdown = markdown::strip_html_img_alt_attributes(&markdown);
+    let (markdown, table_stats) = if options.normalize_tables {
+        let tables_dir = prepared
+            .tables_dir
+            .as_ref()
+            .expect("tables_dir must exist when normalize_tables is enabled");
+        table_normalization::normalize_tables(&markdown, tables_dir).await?
+    } else {
+        (markdown, table_normalization::TableStats::default())
+    };
 
     fire(
-        &progress,
+        &options.progress,
         ProgressEvent::MarkdownWriteStarted {
             bytes: markdown.len(),
         },
     );
     let write_started = Instant::now();
     output::atomic_write_text(&prepared.markdown_path, &markdown).await?;
-    fire(&progress, ProgressEvent::MarkdownWriteFinished);
+    fire(&options.progress, ProgressEvent::MarkdownWriteFinished);
 
     output::append_log(
         &prepared.log_path,
@@ -104,6 +127,14 @@ pub async fn process_pdf(
             "downloaded_figures": downloaded_figures,
             "remote_figure_links": remote_figure_links,
             "image_blocks": image_blocks,
+            "tables_found": table_stats.tables_found,
+            "tables_raw_written": table_stats.tables_raw_written,
+            "tables_normalized": table_stats.tables_normalized,
+            "tables_skipped_in_code": table_stats.tables_skipped_in_code,
+            "tables_skipped_nested": table_stats.tables_skipped_nested,
+            "tables_skipped_too_large": table_stats.tables_skipped_too_large,
+            "tables_failed_extract": table_stats.tables_failed_extract,
+            "tables_failed_parse": table_stats.tables_failed_parse,
             "usage": usage,
             "timing": {
                 "ocr_call_s": round3(ocr_seconds),
@@ -122,6 +153,7 @@ pub async fn process_pdf(
         downloaded_figures,
         remote_figure_links,
         image_blocks,
+        // Table stats are logged but not surfaced in the summary.
         usage,
         log_path: prepared.log_path.display().to_string(),
     })
@@ -140,9 +172,11 @@ fn round3(duration: Duration) -> f64 {
 #[cfg(feature = "internal-testing")]
 #[doc(hidden)]
 pub mod testing {
+    pub use super::ProcessPdfOptions;
     pub use super::ProgressCallback;
     pub use super::ProgressEvent;
     pub use super::process_pdf;
+    pub use super::table_normalization::TableStats;
     use anyhow::Result;
     use serde_json::Value;
     use std::collections::HashMap;
@@ -153,6 +187,7 @@ pub mod testing {
     pub struct PreparedOutputPaths {
         pub output_dir: std::path::PathBuf,
         pub figures_dir: std::path::PathBuf,
+        pub tables_dir: Option<std::path::PathBuf>,
         pub markdown_path: std::path::PathBuf,
         pub log_path: std::path::PathBuf,
     }
@@ -228,16 +263,30 @@ pub mod testing {
         output_root: &Path,
         pdf_path: &Path,
         overwrite: bool,
+        normalize_tables: bool,
     ) -> Result<PreparedOutputPaths> {
-        let prepared = super::output::prepare_output_paths(output_root, pdf_path, overwrite)?;
+        let prepared = super::output::prepare_output_paths(
+            output_root,
+            pdf_path,
+            overwrite,
+            normalize_tables,
+        )?;
         Ok(PreparedOutputPaths {
             output_dir: prepared.output_dir,
             figures_dir: prepared.figures_dir,
+            tables_dir: prepared.tables_dir,
             markdown_path: prepared.markdown_path,
             log_path: prepared.log_path,
         })
     }
 
+    pub async fn normalize_tables(
+        markdown: &str,
+        tables_dir: &Path,
+    ) -> Result<(String, TableStats)> {
+        super::table_normalization::normalize_tables(markdown, tables_dir).await
+    }
+
     pub async fn append_log(log_path: &Path, entry: Value) -> Result<()> {
         super::output::append_log(log_path, entry).await
     }
diff --git a/src/core/output.rs b/src/core/output.rs
index f66d3c4..902eb4a 100644
--- a/src/core/output.rs
+++ b/src/core/output.rs
@@ -9,6 +9,7 @@ use tokio::io::AsyncWriteExt;
 pub(crate) struct PreparedOutput {
     pub(crate) output_dir: PathBuf,
     pub(crate) figures_dir: PathBuf,
+    pub(crate) tables_dir: Option<PathBuf>,
     pub(crate) markdown_path: PathBuf,
     pub(crate) log_path: PathBuf,
 }
@@ -17,6 +18,7 @@ pub(crate) fn prepare_output_paths(
     output_root: &Path,
     pdf_path: &Path,
     overwrite: bool,
+    normalize_tables: bool,
 ) -> Result<PreparedOutput> {
     let stem = pdf_path
         .file_stem()
@@ -28,6 +30,7 @@ pub(crate) fn prepare_output_paths(
 
     let markdown_path = output_dir.join("index.md");
     let figures_dir = output_dir.join("figures");
+    let tables_dir = output_dir.join("tables");
     let log_path = output_dir.join("log.jsonl");
 
     if !overwrite {
@@ -43,6 +46,12 @@ pub(crate) fn prepare_output_paths(
                 figures_dir.display()
             ));
         }
+        if normalize_tables && tables_dir.exists() {
+            return Err(anyhow::anyhow!(
+                "Output already exists: {}. Re-run with --overwrite",
+                tables_dir.display()
+            ));
+        }
     } else {
         if markdown_path.exists() {
             std::fs::remove_file(&markdown_path)?;
@@ -54,13 +63,27 @@ pub(crate) fn prepare_output_paths(
                 std::fs::remove_file(&figures_dir)?;
             }
         }
+        if normalize_tables && tables_dir.exists() {
+            if tables_dir.is_dir() {
+                std::fs::remove_dir_all(&tables_dir)?;
+            } else {
+                std::fs::remove_file(&tables_dir)?;
+            }
+        }
     }
 
     std::fs::create_dir_all(&figures_dir)?;
+    let tables_dir = if normalize_tables {
+        std::fs::create_dir_all(&tables_dir)?;
+        Some(tables_dir)
+    } else {
+        None
+    };
 
     Ok(PreparedOutput {
         output_dir,
         figures_dir,
+        tables_dir,
         markdown_path,
         log_path,
     })
diff --git a/src/core/table_normalization.rs b/src/core/table_normalization.rs
new file mode 100644
index 0000000..985a160
--- /dev/null
+++ b/src/core/table_normalization.rs
@@ -0,0 +1,796 @@
+use anyhow::{Result, anyhow};
+use regex::Regex;
+use serde::Serialize;
+use std::collections::HashMap;
+use std::fmt::Write as _;
+use std::path::Path;
+
+use super::output;
+
+const RAW_HTML_LIMIT: usize = 128 * 1024;
+const CELL_LIMIT: usize = 2_000;
+const NORMALIZED_CHAR_LIMIT: usize = 32 * 1024;
+const ROW_GROUP_SIZE: usize = 25;
+
+#[derive(Debug, Default, Clone, Serialize)]
+pub struct TableStats {
+    pub tables_found: usize,
+    pub tables_raw_written: usize,
+    pub tables_normalized: usize,
+    pub tables_skipped_in_code: usize,
+    pub tables_skipped_nested: usize,
+    pub tables_skipped_too_large: usize,
+    pub tables_failed_extract: usize,
+    pub tables_failed_parse: usize,
+}
+
+pub(crate) async fn normalize_tables(
+    markdown: &str,
+    tables_dir: &Path,
+) -> Result<(String, TableStats)> {
+    let mut out = String::with_capacity(markdown.len());
+    let mut stats = TableStats::default();
+    let mut table_index = 0usize;
+    let mut pos = 0usize;
+
+    while pos < markdown.len() {
+        let line_end_pos = line_end(markdown, pos);
+        let line = &markdown[pos..line_end_pos];
+
+        if let Some((marker, fence_len)) = fence_start(line) {
+            let fence_start_pos = pos;
+            let mut fence_end = line_end_pos;
+            let mut closed = false;
+
+            while fence_end < markdown.len() {
+                let next_end = line_end(markdown, fence_end);
+                let next_line = &markdown[fence_end..next_end];
+                fence_end = next_end;
+                if is_closing_fence_line(next_line, marker, fence_len) {
+                    closed = true;
+                    break;
+                }
+            }
+
+            let block = &markdown[fence_start_pos..fence_end];
+            stats.tables_skipped_in_code += count_case_insensitive_occurrences(block, "<table");
+            out.push_str(block);
+            pos = fence_end;
+
+            if !closed {
+                break;
+            }
+            continue;
+        }
+
+        let mut chunk_end = line_end_pos;
+        while chunk_end < markdown.len() {
+            let next_end = line_end(markdown, chunk_end);
+            let next_line = &markdown[chunk_end..next_end];
+            if fence_start(next_line).is_some() {
+                break;
+            }
+            chunk_end = next_end;
+        }
+
+        let chunk = &markdown[pos..chunk_end];
+        let rewritten =
+            rewrite_non_code_chunk(chunk, tables_dir, &mut table_index, &mut stats).await?;
+        out.push_str(&rewritten);
+        pos = chunk_end;
+    }
+
+    Ok((out, stats))
+}
+
+async fn rewrite_non_code_chunk(
+    chunk: &str,
+    tables_dir: &Path,
+    table_index: &mut usize,
+    stats: &mut TableStats,
+) -> Result<String> {
+    let mut out = String::with_capacity(chunk.len());
+    let mut i = 0usize;
+
+    while i < chunk.len() {
+        if let Some(run_len) = backtick_run_len(chunk, i) {
+            let end = find_matching_backtick_run(chunk, i + run_len, run_len)
+                .map(|offset| offset + run_len)
+                .unwrap_or(chunk.len());
+            let code = &chunk[i..end];
+            stats.tables_skipped_in_code += count_case_insensitive_occurrences(code, "<table");
+            out.push_str(code);
+            i = end;
+            continue;
+        }
+
+        if starts_tag(chunk, i, "table") {
+            stats.tables_found += 1;
+            *table_index += 1;
+            let ordinal = *table_index;
+            let artifact_name = format!("table_{ordinal:03}.html");
+            let artifact_rel = format!("tables/{artifact_name}");
+            let artifact_path = tables_dir.join(&artifact_name);
+
+            match extract_table_span(chunk, i) {
+                TableExtraction::Failed => {
+                    stats.tables_failed_extract += 1;
+                    out.push_str(&chunk[i..]);
+                    break;
+                }
+                TableExtraction::Span { html, end, nested } => {
+                    output::atomic_write_text(&artifact_path, &html).await?;
+                    stats.tables_raw_written += 1;
+
+                    if nested {
+                        stats.tables_skipped_nested += 1;
+                        out.push_str(&render_placeholder_block(
+                            ordinal,
+                            &artifact_rel,
+                            "normalization skipped (nested table detected)",
+                        ));
+                    } else {
+                        match render_normalized_table(&html, ordinal, &artifact_rel) {
+                            Ok(Some(rendered)) => {
+                                stats.tables_normalized += 1;
+                                out.push_str(&rendered);
+                            }
+                            Ok(None) => {
+                                stats.tables_skipped_too_large += 1;
+                                out.push_str(&render_placeholder_block(
+                                    ordinal,
+                                    &artifact_rel,
+                                    "normalization skipped (table too large)",
+                                ));
+                            }
+                            Err(_) => {
+                                stats.tables_failed_parse += 1;
+                                out.push_str(&render_placeholder_block(
+                                    ordinal,
+                                    &artifact_rel,
+                                    "normalization skipped (parse failed)",
+                                ));
+                            }
+                        }
+                    }
+
+                    i = end;
+                    continue;
+                }
+            }
+        }
+
+        let ch = chunk[i..]
+            .chars()
+            .next()
+            .ok_or_else(|| anyhow!("invalid markdown boundary"))?;
+        out.push(ch);
+        i += ch.len_utf8();
+    }
+
+    Ok(out)
+}
+
+fn render_normalized_table(
+    html: &str,
+    ordinal: usize,
+    artifact_rel: &str,
+) -> Result<Option<String>> {
+    if html.len() > RAW_HTML_LIMIT {
+        return Ok(None);
+    }
+
+    let parsed = parse_table_fragment(html)?;
+    if parsed.rows.is_empty() {
+        return Err(anyhow!("table has no rows"));
+    }
+
+    let cell_count = parsed.columns.len() * parsed.rows.len();
+    if cell_count > CELL_LIMIT {
+        return Ok(None);
+    }
+
+    let mut out = String::new();
+    write!(&mut out, "\n\n##### OCR Table {ordinal}\n").unwrap();
+    writeln!(&mut out, "Source (OCR HTML): {artifact_rel}").unwrap();
+    write!(&mut out, "Columns: {}\n\n", parsed.columns.join(", ")).unwrap();
+
+    for (row_index, row) in parsed.rows.iter().enumerate() {
+        if row_index > 0 && row_index % ROW_GROUP_SIZE == 0 {
+            out.push('\n');
+        }
+        writeln!(&mut out, "Row: {}", render_row_json(&parsed.columns, row)?).unwrap();
+    }
+    out.push('\n');
+
+    if out.len() > NORMALIZED_CHAR_LIMIT {
+        return Ok(None);
+    }
+
+    Ok(Some(out))
+}
+
+fn render_placeholder_block(ordinal: usize, artifact_rel: &str, reason: &str) -> String {
+    format!(
+        "\n\n##### OCR Table {ordinal}\nSource (OCR HTML): {artifact_rel}\nStatus: {reason}\n\n"
+    )
+}
+
+fn render_row_json(columns: &[String], values: &[String]) -> Result<String> {
+    let mut out = String::from("{");
+    for (index, (key, value)) in columns.iter().zip(values.iter()).enumerate() {
+        if index > 0 {
+            out.push(',');
+        }
+        out.push_str(&serde_json::to_string(key)?);
+        out.push(':');
+        out.push_str(&serde_json::to_string(value)?);
+    }
+    out.push('}');
+    Ok(out)
+}
+
+fn parse_table_fragment(fragment: &str) -> Result<ParsedTable> {
+    let rows = parse_rows(fragment)?;
+    if rows.is_empty() {
+        return Err(anyhow!("table has no rows"));
+    }
+
+    let header_rows = rows.iter().take_while(|row| row.has_th).count();
+    let expanded = expand_rows(rows)?;
+    let width = expanded.width;
+    if width == 0 {
+        return Err(anyhow!("table has no columns"));
+    }
+
+    let columns = if header_rows == 0 {
+        (1..=width).map(|index| format!("col_{index}")).collect()
+    } else {
+        build_columns(&expanded.grid, header_rows)
+    };
+
+    let mut data_rows = Vec::new();
+    for row in expanded.grid.into_iter().skip(header_rows) {
+        data_rows.push(
+            row.into_iter()
+                .map(|cell| cell.unwrap_or_default())
+                .collect::<Vec<_>>(),
+        );
+    }
+
+    Ok(ParsedTable {
+        columns,
+        rows: data_rows,
+    })
+}
+
+fn build_columns(grid: &[Vec<Option<String>>], header_rows: usize) -> Vec<String> {
+    let width = grid.first().map(|row| row.len()).unwrap_or_default();
+    let mut raw_keys = Vec::with_capacity(width);
+
+    for col in 0..width {
+        let mut parts = Vec::new();
+        for row in grid.iter().take(header_rows.min(grid.len())) {
+            let value = row[col].as_ref().map(|value| value.trim()).unwrap_or("");
+            if value.is_empty() {
+                continue;
+            }
+            if parts.last().is_none_or(|last: &String| last != value) {
+                parts.push(value.to_string());
+            }
+        }
+        let joined = parts.join(" / ");
+        raw_keys.push(normalize_key(&joined, col + 1));
+    }
+
+    disambiguate_keys(raw_keys)
+}
+
+fn disambiguate_keys(keys: Vec<String>) -> Vec<String> {
+    let mut seen = HashMap::<String, usize>::new();
+    let mut out = Vec::with_capacity(keys.len());
+
+    for key in keys {
+        let count = seen.entry(key.clone()).or_insert(0);
+        *count += 1;
+        if *count == 1 {
+            out.push(key);
+        } else {
+            out.push(format!("{key}_{count}"));
+        }
+    }
+
+    out
+}
+
+fn normalize_key(value: &str, index: usize) -> String {
+    let mut out = String::new();
+    let mut last_was_underscore = false;
+
+    for ch in value.chars() {
+        if ch.is_ascii_alphanumeric() {
+            out.push(ch.to_ascii_lowercase());
+            last_was_underscore = false;
+        } else if !out.is_empty() && !last_was_underscore {
+            out.push('_');
+            last_was_underscore = true;
+        }
+    }
+
+    let trimmed = out.trim_matches('_');
+    if trimmed.is_empty() {
+        format!("col_{index}")
+    } else {
+        trimmed.to_string()
+    }
+}
+
+fn expand_rows(rows: Vec<ParsedRow>) -> Result<ExpandedTable> {
+    let mut grid: Vec<Vec<Option<String>>> = Vec::new();
+    let mut occupied: Vec<Vec<bool>> = Vec::new();
+    let mut width = 0usize;
+
+    for (row_index, row) in rows.iter().enumerate() {
+        ensure_row(&mut grid, &mut occupied, row_index, width);
+        let mut col = 0usize;
+
+        for cell in &row.cells {
+            while col < width && occupied[row_index][col] {
+                col += 1;
+            }
+            if col >= width {
+                width = width.max(col + cell.colspan);
+                resize_width(&mut grid, &mut occupied, width);
+            }
+
+            let required_width = col + cell.colspan;
+            if required_width > width {
+                width = required_width;
+                resize_width(&mut grid, &mut occupied, width);
+            }
+
+            for row_offset in 0..cell.rowspan {
+                let target_row = row_index + row_offset;
+                ensure_row(&mut grid, &mut occupied, target_row, width);
+                for col_offset in 0..cell.colspan {
+                    let target_col = col + col_offset;
+                    occupied[target_row][target_col] = true;
+                    grid[target_row][target_col] = Some(cell.text.clone());
+                }
+            }
+
+            col += cell.colspan;
+        }
+    }
+
+    if width == 0 {
+        return Err(anyhow!("table has no columns"));
+    }
+
+    for row in &mut grid {
+        if row.len() < width {
+            row.resize(width, None);
+        }
+    }
+
+    Ok(ExpandedTable { grid, width })
+}
+
+fn ensure_row(
+    grid: &mut Vec<Vec<Option<String>>>,
+    occupied: &mut Vec<Vec<bool>>,
+    row_index: usize,
+    width: usize,
+) {
+    while grid.len() <= row_index {
+        grid.push(vec![None; width]);
+        occupied.push(vec![false; width]);
+    }
+}
+
+fn resize_width(grid: &mut [Vec<Option<String>>], occupied: &mut [Vec<bool>], width: usize) {
+    for row in grid.iter_mut() {
+        if row.len() < width {
+            row.resize(width, None);
+        }
+    }
+    for row in occupied.iter_mut() {
+        if row.len() < width {
+            row.resize(width, false);
+        }
+    }
+}
+
+fn parse_rows(fragment: &str) -> Result<Vec<ParsedRow>> {
+    let mut rows = Vec::new();
+    let mut pos = 0usize;
+
+    while let Some(row_start) = find_tag(fragment, pos, "tr", false) {
+        let row_open_end = find_tag_end(fragment, row_start)
+            .ok_or_else(|| anyhow!("table row start tag was not closed"))?;
+        let row_close_start = find_tag(fragment, row_open_end, "tr", true)
+            .ok_or_else(|| anyhow!("table row end tag was not found"))?;
+        let row_close_end = find_tag_end(fragment, row_close_start)
+            .ok_or_else(|| anyhow!("table row end tag was not closed"))?;
+        let row_inner = &fragment[row_open_end..row_close_start];
+        rows.push(parse_row(row_inner)?);
+        pos = row_close_end;
+    }
+
+    Ok(rows)
+}
+
+fn parse_row(row: &str) -> Result<ParsedRow> {
+    let mut cells = Vec::new();
+    let mut pos = 0usize;
+    let mut has_th = false;
+
+    while pos < row.len() {
+        if let Some(cell_start) = find_tag(row, pos, "th", false) {
+            let (cell, cell_end) = parse_cell(row, cell_start, "th")?;
+            has_th = true;
+            cells.push(cell);
+            pos = cell_end;
+            continue;
+        }
+        if let Some(cell_start) = find_tag(row, pos, "td", false) {
+            let (cell, cell_end) = parse_cell(row, cell_start, "td")?;
+            cells.push(cell);
+            pos = cell_end;
+            continue;
+        }
+
+        let ch = row[pos..]
+            .chars()
+            .next()
+            .ok_or_else(|| anyhow!("invalid table row boundary"))?;
+        pos += ch.len_utf8();
+    }
+
+    Ok(ParsedRow { cells, has_th })
+}
+
+fn parse_cell(row: &str, start: usize, name: &str) -> Result<(ParsedCell, usize)> {
+    let open_end =
+        find_tag_end(row, start).ok_or_else(|| anyhow!("table cell start tag was not closed"))?;
+    let close_start = find_tag(row, open_end, name, true)
+        .ok_or_else(|| anyhow!("table cell end tag was not found"))?;
+    let close_end = find_tag_end(row, close_start)
+        .ok_or_else(|| anyhow!("table cell end tag was not closed"))?;
+    let tag = &row[start..open_end];
+    let inner = &row[open_end..close_start];
+    let rowspan = parse_span_attr(tag, "rowspan");
+    let colspan = parse_span_attr(tag, "colspan");
+
+    Ok((
+        ParsedCell {
+            text: html_fragment_to_text(inner),
+            rowspan,
+            colspan,
+        },
+        close_end,
+    ))
+}
+
+fn html_fragment_to_text(fragment: &str) -> String {
+    let mut out = String::with_capacity(fragment.len());
+    let mut pos = 0usize;
+
+    while pos < fragment.len() {
+        if fragment.as_bytes()[pos] == b'<' {
+            if let Some(tag_end) = find_tag_end(fragment, pos) {
+                let tag = fragment[pos + 1..tag_end - 1].trim();
+                let lower = tag.to_ascii_lowercase();
+                if lower.starts_with("br")
+                    || lower.starts_with("/p")
+                    || lower.starts_with("/div")
+                    || lower.starts_with("/tr")
+                    || lower.starts_with("/td")
+                    || lower.starts_with("/th")
+                    || lower.starts_with("p")
+                    || lower.starts_with("div")
+                {
+                    out.push('\n');
+                }
+                pos = tag_end;
+                continue;
+            }
+            break;
+        }
+
+        if fragment.as_bytes()[pos] == b'&'
+            && let Some((decoded, consumed)) = decode_html_entity(&fragment[pos..])
+        {
+            out.push_str(&decoded);
+            pos += consumed;
+            continue;
+        }
+
+        let ch = fragment[pos..]
+            .chars()
+            .next()
+            .expect("valid fragment boundary");
+        out.push(ch);
+        pos += ch.len_utf8();
+    }
+
+    out.trim().to_string()
+}
+
+fn decode_html_entity(fragment: &str) -> Option<(String, usize)> {
+    let end = fragment.find(';')?;
+    let entity = &fragment[..=end];
+    let decoded = match entity {
+        "&amp;" => "&".to_string(),
+        "&lt;" => "<".to_string(),
+        "&gt;" => ">".to_string(),
+        "&quot;" => "\"".to_string(),
+        "&#39;" | "&#x27;" => "'".to_string(),
+        _ if entity.starts_with("&#x") || entity.starts_with("&#X") => {
+            let value = u32::from_str_radix(&entity[3..end], 16).ok()?;
+            char::from_u32(value)?.to_string()
+        }
+        _ if entity.starts_with("&#") => {
+            let value = entity[2..end].parse::<u32>().ok()?;
+            char::from_u32(value)?.to_string()
+        }
+        _ => return None,
+    };
+    Some((decoded, end + 1))
+}
+
+fn parse_span_attr(tag: &str, attr: &str) -> usize {
+    let pattern = format!(
+        r#"(?i)\b{}\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s/>]+))"#,
+        regex::escape(attr)
+    );
+    let Ok(re) = Regex::new(&pattern) else {
+        return 1;
+    };
+    let Some(caps) = re.captures(tag) else {
+        return 1;
+    };
+    let value = caps
+        .get(1)
+        .or_else(|| caps.get(2))
+        .or_else(|| caps.get(3))
+        .map(|value| value.as_str())
+        .unwrap_or("1");
+    value
+        .trim()
+        .parse::<usize>()
+        .ok()
+        .filter(|value| *value > 0)
+        .unwrap_or(1)
+}
+
+fn extract_table_span(text: &str, start: usize) -> TableExtraction {
+    let Some(open_end) = find_tag_end(text, start) else {
+        return TableExtraction::Failed;
+    };
+    let mut depth = 1usize;
+    let mut nested = false;
+    let mut pos = open_end;
+
+    while pos < text.len() {
+        if let Some(tag_start) = find_next_table_tag(text, pos) {
+            if tag_start > pos {
+                pos = tag_start;
+            }
+            if starts_tag(text, tag_start, "table") {
+                let Some(tag_end) = find_tag_end(text, tag_start) else {
+                    return TableExtraction::Failed;
+                };
+                depth += 1;
+                nested = true;
+                pos = tag_end;
+                continue;
+            }
+            if starts_tag(text, tag_start, "/table") {
+                let Some(tag_end) = find_tag_end(text, tag_start) else {
+                    return TableExtraction::Failed;
+                };
+                depth -= 1;
+                pos = tag_end;
+                if depth == 0 {
+                    return TableExtraction::Span {
+                        html: text[start..pos].to_string(),
+                        end: pos,
+                        nested,
+                    };
+                }
+                continue;
+            }
+        }
+        let ch = text[pos..].chars().next().expect("valid table boundary");
+        pos += ch.len_utf8();
+    }
+
+    TableExtraction::Failed
+}
+
+fn find_next_table_tag(text: &str, start: usize) -> Option<usize> {
+    let mut pos = start;
+    while pos < text.len() {
+        if text.as_bytes()[pos] == b'<'
+            && (starts_tag(text, pos, "table") || starts_tag(text, pos, "/table"))
+        {
+            return Some(pos);
+        }
+        let ch = text[pos..].chars().next()?;
+        pos += ch.len_utf8();
+    }
+    None
+}
+
+fn starts_tag(text: &str, start: usize, name: &str) -> bool {
+    if text.as_bytes().get(start) != Some(&b'<') {
+        return false;
+    }
+    let Some(prefix) = text.get(start + 1..start + 1 + name.len()) else {
+        return false;
+    };
+    if !prefix.eq_ignore_ascii_case(name) {
+        return false;
+    }
+    match text.as_bytes().get(start + 1 + name.len()) {
+        None => true,
+        Some(b) if b.is_ascii_whitespace() || *b == b'>' || *b == b'/' => true,
+        _ => false,
+    }
+}
+
+fn find_tag(text: &str, start: usize, name: &str, closing: bool) -> Option<usize> {
+    let target = if closing {
+        format!("/{name}")
+    } else {
+        name.to_string()
+    };
+    let mut pos = start;
+    while pos < text.len() {
+        if starts_tag(text, pos, &target) {
+            return Some(pos);
+        }
+        let ch = text[pos..].chars().next()?;
+        pos += ch.len_utf8();
+    }
+    None
+}
+
+fn find_tag_end(text: &str, start: usize) -> Option<usize> {
+    let bytes = text.as_bytes();
+    let mut in_single = false;
+    let mut in_double = false;
+    let mut pos = start + 1;
+
+    while pos < bytes.len() {
+        match bytes[pos] {
+            b'\'' if !in_double => in_single = !in_single,
+            b'"' if !in_single => in_double = !in_double,
+            b'>' if !in_single && !in_double => return Some(pos + 1),
+            _ => {}
+        }
+        pos += 1;
+    }
+
+    None
+}
+
+fn fence_start(line: &str) -> Option<(char, usize)> {
+    let trimmed = line.trim_start();
+    let mut chars = trimmed.chars();
+    let marker = chars.next()?;
+    if marker != '`' && marker != '~' {
+        return None;
+    }
+
+    let mut len = 1usize;
+    for ch in chars {
+        if ch == marker {
+            len += 1;
+        } else {
+            break;
+        }
+    }
+
+    (len >= 3).then_some((marker, len))
+}
+
+fn is_closing_fence_line(line: &str, marker: char, len: usize) -> bool {
+    let trimmed = line.trim_start();
+    let mut chars = trimmed.chars();
+    let mut count = 0usize;
+
+    while matches!(chars.clone().next(), Some(ch) if ch == marker) {
+        chars.next();
+        count += 1;
+    }
+
+    count >= len && chars.all(char::is_whitespace)
+}
+
+fn backtick_run_len(text: &str, start: usize) -> Option<usize> {
+    let bytes = text.as_bytes();
+    if bytes.get(start) != Some(&b'`') {
+        return None;
+    }
+
+    let mut len = 1usize;
+    while start + len < bytes.len() && bytes[start + len] == b'`' {
+        len += 1;
+    }
+    Some(len)
+}
+
+fn find_matching_backtick_run(text: &str, start: usize, run_len: usize) -> Option<usize> {
+    let bytes = text.as_bytes();
+    let mut pos = start;
+
+    while pos + run_len <= bytes.len() {
+        if bytes[pos] == b'`' && bytes[pos..pos + run_len].iter().all(|byte| *byte == b'`') {
+            return Some(pos);
+        }
+        pos += 1;
+    }
+
+    None
+}
+
+fn count_case_insensitive_occurrences(text: &str, needle: &str) -> usize {
+    if needle.is_empty() || text.len() < needle.len() {
+        return 0;
+    }
+
+    let haystack = text.to_ascii_lowercase();
+    let needle = needle.to_ascii_lowercase();
+    let mut count = 0usize;
+    let mut start = 0usize;
+
+    while let Some(index) = haystack[start..].find(&needle) {
+        count += 1;
+        start += index + needle.len();
+    }
+
+    count
+}
+
+fn line_end(text: &str, start: usize) -> usize {
+    text[start..]
+        .find('\n')
+        .map(|offset| start + offset + 1)
+        .unwrap_or(text.len())
+}
+
+#[derive(Debug)]
+struct ParsedCell {
+    text: String,
+    rowspan: usize,
+    colspan: usize,
+}
+
+#[derive(Debug)]
+struct ParsedRow {
+    cells: Vec<ParsedCell>,
+    has_th: bool,
+}
+
+#[derive(Debug)]
+struct ParsedTable {
+    columns: Vec<String>,
+    rows: Vec<Vec<String>>,
+}
+
+#[derive(Debug)]
+struct ExpandedTable {
+    grid: Vec<Vec<Option<String>>>,
+    width: usize,
+}
+
+enum TableExtraction {
+    Span {
+        html: String,
+        end: usize,
+        nested: bool,
+    },
+    Failed,
+}
diff --git a/src/main.rs b/src/main.rs
index 3393b29..2907992 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -4,7 +4,9 @@ use anyhow::Result;
 use clap::Parser;
 use futures::stream::{self, StreamExt};
 use indicatif::{MultiProgress, ProgressBar, ProgressDrawTarget, ProgressStyle};
-use paperdown::core::{self, PdfSummary, ProgressCallback, ProgressEvent, collect_pdfs};
+use paperdown::core::{
+    self, PdfSummary, ProcessPdfOptions, ProgressCallback, ProgressEvent, collect_pdfs,
+};
 use std::io::IsTerminal;
 use std::path::Path;
 use std::sync::Arc;
@@ -42,10 +44,13 @@ async fn run() -> Result<i32> {
             &pdfs[0],
             &args.output,
             &args.env_file,
-            Duration::from_secs(args.timeout),
-            args.max_download_bytes,
-            args.overwrite,
-            progress_callback(&pdfs[0], progress.clone()),
+            ProcessPdfOptions {
+                timeout: Duration::from_secs(args.timeout),
+                max_download_bytes: args.max_download_bytes,
+                overwrite: args.overwrite,
+                normalize_tables: args.normalize_tables,
+                progress: progress_callback(&pdfs[0], progress.clone()),
+            },
         )
         .await?;
         print_single_summary_stdout(&summary);
@@ -60,22 +65,17 @@ async fn run() -> Result<i32> {
         let permit_pool = semaphore.clone();
         let output = args.output.clone();
         let env_file = args.env_file.clone();
-        let timeout = Duration::from_secs(args.timeout);
-        let max_download_bytes = args.max_download_bytes;
-        let overwrite = args.overwrite;
         let progress = progress.clone();
+        let options = ProcessPdfOptions {
+            timeout: Duration::from_secs(args.timeout),
+            max_download_bytes: args.max_download_bytes,
+            overwrite: args.overwrite,
+            normalize_tables: args.normalize_tables,
+            progress: progress_callback(&pdf, progress),
+        };
         async move {
             let _permit = permit_pool.acquire_owned().await.expect("semaphore");
-            let res = core::process_pdf(
-                &pdf,
-                &output,
-                &env_file,
-                timeout,
-                max_download_bytes,
-                overwrite,
-                progress_callback(&pdf, progress),
-            )
-            .await;
+            let res = core::process_pdf(&pdf, &output, &env_file, options).await;
             (pdf, res)
         }
     }))
diff --git a/tests/core_internal.rs b/tests/core_internal.rs
index ec22d14..0915f19 100644
--- a/tests/core_internal.rs
+++ b/tests/core_internal.rs
@@ -4,10 +4,10 @@
 use httpmock::prelude::*;
 use paperdown::core::collect_pdfs;
 use paperdown::core::testing::{
-    ProgressCallback, ProgressEvent, append_log, atomic_write_text, build_payload,
-    content_type_to_suffix, extract_image_url, fire_for_test, is_http_url, load_api_key,
-    prepare_output_paths, process_pdf, replace_image_urls, round3_for_test,
-    strip_html_img_alt_attributes, url_suffix, validate_layout_response,
+    ProcessPdfOptions, ProgressCallback, ProgressEvent, append_log, atomic_write_text,
+    build_payload, content_type_to_suffix, extract_image_url, fire_for_test, is_http_url,
+    load_api_key, normalize_tables, prepare_output_paths, process_pdf, replace_image_urls,
+    round3_for_test, strip_html_img_alt_attributes, url_suffix, validate_layout_response,
 };
 #[cfg(feature = "net-tests")]
 use paperdown::core::testing::{download_figure, localize_figures};
@@ -320,6 +320,101 @@ fn round3_rounds_millis() {
     assert_eq!(round3_for_test(Duration::from_millis(1234)), 1.234);
 }
 
+#[test]
+fn normalize_tables_rewrites_table_and_writes_artifact() {
+    let tmp = TempDir::new().unwrap();
+    let markdown = "before\n<table>\n<tr><th>Sample Name</th><th>Value</th></tr>\n<tr><td>Alpha</td><td>1</td></tr>\n</table>\nafter";
+
+    let rt = tokio::runtime::Runtime::new().unwrap();
+    let (updated, stats) = rt.block_on(normalize_tables(markdown, tmp.path())).unwrap();
+
+    assert!(updated.contains("##### OCR Table 1"));
+    assert!(updated.contains("Source (OCR HTML): tables/table_001.html"));
+    assert!(updated.contains("Columns: sample_name, value"));
+    assert!(updated.contains(r#"Row: {"sample_name":"Alpha","value":"1"}"#));
+    assert!(tmp.path().join("table_001.html").exists());
+    assert_eq!(stats.tables_found, 1);
+    assert_eq!(stats.tables_raw_written, 1);
+    assert_eq!(stats.tables_normalized, 1);
+}
+
+#[test]
+fn normalize_tables_skips_fenced_and_inline_code() {
+    let tmp = TempDir::new().unwrap();
+    let markdown = "start `<table><tr><td>inline</td></tr></table>`\n```html\n<table><tr><td>fenced</td></tr></table>\n```\n<table><tr><td>real</td></tr></table>\nend";
+
+    let rt = tokio::runtime::Runtime::new().unwrap();
+    let (updated, stats) = rt.block_on(normalize_tables(markdown, tmp.path())).unwrap();
+
+    assert!(updated.contains("<table><tr><td>inline</td></tr></table>"));
+    assert!(updated.contains("```html"));
+    assert!(updated.contains("##### OCR Table 1"));
+    assert_eq!(stats.tables_found, 1);
+    assert_eq!(stats.tables_skipped_in_code, 2);
+    assert_eq!(stats.tables_normalized, 1);
+}
+
+#[test]
+fn normalize_tables_expands_rowspan_and_colspan() {
+    let tmp = TempDir::new().unwrap();
+    let markdown = "<table>\n<tr><th rowspan=\"2\">Category</th><th colspan=\"2\">Measure</th></tr>\n<tr><th>Mean</th><th>Max</th></tr>\n<tr><td rowspan=\"2\">A</td><td>1</td><td>2</td></tr>\n<tr><td>3</td><td>4</td></tr>\n</table>";
+
+    let rt = tokio::runtime::Runtime::new().unwrap();
+    let (updated, _) = rt.block_on(normalize_tables(markdown, tmp.path())).unwrap();
+
+    assert!(updated.contains("Columns: category, measure_mean, measure_max"));
+    assert!(updated.contains(r#"Row: {"category":"A","measure_mean":"1","measure_max":"2"}"#));
+    assert!(updated.contains(r#"Row: {"category":"A","measure_mean":"3","measure_max":"4"}"#));
+}
+
+#[test]
+fn normalize_tables_preserves_json_escaping() {
+    let tmp = TempDir::new().unwrap();
+    let markdown = "<table><tr><th>Value</th></tr><tr><td>a;=b\n\"c\"\\d</td></tr></table>";
+
+    let rt = tokio::runtime::Runtime::new().unwrap();
+    let (updated, _) = rt.block_on(normalize_tables(markdown, tmp.path())).unwrap();
+
+    let row_line = updated
+        .lines()
+        .find(|line| line.starts_with("Row: "))
+        .expect("row line");
+    let row_json = row_line.trim_start_matches("Row: ");
+    let parsed: Value = serde_json::from_str(row_json).unwrap();
+    assert_eq!(parsed["value"], "a;=b\n\"c\"\\d");
+}
+
+#[test]
+fn normalize_tables_leaves_unclosed_table_intact() {
+    let tmp = TempDir::new().unwrap();
+    let markdown = "before\n<table><tr><td>broken";
+
+    let rt = tokio::runtime::Runtime::new().unwrap();
+    let (updated, stats) = rt.block_on(normalize_tables(markdown, tmp.path())).unwrap();
+
+    assert_eq!(updated, markdown);
+    assert_eq!(stats.tables_found, 1);
+    assert_eq!(stats.tables_failed_extract, 1);
+}
+
+#[test]
+fn normalize_tables_uses_placeholder_for_oversized_table() {
+    let tmp = TempDir::new().unwrap();
+    let cells = (0..2001).map(|_| "<td>x</td>").collect::<String>();
+    let markdown = format!("<table><tr>{cells}</tr></table>");
+
+    let rt = tokio::runtime::Runtime::new().unwrap();
+    let (updated, stats) = rt
+        .block_on(normalize_tables(&markdown, tmp.path()))
+        .unwrap();
+
+    assert!(updated.contains("normalization skipped (table too large)"));
+    assert!(tmp.path().join("table_001.html").exists());
+    assert_eq!(stats.tables_found, 1);
+    assert_eq!(stats.tables_raw_written, 1);
+    assert_eq!(stats.tables_skipped_too_large, 1);
+}
+
 #[cfg(feature = "net-tests")]
 async fn start_chunked_image_server(chunks: Vec<Vec<u8>>) -> (String, tokio::task::JoinHandle<()>) {
     let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
@@ -677,7 +772,7 @@ fn prepare_output_without_overwrite_fails_on_existing_managed_artifacts() {
     std::fs::create_dir_all(&target).unwrap();
     std::fs::write(target.join("index.md"), b"old").unwrap();
 
-    let err = prepare_output_paths(&tmp.path().join("out"), &pdf, false)
+    let err = prepare_output_paths(&tmp.path().join("out"), &pdf, false, false)
         .unwrap_err()
         .to_string();
     assert!(err.contains("--overwrite"));
@@ -691,7 +786,7 @@ fn prepare_output_without_overwrite_fails_when_only_figures_exists() {
     let target = tmp.path().join("out").join("paper");
     std::fs::create_dir_all(target.join("figures")).unwrap();
 
-    let err = prepare_output_paths(&tmp.path().join("out"), &pdf, false)
+    let err = prepare_output_paths(&tmp.path().join("out"), &pdf, false, false)
         .unwrap_err()
         .to_string();
     assert!(err.contains("figures"));
@@ -707,7 +802,7 @@ fn prepare_output_without_overwrite_fails_when_both_exist() {
     std::fs::create_dir_all(target.join("figures")).unwrap();
     std::fs::write(target.join("index.md"), b"old").unwrap();
 
-    let err = prepare_output_paths(&tmp.path().join("out"), &pdf, false)
+    let err = prepare_output_paths(&tmp.path().join("out"), &pdf, false, false)
         .unwrap_err()
         .to_string();
     assert!(err.contains("index.md"));
@@ -727,7 +822,7 @@ fn prepare_output_with_overwrite_preserves_unrelated_files() {
     std::fs::write(figures.join("stale.png"), b"old").unwrap();
     std::fs::write(out.join("keep.txt"), b"keep").unwrap();
 
-    let prepared = prepare_output_paths(&tmp.path().join("out"), &pdf, true).unwrap();
+    let prepared = prepare_output_paths(&tmp.path().join("out"), &pdf, true, false).unwrap();
     assert!(prepared.figures_dir.exists());
     assert!(!prepared.figures_dir.join("stale.png").exists());
     assert!(out.join("keep.txt").exists());
@@ -742,10 +837,43 @@ fn prepare_output_with_overwrite_handles_figures_file() {
     std::fs::create_dir_all(&out).unwrap();
     std::fs::write(out.join("figures"), b"stale").unwrap();
 
-    let prepared = prepare_output_paths(&tmp.path().join("out"), &pdf, true).unwrap();
+    let prepared = prepare_output_paths(&tmp.path().join("out"), &pdf, true, false).unwrap();
     assert!(prepared.figures_dir.is_dir());
 }
 
+#[test]
+fn prepare_output_with_normalize_tables_manages_tables_dir() {
+    let tmp = TempDir::new().unwrap();
+    let pdf = tmp.path().join("paper.pdf");
+    std::fs::write(&pdf, b"%PDF").unwrap();
+    let out = tmp.path().join("out").join("paper");
+    std::fs::create_dir_all(out.join("tables")).unwrap();
+    std::fs::write(out.join("tables").join("stale.html"), b"old").unwrap();
+
+    let prepared = prepare_output_paths(&tmp.path().join("out"), &pdf, true, true).unwrap();
+    assert!(prepared.tables_dir.as_ref().unwrap().is_dir());
+    assert!(
+        !prepared
+            .tables_dir
+            .as_ref()
+            .unwrap()
+            .join("stale.html")
+            .exists()
+    );
+}
+
+#[test]
+fn prepare_output_without_overwrite_ignores_stale_tables_when_disabled() {
+    let tmp = TempDir::new().unwrap();
+    let pdf = tmp.path().join("paper.pdf");
+    std::fs::write(&pdf, b"%PDF").unwrap();
+    let out = tmp.path().join("out").join("paper");
+    std::fs::create_dir_all(out.join("tables")).unwrap();
+
+    let prepared = prepare_output_paths(&tmp.path().join("out"), &pdf, false, false).unwrap();
+    assert!(prepared.tables_dir.is_none());
+}
+
 #[test]
 fn extract_image_url_checks_fallback_keys() {
     let block = json!({
@@ -819,10 +947,13 @@ fn process_pdf_checks_output_conflict_before_env_lookup() {
             &pdf,
             &output_root,
             &missing_env,
-            Duration::from_secs(1),
-            1024,
-            false,
-            None,
+            ProcessPdfOptions {
+                timeout: Duration::from_secs(1),
+                max_download_bytes: 1024,
+                overwrite: false,
+                normalize_tables: false,
+                progress: None,
+            },
         ))
         .unwrap_err()
         .to_string();