From 66a8cf1d84d99c9757a48a15149def6358eea7c0 Mon Sep 17 00:00:00 2001 From: atsyplenkov Date: Wed, 25 Mar 2026 17:55:19 +0300 Subject: [PATCH 1/7] test: add failing coverage for skip-by-log and skipped summary --- tests/cli_coverage.rs | 38 +++++++++++++++++++++++++++- tests/cli_existing_output.rs | 49 +++++++++++++++++++++++++++++++++++- 2 files changed, 85 insertions(+), 2 deletions(-) diff --git a/tests/cli_coverage.rs b/tests/cli_coverage.rs index 16ec953..216d444 100644 --- a/tests/cli_coverage.rs +++ b/tests/cli_coverage.rs @@ -66,7 +66,43 @@ fn cli_batch_reports_failed_count() { assert!(!output.status.success()); let stdout = String::from_utf8_lossy(&output.stdout); let stderr = String::from_utf8_lossy(&output.stderr); - assert!(stdout.contains("Batch Complete processed: 0 failed: 2 figures: 0")); + assert!(stdout.contains("Batch Complete processed: 0 skipped: 0 failed: 2 figures: 0")); assert!(stderr.contains("failed:")); assert!(stderr.contains("OCR concurrency: 1")); } + +#[test] +fn cli_single_pdf_skips_when_log_exists_and_env_missing() { + let tmp = TempDir::new().unwrap(); + let pdf = tmp.path().join("paper.pdf"); + std::fs::write(&pdf, b"%PDF-1.7\n").unwrap(); + + let output_dir = tmp.path().join("output"); + let paper_dir = output_dir.join("paper"); + std::fs::create_dir_all(&paper_dir).unwrap(); + std::fs::write(paper_dir.join("log.jsonl"), b"{}\n").unwrap(); + + let env_file = tmp.path().join("missing.env"); + + let mut cmd = Command::cargo_bin("paperdown").unwrap(); + let output = cmd + .current_dir(tmp.path()) + .args([ + "--input", + pdf.to_str().unwrap(), + "--output", + output_dir.to_str().unwrap(), + "--env-file", + env_file.to_str().unwrap(), + ]) + .env_remove("ZAI_API_KEY") + .output() + .unwrap(); + + assert!(output.status.success()); + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!(stdout.contains("Skipped")); + assert!(stdout.contains("paper.pdf")); + assert!(!stderr.contains("ZAI_API_KEY")); +} diff --git a/tests/cli_existing_output.rs b/tests/cli_existing_output.rs index e6e11f0..66e2eb1 100644 --- a/tests/cli_existing_output.rs +++ b/tests/cli_existing_output.rs @@ -41,7 +41,7 @@ fn batch_existing_outputs_fail_before_env_or_ocr() { let stdout = String::from_utf8_lossy(&output.stdout); let stderr = String::from_utf8_lossy(&output.stderr); - assert!(stdout.contains("Batch Complete processed: 0 failed: 2 figures: 0")); + assert!(stdout.contains("Batch Complete processed: 0 skipped: 0 failed: 2 figures: 0")); assert!(stderr.contains("failed:")); assert!(stderr.contains("a.pdf")); @@ -53,3 +53,50 @@ fn batch_existing_outputs_fail_before_env_or_ocr() { assert!(!stdout.contains("\u{1b}[")); assert!(!stderr.contains("\u{1b}[")); } + +#[test] +fn batch_existing_log_outputs_skip_without_env_or_ocr() { + let temp = tempfile::tempdir().expect("tempdir"); + let pdf_dir = temp.path().join("pdf"); + let out_dir = temp.path().join("output"); + fs::create_dir_all(&pdf_dir).expect("pdf dir"); + fs::create_dir_all(&out_dir).expect("output dir"); + + let pdf_a = pdf_dir.join("a.pdf"); + let pdf_b = pdf_dir.join("b.pdf"); + fs::write(&pdf_a, b"%PDF").expect("pdf a"); + fs::write(&pdf_b, b"%PDF").expect("pdf b"); + + fs::create_dir_all(out_dir.join("a")).expect("out a"); + fs::create_dir_all(out_dir.join("b")).expect("out b"); + fs::write(out_dir.join("a/log.jsonl"), b"{}\n").expect("log a"); + fs::write(out_dir.join("b/log.jsonl"), b"{}\n").expect("log b"); + + let missing_env = temp.path().join("missing.env"); + + let output = Command::cargo_bin("paperdown") + .expect("binary") + .args([ + "--input", + pdf_dir.to_str().expect("pdf path"), + "--output", + out_dir.to_str().expect("out path"), + "--workers", + "2", + "--env-file", + missing_env.to_str().expect("env path"), + ]) + .env_remove("ZAI_API_KEY") + .output() + .expect("run"); + + assert!(output.status.success()); + + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + + assert!(stdout.contains("Batch Complete processed: 0 skipped: 2 failed: 0 figures: 0")); + assert!(!stderr.contains("ZAI_API_KEY")); + assert!(!stderr.contains("OCR concurrency:")); + assert!(!stderr.contains("failed:")); +} From 6d154c8ca8ed4cc54eee97019ce217d81a258a04 Mon Sep 17 00:00:00 2001 From: atsyplenkov Date: Wed, 25 Mar 2026 18:14:04 +0300 Subject: [PATCH 2/7] feat: skip existing outputs by log marker and report skipped counts --- src/main.rs | 158 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 149 insertions(+), 9 deletions(-) diff --git a/src/main.rs b/src/main.rs index a3ca449..1625640 100644 --- a/src/main.rs +++ b/src/main.rs @@ -37,6 +37,10 @@ async fn run() -> Result { }; if pdfs.len() == 1 { + if !args.overwrite && has_existing_log_marker(&args.output, &pdfs[0]) { + print_single_skip_summary_stdout(&pdfs[0]); + return Ok(0); + } if args.verbose { eprintln!("Processing 1 PDF: {}", pdfs[0].display()); } @@ -57,18 +61,40 @@ async fn run() -> Result { return Ok(0); } - let workers = args.workers.min(pdfs.len()).max(1); + let total_inputs = pdfs.len(); + let mut skipped_count = 0usize; + let mut process_pdfs = Vec::new(); + for pdf in pdfs { + if !args.overwrite && has_existing_log_marker(&args.output, &pdf) { + skipped_count += 1; + } else { + process_pdfs.push(pdf); + } + } + + if process_pdfs.is_empty() { + let counts = batch_accounting(total_inputs, 0, skipped_count, 0, 0); + print_batch_summary_stdout( + counts.processed, + counts.skipped, + counts.failed, + counts.figures, + ); + return Ok(0); + } + + let workers = args.workers.min(process_pdfs.len()).max(1); let ocr_workers = effective_ocr_workers(workers, args.ocr_workers); eprintln!( "Processing {} PDFs with {} workers (OCR concurrency: {})...", - pdfs.len(), + process_pdfs.len(), workers, ocr_workers ); let semaphore = Arc::new(Semaphore::new(workers)); let ocr_semaphore = Arc::new(Semaphore::new(ocr_workers)); - let results = stream::iter(pdfs.into_iter().map(|pdf| { + let results = stream::iter(process_pdfs.into_iter().map(|pdf| { let permit_pool = semaphore.clone(); let ocr_limiter = ocr_semaphore.clone(); let output = args.output.clone(); @@ -118,8 +144,20 @@ async fn run() -> Result { } } - print_batch_summary_stdout(success_count, failed_count, downloaded_figures); - Ok(if failed_count > 0 { 1 } else { 0 }) + let counts = batch_accounting( + total_inputs, + success_count, + skipped_count, + failed_count, + downloaded_figures, + ); + print_batch_summary_stdout( + counts.processed, + counts.skipped, + counts.failed, + counts.figures, + ); + Ok(if counts.failed > 0 { 1 } else { 0 }) } fn stderr_is_tty() -> bool { @@ -141,6 +179,39 @@ fn stdout_is_tty() -> bool { std::io::stdout().is_terminal() } +fn has_existing_log_marker(output_root: &Path, pdf: &Path) -> bool { + let Some(stem) = pdf.file_stem() else { + return false; + }; + let log_path = output_root.join(stem).join("log.jsonl"); + if !log_path.is_file() { + return false; + } + + let Ok(contents) = std::fs::read_to_string(&log_path) else { + return true; + }; + let Some(last_line) = contents.lines().rev().find(|line| !line.trim().is_empty()) else { + return true; + }; + let Ok(entry) = serde_json::from_str::(last_line) else { + return true; + }; + let Some(pdf_path) = entry.get("pdf_path").and_then(|value| value.as_str()) else { + return true; + }; + + pdf_path == pdf.display().to_string() +} + +fn print_single_skip_summary_stdout(pdf: &Path) { + if stdout_is_tty() { + println!("\x1b[1;33mSkipped\x1b[0m {}", display_path(pdf)); + } else { + println!("Skipped {}", display_path(pdf)); + } +} + fn print_single_summary_stdout(summary: &PdfSummary) { if stdout_is_tty() { println!( @@ -165,7 +236,35 @@ fn print_single_summary_stdout(summary: &PdfSummary) { } } -fn print_batch_summary_stdout(processed: usize, failed: usize, figures: usize) { +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +struct BatchAccounting { + processed: usize, + skipped: usize, + failed: usize, + figures: usize, +} + +fn batch_accounting( + total_inputs: usize, + processed: usize, + skipped: usize, + failed: usize, + figures: usize, +) -> BatchAccounting { + assert_eq!( + processed + skipped + failed, + total_inputs, + "batch accounting invariant violated" + ); + BatchAccounting { + processed, + skipped, + failed, + figures, + } +} + +fn print_batch_summary_stdout(processed: usize, skipped: usize, failed: usize, figures: usize) { if stdout_is_tty() { let color = if failed == 0 { "\x1b[1;32m" @@ -173,10 +272,12 @@ fn print_batch_summary_stdout(processed: usize, failed: usize, figures: usize) { "\x1b[1;33m" }; println!( - "{color}Batch Complete\x1b[0m processed: \x1b[1m{processed}\x1b[0m failed: \x1b[1m{failed}\x1b[0m figures: \x1b[1m{figures}\x1b[0m" + "{color}Batch Complete\x1b[0m processed: \x1b[1m{processed}\x1b[0m skipped: \x1b[1m{skipped}\x1b[0m failed: \x1b[1m{failed}\x1b[0m figures: \x1b[1m{figures}\x1b[0m" ); } else { - println!("Batch Complete processed: {processed} failed: {failed} figures: {figures}"); + println!( + "Batch Complete processed: {processed} skipped: {skipped} failed: {failed} figures: {figures}" + ); } } @@ -298,7 +399,8 @@ mod tests { log_path: "/tmp/out/paper/log.jsonl".to_string(), }; print_single_summary_stdout(&summary); - print_batch_summary_stdout(2, 1, 4); + print_single_skip_summary_stdout(Path::new(&summary.pdf)); + print_batch_summary_stdout(2, 1, 1, 4); } #[test] @@ -325,4 +427,42 @@ mod tests { assert_eq!(effective_ocr_workers(8, 32), 8); assert_eq!(effective_ocr_workers(1, 2), 1); } + + #[test] + fn has_existing_log_marker_returns_false_for_pdf_path_mismatch() { + let temp = tempfile::tempdir().expect("tempdir"); + let output_root = temp.path(); + let pdf = Path::new("/input/current/paper.pdf"); + let log_path = output_root.join("paper").join("log.jsonl"); + std::fs::create_dir_all(log_path.parent().expect("log parent")).expect("create log dir"); + std::fs::write( + &log_path, + "{\"pdf_path\":\"/input/current/paper.pdf\"}\n\n{\"pdf_path\":\"/input/other/paper.pdf\"}\n", + ) + .expect("write log marker"); + + assert!(!has_existing_log_marker(output_root, pdf)); + } + + mod main { + use super::*; + + mod tests { + use super::*; + + #[test] + fn batch_accounting_mixed_outcomes_is_consistent() { + let counts = batch_accounting(5, 2, 1, 2, 7); + assert_eq!( + counts, + BatchAccounting { + processed: 2, + skipped: 1, + failed: 2, + figures: 7 + } + ); + } + } + } } From f2cbd7d5a4d604530e865e6d45c0e2546c6c0897 Mon Sep 17 00:00:00 2001 From: atsyplenkov Date: Wed, 25 Mar 2026 18:27:46 +0300 Subject: [PATCH 3/7] fix: make overwrite replace full per-pdf output directory --- src/core/output.rs | 66 ++++++++++++++++++++++++++++++------------ tests/core_internal.rs | 41 ++++++++++++++++++++++++-- 2 files changed, 87 insertions(+), 20 deletions(-) diff --git a/src/core/output.rs b/src/core/output.rs index 902eb4a..0c627c3 100644 --- a/src/core/output.rs +++ b/src/core/output.rs @@ -14,6 +14,18 @@ pub(crate) struct PreparedOutput { pub(crate) log_path: PathBuf, } +fn validate_output_stem(stem: &str) -> Result<()> { + if stem.is_empty() + || stem == "." + || stem == ".." + || stem.contains('/') + || stem.contains('\\') + { + return Err(anyhow::anyhow!("Invalid output stem: {stem}")); + } + Ok(()) +} + pub(crate) fn prepare_output_paths( output_root: &Path, pdf_path: &Path, @@ -24,8 +36,22 @@ pub(crate) fn prepare_output_paths( .file_stem() .and_then(|s| s.to_str()) .ok_or_else(|| anyhow::anyhow!("Invalid PDF filename: {}", pdf_path.display()))?; + validate_output_stem(stem)?; let output_dir = output_root.join(stem); + if overwrite { + match std::fs::symlink_metadata(&output_dir) { + Ok(metadata) => { + if metadata.is_dir() { + std::fs::remove_dir_all(&output_dir)?; + } else { + std::fs::remove_file(&output_dir)?; + } + } + Err(err) if err.kind() == std::io::ErrorKind::NotFound => {} + Err(err) => return Err(err.into()), + } + } std::fs::create_dir_all(&output_dir)?; let markdown_path = output_dir.join("index.md"); @@ -52,24 +78,6 @@ pub(crate) fn prepare_output_paths( tables_dir.display() )); } - } else { - if markdown_path.exists() { - std::fs::remove_file(&markdown_path)?; - } - if figures_dir.exists() { - if figures_dir.is_dir() { - std::fs::remove_dir_all(&figures_dir)?; - } else { - std::fs::remove_file(&figures_dir)?; - } - } - if normalize_tables && tables_dir.exists() { - if tables_dir.is_dir() { - std::fs::remove_dir_all(&tables_dir)?; - } else { - std::fs::remove_file(&tables_dir)?; - } - } } std::fs::create_dir_all(&figures_dir)?; @@ -125,3 +133,25 @@ pub(crate) async fn atomic_write_bytes(path: &Path, content: &[u8]) -> Result<() fs::rename(&temp_path, path).await?; Ok(()) } + +#[cfg(test)] +mod tests { + use super::validate_output_stem; + + #[test] + fn validate_output_stem_rejects_backslash() { + let err = validate_output_stem("a\\b").unwrap_err().to_string(); + assert!(err.contains("Invalid output stem")); + } + + #[test] + fn validate_output_stem_rejects_forward_slash() { + let err = validate_output_stem("a/b").unwrap_err().to_string(); + assert!(err.contains("Invalid output stem")); + } + + #[test] + fn validate_output_stem_accepts_normal_stem() { + assert!(validate_output_stem("paper").is_ok()); + } +} diff --git a/tests/core_internal.rs b/tests/core_internal.rs index 0915f19..9b3a3b0 100644 --- a/tests/core_internal.rs +++ b/tests/core_internal.rs @@ -810,7 +810,7 @@ fn prepare_output_without_overwrite_fails_when_both_exist() { } #[test] -fn prepare_output_with_overwrite_preserves_unrelated_files() { +fn prepare_output_with_overwrite_removes_unrelated_files() { let tmp = TempDir::new().unwrap(); let pdf = tmp.path().join("paper.pdf"); std::fs::write(&pdf, b"%PDF").unwrap(); @@ -825,7 +825,22 @@ fn prepare_output_with_overwrite_preserves_unrelated_files() { let prepared = prepare_output_paths(&tmp.path().join("out"), &pdf, true, false).unwrap(); assert!(prepared.figures_dir.exists()); assert!(!prepared.figures_dir.join("stale.png").exists()); - assert!(out.join("keep.txt").exists()); + assert!(!out.join("keep.txt").exists()); +} + +#[test] +fn prepare_output_with_overwrite_replaces_output_file_path() { + let tmp = TempDir::new().unwrap(); + let pdf = tmp.path().join("paper.pdf"); + std::fs::write(&pdf, b"%PDF").unwrap(); + + let out_file = tmp.path().join("out").join("paper"); + std::fs::create_dir_all(tmp.path().join("out")).unwrap(); + std::fs::write(&out_file, b"stale").unwrap(); + + let prepared = prepare_output_paths(&tmp.path().join("out"), &pdf, true, false).unwrap(); + assert!(prepared.output_dir.is_dir()); + assert!(prepared.figures_dir.is_dir()); } #[test] @@ -874,6 +889,28 @@ fn prepare_output_without_overwrite_ignores_stale_tables_when_disabled() { assert!(prepared.tables_dir.is_none()); } +#[test] +fn prepare_output_rejects_unsafe_stems() { + let tmp = TempDir::new().unwrap(); + let output_root = tmp.path().join("out"); + std::fs::create_dir_all(&output_root).unwrap(); + + let dot_stem_pdf = tmp.path().join("..pdf"); + std::fs::write(&dot_stem_pdf, b"%PDF").unwrap(); + let err = prepare_output_paths(&output_root, &dot_stem_pdf, false, false) + .unwrap_err() + .to_string(); + assert!(err.contains("Invalid output stem")); + + let dotdot_stem_pdf = tmp.path().join("...pdf"); + std::fs::write(&dotdot_stem_pdf, b"%PDF").unwrap(); + let err = prepare_output_paths(&output_root, &dotdot_stem_pdf, false, false) + .unwrap_err() + .to_string(); + assert!(err.contains("Invalid output stem")); + +} + #[test] fn extract_image_url_checks_fallback_keys() { let block = json!({ From c1f5ee11f23967126f2d9ea231a697a2956e08a4 Mon Sep 17 00:00:00 2001 From: atsyplenkov Date: Wed, 25 Mar 2026 18:43:50 +0300 Subject: [PATCH 4/7] docs: clarify skip marker and overwrite folder replacement behavior --- CHANGELOG.md | 1 + README.md | 4 +++- src/cli.rs | 20 +++++++++++++++++--- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e33cdc..569a19c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ Possible sections are: ### Fixed: - avoid Z.AI OCR rate-limit failures in large batch runs by introducing OCR-specific concurrency control (`--ocr-workers`) and clearer HTTP 429 guidance ([#7](https://github.com/atsyplenkov/paperdown/issues/7)) +- align CLI help and README overwrite guidance with marker-based skip behavior (existing `//log.jsonl` skips by default; skip is not applied when the last marker entry `pdf_path` explicitly points to a different PDF) and conflict recovery instructions ([#11](https://github.com/atsyplenkov/paperdown/issues/11)) ## [0.2.0] - 2026-03-18 diff --git a/README.md b/README.md index 7201307..70f2fd7 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,8 @@ paperdown --input pdf/ --output md/ --workers 32 --ocr-workers 2 --overwrite `--workers` controls how many PDFs are processed concurrently in batch mode. `--ocr-workers` controls concurrent OCR API calls. Effective OCR concurrency is `min(--workers, --ocr-workers)`. +Without `--overwrite`, an existing `//log.jsonl` marker skips the PDF. Exception: skip is not applied if the marker's last non-empty entry has `pdf_path` explicitly pointing to a different PDF. When skip is not applied, existing managed artifacts can still trigger a conflict error; rerun with `--overwrite` to replace the folder. With `--overwrite`, `paperdown` replaces the whole `//` folder before processing. + ## Installation Install from crates.io: @@ -91,7 +93,7 @@ Options: --workers Maximum number of PDFs processed concurrently in batch mode. [default: 32] --ocr-workers Maximum number of concurrent OCR API calls in batch mode; effective OCR concurrency is min(--workers, --ocr-workers). [default: 2] -v, --verbose Enable verbose progress messages on stderr. - --overwrite Replace existing managed output artifacts (index.md, figures/, and tables/ when enabled). + --overwrite Replace the whole // folder before processing. --normalize-tables Normalize OCR HTML tables into Markdown and store raw HTML under tables/. -h, --help Print help (see a summary with '-h') -V, --version Print version diff --git a/src/cli.rs b/src/cli.rs index 0340e00..b1ace06 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -21,8 +21,10 @@ paperdown --input pdf/ --output md/ --workers 4\n \ paperdown --input pdf/ --output md/ --overwrite\n \ paperdown --input pdf/ --output md/ --normalize-tables\n\n\ Notes:\n \ -Without --overwrite, existing index.md or figures/ causes a failure.\n \ -When --normalize-tables is enabled, existing tables/ also causes a failure.\n \ +Without --overwrite, an existing //log.jsonl marker skips the PDF.\n \ +Exception: skip is not applied if the marker's last non-empty entry has pdf_path explicitly pointing to a different PDF.\n \ +When skip is not applied, existing managed artifacts can still trigger a conflict error; rerun with --overwrite to replace the folder.\n \ +With --overwrite, the whole // folder is replaced.\n \ Progress bars are shown on stderr only when running in a TTY." )] pub struct Cli { @@ -91,7 +93,7 @@ pub struct Cli { #[arg( long, action = ArgAction::SetTrue, - help = "Replace existing managed output artifacts (index.md and figures/)." + help = "Replace the whole // folder before processing." )] pub overwrite: bool, @@ -168,6 +170,18 @@ mod tests { assert!(help.contains("Examples:")); assert!(help.contains("--overwrite")); assert!(help.contains("--normalize-tables")); + assert!(help.contains( + "Without --overwrite, an existing //log.jsonl marker skips the PDF." + )); + assert!(help.contains( + "Exception: skip is not applied if the marker's last non-empty entry has pdf_path explicitly pointing to a different PDF." + )); + assert!(help.contains( + "When skip is not applied, existing managed artifacts can still trigger a conflict error; rerun with --overwrite to replace the folder." + )); + assert!(help.contains( + "With --overwrite, the whole // folder is replaced." + )); let file_first = help.find("1) ZAI_API_KEY from --env-file"); let env_second = help.find("2) ZAI_API_KEY from environment"); assert!(file_first.is_some()); From 9ba62eff78d03d05bc0776fe310e739f22e093d0 Mon Sep 17 00:00:00 2001 From: atsyplenkov Date: Wed, 25 Mar 2026 18:55:45 +0300 Subject: [PATCH 5/7] fix: canonicalize log marker matching for symlinked pdf paths --- src/cli.rs | 6 +++--- src/core/output.rs | 7 +------ src/main.rs | 40 +++++++++++++++++++++++++++++++++++++++- tests/core_internal.rs | 1 - 4 files changed, 43 insertions(+), 11 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index b1ace06..cc6424c 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -179,9 +179,9 @@ mod tests { assert!(help.contains( "When skip is not applied, existing managed artifacts can still trigger a conflict error; rerun with --overwrite to replace the folder." )); - assert!(help.contains( - "With --overwrite, the whole // folder is replaced." - )); + assert!( + help.contains("With --overwrite, the whole // folder is replaced.") + ); let file_first = help.find("1) ZAI_API_KEY from --env-file"); let env_second = help.find("2) ZAI_API_KEY from environment"); assert!(file_first.is_some()); diff --git a/src/core/output.rs b/src/core/output.rs index 0c627c3..5da9596 100644 --- a/src/core/output.rs +++ b/src/core/output.rs @@ -15,12 +15,7 @@ pub(crate) struct PreparedOutput { } fn validate_output_stem(stem: &str) -> Result<()> { - if stem.is_empty() - || stem == "." - || stem == ".." - || stem.contains('/') - || stem.contains('\\') - { + if stem.is_empty() || stem == "." || stem == ".." || stem.contains('/') || stem.contains('\\') { return Err(anyhow::anyhow!("Invalid output stem: {stem}")); } Ok(()) diff --git a/src/main.rs b/src/main.rs index 1625640..5a987df 100644 --- a/src/main.rs +++ b/src/main.rs @@ -201,7 +201,17 @@ fn has_existing_log_marker(output_root: &Path, pdf: &Path) -> bool { return true; }; - pdf_path == pdf.display().to_string() + let current_pdf = match pdf.canonicalize() { + Ok(path) => path, + Err(_) => return pdf_path == pdf.display().to_string(), + }; + + let marker_pdf = Path::new(pdf_path); + if let Ok(marker_canonical) = marker_pdf.canonicalize() { + return marker_canonical == current_pdf; + } + + pdf_path == current_pdf.display().to_string() } fn print_single_skip_summary_stdout(pdf: &Path) { @@ -444,6 +454,34 @@ mod tests { assert!(!has_existing_log_marker(output_root, pdf)); } + #[cfg(unix)] + #[test] + fn has_existing_log_marker_treats_symlink_and_real_path_as_same_file() { + use std::os::unix::fs::symlink; + + let temp = tempfile::tempdir().expect("tempdir"); + let real_dir = temp.path().join("real"); + let link_dir = temp.path().join("link"); + std::fs::create_dir_all(&real_dir).expect("create real dir"); + std::fs::create_dir_all(&link_dir).expect("create link dir"); + + let real_pdf = real_dir.join("paper.pdf"); + std::fs::write(&real_pdf, b"%PDF-1.4").expect("create real pdf"); + let symlink_pdf = link_dir.join("paper.pdf"); + symlink(&real_pdf, &symlink_pdf).expect("create symlink"); + + let output_root = temp.path().join("output"); + let log_path = output_root.join("paper").join("log.jsonl"); + std::fs::create_dir_all(log_path.parent().expect("log parent")).expect("create log dir"); + std::fs::write( + &log_path, + format!("{{\"pdf_path\":\"{}\"}}\n", real_pdf.display()), + ) + .expect("write log marker"); + + assert!(has_existing_log_marker(&output_root, &symlink_pdf)); + } + mod main { use super::*; diff --git a/tests/core_internal.rs b/tests/core_internal.rs index 9b3a3b0..7303bd7 100644 --- a/tests/core_internal.rs +++ b/tests/core_internal.rs @@ -908,7 +908,6 @@ fn prepare_output_rejects_unsafe_stems() { .unwrap_err() .to_string(); assert!(err.contains("Invalid output stem")); - } #[test] From a3bfc5e7234a5a2cb57b185bdac6d9bfb47d482a Mon Sep 17 00:00:00 2001 From: atsyplenkov Date: Wed, 25 Mar 2026 19:08:39 +0300 Subject: [PATCH 6/7] fix: skip only when log marker file exists --- CHANGELOG.md | 2 +- README.md | 2 +- src/cli.rs | 8 ++---- src/main.rs | 71 ++++++++-------------------------------------------- 4 files changed, 15 insertions(+), 68 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 569a19c..259ce97 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,7 @@ Possible sections are: ### Fixed: - avoid Z.AI OCR rate-limit failures in large batch runs by introducing OCR-specific concurrency control (`--ocr-workers`) and clearer HTTP 429 guidance ([#7](https://github.com/atsyplenkov/paperdown/issues/7)) -- align CLI help and README overwrite guidance with marker-based skip behavior (existing `//log.jsonl` skips by default; skip is not applied when the last marker entry `pdf_path` explicitly points to a different PDF) and conflict recovery instructions ([#11](https://github.com/atsyplenkov/paperdown/issues/11)) +- align CLI help and README overwrite guidance with marker-based skip behavior (skip only when `//log.jsonl` exists) and conflict recovery instructions ([#11](https://github.com/atsyplenkov/paperdown/issues/11)) ## [0.2.0] - 2026-03-18 diff --git a/README.md b/README.md index 70f2fd7..a0ca3c4 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ paperdown --input pdf/ --output md/ --workers 32 --ocr-workers 2 --overwrite `--workers` controls how many PDFs are processed concurrently in batch mode. `--ocr-workers` controls concurrent OCR API calls. Effective OCR concurrency is `min(--workers, --ocr-workers)`. -Without `--overwrite`, an existing `//log.jsonl` marker skips the PDF. Exception: skip is not applied if the marker's last non-empty entry has `pdf_path` explicitly pointing to a different PDF. When skip is not applied, existing managed artifacts can still trigger a conflict error; rerun with `--overwrite` to replace the folder. With `--overwrite`, `paperdown` replaces the whole `//` folder before processing. +Without `--overwrite`, an existing `//log.jsonl` marker skips the PDF. If the log marker is missing, existing managed artifacts can still trigger a conflict error; rerun with `--overwrite` to replace the folder. With `--overwrite`, `paperdown` replaces the whole `//` folder before processing. ## Installation diff --git a/src/cli.rs b/src/cli.rs index cc6424c..51d240a 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -22,8 +22,7 @@ paperdown --input pdf/ --output md/ --overwrite\n \ paperdown --input pdf/ --output md/ --normalize-tables\n\n\ Notes:\n \ Without --overwrite, an existing //log.jsonl marker skips the PDF.\n \ -Exception: skip is not applied if the marker's last non-empty entry has pdf_path explicitly pointing to a different PDF.\n \ -When skip is not applied, existing managed artifacts can still trigger a conflict error; rerun with --overwrite to replace the folder.\n \ +If the log marker is missing, existing managed artifacts can still trigger a conflict error; rerun with --overwrite to replace the folder.\n \ With --overwrite, the whole // folder is replaced.\n \ Progress bars are shown on stderr only when running in a TTY." )] @@ -174,10 +173,7 @@ mod tests { "Without --overwrite, an existing //log.jsonl marker skips the PDF." )); assert!(help.contains( - "Exception: skip is not applied if the marker's last non-empty entry has pdf_path explicitly pointing to a different PDF." - )); - assert!(help.contains( - "When skip is not applied, existing managed artifacts can still trigger a conflict error; rerun with --overwrite to replace the folder." + "If the log marker is missing, existing managed artifacts can still trigger a conflict error; rerun with --overwrite to replace the folder." )); assert!( help.contains("With --overwrite, the whole // folder is replaced.") diff --git a/src/main.rs b/src/main.rs index 5a987df..ba302b8 100644 --- a/src/main.rs +++ b/src/main.rs @@ -183,35 +183,7 @@ fn has_existing_log_marker(output_root: &Path, pdf: &Path) -> bool { let Some(stem) = pdf.file_stem() else { return false; }; - let log_path = output_root.join(stem).join("log.jsonl"); - if !log_path.is_file() { - return false; - } - - let Ok(contents) = std::fs::read_to_string(&log_path) else { - return true; - }; - let Some(last_line) = contents.lines().rev().find(|line| !line.trim().is_empty()) else { - return true; - }; - let Ok(entry) = serde_json::from_str::(last_line) else { - return true; - }; - let Some(pdf_path) = entry.get("pdf_path").and_then(|value| value.as_str()) else { - return true; - }; - - let current_pdf = match pdf.canonicalize() { - Ok(path) => path, - Err(_) => return pdf_path == pdf.display().to_string(), - }; - - let marker_pdf = Path::new(pdf_path); - if let Ok(marker_canonical) = marker_pdf.canonicalize() { - return marker_canonical == current_pdf; - } - - pdf_path == current_pdf.display().to_string() + output_root.join(stem).join("log.jsonl").is_file() } fn print_single_skip_summary_stdout(pdf: &Path) { @@ -439,47 +411,26 @@ mod tests { } #[test] - fn has_existing_log_marker_returns_false_for_pdf_path_mismatch() { + fn has_existing_log_marker_returns_true_when_log_file_exists() { let temp = tempfile::tempdir().expect("tempdir"); let output_root = temp.path(); - let pdf = Path::new("/input/current/paper.pdf"); + let pdf = temp.path().join("paper.pdf"); + std::fs::write(&pdf, b"%PDF-1.4").expect("create pdf"); let log_path = output_root.join("paper").join("log.jsonl"); std::fs::create_dir_all(log_path.parent().expect("log parent")).expect("create log dir"); - std::fs::write( - &log_path, - "{\"pdf_path\":\"/input/current/paper.pdf\"}\n\n{\"pdf_path\":\"/input/other/paper.pdf\"}\n", - ) - .expect("write log marker"); + std::fs::write(&log_path, b"{}\n").expect("write log marker"); - assert!(!has_existing_log_marker(output_root, pdf)); + assert!(has_existing_log_marker(output_root, &pdf)); } - #[cfg(unix)] #[test] - fn has_existing_log_marker_treats_symlink_and_real_path_as_same_file() { - use std::os::unix::fs::symlink; - + fn has_existing_log_marker_returns_false_when_log_file_missing() { let temp = tempfile::tempdir().expect("tempdir"); - let real_dir = temp.path().join("real"); - let link_dir = temp.path().join("link"); - std::fs::create_dir_all(&real_dir).expect("create real dir"); - std::fs::create_dir_all(&link_dir).expect("create link dir"); - - let real_pdf = real_dir.join("paper.pdf"); - std::fs::write(&real_pdf, b"%PDF-1.4").expect("create real pdf"); - let symlink_pdf = link_dir.join("paper.pdf"); - symlink(&real_pdf, &symlink_pdf).expect("create symlink"); - - let output_root = temp.path().join("output"); - let log_path = output_root.join("paper").join("log.jsonl"); - std::fs::create_dir_all(log_path.parent().expect("log parent")).expect("create log dir"); - std::fs::write( - &log_path, - format!("{{\"pdf_path\":\"{}\"}}\n", real_pdf.display()), - ) - .expect("write log marker"); + let output_root = temp.path(); + let pdf = temp.path().join("paper.pdf"); + std::fs::write(&pdf, b"%PDF-1.4").expect("create pdf"); - assert!(has_existing_log_marker(&output_root, &symlink_pdf)); + assert!(!has_existing_log_marker(output_root, &pdf)); } mod main { From 6ebda1f66ee0359b3b7a1c559b168bb54c15ec68 Mon Sep 17 00:00:00 2001 From: atsyplenkov Date: Wed, 25 Mar 2026 21:11:08 +0300 Subject: [PATCH 7/7] fix: refresh stale outputs when log marker is missing --- CHANGELOG.md | 2 +- README.md | 2 +- src/cli.rs | 4 +- src/core/output.rs | 46 +++++++-------- tests/cli_existing_output.rs | 10 +++- tests/core_internal.rs | 108 ++++++++++++++++++++++++++++++----- 6 files changed, 126 insertions(+), 46 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 259ce97..fcd19d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,7 @@ Possible sections are: ### Fixed: - avoid Z.AI OCR rate-limit failures in large batch runs by introducing OCR-specific concurrency control (`--ocr-workers`) and clearer HTTP 429 guidance ([#7](https://github.com/atsyplenkov/paperdown/issues/7)) -- align CLI help and README overwrite guidance with marker-based skip behavior (skip only when `//log.jsonl` exists) and conflict recovery instructions ([#11](https://github.com/atsyplenkov/paperdown/issues/11)) +- align skip and output-reuse behavior with marker-based semantics: skip only when `//log.jsonl` exists; otherwise refresh managed artifacts and continue processing ([#11](https://github.com/atsyplenkov/paperdown/issues/11)) ## [0.2.0] - 2026-03-18 diff --git a/README.md b/README.md index a0ca3c4..8d28c5f 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ paperdown --input pdf/ --output md/ --workers 32 --ocr-workers 2 --overwrite `--workers` controls how many PDFs are processed concurrently in batch mode. `--ocr-workers` controls concurrent OCR API calls. Effective OCR concurrency is `min(--workers, --ocr-workers)`. -Without `--overwrite`, an existing `//log.jsonl` marker skips the PDF. If the log marker is missing, existing managed artifacts can still trigger a conflict error; rerun with `--overwrite` to replace the folder. With `--overwrite`, `paperdown` replaces the whole `//` folder before processing. +Without `--overwrite`, an existing `//log.jsonl` marker skips the PDF. If the log marker is missing, `paperdown` treats the PDF as unprocessed and refreshes managed artifacts (`index.md`, `figures/`, and `tables/` when `--normalize-tables` is enabled). With `--overwrite`, `paperdown` replaces the whole `//` folder before processing. ## Installation diff --git a/src/cli.rs b/src/cli.rs index 51d240a..d39a9d1 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -22,7 +22,7 @@ paperdown --input pdf/ --output md/ --overwrite\n \ paperdown --input pdf/ --output md/ --normalize-tables\n\n\ Notes:\n \ Without --overwrite, an existing //log.jsonl marker skips the PDF.\n \ -If the log marker is missing, existing managed artifacts can still trigger a conflict error; rerun with --overwrite to replace the folder.\n \ +If the log marker is missing, paperdown treats the PDF as unprocessed and refreshes managed artifacts (index.md, figures/, and tables/ when enabled).\n \ With --overwrite, the whole // folder is replaced.\n \ Progress bars are shown on stderr only when running in a TTY." )] @@ -173,7 +173,7 @@ mod tests { "Without --overwrite, an existing //log.jsonl marker skips the PDF." )); assert!(help.contains( - "If the log marker is missing, existing managed artifacts can still trigger a conflict error; rerun with --overwrite to replace the folder." + "If the log marker is missing, paperdown treats the PDF as unprocessed and refreshes managed artifacts (index.md, figures/, and tables/ when enabled)." )); assert!( help.contains("With --overwrite, the whole // folder is replaced.") diff --git a/src/core/output.rs b/src/core/output.rs index 5da9596..afa2fc9 100644 --- a/src/core/output.rs +++ b/src/core/output.rs @@ -14,6 +14,21 @@ pub(crate) struct PreparedOutput { pub(crate) log_path: PathBuf, } +fn remove_path_if_exists(path: &Path) -> Result<()> { + match std::fs::symlink_metadata(path) { + Ok(metadata) => { + if metadata.is_dir() { + std::fs::remove_dir_all(path)?; + } else { + std::fs::remove_file(path)?; + } + } + Err(err) if err.kind() == std::io::ErrorKind::NotFound => {} + Err(err) => return Err(err.into()), + } + Ok(()) +} + fn validate_output_stem(stem: &str) -> Result<()> { if stem.is_empty() || stem == "." || stem == ".." || stem.contains('/') || stem.contains('\\') { return Err(anyhow::anyhow!("Invalid output stem: {stem}")); @@ -35,17 +50,7 @@ pub(crate) fn prepare_output_paths( let output_dir = output_root.join(stem); if overwrite { - match std::fs::symlink_metadata(&output_dir) { - Ok(metadata) => { - if metadata.is_dir() { - std::fs::remove_dir_all(&output_dir)?; - } else { - std::fs::remove_file(&output_dir)?; - } - } - Err(err) if err.kind() == std::io::ErrorKind::NotFound => {} - Err(err) => return Err(err.into()), - } + remove_path_if_exists(&output_dir)?; } std::fs::create_dir_all(&output_dir)?; @@ -55,23 +60,16 @@ pub(crate) fn prepare_output_paths( let log_path = output_dir.join("log.jsonl"); if !overwrite { - if markdown_path.exists() { + if log_path.is_file() { return Err(anyhow::anyhow!( "Output already exists: {}. Re-run with --overwrite", - markdown_path.display() + log_path.display() )); } - if figures_dir.exists() { - return Err(anyhow::anyhow!( - "Output already exists: {}. Re-run with --overwrite", - figures_dir.display() - )); - } - if normalize_tables && tables_dir.exists() { - return Err(anyhow::anyhow!( - "Output already exists: {}. Re-run with --overwrite", - tables_dir.display() - )); + remove_path_if_exists(&markdown_path)?; + remove_path_if_exists(&figures_dir)?; + if normalize_tables { + remove_path_if_exists(&tables_dir)?; } } diff --git a/tests/cli_existing_output.rs b/tests/cli_existing_output.rs index 66e2eb1..f9577f5 100644 --- a/tests/cli_existing_output.rs +++ b/tests/cli_existing_output.rs @@ -2,7 +2,7 @@ use assert_cmd::Command; use std::fs; #[test] -fn batch_existing_outputs_fail_before_env_or_ocr() { +fn batch_without_log_marker_reaches_env_lookup_even_with_stale_outputs() { let temp = tempfile::tempdir().expect("tempdir"); let pdf_dir = temp.path().join("pdf"); let out_dir = temp.path().join("md"); @@ -17,6 +17,10 @@ fn batch_existing_outputs_fail_before_env_or_ocr() { fs::create_dir_all(out_dir.join("b")).expect("out b"); fs::write(out_dir.join("a/index.md"), b"old").expect("index a"); fs::write(out_dir.join("b/index.md"), b"old").expect("index b"); + fs::create_dir_all(out_dir.join("a/figures")).expect("figures a"); + fs::create_dir_all(out_dir.join("b/figures")).expect("figures b"); + fs::write(out_dir.join("a/figures/stale.png"), b"old").expect("stale fig a"); + fs::write(out_dir.join("b/figures/stale.png"), b"old").expect("stale fig b"); let missing_env = temp.path().join("missing.env"); @@ -46,10 +50,10 @@ fn batch_existing_outputs_fail_before_env_or_ocr() { assert!(stderr.contains("failed:")); assert!(stderr.contains("a.pdf")); assert!(stderr.contains("b.pdf")); - assert!(stderr.contains("Re-run with --overwrite")); assert!(stderr.contains("OCR concurrency:")); - assert!(!stderr.contains("ZAI_API_KEY")); + assert!(stderr.contains("ZAI_API_KEY")); + assert!(!stderr.contains("Re-run with --overwrite")); assert!(!stdout.contains("\u{1b}[")); assert!(!stderr.contains("\u{1b}[")); } diff --git a/tests/core_internal.rs b/tests/core_internal.rs index 7303bd7..08424be 100644 --- a/tests/core_internal.rs +++ b/tests/core_internal.rs @@ -764,7 +764,7 @@ fn strip_html_img_alt_attributes_keeps_localized_image_urls() { } #[test] -fn prepare_output_without_overwrite_fails_on_existing_managed_artifacts() { +fn prepare_output_without_overwrite_replaces_existing_index_when_log_missing() { let tmp = TempDir::new().unwrap(); let pdf = tmp.path().join("paper.pdf"); std::fs::write(&pdf, b"%PDF").unwrap(); @@ -772,41 +772,59 @@ fn prepare_output_without_overwrite_fails_on_existing_managed_artifacts() { std::fs::create_dir_all(&target).unwrap(); std::fs::write(target.join("index.md"), b"old").unwrap(); - let err = prepare_output_paths(&tmp.path().join("out"), &pdf, false, false) - .unwrap_err() - .to_string(); - assert!(err.contains("--overwrite")); + let prepared = prepare_output_paths(&tmp.path().join("out"), &pdf, false, false).unwrap(); + assert_eq!(prepared.markdown_path, target.join("index.md")); + assert!(!prepared.markdown_path.exists()); + assert!(prepared.figures_dir.is_dir()); } #[test] -fn prepare_output_without_overwrite_fails_when_only_figures_exists() { +fn prepare_output_without_overwrite_cleans_stale_figures_when_log_missing() { let tmp = TempDir::new().unwrap(); let pdf = tmp.path().join("paper.pdf"); std::fs::write(&pdf, b"%PDF").unwrap(); let target = tmp.path().join("out").join("paper"); std::fs::create_dir_all(target.join("figures")).unwrap(); + std::fs::write(target.join("figures").join("stale.png"), b"old").unwrap(); - let err = prepare_output_paths(&tmp.path().join("out"), &pdf, false, false) - .unwrap_err() - .to_string(); - assert!(err.contains("figures")); - assert!(err.contains("--overwrite")); + let prepared = prepare_output_paths(&tmp.path().join("out"), &pdf, false, false).unwrap(); + assert!(prepared.figures_dir.is_dir()); + assert!(!prepared.figures_dir.join("stale.png").exists()); +} + +#[test] +fn prepare_output_without_overwrite_cleans_index_and_figures_when_log_missing() { + let tmp = TempDir::new().unwrap(); + let pdf = tmp.path().join("paper.pdf"); + std::fs::write(&pdf, b"%PDF").unwrap(); + let target = tmp.path().join("out").join("paper"); + std::fs::create_dir_all(target.join("figures")).unwrap(); + std::fs::write(target.join("figures").join("stale.png"), b"old").unwrap(); + std::fs::write(target.join("index.md"), b"old").unwrap(); + + let prepared = prepare_output_paths(&tmp.path().join("out"), &pdf, false, false).unwrap(); + assert!(!prepared.markdown_path.exists()); + assert!(!prepared.figures_dir.join("stale.png").exists()); } #[test] -fn prepare_output_without_overwrite_fails_when_both_exist() { +fn prepare_output_without_overwrite_preserves_completed_output_when_log_exists() { let tmp = TempDir::new().unwrap(); let pdf = tmp.path().join("paper.pdf"); std::fs::write(&pdf, b"%PDF").unwrap(); let target = tmp.path().join("out").join("paper"); std::fs::create_dir_all(target.join("figures")).unwrap(); + std::fs::write(target.join("figures").join("stale.png"), b"old").unwrap(); std::fs::write(target.join("index.md"), b"old").unwrap(); + std::fs::write(target.join("log.jsonl"), b"{}\n").unwrap(); let err = prepare_output_paths(&tmp.path().join("out"), &pdf, false, false) .unwrap_err() .to_string(); - assert!(err.contains("index.md")); + assert!(err.contains("log.jsonl")); assert!(err.contains("--overwrite")); + assert!(target.join("index.md").exists()); + assert!(target.join("figures").join("stale.png").exists()); } #[test] @@ -877,6 +895,27 @@ fn prepare_output_with_normalize_tables_manages_tables_dir() { ); } +#[test] +fn prepare_output_without_overwrite_cleans_tables_when_enabled_and_log_missing() { + let tmp = TempDir::new().unwrap(); + let pdf = tmp.path().join("paper.pdf"); + std::fs::write(&pdf, b"%PDF").unwrap(); + let out = tmp.path().join("out").join("paper"); + std::fs::create_dir_all(out.join("tables")).unwrap(); + std::fs::write(out.join("tables").join("stale.html"), b"old").unwrap(); + + let prepared = prepare_output_paths(&tmp.path().join("out"), &pdf, false, true).unwrap(); + assert!(prepared.tables_dir.as_ref().unwrap().is_dir()); + assert!( + !prepared + .tables_dir + .as_ref() + .unwrap() + .join("stale.html") + .exists() + ); +} + #[test] fn prepare_output_without_overwrite_ignores_stale_tables_when_disabled() { let tmp = TempDir::new().unwrap(); @@ -961,7 +1000,7 @@ fn load_api_key_parses_quoted_value() { } #[test] -fn process_pdf_checks_output_conflict_before_env_lookup() { +fn process_pdf_checks_log_conflict_before_env_lookup() { let _guard = env_lock().lock().unwrap(); unsafe { std::env::remove_var("ZAI_API_KEY"); @@ -974,7 +1013,7 @@ fn process_pdf_checks_output_conflict_before_env_lookup() { let output_root = tmp.path().join("out"); let output_dir = output_root.join("paper"); std::fs::create_dir_all(&output_dir).unwrap(); - std::fs::write(output_dir.join("index.md"), b"existing").unwrap(); + std::fs::write(output_dir.join("log.jsonl"), b"{}\n").unwrap(); let missing_env = tmp.path().join("missing.env"); let rt = tokio::runtime::Runtime::new().unwrap(); @@ -997,3 +1036,42 @@ fn process_pdf_checks_output_conflict_before_env_lookup() { assert!(err.contains("Re-run with --overwrite")); assert!(!err.contains("ZAI_API_KEY")); } + +#[test] +fn process_pdf_reaches_env_lookup_when_log_missing_despite_stale_outputs() { + let _guard = env_lock().lock().unwrap(); + unsafe { + std::env::remove_var("ZAI_API_KEY"); + } + + let tmp = TempDir::new().unwrap(); + let pdf = tmp.path().join("paper.pdf"); + std::fs::write(&pdf, b"%PDF").unwrap(); + + let output_root = tmp.path().join("out"); + let output_dir = output_root.join("paper"); + std::fs::create_dir_all(output_dir.join("figures")).unwrap(); + std::fs::write(output_dir.join("index.md"), b"existing").unwrap(); + std::fs::write(output_dir.join("figures").join("stale.png"), b"old").unwrap(); + + let missing_env = tmp.path().join("missing.env"); + let rt = tokio::runtime::Runtime::new().unwrap(); + let err = rt + .block_on(process_pdf( + &pdf, + &output_root, + &missing_env, + ProcessPdfOptions { + timeout: Duration::from_secs(1), + max_download_bytes: 1024, + overwrite: false, + normalize_tables: false, + progress: None, + }, + )) + .unwrap_err() + .to_string(); + + assert!(err.contains("ZAI_API_KEY")); + assert!(!err.contains("Re-run with --overwrite")); +}