From ecc8cefb866cca4e9a0feaa782c611d6f287d233 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Sat, 17 Jan 2026 09:15:00 +0000 Subject: [PATCH 1/2] Harden PDF loading and handle edge cases Identified and handled several edge cases in `load_pdf`: - Added type validation for input path. - Added check to ensure path is a file, not a directory. - Added explicit check for encrypted PDFs. - Wrapped PyMuPDF (fitz) open call to gracefully handle corrupt files and raise clearer ValueErrors. Added `tests/test_edge_cases.py` to cover: - Encrypted files - Corrupt files - Wrong input types - Directory paths - Image-only and empty files --- src/docblocks/io/pdf_reader.py | 38 +++++++++++----- tests/test_edge_cases.py | 83 ++++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+), 12 deletions(-) create mode 100644 tests/test_edge_cases.py diff --git a/src/docblocks/io/pdf_reader.py b/src/docblocks/io/pdf_reader.py index 5f988f6..aa9e341 100644 --- a/src/docblocks/io/pdf_reader.py +++ b/src/docblocks/io/pdf_reader.py @@ -1,12 +1,26 @@ -import fitz -import os - -def load_pdf(path: str): - if not path.lower().endswith(".pdf"): - raise ValueError("Input file must be a .pdf file") - - if not os.path.exists(path): - raise FileNotFoundError(f"File not found: {path}") - - doc = fitz.open(path) - return [page for page in doc] +import fitz +import os + +def load_pdf(path: str): + if not isinstance(path, str): + raise TypeError(f"Input path must be a string, got {type(path).__name__}") + + if not path.lower().endswith(".pdf"): + raise ValueError("Input file must be a .pdf file") + + if not os.path.exists(path): + raise FileNotFoundError(f"File not found: {path}") + + if not os.path.isfile(path): + raise ValueError(f"Path is not a file: {path}") + + try: + doc = fitz.open(path) + except Exception as e: + # Wrap fitz errors for corrupt files + raise ValueError(f"Failed to open PDF file: {e}") from e + + if doc.is_encrypted: + raise ValueError("PDF is encrypted and cannot be processed without a password") + + return [page for page in doc] diff --git a/tests/test_edge_cases.py b/tests/test_edge_cases.py new file mode 100644 index 0000000..3f09376 --- /dev/null +++ b/tests/test_edge_cases.py @@ -0,0 +1,83 @@ +import pytest +import os +import fitz +import docblocks +from docblocks.io.pdf_reader import load_pdf + +# Helper to create PDF files +@pytest.fixture +def pdf_factory(tmp_path): + def _create_pdf(name, content="text", encrypted=False, corrupt=False, image_only=False, zero_pages=False): + p = tmp_path / name + if corrupt: + p.write_text("Not a PDF") + return str(p) + + doc = fitz.open() + if not zero_pages: + page = doc.new_page() + if image_only: + # Create a minimal image + img_data = b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x06\x00\x00\x00\x1f\x15\xc4\x89\x00\x00\x00\nIDATx\x9cc\x00\x01\x00\x00\x05\x00\x01\r\n-\xb4\x00\x00\x00\x00IEND\xaeB`\x82' + page.insert_image(page.rect, stream=img_data) + else: + page.insert_text((50, 50), content) + + if encrypted: + doc.save(str(p), encryption=fitz.PDF_ENCRYPT_AES_256, user_pw="user", owner_pw="owner") + else: + doc.save(str(p)) + doc.close() + return str(p) + return _create_pdf + +def test_extract_normal(pdf_factory): + path = pdf_factory("normal.pdf", content="Hello World") + text = docblocks.extract(path) + assert "Hello World" in text + assert "PAGE 1" in text + +def test_extract_encrypted(pdf_factory): + path = pdf_factory("encrypted.pdf", encrypted=True) + with pytest.raises(ValueError, match="encrypted"): + docblocks.extract(path) + +def test_extract_corrupt(pdf_factory): + path = pdf_factory("corrupt.pdf", corrupt=True) + with pytest.raises(ValueError, match="Failed to open PDF file"): + docblocks.extract(path) + +def test_extract_wrong_type(): + with pytest.raises(TypeError, match="must be a string"): + docblocks.extract(123) + with pytest.raises(TypeError, match="must be a string"): + docblocks.extract(None) + +def test_extract_directory(tmp_path): + d = tmp_path / "folder.pdf" + d.mkdir() + # Should raise IsADirectoryError or our custom error + with pytest.raises((IsADirectoryError, ValueError), match="file"): + docblocks.extract(str(d)) + +def test_extract_image_only(pdf_factory): + path = pdf_factory("image.pdf", image_only=True) + text = docblocks.extract(path) + # Should be empty of content text, but contain page markers + assert "PAGE 1" in text + # The text content between markers should be empty or just newlines + lines = text.splitlines() + # PAGE 1 header, empty line, footer, empty line + # Depending on formatting. + # Just check that no text is found (we didn't add any). + assert "Hello" not in text + +def test_extract_missing_file(): + with pytest.raises(FileNotFoundError): + docblocks.extract("non_existent.pdf") + +def test_extract_wrong_extension(tmp_path): + p = tmp_path / "test.txt" + p.write_text("content") + with pytest.raises(ValueError, match="must be a .pdf file"): + docblocks.extract(str(p)) From 1316c9cf370160c4abeefa2c70e84398f3500f42 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Sat, 17 Jan 2026 09:29:29 +0000 Subject: [PATCH 2/2] Harden PDF loading and enable PyPI publishing - Update `load_pdf` in `src/docblocks/io/pdf_reader.py` to handle edge cases: - Validate input type (must be string). - Ensure path is a file (not a directory). - Check for file existence. - Check for `.pdf` extension. - Handle corrupt files by catching `fitz` exceptions. - explicit check for encrypted PDFs. - Add `tests/test_edge_cases.py` to test these scenarios. - Remove `tests/test_run.py` (manual test script). - Add `.github/workflows/publish.yml` for automated PyPI publishing on release. --- .github/workflows/publish.yml | 32 ++++++++++++++++++++++++++++++++ tests/test_run.py | 22 ---------------------- 2 files changed, 32 insertions(+), 22 deletions(-) create mode 100644 .github/workflows/publish.yml delete mode 100644 tests/test_run.py diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..c7b65e2 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,32 @@ +name: Publish to PyPI + +on: + release: + types: [published] + +jobs: + build-and-publish: + name: Build and publish to PyPI + runs-on: ubuntu-latest + permissions: + id-token: write # IMPORTANT: this permission is mandatory for trusted publishing + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install build tools + run: | + python -m pip install --upgrade pip + pip install build + + - name: Build package + run: python -m build + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/tests/test_run.py b/tests/test_run.py deleted file mode 100644 index 1994b3d..0000000 --- a/tests/test_run.py +++ /dev/null @@ -1,22 +0,0 @@ -from docblocks.io.pdf_reader import load_pdf - -PDF_PATH = "tests/sample.pdf" -OUT_PATH = "sample_output.txt" - -pages = load_pdf(PDF_PATH) - -output_pages = [] - -for i, p in enumerate(pages, start=1): - # extract raw text from page exactly as-is - raw = p.get_text("text") - - header = f"┌──────── PAGE {i} ─────────┐" - footer = "└──────────────────────────┘" - - output_pages.append(f"{header}\n{raw}\n{footer}") - -with open(OUT_PATH, "w", encoding="utf-8") as f: - f.write("\n\n".join(output_pages)) - -print(f"Done! Saved to: {OUT_PATH}")