From ecc8cefb866cca4e9a0feaa782c611d6f287d233 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Sat, 17 Jan 2026 09:15:00 +0000
Subject: [PATCH 1/2] Harden PDF loading and handle edge cases

Identified and handled several edge cases in `load_pdf`:
- Added type validation for input path.
- Added check to ensure path is a file, not a directory.
- Added explicit check for encrypted PDFs.
- Wrapped PyMuPDF (fitz) open call to gracefully handle corrupt files and raise clearer ValueErrors.

Added `tests/test_edge_cases.py` to cover:
- Encrypted files
- Corrupt files
- Wrong input types
- Directory paths
- Image-only and empty files
---
 src/docblocks/io/pdf_reader.py | 38 +++++++++++-----
 tests/test_edge_cases.py       | 83 ++++++++++++++++++++++++++++++++++
 2 files changed, 109 insertions(+), 12 deletions(-)
 create mode 100644 tests/test_edge_cases.py

diff --git a/src/docblocks/io/pdf_reader.py b/src/docblocks/io/pdf_reader.py
index 5f988f6..aa9e341 100644
--- a/src/docblocks/io/pdf_reader.py
+++ b/src/docblocks/io/pdf_reader.py
@@ -1,12 +1,26 @@
-import fitz
-import os
-
-def load_pdf(path: str):
-    if not path.lower().endswith(".pdf"):
-        raise ValueError("Input file must be a .pdf file")
-
-    if not os.path.exists(path):
-        raise FileNotFoundError(f"File not found: {path}")
-
-    doc = fitz.open(path)
-    return [page for page in doc]
+import fitz
+import os
+
+def load_pdf(path: str):
+    if not isinstance(path, str):
+        raise TypeError(f"Input path must be a string, got {type(path).__name__}")
+
+    if not path.lower().endswith(".pdf"):
+        raise ValueError("Input file must be a .pdf file")
+
+    if not os.path.exists(path):
+        raise FileNotFoundError(f"File not found: {path}")
+
+    if not os.path.isfile(path):
+        raise ValueError(f"Path is not a file: {path}")
+
+    try:
+        doc = fitz.open(path)
+    except Exception as e:
+        # Wrap fitz errors for corrupt files
+        raise ValueError(f"Failed to open PDF file: {e}") from e
+
+    if doc.is_encrypted:
+        raise ValueError("PDF is encrypted and cannot be processed without a password")
+
+    return [page for page in doc]
diff --git a/tests/test_edge_cases.py b/tests/test_edge_cases.py
new file mode 100644
index 0000000..3f09376
--- /dev/null
+++ b/tests/test_edge_cases.py
@@ -0,0 +1,83 @@
+import pytest
+import os
+import fitz
+import docblocks
+from docblocks.io.pdf_reader import load_pdf
+
+# Helper to create PDF files
+@pytest.fixture
+def pdf_factory(tmp_path):
+    def _create_pdf(name, content="text", encrypted=False, corrupt=False, image_only=False, zero_pages=False):
+        p = tmp_path / name
+        if corrupt:
+            p.write_text("Not a PDF")
+            return str(p)
+
+        doc = fitz.open()
+        if not zero_pages:
+            page = doc.new_page()
+            if image_only:
+                # Create a minimal image
+                img_data = b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x06\x00\x00\x00\x1f\x15\xc4\x89\x00\x00\x00\nIDATx\x9cc\x00\x01\x00\x00\x05\x00\x01\r\n-\xb4\x00\x00\x00\x00IEND\xaeB`\x82'
+                page.insert_image(page.rect, stream=img_data)
+            else:
+                page.insert_text((50, 50), content)
+
+        if encrypted:
+            doc.save(str(p), encryption=fitz.PDF_ENCRYPT_AES_256, user_pw="user", owner_pw="owner")
+        else:
+            doc.save(str(p))
+        doc.close()
+        return str(p)
+    return _create_pdf
+
+def test_extract_normal(pdf_factory):
+    path = pdf_factory("normal.pdf", content="Hello World")
+    text = docblocks.extract(path)
+    assert "Hello World" in text
+    assert "PAGE 1" in text
+
+def test_extract_encrypted(pdf_factory):
+    path = pdf_factory("encrypted.pdf", encrypted=True)
+    with pytest.raises(ValueError, match="encrypted"):
+        docblocks.extract(path)
+
+def test_extract_corrupt(pdf_factory):
+    path = pdf_factory("corrupt.pdf", corrupt=True)
+    with pytest.raises(ValueError, match="Failed to open PDF file"):
+        docblocks.extract(path)
+
+def test_extract_wrong_type():
+    with pytest.raises(TypeError, match="must be a string"):
+        docblocks.extract(123)
+    with pytest.raises(TypeError, match="must be a string"):
+        docblocks.extract(None)
+
+def test_extract_directory(tmp_path):
+    d = tmp_path / "folder.pdf"
+    d.mkdir()
+    # Should raise IsADirectoryError or our custom error
+    with pytest.raises((IsADirectoryError, ValueError), match="file"):
+        docblocks.extract(str(d))
+
+def test_extract_image_only(pdf_factory):
+    path = pdf_factory("image.pdf", image_only=True)
+    text = docblocks.extract(path)
+    # Should be empty of content text, but contain page markers
+    assert "PAGE 1" in text
+    # The text content between markers should be empty or just newlines
+    lines = text.splitlines()
+    # PAGE 1 header, empty line, footer, empty line
+    # Depending on formatting.
+    # Just check that no text is found (we didn't add any).
+    assert "Hello" not in text
+
+def test_extract_missing_file():
+    with pytest.raises(FileNotFoundError):
+        docblocks.extract("non_existent.pdf")
+
+def test_extract_wrong_extension(tmp_path):
+    p = tmp_path / "test.txt"
+    p.write_text("content")
+    with pytest.raises(ValueError, match="must be a .pdf file"):
+        docblocks.extract(str(p))

From 1316c9cf370160c4abeefa2c70e84398f3500f42 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Sat, 17 Jan 2026 09:29:29 +0000
Subject: [PATCH 2/2] Harden PDF loading and enable PyPI publishing

- Update `load_pdf` in `src/docblocks/io/pdf_reader.py` to handle edge cases:
    - Validate input type (must be string).
    - Ensure path is a file (not a directory).
    - Check for file existence.
    - Check for `.pdf` extension.
    - Handle corrupt files by catching `fitz` exceptions.
    - explicit check for encrypted PDFs.
- Add `tests/test_edge_cases.py` to test these scenarios.
- Remove `tests/test_run.py` (manual test script).
- Add `.github/workflows/publish.yml` for automated PyPI publishing on release.
---
 .github/workflows/publish.yml | 32 ++++++++++++++++++++++++++++++++
 tests/test_run.py             | 22 ----------------------
 2 files changed, 32 insertions(+), 22 deletions(-)
 create mode 100644 .github/workflows/publish.yml
 delete mode 100644 tests/test_run.py

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
new file mode 100644
index 0000000..c7b65e2
--- /dev/null
+++ b/.github/workflows/publish.yml
@@ -0,0 +1,32 @@
+name: Publish to PyPI
+
+on:
+  release:
+    types: [published]
+
+jobs:
+  build-and-publish:
+    name: Build and publish to PyPI
+    runs-on: ubuntu-latest
+    permissions:
+      id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.12'
+
+    - name: Install build tools
+      run: |
+        python -m pip install --upgrade pip
+        pip install build
+
+    - name: Build package
+      run: python -m build
+
+    - name: Publish to PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/tests/test_run.py b/tests/test_run.py
deleted file mode 100644
index 1994b3d..0000000
--- a/tests/test_run.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from docblocks.io.pdf_reader import load_pdf
-
-PDF_PATH = "tests/sample.pdf"
-OUT_PATH = "sample_output.txt"
-
-pages = load_pdf(PDF_PATH)
-
-output_pages = []
-
-for i, p in enumerate(pages, start=1):
-    # extract raw text from page exactly as-is
-    raw = p.get_text("text")
-
-    header = f"┌──────── PAGE {i} ─────────┐"
-    footer = "└──────────────────────────┘"
-
-    output_pages.append(f"{header}\n{raw}\n{footer}")
-
-with open(OUT_PATH, "w", encoding="utf-8") as f:
-    f.write("\n\n".join(output_pages))
-
-print(f"Done! Saved to: {OUT_PATH}")