Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: Publish to PyPI

on:
release:
types: [published]

jobs:
build-and-publish:
name: Build and publish to PyPI
runs-on: ubuntu-latest
permissions:
id-token: write # IMPORTANT: this permission is mandatory for trusted publishing

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'

- name: Install build tools
run: |
python -m pip install --upgrade pip
pip install build

- name: Build package
run: python -m build

- name: Publish to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
38 changes: 26 additions & 12 deletions src/docblocks/io/pdf_reader.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,26 @@
import fitz
import os

def load_pdf(path: str):
if not path.lower().endswith(".pdf"):
raise ValueError("Input file must be a .pdf file")

if not os.path.exists(path):
raise FileNotFoundError(f"File not found: {path}")

doc = fitz.open(path)
return [page for page in doc]
import fitz
import os

def load_pdf(path: str):
if not isinstance(path, str):
raise TypeError(f"Input path must be a string, got {type(path).__name__}")

if not path.lower().endswith(".pdf"):
raise ValueError("Input file must be a .pdf file")

if not os.path.exists(path):
raise FileNotFoundError(f"File not found: {path}")

if not os.path.isfile(path):
raise ValueError(f"Path is not a file: {path}")

try:
doc = fitz.open(path)
except Exception as e:
# Wrap fitz errors for corrupt files
raise ValueError(f"Failed to open PDF file: {e}") from e

if doc.is_encrypted:
raise ValueError("PDF is encrypted and cannot be processed without a password")

return [page for page in doc]
83 changes: 83 additions & 0 deletions tests/test_edge_cases.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import pytest
import os
import fitz
import docblocks
from docblocks.io.pdf_reader import load_pdf

# Helper to create PDF files
@pytest.fixture
def pdf_factory(tmp_path):
def _create_pdf(name, content="text", encrypted=False, corrupt=False, image_only=False, zero_pages=False):
p = tmp_path / name
if corrupt:
p.write_text("Not a PDF")
return str(p)

doc = fitz.open()
if not zero_pages:
page = doc.new_page()
if image_only:
# Create a minimal image
img_data = b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x06\x00\x00\x00\x1f\x15\xc4\x89\x00\x00\x00\nIDATx\x9cc\x00\x01\x00\x00\x05\x00\x01\r\n-\xb4\x00\x00\x00\x00IEND\xaeB`\x82'
page.insert_image(page.rect, stream=img_data)
else:
page.insert_text((50, 50), content)

if encrypted:
doc.save(str(p), encryption=fitz.PDF_ENCRYPT_AES_256, user_pw="user", owner_pw="owner")
else:
doc.save(str(p))
doc.close()
return str(p)
return _create_pdf

def test_extract_normal(pdf_factory):
path = pdf_factory("normal.pdf", content="Hello World")
text = docblocks.extract(path)
assert "Hello World" in text
assert "PAGE 1" in text

def test_extract_encrypted(pdf_factory):
path = pdf_factory("encrypted.pdf", encrypted=True)
with pytest.raises(ValueError, match="encrypted"):
docblocks.extract(path)

def test_extract_corrupt(pdf_factory):
path = pdf_factory("corrupt.pdf", corrupt=True)
with pytest.raises(ValueError, match="Failed to open PDF file"):
docblocks.extract(path)

def test_extract_wrong_type():
with pytest.raises(TypeError, match="must be a string"):
docblocks.extract(123)
with pytest.raises(TypeError, match="must be a string"):
docblocks.extract(None)

def test_extract_directory(tmp_path):
d = tmp_path / "folder.pdf"
d.mkdir()
# Should raise IsADirectoryError or our custom error
with pytest.raises((IsADirectoryError, ValueError), match="file"):
docblocks.extract(str(d))

def test_extract_image_only(pdf_factory):
path = pdf_factory("image.pdf", image_only=True)
text = docblocks.extract(path)
# Should be empty of content text, but contain page markers
assert "PAGE 1" in text
# The text content between markers should be empty or just newlines
lines = text.splitlines()
# PAGE 1 header, empty line, footer, empty line
# Depending on formatting.
# Just check that no text is found (we didn't add any).
assert "Hello" not in text

def test_extract_missing_file():
with pytest.raises(FileNotFoundError):
docblocks.extract("non_existent.pdf")

def test_extract_wrong_extension(tmp_path):
p = tmp_path / "test.txt"
p.write_text("content")
with pytest.raises(ValueError, match="must be a .pdf file"):
docblocks.extract(str(p))
22 changes: 0 additions & 22 deletions tests/test_run.py

This file was deleted.