Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 6 additions & 10 deletions pageindex/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,10 @@
import concurrent.futures
from pathlib import Path

import PyPDF2

from .page_index import page_index
from .page_index_md import md_to_tree
from .retrieve import get_document, get_document_structure, get_page_content
from .utils import ConfigLoader, remove_fields
from .utils import ConfigLoader, read_pdf_pages, remove_fields

META_INDEX = "_meta.json"

Expand All @@ -32,7 +30,8 @@ class PageIndexClient:

For agent-based QA, see examples/agentic_vectorless_rag_demo.py.
"""
def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = None, workspace: str = None):
def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = None,
workspace: str = None):
if api_key:
os.environ["OPENAI_API_KEY"] = api_key
elif not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"):
Expand Down Expand Up @@ -74,14 +73,11 @@ def index(self, file_path: str, mode: str = "auto") -> str:
if_add_node_summary='yes',
if_add_node_text='yes',
if_add_node_id='yes',
if_add_doc_description='yes'
if_add_doc_description='yes',
)
# Extract per-page text so queries don't need the original PDF
pages = []
with open(file_path, 'rb') as f:
pdf_reader = PyPDF2.PdfReader(f)
for i, page in enumerate(pdf_reader.pages, 1):
pages.append({'page': i, 'content': page.extract_text() or ''})
page_texts = read_pdf_pages(file_path)
pages = [{'page': i, 'content': text} for i, text in enumerate(page_texts, 1)]

self.documents[doc_id] = {
'id': doc_id,
Expand Down
6 changes: 3 additions & 3 deletions pageindex/page_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1065,9 +1065,9 @@ async def tree_parser(page_list, opt, doc=None, logger=None):

def page_index_main(doc, opt=None):
logger = JsonLogger(doc)

is_valid_pdf = (
(isinstance(doc, str) and os.path.isfile(doc) and doc.lower().endswith(".pdf")) or
(isinstance(doc, str) and os.path.isfile(doc) and doc.lower().endswith(".pdf")) or
isinstance(doc, BytesIO)
)
if not is_valid_pdf:
Expand Down Expand Up @@ -1112,7 +1112,7 @@ async def page_index_builder():

def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None,
if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None, if_add_node_text=None):

user_opt = {
arg: value for arg, value in locals().items()
if arg != "doc" and value is not None
Expand Down
21 changes: 9 additions & 12 deletions pageindex/retrieve.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import json
import PyPDF2

try:
from .utils import get_number_of_pages, remove_fields
from .utils import get_number_of_pages, read_pdf_pages, remove_fields
except ImportError:
from utils import get_number_of_pages, remove_fields
from utils import get_number_of_pages, read_pdf_pages, remove_fields


# ── Helpers ──────────────────────────────────────────────────────────────────
Expand Down Expand Up @@ -42,15 +41,13 @@ def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]:
{'page': p, 'content': page_map[p]}
for p in page_nums if p in page_map
]
path = doc_info['path']
with open(path, 'rb') as f:
pdf_reader = PyPDF2.PdfReader(f)
total = len(pdf_reader.pages)
valid_pages = [p for p in page_nums if 1 <= p <= total]
return [
{'page': p, 'content': pdf_reader.pages[p - 1].extract_text() or ''}
for p in valid_pages
]
all_pages = read_pdf_pages(doc_info['path'])
total = len(all_pages)
valid_pages = [p for p in page_nums if 1 <= p <= total]
return [
{'page': p, 'content': all_pages[p - 1]}
for p in valid_pages
]


def _get_md_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]:
Expand Down
83 changes: 58 additions & 25 deletions pageindex/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,32 +384,65 @@ def add_preface_if_needed(data):



def get_page_tokens(pdf_path, model=None, pdf_parser="PyPDF2"):
if pdf_parser == "PyPDF2":
pdf_reader = PyPDF2.PdfReader(pdf_path)
page_list = []
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
page_text = page.extract_text()
token_length = litellm.token_counter(model=model, text=page_text)
page_list.append((page_text, token_length))
return page_list
elif pdf_parser == "PyMuPDF":
if isinstance(pdf_path, BytesIO):
pdf_stream = pdf_path
doc = pymupdf.open(stream=pdf_stream, filetype="pdf")
elif isinstance(pdf_path, str) and os.path.isfile(pdf_path) and pdf_path.lower().endswith(".pdf"):
doc = pymupdf.open(pdf_path)
page_list = []
for page in doc:
page_text = page.get_text()
token_length = litellm.token_counter(model=model, text=page_text)
page_list.append((page_text, token_length))
return page_list
else:
raise ValueError(f"Unsupported PDF parser: {pdf_parser}")
SUPPORTED_PDF_PARSERS = ("PyPDF2", "pypdfium2", "PyMuPDF")

# Module-level setting. Override by mutating this attribute or setting
# PAGEINDEX_PDF_PARSER in the environment before import.
DEFAULT_PDF_PARSER = os.getenv("PAGEINDEX_PDF_PARSER") or SUPPORTED_PDF_PARSERS[0]


def read_pdf_pages(doc, pdf_parser=None):
"""Return a list of per-page text strings using the selected parser."""
parser = pdf_parser or DEFAULT_PDF_PARSER

if parser == "PyPDF2":
reader = PyPDF2.PdfReader(doc)
return [(p.extract_text() or "") for p in reader.pages]

if parser == "pypdfium2":
try:
import pypdfium2 as pdfium
except ImportError as e:
raise ImportError(
"DEFAULT_PDF_PARSER='pypdfium2' requires the optional dependency. "
"Install it with: pip install pypdfium2"
) from e
source = doc.getvalue() if isinstance(doc, BytesIO) else str(doc)
pdf = pdfium.PdfDocument(source)
try:
pages = []
for i in range(len(pdf)):
page = pdf[i]
tp = page.get_textpage()
try:
text = (tp.get_text_bounded() or "").replace("\r\n", "\n")
finally:
tp.close()
page.close()
pages.append(text)
return pages
finally:
pdf.close()

if parser == "PyMuPDF":
if isinstance(doc, BytesIO):
d = pymupdf.open(stream=doc, filetype="pdf")
else:
d = pymupdf.open(str(doc))
try:
return [p.get_text() for p in d]
finally:
d.close()

raise ValueError(
f"Unsupported pdf_parser={parser!r}. Choose from {SUPPORTED_PDF_PARSERS}."
)


def get_page_tokens(pdf_path, model=None, pdf_parser=None):
pages = read_pdf_pages(pdf_path, pdf_parser=pdf_parser)
return [(text, litellm.token_counter(model=model, text=text)) for text in pages]



def get_text_of_pdf_pages(pdf_pages, start_page, end_page):
text = ""
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
litellm==1.83.7
# openai-agents # optional: required for examples/agentic_vectorless_rag_demo.py
pymupdf==1.26.4
# pypdfium2 # optional: alternative PDF parser
PyPDF2==3.0.1
python-dotenv==1.2.2
pyyaml==6.0.2
18 changes: 12 additions & 6 deletions run_pageindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import os
import json
from pageindex import *
import pageindex.utils as pageindex_utils
Comment thread
github-code-quality[bot] marked this conversation as resolved.
Fixed
from pageindex.page_index_md import md_to_tree
from pageindex.utils import ConfigLoader

if __name__ == "__main__":
# Set up argument parser
Expand All @@ -28,7 +28,10 @@
help='Whether to add doc description to the doc')
parser.add_argument('--if-add-node-text', type=str, default=None,
help='Whether to add text to the node')

parser.add_argument('--pdf-parser', type=str, default=None,
choices=pageindex_utils.SUPPORTED_PDF_PARSERS,
help='PDF text extractor to use')

# Markdown specific arguments
parser.add_argument('--if-thinning', type=str, default='no',
help='Whether to apply tree thinning for markdown (markdown only)')
Expand Down Expand Up @@ -62,7 +65,11 @@
'if_add_doc_description': args.if_add_doc_description,
'if_add_node_text': args.if_add_node_text,
}
opt = ConfigLoader().load({k: v for k, v in user_opt.items() if v is not None})
opt = pageindex_utils.ConfigLoader().load({k: v for k, v in user_opt.items() if v is not None})

# CLI flag overrides the module-level default (and env var PAGEINDEX_PDF_PARSER).
if args.pdf_parser:
pageindex_utils.DEFAULT_PDF_PARSER = args.pdf_parser

# Process the PDF
toc_with_page_number = page_index_main(args.pdf_path, opt)
Expand Down Expand Up @@ -93,8 +100,7 @@
import asyncio

# Use ConfigLoader to get consistent defaults (matching PDF behavior)
from pageindex.utils import ConfigLoader
config_loader = ConfigLoader()
config_loader = pageindex_utils.ConfigLoader()

# Create options dict with user args
user_opt = {
Expand Down Expand Up @@ -131,4 +137,4 @@
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False)

print(f'Tree structure saved to: {output_file}')
print(f'Tree structure saved to: {output_file}')
Loading