-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocument_processor.py
More file actions
60 lines (44 loc) · 1.58 KB
/
document_processor.py
File metadata and controls
60 lines (44 loc) · 1.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
"""
Document Processor Module
-------------------------
Handles loading PDF files and splitting them into manageable text chunks.
"""
import os
import logging
from typing import List
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
logger = logging.getLogger(__name__)
_TEXT_SPLITTER = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
add_start_index=True,
)
def _validate_path(pdf_path: str) -> None:
if not os.path.exists(pdf_path):
logger.error(f"File not found: {pdf_path}")
raise FileNotFoundError(f"The file '{pdf_path}' was not found.")
def _load_pdf(pdf_path: str) -> List[Document]:
logger.info(f"Loading PDF: {pdf_path}")
docs = list(PyPDFLoader(pdf_path).lazy_load())
logger.info(f"Loaded {len(docs)} pages from '{pdf_path}'")
return docs
def process_pdf(pdf_path: str) -> List[Document]:
"""
Loads a PDF and splits it into text chunks.
Args:
pdf_path: Path to the PDF file.
Returns:
List of LangChain Document chunks, or an empty list if no text is extracted.
Raises:
FileNotFoundError: If the file does not exist.
"""
_validate_path(pdf_path)
docs = _load_pdf(pdf_path)
if not docs:
logger.warning(f"No text extracted from '{pdf_path}'. PDF may be empty or image-based.")
return []
chunks = _TEXT_SPLITTER.split_documents(docs)
logger.info(f"Split {len(docs)} pages into {len(chunks)} chunks.")
return chunks