-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpdf.py
More file actions
94 lines (71 loc) · 2.33 KB
/
pdf.py
File metadata and controls
94 lines (71 loc) · 2.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# -*- coding: UTF-8 -*-
from fragments import *
from document import Document
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO
class PdfProcessor:
def __init__ (self, aPdfDocument):
self._pdfDocument = aPdfDocument
self._document = None
#---------------------------------------------------------
def document (self):
def mergeSameParagraphLines (lines):
def isEndOfParagraph (line):
return line[-1:] in ['.', '?', '!'] or len(line) < 60
result = []
currentLine = ''
for line in lines:
# print "# '" + line + "'"
currentLine += line
if isEndOfParagraph(line):
result.append(currentLine)
currentLine = ''
if currentLine != '':
result.append(currentLine)
return result
if not self._document:
pdfFile = open(self._pdfDocument, 'rb')
pdfParser = PDFParser(pdfFile)
document = PDFDocument()
pdfParser.set_document(document)
document.set_parser(pdfParser)
document.initialize()
if not document.is_extractable:
raise pdfminer.pdfparser.PDFTextExtractionNotAllowed
resourceManger = PDFResourceManager()
debug = 1
#
PDFDocument.debug = debug
PDFParser.debug = debug
# CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
pdfContent = StringIO()
laparams = LAParams()
laparams.all_texts = True
laparams.detect_vertical = True
# laparams.line_margin = 1.0
# laparams.char_margin = 1.0
# laparams.word_margin = 1.0
# laparams.boxes_flow = 1.0
# device = PDFDevice(resourceManger)
device = TextConverter(resourceManger, pdfContent, codec='utf-8', laparams=laparams)
interpreter = PDFPageInterpreter(resourceManger, device)
for page in document.get_pages():
interpreter.process_page(page)
content = mergeSameParagraphLines(pdfContent.getvalue().split('\n'))
toc = []
try:
for (level, title, destination, a, se) in document.get_outlines():
toc.append((level, title))
except:
pass
pdfContent.close()
self._document = Document().initWithDocumentInfo(content, None, None)
return self._document