metadata-processor/pdf.py at master · streetlib/metadata-processor · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# -*- coding: UTF-8 -*-

from	fragments	import	*
from	document	import	Document

from	pdfminer.pdfparser	import	PDFParser, PDFDocument
from	pdfminer.pdfinterp	import	PDFResourceManager, PDFPageInterpreter, process_pdf
from	pdfminer.pdfdevice	import	PDFDevice
from	pdfminer.converter	import	TextConverter
from	pdfminer.layout		import	LAParams
from	cStringIO			import	StringIO

class PdfProcessor:

	def __init__ (self, aPdfDocument):
		self._pdfDocument = aPdfDocument
		self._document = None

	#---------------------------------------------------------

	def document (self):

		def mergeSameParagraphLines (lines):
			def isEndOfParagraph (line):
				return line[-1:] in ['.', '?', '!'] or len(line) < 60

			result = []
			currentLine = ''

			for line in lines:
#				print "# '" + line + "'"
				currentLine += line
				if isEndOfParagraph(line):
					result.append(currentLine)
					currentLine = ''

			if currentLine != '':
				result.append(currentLine)

			return result

		if not self._document:
			pdfFile = open(self._pdfDocument, 'rb')
			pdfParser = PDFParser(pdfFile)
			document = PDFDocument()

			pdfParser.set_document(document)
			document.set_parser(pdfParser)
			document.initialize()

			if not document.is_extractable:
				raise pdfminer.pdfparser.PDFTextExtractionNotAllowed

			resourceManger = PDFResourceManager()

			debug = 1
			#
			PDFDocument.debug = debug
			PDFParser.debug = debug
#			CMapDB.debug = debug
			PDFResourceManager.debug = debug
			PDFPageInterpreter.debug = debug
			PDFDevice.debug = debug
			#

			pdfContent = StringIO()
			laparams = LAParams()
			laparams.all_texts = True
			laparams.detect_vertical = True
#			laparams.line_margin = 1.0
#			laparams.char_margin = 1.0
#			laparams.word_margin = 1.0
#			laparams.boxes_flow = 1.0

#			device = PDFDevice(resourceManger)
			device = TextConverter(resourceManger, pdfContent, codec='utf-8', laparams=laparams)
			interpreter = PDFPageInterpreter(resourceManger, device)
			for page in document.get_pages():
				interpreter.process_page(page)
			content = mergeSameParagraphLines(pdfContent.getvalue().split('\n'))

			toc = []
			try:
				for (level, title, destination, a, se) in document.get_outlines():
					toc.append((level, title))
			except:
				pass

			pdfContent.close()

			self._document = Document().initWithDocumentInfo(content, None, None)

		return self._document