12345678910111213141516171819202122232425262728293031323334353637383940414243 |
- def ingdiba_pdf_file_to_text(pdf_file):
- import pdfminer.pdfparser
- parser = pdfminer.pdfparser.PDFParser(pdf_file)
- import pdfminer.pdfdocument
- doc = pdfminer.pdfdocument.PDFDocument(parser)
- assert doc.is_extractable
- import pdfminer.pdfinterp
- resource_manager = pdfminer.pdfinterp.PDFResourceManager()
- import pdfminer.converter
- device = pdfminer.converter.PDFPageAggregator(resource_manager)
- interpreter = pdfminer.pdfinterp.PDFPageInterpreter(resource_manager, device)
-
- import pdfminer.pdfpage
- page = [p for p in pdfminer.pdfpage.PDFPage.create_pages(doc)][1]
-
- """ The bbox value is (x0,y0,x1,y1).
-
- x0: the distance from the left of the page to the left edge of the box.
- y0: the distance from the bottom of the page to the lower edge of the box.
- x1: the distance from the left of the page to the right edge of the box.
- y1: the distance from the bottom of the page to the upper edge of the box.
-
- Remember in PDF the page origin is the *bottom left corner*.
- So the bottom left is (0,0) and the top right corner is
- somewhere like (612,792) in the case of A4 paper.
-
- https://groups.google.com/forum/#!topic/pdfminer-users/wOvDSW23B4M
-
- """
-
- interpreter.process_page(page)
- chars = {}
- for ltchar in device.get_result():
- if isinstance(ltchar, pdfminer.layout.LTChar):
- if not ltchar.y0 in chars:
- chars[ltchar.y0] = {}
- chars[ltchar.y0][ltchar.x0] = ltchar.get_text()
- return '\n'.join([''.join([chars[y0][x0] for x0 in sorted(chars[y0])]) for y0 in sorted(chars)[::-1]]) + '\n'
|