ingdiba.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. def ingdiba_pdf_file_to_text(pdf_file):
  2. import pdfminer.pdfparser
  3. parser = pdfminer.pdfparser.PDFParser(pdf_file)
  4. import pdfminer.pdfdocument
  5. doc = pdfminer.pdfdocument.PDFDocument(parser)
  6. assert doc.is_extractable
  7. import pdfminer.pdfinterp
  8. resource_manager = pdfminer.pdfinterp.PDFResourceManager()
  9. import pdfminer.converter
  10. device = pdfminer.converter.PDFPageAggregator(resource_manager)
  11. interpreter = pdfminer.pdfinterp.PDFPageInterpreter(resource_manager, device)
  12. import pdfminer.pdfpage
  13. page = [p for p in pdfminer.pdfpage.PDFPage.create_pages(doc)][1]
  14. """ The bbox value is (x0,y0,x1,y1).
  15. x0: the distance from the left of the page to the left edge of the box.
  16. y0: the distance from the bottom of the page to the lower edge of the box.
  17. x1: the distance from the left of the page to the right edge of the box.
  18. y1: the distance from the bottom of the page to the upper edge of the box.
  19. Remember in PDF the page origin is the *bottom left corner*.
  20. So the bottom left is (0,0) and the top right corner is
  21. somewhere like (612,792) in the case of A4 paper.
  22. https://groups.google.com/forum/#!topic/pdfminer-users/wOvDSW23B4M
  23. """
  24. interpreter.process_page(page)
  25. chars = {}
  26. for ltchar in device.get_result():
  27. if isinstance(ltchar, pdfminer.layout.LTChar):
  28. if not ltchar.y0 in chars:
  29. chars[ltchar.y0] = {}
  30. chars[ltchar.y0][ltchar.x0] = ltchar.get_text()
  31. return '\n'.join([''.join([chars[y0][x0] for x0 in sorted(chars[y0])]) for y0 in sorted(chars)[::-1]]) + '\n'