def ingdiba_pdf_file_to_text(pdf_file): import pdfminer.pdfparser parser = pdfminer.pdfparser.PDFParser(pdf_file) import pdfminer.pdfdocument doc = pdfminer.pdfdocument.PDFDocument(parser) assert doc.is_extractable import pdfminer.pdfinterp resource_manager = pdfminer.pdfinterp.PDFResourceManager() import pdfminer.converter device = pdfminer.converter.PDFPageAggregator(resource_manager) interpreter = pdfminer.pdfinterp.PDFPageInterpreter(resource_manager, device) import pdfminer.pdfpage page = [p for p in pdfminer.pdfpage.PDFPage.create_pages(doc)][1] """ The bbox value is (x0,y0,x1,y1). x0: the distance from the left of the page to the left edge of the box. y0: the distance from the bottom of the page to the lower edge of the box. x1: the distance from the left of the page to the right edge of the box. y1: the distance from the bottom of the page to the upper edge of the box. Remember in PDF the page origin is the *bottom left corner*. So the bottom left is (0,0) and the top right corner is somewhere like (612,792) in the case of A4 paper. https://groups.google.com/forum/#!topic/pdfminer-users/wOvDSW23B4M """ interpreter.process_page(page) chars = {} for ltchar in device.get_result(): if isinstance(ltchar, pdfminer.layout.LTChar): if not ltchar.y0 in chars: chars[ltchar.y0] = {} chars[ltchar.y0][ltchar.x0] = ltchar.get_text() return '\n'.join([''.join([chars[y0][x0] for x0 in sorted(chars[y0])]) for y0 in sorted(chars)[::-1]]) + '\n'