rutilio
/
finoex
forked from fphammerle/finoex


			
				
					
						
						
							12345678910111213141516171819202122232425262728293031323334353637383940414243
							def ingdiba_pdf_file_to_text(pdf_file):

    import pdfminer.pdfparser
    parser = pdfminer.pdfparser.PDFParser(pdf_file)

    import pdfminer.pdfdocument
    doc = pdfminer.pdfdocument.PDFDocument(parser)
    assert doc.is_extractable

    import pdfminer.pdfinterp
    resource_manager = pdfminer.pdfinterp.PDFResourceManager()
    import pdfminer.converter
    device = pdfminer.converter.PDFPageAggregator(resource_manager)
    interpreter = pdfminer.pdfinterp.PDFPageInterpreter(resource_manager, device)
    
    import pdfminer.pdfpage
    page = [p for p in pdfminer.pdfpage.PDFPage.create_pages(doc)][1]
    
    """ The bbox value is (x0,y0,x1,y1).
    
    x0: the distance from the left of the page to the left edge of the box.
    y0: the distance from the bottom of the page to the lower edge of the box.
    x1: the distance from the left of the page to the right edge of the box.
    y1: the distance from the bottom of the page to the upper edge of the box.
    
    Remember in PDF the page origin is the *bottom left corner*.
    So the bottom left is (0,0) and the top right corner is
    somewhere like (612,792) in the case of A4 paper. 
    
    https://groups.google.com/forum/#!topic/pdfminer-users/wOvDSW23B4M
    
    """
    
    interpreter.process_page(page)

    chars = {}
    for ltchar in device.get_result():
        if isinstance(ltchar, pdfminer.layout.LTChar):
            if not ltchar.y0 in chars:
                chars[ltchar.y0] = {}
            chars[ltchar.y0][ltchar.x0] = ltchar.get_text()

    return '\n'.join([''.join([chars[y0][x0] for x0 in sorted(chars[y0])]) for y0 in sorted(chars)[::-1]]) + '\n'