9 years ago · cc18a50bf1
--- a/dingguo/parser/__init__.py
+++ b/dingguo/parser/__init__.py
--- a/dingguo/parser/ingdiba.py
+++ b/dingguo/parser/ingdiba.py
@@ -0,0 +1,43 @@
 
				+def ingdiba_pdf_file_to_text(pdf_file):
			
 
				+
			
 
				+    import pdfminer.pdfparser
			
 
				+    parser = pdfminer.pdfparser.PDFParser(pdf_file)
			
 
				+
			
 
				+    import pdfminer.pdfdocument
			
 
				+    doc = pdfminer.pdfdocument.PDFDocument(parser)
			
 
				+    assert doc.is_extractable
			
 
				+
			
 
				+    import pdfminer.pdfinterp
			
 
				+    resource_manager = pdfminer.pdfinterp.PDFResourceManager()
			
 
				+    import pdfminer.converter
			
 
				+    device = pdfminer.converter.PDFPageAggregator(resource_manager)
			
 
				+    interpreter = pdfminer.pdfinterp.PDFPageInterpreter(resource_manager, device)
			
 
				+    
			
 
				+    import pdfminer.pdfpage
			
 
				+    page = [p for p in pdfminer.pdfpage.PDFPage.create_pages(doc)][1]
			
 
				+    
			
 
				+    """ The bbox value is (x0,y0,x1,y1).
			
 
				+    
			
 
				+    x0: the distance from the left of the page to the left edge of the box.
			
 
				+    y0: the distance from the bottom of the page to the lower edge of the box.
			
 
				+    x1: the distance from the left of the page to the right edge of the box.
			
 
				+    y1: the distance from the bottom of the page to the upper edge of the box.
			
 
				+    
			
 
				+    Remember in PDF the page origin is the *bottom left corner*.
			
 
				+    So the bottom left is (0,0) and the top right corner is
			
 
				+    somewhere like (612,792) in the case of A4 paper. 
			
 
				+    
			
 
				+    https://groups.google.com/forum/#!topic/pdfminer-users/wOvDSW23B4M
			
 
				+    
			
 
				+    """
			
 
				+    
			
 
				+    interpreter.process_page(page)
			
 
				+
			
 
				+    chars = {}
			
 
				+    for char in device.get_result():
			
 
				+        if isinstance(char, pdfminer.layout.LTChar):
			
 
				+            if not char.y0 in chars:
			
 
				+                chars[char.y0] = {}
			
 
				+            chars[char.y0][char.x0] = char.get_text()
			
 
				+
			
 
				+    return '\n'.join([''.join([chars[y0][x0] for x0 in sorted(chars[y0])]) for y0 in sorted(chars)[::-1]]) + '\n'
			
--- a/scripts/ingdiba-pdf-to-text
+++ b/scripts/ingdiba-pdf-to-text
@@ -0,0 +1,33 @@
 
				+#!/usr/bin/env python
			
 
				+# PYTHON_ARGCOMPLETE_OK
			
 
				+
			
 
				+import sys
			
 
				+import dingguo.parser.ingdiba
			
 
				+
			
 
				+def compute(file):
			
 
				+    
			
 
				+    sys.stdout.write(dingguo.parser.ingdiba.ingdiba_pdf_file_to_text(file).encode('utf-8'))
			
 
				+
			
 
				+def _init_argparser():
			
 
				+
			
 
				+    import argparse
			
 
				+    argparser = argparse.ArgumentParser(description = None)
			
 
				+    argparser.add_argument('file', type = argparse.FileType('rb'))
			
 
				+    return argparser
			
 
				+
			
 
				+def main(argv):
			
 
				+
			
 
				+    argparser = _init_argparser()
			
 
				+    try:
			
 
				+        import argcomplete
			
 
				+        argcomplete.autocomplete(argparser)
			
 
				+    except ImportError:
			
 
				+        pass
			
 
				+    args = argparser.parse_args(argv)
			
 
				+
			
 
				+    compute(**vars(args))
			
 
				+
			
 
				+    return 0
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    sys.exit(main(sys.argv[1:]))
			
--- a/setup.py
+++ b/setup.py
@@ -12,8 +12,11 @@ setup(
 
				     # download_url = '',
			
 
				     # keywords = [],
			
 
				     # classifiers = [],
			
 
				-    packages = ['dingguo'],
			
 
				+    packages = [
			
 
				+        'dingguo', 
			
 
				+        'dingguo.parser',
			
 
				+        ],
			
 
				     scripts = glob.glob('scripts/*'),
			
 
				-    # install_requires = [],
			
 
				+    install_requires = ['pdfminer>=20140328'],
			
 
				     tests_require = ['pytest']
			
 
				     )
			
--- a/tests/test_parser_ingdiba.py
+++ b/tests/test_parser_ingdiba.py
@@ -0,0 +1,45 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+import pytest
			
 
				+
			
 
				+import os
			
 
				+import dingguo.parser.ingdiba
			
 
				+
			
 
				+project_root_path = os.path.realpath(os.path.join(__file__, '..', '..'))
			
 
				+test_data_path = os.path.join(project_root_path, 'tests', 'data', 'ing-diba')
			
 
				+
			
 
				+def test_ingdiba_pdf_file_to_text_purchase():
			
 
				+    pdf_name = 'ING-DiBa_Postbox_2013-03-25_WP-Kauf - 60631041.pdf'
			
 
				+    with open(os.path.join(test_data_path, pdf_name)) as pdf_file:
			
 
				+        with open(os.path.join(test_data_path, os.path.splitext(pdf_name)[0] + '.txt')) as text_file:
			
 
				+            assert dingguo.parser.ingdiba.ingdiba_pdf_file_to_text(pdf_file) == text_file.read().decode('utf-8')
			
 
				+
			
 
				+def test_ingdiba_pdf_file_to_text_tax():
			
 
				+    pdf_name = 'ING-DiBa_Postbox_2013-12-27_KESt auf Erträge ausländischer Fonds.pdf'
			
 
				+    with open(os.path.join(test_data_path, pdf_name)) as pdf_file:
			
 
				+        with open(os.path.join(test_data_path, os.path.splitext(pdf_name)[0] + '.txt')) as text_file:
			
 
				+            assert dingguo.parser.ingdiba.ingdiba_pdf_file_to_text(pdf_file) == text_file.read().decode('utf-8')
			
 
				+
			
 
				+def test_ingdiba_pdf_file_to_text_statement():
			
 
				+    pdf_name = 'ING-DiBa_Postbox_2014-01-01_Depotauszug - 60631041.pdf'
			
 
				+    with open(os.path.join(test_data_path, pdf_name)) as pdf_file:
			
 
				+        with open(os.path.join(test_data_path, os.path.splitext(pdf_name)[0] + '.txt')) as text_file:
			
 
				+            assert dingguo.parser.ingdiba.ingdiba_pdf_file_to_text(pdf_file) == text_file.read().decode('utf-8')
			
 
				+
			
 
				+def test_ingdiba_pdf_file_to_text_divestiture():
			
 
				+    pdf_name = 'ING-DiBa_Postbox_2014-02-07_WP-Verkauf - 60631041.pdf'
			
 
				+    with open(os.path.join(test_data_path, pdf_name)) as pdf_file:
			
 
				+        with open(os.path.join(test_data_path, os.path.splitext(pdf_name)[0] + '.txt')) as text_file:
			
 
				+            assert dingguo.parser.ingdiba.ingdiba_pdf_file_to_text(pdf_file) == text_file.read().decode('utf-8')
			
 
				+
			
 
				+def test_ingdiba_pdf_file_to_text_loss_compensation():
			
 
				+    pdf_name = 'ING-DiBa_Postbox_2014-04-03_Verlustausgleich 2013.pdf'
			
 
				+    with open(os.path.join(test_data_path, pdf_name)) as pdf_file:
			
 
				+        with open(os.path.join(test_data_path, os.path.splitext(pdf_name)[0] + '.txt')) as text_file:
			
 
				+            assert dingguo.parser.ingdiba.ingdiba_pdf_file_to_text(pdf_file) == text_file.read().decode('utf-8')
			
 
				+
			
 
				+def test_ingdiba_pdf_file_to_text_dividend_distribution():
			
 
				+    pdf_name = 'ING-DiBa_Postbox_2014-04-09_Jährl. Bestätigung Ausschüttung Fonds.pdf'
			
 
				+    with open(os.path.join(test_data_path, pdf_name)) as pdf_file:
			
 
				+        with open(os.path.join(test_data_path, os.path.splitext(pdf_name)[0] + '.txt')) as text_file:
			
 
				+            assert dingguo.parser.ingdiba.ingdiba_pdf_file_to_text(pdf_file) == text_file.read().decode('utf-8')