Ver Fonte

added script ingdiba-pdf-to-text

Fabian Peter Hammerle há 8 anos atrás
pai
commit
cc18a50bf1

+ 0 - 0
dingguo/parser/__init__.py


+ 43 - 0
dingguo/parser/ingdiba.py

@@ -0,0 +1,43 @@
+def ingdiba_pdf_file_to_text(pdf_file):
+
+    import pdfminer.pdfparser
+    parser = pdfminer.pdfparser.PDFParser(pdf_file)
+
+    import pdfminer.pdfdocument
+    doc = pdfminer.pdfdocument.PDFDocument(parser)
+    assert doc.is_extractable
+
+    import pdfminer.pdfinterp
+    resource_manager = pdfminer.pdfinterp.PDFResourceManager()
+    import pdfminer.converter
+    device = pdfminer.converter.PDFPageAggregator(resource_manager)
+    interpreter = pdfminer.pdfinterp.PDFPageInterpreter(resource_manager, device)
+    
+    import pdfminer.pdfpage
+    page = [p for p in pdfminer.pdfpage.PDFPage.create_pages(doc)][1]
+    
+    """ The bbox value is (x0,y0,x1,y1).
+    
+    x0: the distance from the left of the page to the left edge of the box.
+    y0: the distance from the bottom of the page to the lower edge of the box.
+    x1: the distance from the left of the page to the right edge of the box.
+    y1: the distance from the bottom of the page to the upper edge of the box.
+    
+    Remember in PDF the page origin is the *bottom left corner*.
+    So the bottom left is (0,0) and the top right corner is
+    somewhere like (612,792) in the case of A4 paper. 
+    
+    https://groups.google.com/forum/#!topic/pdfminer-users/wOvDSW23B4M
+    
+    """
+    
+    interpreter.process_page(page)
+
+    chars = {}
+    for char in device.get_result():
+        if isinstance(char, pdfminer.layout.LTChar):
+            if not char.y0 in chars:
+                chars[char.y0] = {}
+            chars[char.y0][char.x0] = char.get_text()
+
+    return '\n'.join([''.join([chars[y0][x0] for x0 in sorted(chars[y0])]) for y0 in sorted(chars)[::-1]]) + '\n'

+ 33 - 0
scripts/ingdiba-pdf-to-text

@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+# PYTHON_ARGCOMPLETE_OK
+
+import sys
+import dingguo.parser.ingdiba
+
+def compute(file):
+    
+    sys.stdout.write(dingguo.parser.ingdiba.ingdiba_pdf_file_to_text(file).encode('utf-8'))
+
+def _init_argparser():
+
+    import argparse
+    argparser = argparse.ArgumentParser(description = None)
+    argparser.add_argument('file', type = argparse.FileType('rb'))
+    return argparser
+
+def main(argv):
+
+    argparser = _init_argparser()
+    try:
+        import argcomplete
+        argcomplete.autocomplete(argparser)
+    except ImportError:
+        pass
+    args = argparser.parse_args(argv)
+
+    compute(**vars(args))
+
+    return 0
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))

+ 5 - 2
setup.py

@@ -12,8 +12,11 @@ setup(
     # download_url = '',
     # keywords = [],
     # classifiers = [],
-    packages = ['dingguo'],
+    packages = [
+        'dingguo', 
+        'dingguo.parser',
+        ],
     scripts = glob.glob('scripts/*'),
-    # install_requires = [],
+    install_requires = ['pdfminer>=20140328'],
     tests_require = ['pytest']
     )

+ 45 - 0
tests/test_parser_ingdiba.py

@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+
+import pytest
+
+import os
+import dingguo.parser.ingdiba
+
+project_root_path = os.path.realpath(os.path.join(__file__, '..', '..'))
+test_data_path = os.path.join(project_root_path, 'tests', 'data', 'ing-diba')
+
+def test_ingdiba_pdf_file_to_text_purchase():
+    pdf_name = 'ING-DiBa_Postbox_2013-03-25_WP-Kauf - 60631041.pdf'
+    with open(os.path.join(test_data_path, pdf_name)) as pdf_file:
+        with open(os.path.join(test_data_path, os.path.splitext(pdf_name)[0] + '.txt')) as text_file:
+            assert dingguo.parser.ingdiba.ingdiba_pdf_file_to_text(pdf_file) == text_file.read().decode('utf-8')
+
+def test_ingdiba_pdf_file_to_text_tax():
+    pdf_name = 'ING-DiBa_Postbox_2013-12-27_KESt auf Erträge ausländischer Fonds.pdf'
+    with open(os.path.join(test_data_path, pdf_name)) as pdf_file:
+        with open(os.path.join(test_data_path, os.path.splitext(pdf_name)[0] + '.txt')) as text_file:
+            assert dingguo.parser.ingdiba.ingdiba_pdf_file_to_text(pdf_file) == text_file.read().decode('utf-8')
+
+def test_ingdiba_pdf_file_to_text_statement():
+    pdf_name = 'ING-DiBa_Postbox_2014-01-01_Depotauszug - 60631041.pdf'
+    with open(os.path.join(test_data_path, pdf_name)) as pdf_file:
+        with open(os.path.join(test_data_path, os.path.splitext(pdf_name)[0] + '.txt')) as text_file:
+            assert dingguo.parser.ingdiba.ingdiba_pdf_file_to_text(pdf_file) == text_file.read().decode('utf-8')
+
+def test_ingdiba_pdf_file_to_text_divestiture():
+    pdf_name = 'ING-DiBa_Postbox_2014-02-07_WP-Verkauf - 60631041.pdf'
+    with open(os.path.join(test_data_path, pdf_name)) as pdf_file:
+        with open(os.path.join(test_data_path, os.path.splitext(pdf_name)[0] + '.txt')) as text_file:
+            assert dingguo.parser.ingdiba.ingdiba_pdf_file_to_text(pdf_file) == text_file.read().decode('utf-8')
+
+def test_ingdiba_pdf_file_to_text_loss_compensation():
+    pdf_name = 'ING-DiBa_Postbox_2014-04-03_Verlustausgleich 2013.pdf'
+    with open(os.path.join(test_data_path, pdf_name)) as pdf_file:
+        with open(os.path.join(test_data_path, os.path.splitext(pdf_name)[0] + '.txt')) as text_file:
+            assert dingguo.parser.ingdiba.ingdiba_pdf_file_to_text(pdf_file) == text_file.read().decode('utf-8')
+
+def test_ingdiba_pdf_file_to_text_dividend_distribution():
+    pdf_name = 'ING-DiBa_Postbox_2014-04-09_Jährl. Bestätigung Ausschüttung Fonds.pdf'
+    with open(os.path.join(test_data_path, pdf_name)) as pdf_file:
+        with open(os.path.join(test_data_path, os.path.splitext(pdf_name)[0] + '.txt')) as text_file:
+            assert dingguo.parser.ingdiba.ingdiba_pdf_file_to_text(pdf_file) == text_file.read().decode('utf-8')