fphammerle
/
finoex


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
							#!/usr/bin/env python
# -*- coding: utf-8 -*-
# PYTHON_ARGCOMPLETE_OK

import re
import os
import sys
import yaml
import email
import pprint
import random
import locale
import argparse
import datetime
import traceback
import subprocess
import HTMLParser
import argcomplete

# strptime
locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8')

def parse_amazon(msg):

    order = {
        'platform': 'amazon.de',
        }

    msg_text = msg.get_payload()[0].get_payload(decode = True)

    order['order_id'] = re.search(r'Bestellnummer #(.+)', msg_text).group(1)

    order_date = datetime.datetime.strptime(
        re.search(r'Aufgegeben am (.+)', msg_text).group(1),
        '%d. %B %Y'
        )
    order['order_date'] = order_date.strftime('%Y-%m-%d')

    order['articles'] = []
    articles_text = msg_text.split('Bestellte(r) Artikel:')[1].split('_' * 10)[0].strip()
    for article_text in articles_text.split('\n\n'):
        article_match = re.match(
            ur' *(?P<name>.*)\n'
                + ur'( *von (?P<authors>.*)\n)?'
                + ur' *(?P<price_brutto_currency>[A-Z]+) (?P<price_brutto>\d+,\d+)\n'
                + ur'( *Zustand: (?P<state>.*)\n)?'
                + ur' *Verkauft von: (?P<reseller>.*)'
                + ur'(\n *Versand durch (?P<shipper>.*))?',
            article_text,
            re.MULTILINE | re.UNICODE
            )
        if article_match is None:
            sys.stderr.write(repr(article_text) + '\n')
            raise Exception('could not match article')
        article = article_match.groupdict()
        if article['authors']:
            article['authors'] = article['authors'].split(',')
        else:
            del article['authors']
        article['price_brutto'] = float(article['price_brutto'].replace(',', '.'))
        order['articles'].append(article)

    return order

def parse_oebb(msg):

    msg_text = msg.get_payload()[0].get_payload(decode = True).decode('utf8')

    # msg_text = re.sub(
    #     r'<[^>]+>',
    #     '',
    #     HTMLParser.HTMLParser().unescape(msg.get_payload(decode = True).decode('utf8'))
    #     )

    order_match = re.search(
        ur'Booking code:\s+(?P<order_id>[\d ]+)\s+'
            + ur'Customer number:\s+(?P<customer_id>PV\d+)\s+'
            + ur'Booking date:\s+(?P<order_date>.* \d{4})\s',
        msg_text,
        re.MULTILINE | re.UNICODE
        )
    order = order_match.groupdict()
    order['platform'] = 'oebb.at'
    locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
    order['order_date'] = datetime.datetime.strptime(
        order['order_date'],
        '%b %d, %Y'
        ).strftime('%Y-%m-%d')

    article_match = re.search(
        ur'(?P<price_brutto_currency>.)(?P<price_brutto>\d+\.\d+)'
            + ur'[\W\w]+'
            + ur'Your Booking\s+'
            + ur'(?P<departure_point>.*)\s+>\s+(?P<destination_point>.*)',
        msg_text,
        re.MULTILINE | re.UNICODE
        )
    article = article_match.groupdict()
    article['name'] = 'Train Ticket'
    article['price_brutto'] = float(article['price_brutto'])
    if article['price_brutto_currency'] == u'€':
        article['price_brutto_currency'] = 'EUR'
    else:
        raise Exception('currency %s is not supported' % article['price_brutto_currency'])
    order['articles'] = [article]

    return order

def parse_mytaxi(msg):

    pdf_compressed = msg.get_payload()[1].get_payload(decode = True)
    pdftk = subprocess.Popen(
            ['pdftk - output - uncompress'],
            shell = True,
            stdin = subprocess.PIPE,
            stdout = subprocess.PIPE,
            )
    pdf_uncompressed = pdftk.communicate(
        input = pdf_compressed,
        )[0].decode('latin-1')
    assert type(pdf_uncompressed) is unicode

    order_match = re.search(
        ur'Rechnungsnummer:[^\(]+\((?P<order_id>\w+)\)',
        pdf_uncompressed,
        re.MULTILINE | re.UNICODE
        )
    order = order_match.groupdict()
    order['platform'] = 'mytaxi'

    article_match = re.search(
        ur'\(Bruttobetrag\)'
            + ur'[^\(]+'
            + ur'\((?P<price_brutto>\d+,\d+) (?P<price_brutto_currency>.+)\)'
            + ur'[\w\W]+'
            + ur'\((?P<driver>[^\(]+)\)'
            + ur'[^\(]+'
            + ur'\(\d+,\d+ .\)'
            + ur'[^\(]+'
            + ur'\((?P<name>Taxifahrt)'
            + ur'[^\(]+'
            + ur'\(von: (?P<departure_point>[^\)]+)'
            + ur'[^\(]+'
            + ur'\(nach: (?P<destination_point>[^\)]+)'
            + ur'[\w\W]+'
            + ur'Belegdatum \\\(Leistungszeitpunkt\\\):[^\(]+\((?P<arrival_time>\d\d.\d\d.\d\d \d\d:\d\d)\)',
        pdf_uncompressed,
        re.MULTILINE | re.UNICODE
        )
    article = article_match.groupdict()
    locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
    arrival_time = datetime.datetime.strptime(
        article['arrival_time'],
        '%d.%m.%y %H:%M'
        )
    article['arrival_time'] = arrival_time.strftime('%Y-%m-%d %H:%M')
    order['order_date'] = arrival_time.strftime('%Y-%m-%d')
    article['price_brutto'] = float(article['price_brutto'].replace(',', '.'))
    if article['price_brutto_currency'] in [u'€', u'\x80']:
        article['price_brutto_currency'] = 'EUR'
    else:
        raise exception('currency %s is not supported' % article['price_brutto_currency'])
    order['articles'] = [article]

    return order

def parse(msg):

    tracebacks = {}

    try:
        return parse_amazon(msg)
    except:
        tracebacks['amazon'] = traceback.format_exc()

    try:
        return parse_oebb(msg)
    except:
        tracebacks['oebb'] = traceback.format_exc()

    try:
        return parse_mytaxi(msg)
    except:
        tracebacks['mytaxi'] = traceback.format_exc()

    for parser_name in tracebacks:
        print('%s parser: \n%s' % (parser_name, tracebacks[parser_name]))

    raise Exception('failed to parse')

def compute():

    msg = email.message_from_string(sys.stdin.read())

    order = parse(msg)

    print(yaml.safe_dump(order, default_flow_style = False))

def _init_argparser():

    argparser = argparse.ArgumentParser(description = None)
    return argparser

def main(argv):

    argparser = _init_argparser()
    argcomplete.autocomplete(argparser)
    args = argparser.parse_args(argv)

    compute(**vars(args))

    return 0

if __name__ == "__main__":
    sys.exit(main(sys.argv[1:]))