#!/usr/bin/env python # -*- coding: utf-8 -*- # PYTHON_ARGCOMPLETE_OK import re import os import sys import yaml import email import pprint import random import locale import argparse import datetime import traceback import subprocess import HTMLParser import argcomplete class Order(object): def __init__(self, platform, order_id, order_date, customer_id = None): assert type(platform) is unicode self.platform = platform assert type(order_id) is unicode self.order_id = order_id assert type(order_date) is datetime.datetime self.order_date = order_date assert customer_id is None or type(customer_id) is unicode self.customer_id = customer_id self.articles = [] def dict_repr(self): return {k: v for (k, v) in { 'articles': self.articles, 'customer_id': self.customer_id, 'order_date': self.order_date.strftime('%Y-%m-%d'), 'order_id': self.order_id, 'platform': self.platform, }.items() if v is not None} yaml.SafeDumper.add_representer(Order, lambda dumper, order: dumper.represent_dict(order.dict_repr())) def parse_amazon(msg): msg_text = msg.get_payload()[0].get_payload(decode = True).decode('utf-8') order_id = re.search(r'Bestellnummer #(.+)', msg_text).group(1) order_date_formatted = re.search(ur'Aufgegeben am (.+)', msg_text, re.UNICODE).group(1) locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8') order_date = datetime.datetime.strptime(order_date_formatted.encode('utf-8'), '%d. %B %Y') order = Order( u'amazon.de', order_id, order_date ) articles = [] articles_text = msg_text.split('Bestellte(r) Artikel:')[1].split('_' * 10)[0].strip() for article_text in articles_text.split('\n\n'): article_match = re.match( ur' *(?P.*)\n' + ur'( *von (?P.*)\n)?' + ur' *(?P[A-Z]+) (?P\d+,\d+)\n' + ur'( *Zustand: (?P.*)\n)?' + ur' *Verkauft von: (?P.*)' + ur'(\n *Versand durch (?P.*))?', article_text, re.MULTILINE | re.UNICODE ) if article_match is None: sys.stderr.write(repr(article_text) + '\n') raise Exception('could not match article') article = article_match.groupdict() if article['authors']: article['authors'] = article['authors'].split(',') else: del article['authors'] article['price_brutto'] = float(article['price_brutto'].replace(',', '.')) order.articles.append(article) return order def parse_oebb(msg): msg_text = msg.get_payload()[0].get_payload(decode = True).decode('utf8') # msg_text = re.sub( # r'<[^>]+>', # '', # HTMLParser.HTMLParser().unescape(msg.get_payload(decode = True).decode('utf8')) # ) order_match = re.search( ur'Booking code:\s+(?P[\d ]+)\s+' + ur'Customer number:\s+(?PPV\d+)\s+' + ur'Booking date:\s+(?P.* \d{4})\s', msg_text, re.MULTILINE | re.UNICODE ) order_match_groups = order_match.groupdict() locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') order_date = datetime.datetime.strptime( order_match_groups['order_date'], '%b %d, %Y' ) order = Order( u'oebb', order_match_groups['order_id'], order_date, customer_id = order_match_groups['customer_id'], ) article_match = re.search( ur'(?P.)(?P\d+\.\d+)' + ur'[\W\w]+' + ur'Your Booking\s+' + ur'(?P.*)\s+>\s+(?P.*)', msg_text, re.MULTILINE | re.UNICODE ) article = article_match.groupdict() article['name'] = 'Train Ticket' article['price_brutto'] = float(article['price_brutto']) if article['price_brutto_currency'] == u'€': article['price_brutto_currency'] = 'EUR' else: raise Exception('currency %s is not supported' % article['price_brutto_currency']) order.articles.append(article) return order def parse_mytaxi(msg): pdf_compressed = msg.get_payload()[1].get_payload(decode = True) pdftk = subprocess.Popen( ['pdftk - output - uncompress'], shell = True, stdin = subprocess.PIPE, stdout = subprocess.PIPE, ) pdf_uncompressed = pdftk.communicate( input = pdf_compressed, )[0].decode('latin-1') assert type(pdf_uncompressed) is unicode order_match = re.search( ur'Rechnungsnummer:[^\(]+\((?P\w+)\)', pdf_uncompressed, re.MULTILINE | re.UNICODE ) order_id = order_match.groupdict()['order_id'] article_match = re.search( ur'\(Bruttobetrag\)' + ur'[^\(]+' + ur'\((?P\d+,\d+) (?P.+)\)' + ur'[\w\W]+' + ur'\((?P[^\(]+)\)' + ur'[^\(]+' + ur'\(\d+,\d+ .\)' + ur'[^\(]+' + ur'\((?PTaxifahrt)' + ur'[^\(]+' + ur'\(von: (?P[^\)]+)' + ur'[^\(]+' + ur'\(nach: (?P[^\)]+)' + ur'[\w\W]+' + ur'Belegdatum \\\(Leistungszeitpunkt\\\):[^\(]+\((?P\d\d.\d\d.\d\d \d\d:\d\d)\)', pdf_uncompressed, re.MULTILINE | re.UNICODE ) article = article_match.groupdict() locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') arrival_time = datetime.datetime.strptime( article['arrival_time'], '%d.%m.%y %H:%M' ) article['arrival_time'] = arrival_time.strftime('%Y-%m-%d %H:%M') article['price_brutto'] = float(article['price_brutto'].replace(',', '.')) if article['price_brutto_currency'] in [u'€', u'\x80']: article['price_brutto_currency'] = 'EUR' else: raise Exception('currency %s is not supported' % article['price_brutto_currency']) order = Order( u'mytaxi', order_id, arrival_time, ) order.articles.append(article) return order def parse(msg): tracebacks = {} try: return parse_amazon(msg) except: tracebacks['amazon'] = traceback.format_exc() try: return parse_oebb(msg) except: tracebacks['oebb'] = traceback.format_exc() try: return parse_mytaxi(msg) except: tracebacks['mytaxi'] = traceback.format_exc() for parser_name in tracebacks: sys.stderr.write('%s parser: \n%s\n' % (parser_name, tracebacks[parser_name])) raise Exception('failed to parse') def compute(): msg = email.message_from_string(sys.stdin.read()) order = parse(msg) print(yaml.safe_dump(order, default_flow_style = False)) def _init_argparser(): argparser = argparse.ArgumentParser(description = None) return argparser def main(argv): argparser = _init_argparser() argcomplete.autocomplete(argparser) args = argparser.parse_args(argv) compute(**vars(args)) return 0 if __name__ == "__main__": sys.exit(main(sys.argv[1:]))