123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215 |
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- # PYTHON_ARGCOMPLETE_OK
- import re
- import os
- import sys
- import yaml
- import email
- import pprint
- import random
- import locale
- import argparse
- import datetime
- import traceback
- import subprocess
- import HTMLParser
- import argcomplete
- # strptime
- locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8')
- def parse_amazon(msg):
- order = {
- 'platform': 'amazon.de',
- }
- msg_text = msg.get_payload()[0].get_payload(decode = True)
- order['order_id'] = re.search(r'Bestellnummer #(.+)', msg_text).group(1)
- order_date = datetime.datetime.strptime(
- re.search(r'Aufgegeben am (.+)', msg_text).group(1),
- '%d. %B %Y'
- )
- order['order_date'] = order_date.strftime('%Y-%m-%d')
- order['articles'] = []
- articles_text = msg_text.split('Bestellte(r) Artikel:')[1].split('_' * 10)[0].strip()
- for article_text in articles_text.split('\n\n'):
- article_match = re.match(
- ur' *(?P<name>.*)\n'
- + ur'( *von (?P<authors>.*)\n)?'
- + ur' *(?P<price_brutto_currency>[A-Z]+) (?P<price_brutto>\d+,\d+)\n'
- + ur'( *Zustand: (?P<state>.*)\n)?'
- + ur' *Verkauft von: (?P<reseller>.*)'
- + ur'(\n *Versand durch (?P<shipper>.*))?',
- article_text,
- re.MULTILINE | re.UNICODE
- )
- if article_match is None:
- sys.stderr.write(repr(article_text) + '\n')
- raise Exception('could not match article')
- article = article_match.groupdict()
- if article['authors']:
- article['authors'] = article['authors'].split(',')
- else:
- del article['authors']
- article['price_brutto'] = float(article['price_brutto'].replace(',', '.'))
- order['articles'].append(article)
- return order
- def parse_oebb(msg):
- msg_text = msg.get_payload()[0].get_payload(decode = True).decode('utf8')
- # msg_text = re.sub(
- # r'<[^>]+>',
- # '',
- # HTMLParser.HTMLParser().unescape(msg.get_payload(decode = True).decode('utf8'))
- # )
- order_match = re.search(
- ur'Booking code:\s+(?P<order_id>[\d ]+)\s+'
- + ur'Customer number:\s+(?P<customer_id>PV\d+)\s+'
- + ur'Booking date:\s+(?P<order_date>.* \d{4})\s',
- msg_text,
- re.MULTILINE | re.UNICODE
- )
- order = order_match.groupdict()
- order['platform'] = 'oebb.at'
- locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
- order['order_date'] = datetime.datetime.strptime(
- order['order_date'],
- '%b %d, %Y'
- ).strftime('%Y-%m-%d')
- article_match = re.search(
- ur'(?P<price_brutto_currency>.)(?P<price_brutto>\d+\.\d+)'
- + ur'[\W\w]+'
- + ur'Your Booking\s+'
- + ur'(?P<departure_point>.*)\s+>\s+(?P<destination_point>.*)',
- msg_text,
- re.MULTILINE | re.UNICODE
- )
- article = article_match.groupdict()
- article['name'] = 'Train Ticket'
- article['price_brutto'] = float(article['price_brutto'])
- if article['price_brutto_currency'] == u'€':
- article['price_brutto_currency'] = 'EUR'
- else:
- raise Exception('currency %s is not supported' % article['price_brutto_currency'])
- order['articles'] = [article]
- return order
- def parse_mytaxi(msg):
- pdf_compressed = msg.get_payload()[1].get_payload(decode = True)
- pdftk = subprocess.Popen(
- ['pdftk - output - uncompress'],
- shell = True,
- stdin = subprocess.PIPE,
- stdout = subprocess.PIPE,
- )
- pdf_uncompressed = pdftk.communicate(
- input = pdf_compressed,
- )[0].decode('latin-1')
- assert type(pdf_uncompressed) is unicode
- order_match = re.search(
- ur'Rechnungsnummer:[^\(]+\((?P<order_id>\w+)\)',
- pdf_uncompressed,
- re.MULTILINE | re.UNICODE
- )
- order = order_match.groupdict()
- order['platform'] = 'mytaxi'
- article_match = re.search(
- ur'\(Bruttobetrag\)'
- + ur'[^\(]+'
- + ur'\((?P<price_brutto>\d+,\d+) (?P<price_brutto_currency>.+)\)'
- + ur'[\w\W]+'
- + ur'\((?P<driver>[^\(]+)\)'
- + ur'[^\(]+'
- + ur'\(\d+,\d+ .\)'
- + ur'[^\(]+'
- + ur'\((?P<name>Taxifahrt)'
- + ur'[^\(]+'
- + ur'\(von: (?P<departure_point>[^\)]+)'
- + ur'[^\(]+'
- + ur'\(nach: (?P<destination_point>[^\)]+)'
- + ur'[\w\W]+'
- + ur'Belegdatum \\\(Leistungszeitpunkt\\\):[^\(]+\((?P<arrival_time>\d\d.\d\d.\d\d \d\d:\d\d)\)',
- pdf_uncompressed,
- re.MULTILINE | re.UNICODE
- )
- article = article_match.groupdict()
- locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
- arrival_time = datetime.datetime.strptime(
- article['arrival_time'],
- '%d.%m.%y %H:%M'
- )
- article['arrival_time'] = arrival_time.strftime('%Y-%m-%d %H:%M')
- order['order_date'] = arrival_time.strftime('%Y-%m-%d')
- article['price_brutto'] = float(article['price_brutto'].replace(',', '.'))
- if article['price_brutto_currency'] in [u'€', u'\x80']:
- article['price_brutto_currency'] = 'EUR'
- else:
- raise exception('currency %s is not supported' % article['price_brutto_currency'])
- order['articles'] = [article]
- return order
- def parse(msg):
- tracebacks = {}
- try:
- return parse_amazon(msg)
- except:
- tracebacks['amazon'] = traceback.format_exc()
- try:
- return parse_oebb(msg)
- except:
- tracebacks['oebb'] = traceback.format_exc()
- try:
- return parse_mytaxi(msg)
- except:
- tracebacks['mytaxi'] = traceback.format_exc()
- for parser_name in tracebacks:
- print('%s parser: \n%s' % (parser_name, tracebacks[parser_name]))
- raise Exception('failed to parse')
- def compute():
- msg = email.message_from_string(sys.stdin.read())
- order = parse(msg)
- print(yaml.safe_dump(order, default_flow_style = False))
- def _init_argparser():
- argparser = argparse.ArgumentParser(description = None)
- return argparser
- def main(argv):
- argparser = _init_argparser()
- argcomplete.autocomplete(argparser)
- args = argparser.parse_args(argv)
- compute(**vars(args))
- return 0
- if __name__ == "__main__":
- sys.exit(main(sys.argv[1:]))
|