order-confirmation-mail-parser 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # PYTHON_ARGCOMPLETE_OK
  4. import re
  5. import os
  6. import sys
  7. import yaml
  8. import email
  9. import pprint
  10. import random
  11. import locale
  12. import argparse
  13. import datetime
  14. import traceback
  15. import subprocess
  16. import HTMLParser
  17. import argcomplete
  18. class Order(object):
  19. def __init__(self, platform, order_id, order_date, customer_id = None):
  20. assert type(platform) is unicode
  21. self.platform = platform
  22. assert type(order_id) is unicode
  23. self.order_id = order_id
  24. assert type(order_date) is datetime.datetime
  25. self.order_date = order_date
  26. assert customer_id is None or type(customer_id) is unicode
  27. self.customer_id = customer_id
  28. self.articles = []
  29. def dict_repr(self):
  30. return {k: v for (k, v) in {
  31. 'articles': self.articles,
  32. 'customer_id': self.customer_id,
  33. 'order_date': self.order_date.strftime('%Y-%m-%d'),
  34. 'order_id': self.order_id,
  35. 'platform': self.platform,
  36. }.items() if v is not None}
  37. yaml.SafeDumper.add_representer(Order, lambda dumper, order: dumper.represent_dict(order.dict_repr()))
  38. def parse_amazon(msg):
  39. msg_text = msg.get_payload()[0].get_payload(decode = True).decode('utf-8')
  40. order_id = re.search(r'Bestellnummer #(.+)', msg_text).group(1)
  41. order_date_formatted = re.search(ur'Aufgegeben am (.+)', msg_text, re.UNICODE).group(1)
  42. locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8')
  43. order_date = datetime.datetime.strptime(order_date_formatted.encode('utf-8'), '%d. %B %Y')
  44. order = Order(
  45. u'amazon.de',
  46. order_id,
  47. order_date
  48. )
  49. articles = []
  50. articles_text = msg_text.split('Bestellte(r) Artikel:')[1].split('_' * 10)[0].strip()
  51. for article_text in articles_text.split('\n\n'):
  52. article_match = re.match(
  53. ur' *(?P<name>.*)\n'
  54. + ur'( *von (?P<authors>.*)\n)?'
  55. + ur' *(?P<price_brutto_currency>[A-Z]+) (?P<price_brutto>\d+,\d+)\n'
  56. + ur'( *Zustand: (?P<state>.*)\n)?'
  57. + ur' *Verkauft von: (?P<reseller>.*)'
  58. + ur'(\n *Versand durch (?P<shipper>.*))?',
  59. article_text,
  60. re.MULTILINE | re.UNICODE
  61. )
  62. if article_match is None:
  63. sys.stderr.write(repr(article_text) + '\n')
  64. raise Exception('could not match article')
  65. article = article_match.groupdict()
  66. if article['authors']:
  67. article['authors'] = article['authors'].split(',')
  68. else:
  69. del article['authors']
  70. article['price_brutto'] = float(article['price_brutto'].replace(',', '.'))
  71. order.articles.append(article)
  72. return order
  73. def parse_oebb(msg):
  74. msg_text = msg.get_payload()[0].get_payload(decode = True).decode('utf8')
  75. # msg_text = re.sub(
  76. # r'<[^>]+>',
  77. # '',
  78. # HTMLParser.HTMLParser().unescape(msg.get_payload(decode = True).decode('utf8'))
  79. # )
  80. order_match = re.search(
  81. ur'Booking code:\s+(?P<order_id>[\d ]+)\s+'
  82. + ur'Customer number:\s+(?P<customer_id>PV\d+)\s+'
  83. + ur'Booking date:\s+(?P<order_date>.* \d{4})\s',
  84. msg_text,
  85. re.MULTILINE | re.UNICODE
  86. )
  87. order_match_groups = order_match.groupdict()
  88. locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
  89. order_date = datetime.datetime.strptime(
  90. order_match_groups['order_date'],
  91. '%b %d, %Y'
  92. )
  93. order = Order(
  94. u'oebb',
  95. order_match_groups['order_id'],
  96. order_date,
  97. customer_id = order_match_groups['customer_id'],
  98. )
  99. article_match = re.search(
  100. ur'(?P<price_brutto_currency>.)(?P<price_brutto>\d+\.\d+)'
  101. + ur'[\W\w]+'
  102. + ur'Your Booking\s+'
  103. + ur'(?P<departure_point>.*)\s+>\s+(?P<destination_point>.*)',
  104. msg_text,
  105. re.MULTILINE | re.UNICODE
  106. )
  107. article = article_match.groupdict()
  108. article['name'] = 'Train Ticket'
  109. article['price_brutto'] = float(article['price_brutto'])
  110. if article['price_brutto_currency'] == u'€':
  111. article['price_brutto_currency'] = 'EUR'
  112. else:
  113. raise Exception('currency %s is not supported' % article['price_brutto_currency'])
  114. order.articles.append(article)
  115. return order
  116. def parse_mytaxi(msg):
  117. pdf_compressed = msg.get_payload()[1].get_payload(decode = True)
  118. pdftk = subprocess.Popen(
  119. ['pdftk - output - uncompress'],
  120. shell = True,
  121. stdin = subprocess.PIPE,
  122. stdout = subprocess.PIPE,
  123. )
  124. pdf_uncompressed = pdftk.communicate(
  125. input = pdf_compressed,
  126. )[0].decode('latin-1')
  127. assert type(pdf_uncompressed) is unicode
  128. order_match = re.search(
  129. ur'Rechnungsnummer:[^\(]+\((?P<order_id>\w+)\)',
  130. pdf_uncompressed,
  131. re.MULTILINE | re.UNICODE
  132. )
  133. order_id = order_match.groupdict()['order_id']
  134. article_match = re.search(
  135. ur'\(Bruttobetrag\)'
  136. + ur'[^\(]+'
  137. + ur'\((?P<price_brutto>\d+,\d+) (?P<price_brutto_currency>.+)\)'
  138. + ur'[\w\W]+'
  139. + ur'\((?P<driver>[^\(]+)\)'
  140. + ur'[^\(]+'
  141. + ur'\(\d+,\d+ .\)'
  142. + ur'[^\(]+'
  143. + ur'\((?P<name>Taxifahrt)'
  144. + ur'[^\(]+'
  145. + ur'\(von: (?P<departure_point>[^\)]+)'
  146. + ur'[^\(]+'
  147. + ur'\(nach: (?P<destination_point>[^\)]+)'
  148. + ur'[\w\W]+'
  149. + ur'Belegdatum \\\(Leistungszeitpunkt\\\):[^\(]+\((?P<arrival_time>\d\d.\d\d.\d\d \d\d:\d\d)\)',
  150. pdf_uncompressed,
  151. re.MULTILINE | re.UNICODE
  152. )
  153. article = article_match.groupdict()
  154. locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
  155. arrival_time = datetime.datetime.strptime(
  156. article['arrival_time'],
  157. '%d.%m.%y %H:%M'
  158. )
  159. article['arrival_time'] = arrival_time.strftime('%Y-%m-%d %H:%M')
  160. article['price_brutto'] = float(article['price_brutto'].replace(',', '.'))
  161. if article['price_brutto_currency'] in [u'€', u'\x80']:
  162. article['price_brutto_currency'] = 'EUR'
  163. else:
  164. raise Exception('currency %s is not supported' % article['price_brutto_currency'])
  165. order = Order(
  166. u'mytaxi',
  167. order_id,
  168. arrival_time,
  169. )
  170. order.articles.append(article)
  171. return order
  172. def parse(msg):
  173. tracebacks = {}
  174. try:
  175. return parse_amazon(msg)
  176. except:
  177. tracebacks['amazon'] = traceback.format_exc()
  178. try:
  179. return parse_oebb(msg)
  180. except:
  181. tracebacks['oebb'] = traceback.format_exc()
  182. try:
  183. return parse_mytaxi(msg)
  184. except:
  185. tracebacks['mytaxi'] = traceback.format_exc()
  186. for parser_name in tracebacks:
  187. sys.stderr.write('%s parser: \n%s\n' % (parser_name, tracebacks[parser_name]))
  188. raise Exception('failed to parse')
  189. def compute():
  190. msg = email.message_from_string(sys.stdin.read())
  191. order = parse(msg)
  192. print(yaml.safe_dump(order, default_flow_style = False))
  193. def _init_argparser():
  194. argparser = argparse.ArgumentParser(description = None)
  195. return argparser
  196. def main(argv):
  197. argparser = _init_argparser()
  198. argcomplete.autocomplete(argparser)
  199. args = argparser.parse_args(argv)
  200. compute(**vars(args))
  201. return 0
  202. if __name__ == "__main__":
  203. sys.exit(main(sys.argv[1:]))