order-confirmation-mail-parser 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # PYTHON_ARGCOMPLETE_OK
  4. import re
  5. import os
  6. import sys
  7. import yaml
  8. import email
  9. import pprint
  10. import random
  11. import locale
  12. import argparse
  13. import datetime
  14. import traceback
  15. import subprocess
  16. import HTMLParser
  17. import argcomplete
  18. class Order(object):
  19. def __init__(self, platform, order_id, order_date, customer_id = None):
  20. assert type(platform) is unicode
  21. self.platform = platform
  22. assert type(order_id) is unicode
  23. self.order_id = order_id
  24. assert type(order_date) is datetime.datetime
  25. self.order_date = order_date
  26. assert customer_id is None or type(customer_id) is unicode
  27. self.customer_id = customer_id
  28. self.items = []
  29. def dict_repr(self):
  30. return {k: v for (k, v) in {
  31. 'articles': self.items,
  32. 'customer_id': self.customer_id,
  33. 'order_date': self.order_date.strftime('%Y-%m-%d'),
  34. 'order_id': self.order_id,
  35. 'platform': self.platform,
  36. }.items() if v is not None}
  37. yaml.SafeDumper.add_representer(Order, lambda dumper, order: dumper.represent_dict(order.dict_repr()))
  38. class Item(object):
  39. def __init__(
  40. self,
  41. name = None,
  42. price_brutto = None,
  43. price_brutto_currency = None,
  44. ):
  45. assert type(name) is unicode
  46. self.name = name
  47. assert type(price_brutto) is float
  48. self.price_brutto = price_brutto
  49. if price_brutto_currency == u'€':
  50. price_brutto_currency = u'EUR'
  51. assert type(price_brutto_currency) is unicode
  52. assert price_brutto_currency in [u'EUR']
  53. self.price_brutto_currency = price_brutto_currency
  54. def dict_repr(self):
  55. return {
  56. 'name': self.name,
  57. 'price_brutto': self.price_brutto,
  58. 'price_brutto_currency': self.price_brutto_currency,
  59. }
  60. yaml.SafeDumper.add_representer(Item, lambda dumper, item: dumper.represent_dict(item.dict_repr()))
  61. class Article(Item):
  62. def __init__(
  63. self,
  64. authors = [],
  65. state = None,
  66. reseller = None,
  67. shipper = None,
  68. **kwargs
  69. ):
  70. super(Article, self).__init__(**kwargs)
  71. assert type(authors) is list
  72. self.authors = authors
  73. assert state is None or type(state) is unicode
  74. self.state = state
  75. assert reseller is None or type(reseller) is unicode
  76. self.reseller = reseller
  77. assert shipper is None or type(shipper) is unicode
  78. self.shipper = shipper
  79. def dict_repr(self):
  80. attr = Item.dict_repr(self)
  81. attr.update({
  82. 'state': self.state,
  83. 'reseller': self.reseller,
  84. 'shipper': self.shipper,
  85. })
  86. if len(self.authors) > 0:
  87. attr['authors'] = self.authors
  88. return attr
  89. yaml.SafeDumper.add_representer(Article, lambda dumper, article: dumper.represent_dict(article.dict_repr()))
  90. class Transportation(Item):
  91. def __init__(self, departure_point = None, destination_point = None, **kwargs):
  92. super(Transportation, self).__init__(**kwargs)
  93. assert type(departure_point) is unicode
  94. self.departure_point = departure_point
  95. assert type(destination_point) is unicode
  96. self.destination_point = destination_point
  97. def dict_repr(self):
  98. attr = Item.dict_repr(self)
  99. attr.update({
  100. 'departure_point': self.departure_point,
  101. 'destination_point': self.destination_point,
  102. })
  103. return attr
  104. yaml.SafeDumper.add_representer(Transportation, lambda dumper, transportation: dumper.represent_dict(transportation.dict_repr()))
  105. class TaxiRide(Transportation):
  106. def __init__(self, driver = None, arrival_time = None, **kwargs):
  107. super(TaxiRide, self).__init__(name = u'Taxi Ride', **kwargs)
  108. assert type(driver) is unicode
  109. self.driver = driver
  110. assert type(arrival_time) is datetime.datetime
  111. self.arrival_time = arrival_time
  112. def dict_repr(self):
  113. attr = Transportation.dict_repr(self)
  114. attr.update({
  115. 'driver': self.driver,
  116. 'arrival_time': self.arrival_time.strftime('%Y-%m-%d %H:%M'),
  117. })
  118. return attr
  119. yaml.SafeDumper.add_representer(TaxiRide, lambda dumper, taxi_ride: dumper.represent_dict(taxi_ride.dict_repr()))
  120. def parse_amazon(msg):
  121. msg_text = msg.get_payload()[0].get_payload(decode = True).decode('utf-8')
  122. order_id = re.search(r'Bestellnummer #(.+)', msg_text).group(1)
  123. order_date_formatted = re.search(ur'Aufgegeben am (.+)', msg_text, re.UNICODE).group(1)
  124. locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8')
  125. order_date = datetime.datetime.strptime(order_date_formatted.encode('utf-8'), '%d. %B %Y')
  126. order = Order(
  127. u'amazon.de',
  128. order_id,
  129. order_date
  130. )
  131. articles_text = msg_text.split('Bestellte(r) Artikel:')[1].split('_' * 10)[0].strip()
  132. for article_text in articles_text.split('\n\n'):
  133. article_match = re.match(
  134. ur' *(?P<name>.*)\n'
  135. + ur'( *von (?P<authors>.*)\n)?'
  136. + ur' *(?P<price_brutto_currency>[A-Z]+) (?P<price_brutto>\d+,\d+)\n'
  137. + ur'( *Zustand: (?P<state>.*)\n)?'
  138. + ur' *Verkauft von: (?P<reseller>.*)'
  139. + ur'(\n *Versand durch (?P<shipper>.*))?',
  140. article_text,
  141. re.MULTILINE | re.UNICODE
  142. )
  143. if article_match is None:
  144. sys.stderr.write(repr(article_text) + '\n')
  145. raise Exception('could not match article')
  146. article = article_match.groupdict()
  147. order.items.append(Article(
  148. name = article['name'],
  149. price_brutto = float(article['price_brutto'].replace(',', '.')),
  150. price_brutto_currency = article['price_brutto_currency'],
  151. authors = article['authors'].split(',') if article['authors'] else [],
  152. state = article['state'],
  153. reseller = article['reseller'],
  154. shipper = article['shipper'],
  155. ))
  156. return order
  157. def parse_oebb(msg):
  158. msg_text = msg.get_payload()[0].get_payload(decode = True).decode('utf8')
  159. # msg_text = re.sub(
  160. # r'<[^>]+>',
  161. # '',
  162. # HTMLParser.HTMLParser().unescape(msg.get_payload(decode = True).decode('utf8'))
  163. # )
  164. order_match = re.search(
  165. ur'Booking code:\s+(?P<order_id>[\d ]+)\s+'
  166. + ur'Customer number:\s+(?P<customer_id>PV\d+)\s+'
  167. + ur'Booking date:\s+(?P<order_date>.* \d{4})\s',
  168. msg_text,
  169. re.MULTILINE | re.UNICODE
  170. )
  171. order_match_groups = order_match.groupdict()
  172. locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
  173. order_date = datetime.datetime.strptime(
  174. order_match_groups['order_date'],
  175. '%b %d, %Y'
  176. )
  177. order = Order(
  178. u'oebb',
  179. order_match_groups['order_id'],
  180. order_date,
  181. customer_id = order_match_groups['customer_id'],
  182. )
  183. item_match = re.search(
  184. ur'(?P<price_brutto_currency>.)(?P<price_brutto>\d+\.\d+)'
  185. + ur'[\W\w]+'
  186. + ur'Your Booking\s+'
  187. + ur'(?P<departure_point>.*)\s+>\s+(?P<destination_point>.*)',
  188. msg_text,
  189. re.MULTILINE | re.UNICODE
  190. )
  191. item = item_match.groupdict()
  192. order.items.append(Transportation(
  193. name = u'Train Ticket',
  194. price_brutto = float(item['price_brutto']),
  195. price_brutto_currency = item['price_brutto_currency'],
  196. departure_point = item['departure_point'],
  197. destination_point = item['destination_point'],
  198. ))
  199. return order
  200. def parse_mytaxi(msg):
  201. pdf_compressed = msg.get_payload()[1].get_payload(decode = True)
  202. pdftk = subprocess.Popen(
  203. ['pdftk - output - uncompress'],
  204. shell = True,
  205. stdin = subprocess.PIPE,
  206. stdout = subprocess.PIPE,
  207. )
  208. pdf_uncompressed = pdftk.communicate(
  209. input = pdf_compressed,
  210. )[0].decode('latin-1')
  211. assert type(pdf_uncompressed) is unicode
  212. order_match = re.search(
  213. ur'Rechnungsnummer:[^\(]+\((?P<order_id>\w+)\)',
  214. pdf_uncompressed,
  215. re.MULTILINE | re.UNICODE
  216. )
  217. order_id = order_match.groupdict()['order_id']
  218. ride_match_groups = re.search(
  219. ur'\(Bruttobetrag\)'
  220. + ur'[^\(]+'
  221. + ur'\((?P<price_brutto>\d+,\d+) (?P<price_brutto_currency>.+)\)'
  222. + ur'[\w\W]+'
  223. + ur'\((?P<driver>[^\(]+)\)'
  224. + ur'[^\(]+'
  225. + ur'\(\d+,\d+ .\)'
  226. + ur'[^\(]+'
  227. + ur'\((?P<name>Taxifahrt)'
  228. + ur'[^\(]+'
  229. + ur'\(von: (?P<departure_point>[^\)]+)'
  230. + ur'[^\(]+'
  231. + ur'\(nach: (?P<destination_point>[^\)]+)'
  232. + ur'[\w\W]+'
  233. + ur'Belegdatum \\\(Leistungszeitpunkt\\\):[^\(]+\((?P<arrival_time>\d\d.\d\d.\d\d \d\d:\d\d)\)',
  234. pdf_uncompressed,
  235. re.MULTILINE | re.UNICODE
  236. ).groupdict()
  237. arrival_time = datetime.datetime.strptime(
  238. ride_match_groups['arrival_time'],
  239. '%d.%m.%y %H:%M'
  240. )
  241. order = Order(
  242. u'mytaxi',
  243. order_id,
  244. arrival_time,
  245. )
  246. locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
  247. order.items.append(TaxiRide(
  248. price_brutto = float(ride_match_groups['price_brutto'].replace(',', '.')),
  249. # why 0x80 ?
  250. price_brutto_currency = u'EUR'
  251. if (ride_match_groups['price_brutto_currency'] == u'\x80')
  252. else ride_match_groups['price_brutto_currency'],
  253. departure_point = ride_match_groups['departure_point'],
  254. destination_point = ride_match_groups['destination_point'],
  255. driver = ride_match_groups['driver'],
  256. arrival_time = arrival_time,
  257. ))
  258. return order
  259. def parse(msg):
  260. tracebacks = {}
  261. try:
  262. return parse_amazon(msg)
  263. except:
  264. tracebacks['amazon'] = traceback.format_exc()
  265. try:
  266. return parse_oebb(msg)
  267. except:
  268. tracebacks['oebb'] = traceback.format_exc()
  269. try:
  270. return parse_mytaxi(msg)
  271. except:
  272. tracebacks['mytaxi'] = traceback.format_exc()
  273. for parser_name in tracebacks:
  274. sys.stderr.write('%s parser: \n%s\n' % (parser_name, tracebacks[parser_name]))
  275. raise Exception('failed to parse')
  276. def compute():
  277. msg = email.message_from_string(sys.stdin.read())
  278. order = parse(msg)
  279. print(yaml.safe_dump(order, default_flow_style = False))
  280. def _init_argparser():
  281. argparser = argparse.ArgumentParser(description = None)
  282. return argparser
  283. def main(argv):
  284. argparser = _init_argparser()
  285. argcomplete.autocomplete(argparser)
  286. args = argparser.parse_args(argv)
  287. compute(**vars(args))
  288. return 0
  289. if __name__ == "__main__":
  290. sys.exit(main(sys.argv[1:]))