order-confirmation-mail-parser 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # PYTHON_ARGCOMPLETE_OK
  4. import re
  5. import os
  6. import sys
  7. import yaml
  8. import email
  9. import pprint
  10. import random
  11. import locale
  12. import argparse
  13. import datetime
  14. import traceback
  15. import subprocess
  16. import HTMLParser
  17. import argcomplete
  18. import BeautifulSoup
  19. class Order(object):
  20. def __init__(self, platform, order_id, order_date, customer_id = None):
  21. assert type(platform) is unicode
  22. self.platform = platform
  23. assert type(order_id) is unicode
  24. self.order_id = order_id
  25. assert type(order_date) is datetime.datetime
  26. self.order_date = order_date
  27. assert customer_id is None or type(customer_id) is unicode
  28. self.customer_id = customer_id
  29. self.items = []
  30. self.discounts = []
  31. def dict_repr(self):
  32. return {k: v for (k, v) in {
  33. 'articles': self.items,
  34. 'customer_id': self.customer_id,
  35. 'discounts': self.discounts,
  36. 'order_date': self.order_date.strftime('%Y-%m-%d'),
  37. 'order_id': self.order_id,
  38. 'platform': self.platform,
  39. }.items() if v is not None}
  40. yaml.SafeDumper.add_representer(Order, lambda dumper, order: dumper.represent_dict(order.dict_repr()))
  41. class Sum(object):
  42. def __init__(self, value, currency):
  43. assert type(value) is float
  44. self.value = value
  45. if currency == u'€':
  46. currency = u'EUR'
  47. assert type(currency) is unicode
  48. assert currency in [u'EUR']
  49. self.currency = currency
  50. class Discount(object):
  51. def __init__(
  52. self,
  53. name = None,
  54. amount = None,
  55. ):
  56. assert type(name) is unicode
  57. self.name = name
  58. assert type(amount) is Sum
  59. assert amount.value >= 0
  60. self.amount = amount
  61. def dict_repr(self):
  62. return {
  63. 'name': self.name,
  64. 'value': self.amount.value,
  65. 'value_currency': self.amount.currency,
  66. }
  67. yaml.SafeDumper.add_representer(Discount, lambda dumper, discount: dumper.represent_dict(discount.dict_repr()))
  68. class Item(object):
  69. def __init__(
  70. self,
  71. name = None,
  72. price_brutto = None,
  73. ):
  74. assert type(name) is unicode
  75. self.name = name
  76. assert type(price_brutto) is Sum
  77. self.price_brutto = price_brutto
  78. def dict_repr(self):
  79. return {
  80. 'name': self.name,
  81. 'price_brutto': self.price_brutto.value,
  82. 'price_brutto_currency': self.price_brutto.currency,
  83. }
  84. yaml.SafeDumper.add_representer(Item, lambda dumper, item: dumper.represent_dict(item.dict_repr()))
  85. class Article(Item):
  86. def __init__(
  87. self,
  88. quantity = None,
  89. authors = [],
  90. state = None,
  91. reseller = None,
  92. shipper = None,
  93. **kwargs
  94. ):
  95. super(Article, self).__init__(**kwargs)
  96. assert type(quantity) is int
  97. self.quantity = quantity
  98. assert type(authors) is list
  99. self.authors = authors
  100. assert state is None or type(state) is unicode
  101. self.state = state
  102. assert reseller is None or type(reseller) is unicode
  103. self.reseller = reseller
  104. assert shipper is None or type(shipper) is unicode
  105. self.shipper = shipper
  106. def dict_repr(self):
  107. attr = Item.dict_repr(self)
  108. attr.update({
  109. 'quantity': self.quantity,
  110. 'reseller': self.reseller,
  111. 'shipper': self.shipper,
  112. 'state': self.state,
  113. })
  114. if len(self.authors) > 0:
  115. attr['authors'] = self.authors
  116. return attr
  117. yaml.SafeDumper.add_representer(Article, lambda dumper, article: dumper.represent_dict(article.dict_repr()))
  118. class Transportation(Item):
  119. def __init__(self, departure_point = None, destination_point = None, **kwargs):
  120. super(Transportation, self).__init__(**kwargs)
  121. assert type(departure_point) is unicode
  122. self.departure_point = departure_point
  123. assert type(destination_point) is unicode
  124. self.destination_point = destination_point
  125. def dict_repr(self):
  126. attr = Item.dict_repr(self)
  127. attr.update({
  128. 'departure_point': self.departure_point,
  129. 'destination_point': self.destination_point,
  130. })
  131. return attr
  132. yaml.SafeDumper.add_representer(Transportation, lambda dumper, transportation: dumper.represent_dict(transportation.dict_repr()))
  133. class TaxiRide(Transportation):
  134. def __init__(self, driver = None, arrival_time = None, **kwargs):
  135. super(TaxiRide, self).__init__(name = u'Taxi Ride', **kwargs)
  136. assert type(driver) is unicode
  137. self.driver = driver
  138. assert type(arrival_time) is datetime.datetime
  139. self.arrival_time = arrival_time
  140. def dict_repr(self):
  141. attr = Transportation.dict_repr(self)
  142. attr.update({
  143. 'driver': self.driver,
  144. 'arrival_time': self.arrival_time.strftime('%Y-%m-%d %H:%M'),
  145. })
  146. return attr
  147. yaml.SafeDumper.add_representer(TaxiRide, lambda dumper, taxi_ride: dumper.represent_dict(taxi_ride.dict_repr()))
  148. def parse_amazon(msg):
  149. msg_text = msg.get_payload()[0].get_payload(decode = True).decode('utf-8')
  150. order_id = re.search(r'Bestellnummer #(.+)', msg_text).group(1)
  151. order_date_formatted = re.search(ur'Aufgegeben am (.+)', msg_text, re.UNICODE).group(1)
  152. locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8')
  153. order_date = datetime.datetime.strptime(order_date_formatted.encode('utf-8'), '%d. %B %Y')
  154. order = Order(
  155. u'amazon.de',
  156. order_id,
  157. order_date
  158. )
  159. articles_text = msg_text.split('Bestellte(r) Artikel:')[1].split('_' * 10)[0].strip()
  160. for article_text in re.split(ur'\n\t*\n', articles_text):
  161. article_match = re.match(
  162. ur' *(?P<name>.*)\n'
  163. + ur'( *von (?P<authors>.*)\n)?'
  164. + ur' *(?P<price_brutto_currency>[A-Z]+) (?P<price_brutto>\d+,\d+)\n'
  165. + ur'( *Zustand: (?P<state>.*)\n)?'
  166. + ur' *Verkauft von: (?P<reseller>.*)'
  167. + ur'(\n *Versand durch (?P<shipper>.*))?',
  168. article_text,
  169. re.MULTILINE | re.UNICODE
  170. )
  171. if article_match is None:
  172. sys.stderr.write(repr(article_text) + '\n')
  173. raise Exception('could not match article')
  174. article = article_match.groupdict()
  175. order.items.append(Article(
  176. name = article['name'],
  177. price_brutto = Sum(
  178. float(article['price_brutto'].replace(',', '.')),
  179. article['price_brutto_currency']
  180. ),
  181. quantity = 1,
  182. authors = article['authors'].split(',') if article['authors'] else [],
  183. state = article['state'],
  184. reseller = article['reseller'],
  185. shipper = article['shipper'],
  186. ))
  187. return order
  188. def parse_oebb(msg):
  189. msg_text = msg.get_payload()[0].get_payload(decode = True).decode('utf8')
  190. # msg_text = re.sub(
  191. # r'<[^>]+>',
  192. # '',
  193. # HTMLParser.HTMLParser().unescape(msg.get_payload(decode = True).decode('utf8'))
  194. # )
  195. order_match = re.search(
  196. ur'Booking code:\s+(?P<order_id>[\d ]+)\s+'
  197. + ur'Customer number:\s+(?P<customer_id>PV\d+)\s+'
  198. + ur'Booking date:\s+(?P<order_date>.* \d{4})\s',
  199. msg_text,
  200. re.MULTILINE | re.UNICODE
  201. )
  202. order_match_groups = order_match.groupdict()
  203. locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
  204. order_date = datetime.datetime.strptime(
  205. order_match_groups['order_date'],
  206. '%b %d, %Y'
  207. )
  208. order = Order(
  209. u'oebb',
  210. order_match_groups['order_id'],
  211. order_date,
  212. customer_id = order_match_groups['customer_id'],
  213. )
  214. item_match = re.search(
  215. ur'(?P<price_brutto_currency>.)(?P<price_brutto>\d+\.\d+)'
  216. + ur'[\W\w]+'
  217. + ur'Your Booking\s+'
  218. + ur'(?P<departure_point>.*)\s+>\s+(?P<destination_point>.*)',
  219. msg_text,
  220. re.MULTILINE | re.UNICODE
  221. )
  222. item = item_match.groupdict()
  223. order.items.append(Transportation(
  224. name = u'Train Ticket',
  225. price_brutto = Sum(
  226. float(item['price_brutto']),
  227. item['price_brutto_currency'],
  228. ),
  229. departure_point = item['departure_point'],
  230. destination_point = item['destination_point'],
  231. ))
  232. return order
  233. def parse_mytaxi(msg):
  234. if not 'mytaxi' in msg.get_payload()[0].get_payload()[0].get_payload(decode = True):
  235. raise Exception('no mytaxi mail')
  236. pdf_compressed = msg.get_payload()[1].get_payload(decode = True)
  237. pdftk = subprocess.Popen(
  238. ['pdftk - output - uncompress'],
  239. shell = True,
  240. stdin = subprocess.PIPE,
  241. stdout = subprocess.PIPE,
  242. )
  243. pdf_uncompressed = pdftk.communicate(
  244. input = pdf_compressed,
  245. )[0].decode('latin-1')
  246. assert type(pdf_uncompressed) is unicode
  247. order_match = re.search(
  248. ur'Rechnungsnummer:[^\(]+\((?P<order_id>\w+)\)',
  249. pdf_uncompressed,
  250. re.MULTILINE | re.UNICODE
  251. )
  252. order_id = order_match.groupdict()['order_id']
  253. ride_match_groups = re.search(
  254. ur'\(Bruttobetrag\)'
  255. + ur'[^\(]+'
  256. + ur'\((?P<price_brutto>\d+,\d+) (?P<price_brutto_currency>.+)\)'
  257. + ur'[\w\W]+'
  258. + ur'\((?P<driver>[^\(]+)\)'
  259. + ur'[^\(]+'
  260. + ur'\(\d+,\d+ .\)'
  261. + ur'[^\(]+'
  262. + ur'\((?P<name>Taxifahrt)'
  263. + ur'[^\(]+'
  264. + ur'\(von: (?P<departure_point>[^\)]+)'
  265. + ur'[^\(]+'
  266. + ur'\(nach: (?P<destination_point>[^\)]+)'
  267. + ur'[\w\W]+'
  268. + ur'Belegdatum \\\(Leistungszeitpunkt\\\):[^\(]+\((?P<arrival_time>\d\d.\d\d.\d\d \d\d:\d\d)\)',
  269. pdf_uncompressed,
  270. re.MULTILINE | re.UNICODE
  271. ).groupdict()
  272. arrival_time = datetime.datetime.strptime(
  273. ride_match_groups['arrival_time'],
  274. '%d.%m.%y %H:%M'
  275. )
  276. order = Order(
  277. u'mytaxi',
  278. order_id,
  279. arrival_time,
  280. )
  281. locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
  282. order.items.append(TaxiRide(
  283. price_brutto = Sum(
  284. float(ride_match_groups['price_brutto'].replace(',', '.')),
  285. # why 0x80 ?
  286. u'EUR' if (ride_match_groups['price_brutto_currency'] == u'\x80')
  287. else ride_match_groups['price_brutto_currency'],
  288. ),
  289. departure_point = ride_match_groups['departure_point'],
  290. destination_point = ride_match_groups['destination_point'],
  291. driver = ride_match_groups['driver'],
  292. arrival_time = arrival_time,
  293. ))
  294. return order
  295. def parse_yipbee(msg):
  296. html = msg.get_payload()[0].get_payload()[1].get_payload(decode = True)
  297. if not 'yipbee' in html:
  298. raise Exception('no yipbee confirmation')
  299. doc = BeautifulSoup.BeautifulSoup(html, convertEntities = BeautifulSoup.BeautifulSoup.HTML_ENTITIES)
  300. content_table = doc.find('table')
  301. order_match_groups = re.search(
  302. ur'Bestellung:(?P<order_id>\w+) vom (?P<order_time>\d\d.\d\d.\d{4} \d\d:\d\d:\d\d)',
  303. content_table.find('table').findAll('tr')[3].text,
  304. re.UNICODE
  305. ).groupdict()
  306. order = Order(
  307. u'yipbee',
  308. order_match_groups['order_id'],
  309. datetime.datetime.strptime(order_match_groups['order_time'], '%d.%m.%Y %H:%M:%S'),
  310. )
  311. articles_table = content_table.find('table').find('tbody').findAll('tr', recursive = False)[4].find('table')
  312. for article_row in articles_table.find('tbody').findAll('tr', recursive = False)[1:]:
  313. article_columns = article_row.findAll('td', recursive = False)
  314. (price, currency) = re.sub(ur'\s+', ' ', article_columns[2].text.replace(u',', u'.')).split(' ')
  315. order.items.append(Article(
  316. name = article_columns[1].text,
  317. price_brutto = Sum(float(price), currency),
  318. quantity = int(article_columns[3].text),
  319. reseller = u'yipbee',
  320. shipper = u'yipbee',
  321. ))
  322. discount_row = content_table.find('table').find('tbody').findAll('tr', recursive = False)[6]
  323. (discount_name, discount_value_with_currency) = [c.text for c in discount_row.findAll('td', recursive = False)]
  324. (discount_value, discount_currency) = discount_value_with_currency.split(' ')
  325. order.discounts.append(Discount(
  326. name = discount_name,
  327. amount = Sum(float(discount_value.replace(',', '.')) * -1, discount_currency)
  328. ))
  329. shipping_costs_table = content_table.find('tbody').findAll('tr', recursive = False)[3].findAll('table')[1]
  330. (shipping_price, shipping_currency) = shipping_costs_table.text.replace(',', '.').split(' ')
  331. order.items.append(Item(
  332. name = u'Delivery',
  333. price_brutto = Sum(float(shipping_price), shipping_currency),
  334. ))
  335. return order
  336. def parse(msg):
  337. tracebacks = {}
  338. try:
  339. return parse_amazon(msg)
  340. except:
  341. tracebacks['amazon'] = traceback.format_exc()
  342. try:
  343. return parse_oebb(msg)
  344. except:
  345. tracebacks['oebb'] = traceback.format_exc()
  346. try:
  347. return parse_mytaxi(msg)
  348. except:
  349. tracebacks['mytaxi'] = traceback.format_exc()
  350. try:
  351. return parse_yipbee(msg)
  352. except:
  353. tracebacks['yipbee'] = traceback.format_exc()
  354. for parser_name in tracebacks:
  355. sys.stderr.write('%s parser: \n%s\n' % (parser_name, tracebacks[parser_name]))
  356. raise Exception('failed to parse')
  357. def compute(register_path):
  358. msg = email.message_from_string(sys.stdin.read())
  359. order = parse(msg)
  360. if register_path:
  361. with open(register_path, 'r') as register:
  362. orders = yaml.load(register.read().decode('utf-8'))
  363. if not orders:
  364. orders = {}
  365. if order.platform not in orders:
  366. orders[order.platform] = {}
  367. if order.order_id in orders[order.platform]:
  368. raise Exception('already registered')
  369. orders[order.platform][order.order_id] = order
  370. with open(register_path, 'w') as register:
  371. register.write(yaml.safe_dump(orders, default_flow_style = False))
  372. else:
  373. print(yaml.safe_dump(order, default_flow_style = False))
  374. def _init_argparser():
  375. argparser = argparse.ArgumentParser(description = None)
  376. argparser.add_argument('--register', metavar = 'path', dest = 'register_path')
  377. return argparser
  378. def main(argv):
  379. argparser = _init_argparser()
  380. argcomplete.autocomplete(argparser)
  381. args = argparser.parse_args(argv)
  382. compute(**vars(args))
  383. return 0
  384. if __name__ == "__main__":
  385. sys.exit(main(sys.argv[1:]))