order-confirmation-mail-parser 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # PYTHON_ARGCOMPLETE_OK
  4. import re
  5. import os
  6. import sys
  7. import yaml
  8. import email
  9. import pprint
  10. import random
  11. import locale
  12. import argparse
  13. import datetime
  14. import traceback
  15. import subprocess
  16. import HTMLParser
  17. import argcomplete
  18. import BeautifulSoup
  19. class Order(object):
  20. def __init__(self, platform, order_id, order_date, customer_id = None):
  21. assert type(platform) is unicode
  22. self.platform = platform
  23. assert type(order_id) is unicode
  24. self.order_id = order_id
  25. assert type(order_date) is datetime.datetime
  26. self.order_date = order_date
  27. assert customer_id is None or type(customer_id) is unicode
  28. self.customer_id = customer_id
  29. self.items = []
  30. self.discounts = []
  31. def dict_repr(self):
  32. return {k: v for (k, v) in {
  33. 'articles': self.items,
  34. 'customer_id': self.customer_id,
  35. 'discounts': self.discounts,
  36. 'order_date': self.order_date.strftime('%Y-%m-%d'),
  37. 'order_id': self.order_id,
  38. 'platform': self.platform,
  39. }.items() if v is not None}
  40. yaml.SafeDumper.add_representer(Order, lambda dumper, order: dumper.represent_dict(order.dict_repr()))
  41. class Sum(object):
  42. def __init__(self, value, currency):
  43. assert type(value) is float
  44. self.value = value
  45. if currency == u'€':
  46. currency = u'EUR'
  47. assert type(currency) is unicode
  48. assert currency in [u'EUR']
  49. self.currency = currency
  50. class Discount(object):
  51. def __init__(
  52. self,
  53. name = None,
  54. amount = None,
  55. ):
  56. assert type(name) is unicode
  57. self.name = name
  58. assert type(amount) is Sum
  59. assert amount.value >= 0
  60. self.amount = amount
  61. def dict_repr(self):
  62. return {
  63. 'name': self.name,
  64. 'value': self.amount.value,
  65. 'value_currency': self.amount.currency,
  66. }
  67. yaml.SafeDumper.add_representer(Discount, lambda dumper, discount: dumper.represent_dict(discount.dict_repr()))
  68. class Item(object):
  69. def __init__(
  70. self,
  71. name = None,
  72. price_brutto = None,
  73. ):
  74. assert type(name) is unicode
  75. self.name = name
  76. assert type(price_brutto) is Sum
  77. self.price_brutto = price_brutto
  78. def dict_repr(self):
  79. return {
  80. 'name': self.name,
  81. 'price_brutto': self.price_brutto.value,
  82. 'price_brutto_currency': self.price_brutto.currency,
  83. }
  84. yaml.SafeDumper.add_representer(Item, lambda dumper, item: dumper.represent_dict(item.dict_repr()))
  85. class Article(Item):
  86. def __init__(
  87. self,
  88. quantity = None,
  89. authors = [],
  90. state = None,
  91. reseller = None,
  92. shipper = None,
  93. **kwargs
  94. ):
  95. super(Article, self).__init__(**kwargs)
  96. assert type(quantity) is int
  97. self.quantity = quantity
  98. assert type(authors) is list
  99. self.authors = authors
  100. assert state is None or type(state) is unicode
  101. self.state = state
  102. assert reseller is None or type(reseller) is unicode
  103. self.reseller = reseller
  104. assert shipper is None or type(shipper) is unicode
  105. self.shipper = shipper
  106. self.delivery_date = None
  107. def dict_repr(self):
  108. attr = Item.dict_repr(self)
  109. attr.update({
  110. 'delivery_date': self.delivery_date,
  111. 'quantity': self.quantity,
  112. 'reseller': self.reseller,
  113. 'shipper': self.shipper,
  114. 'state': self.state,
  115. })
  116. if len(self.authors) > 0:
  117. attr['authors'] = self.authors
  118. return attr
  119. yaml.SafeDumper.add_representer(Article, lambda dumper, article: dumper.represent_dict(article.dict_repr()))
  120. class Transportation(Item):
  121. def __init__(self, departure_point = None, destination_point = None, **kwargs):
  122. super(Transportation, self).__init__(**kwargs)
  123. assert type(departure_point) is unicode
  124. self.departure_point = departure_point
  125. assert type(destination_point) is unicode
  126. self.destination_point = destination_point
  127. def dict_repr(self):
  128. attr = Item.dict_repr(self)
  129. attr.update({
  130. 'departure_point': self.departure_point,
  131. 'destination_point': self.destination_point,
  132. })
  133. return attr
  134. yaml.SafeDumper.add_representer(Transportation, lambda dumper, transportation: dumper.represent_dict(transportation.dict_repr()))
  135. class TaxiRide(Transportation):
  136. def __init__(self, driver = None, arrival_time = None, **kwargs):
  137. super(TaxiRide, self).__init__(name = u'Taxi Ride', **kwargs)
  138. assert type(driver) is unicode
  139. self.driver = driver
  140. assert type(arrival_time) is datetime.datetime
  141. self.arrival_time = arrival_time
  142. def dict_repr(self):
  143. attr = Transportation.dict_repr(self)
  144. attr.update({
  145. 'driver': self.driver,
  146. 'arrival_time': self.arrival_time.strftime('%Y-%m-%d %H:%M'),
  147. })
  148. return attr
  149. yaml.SafeDumper.add_representer(TaxiRide, lambda dumper, taxi_ride: dumper.represent_dict(taxi_ride.dict_repr()))
  150. def parse_amazon(msg):
  151. msg_text = msg.get_payload()[0].get_payload(decode = True).decode('utf-8')
  152. order_id = re.search(r'Bestellnummer #(.+)', msg_text).group(1)
  153. order_date_formatted = re.search(ur'Aufgegeben am (.+)', msg_text, re.UNICODE).group(1)
  154. locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8')
  155. order_date = datetime.datetime.strptime(order_date_formatted.encode('utf-8'), '%d. %B %Y')
  156. order = Order(
  157. u'amazon.de',
  158. order_id,
  159. order_date
  160. )
  161. articles_text = msg_text.split('Bestellte(r) Artikel:')[1].split('_' * 10)[0].strip()
  162. for article_text in re.split(ur'\n\t*\n', articles_text):
  163. article_match = re.match(
  164. ur' *((?P<quantity>\d+) x )?(?P<name>.*)\n'
  165. + ur'( *von (?P<authors>.*)\n)?'
  166. + ur' *(?P<price_brutto_currency>[A-Z]+) (?P<price_brutto>\d+,\d+)\n'
  167. + ur'( *Zustand: (?P<state>.*)\n)?'
  168. + ur' *Verkauft von: (?P<reseller>.*)'
  169. + ur'(\n *Versand durch (?P<shipper>.*))?',
  170. article_text,
  171. re.MULTILINE | re.UNICODE
  172. )
  173. if article_match is None:
  174. sys.stderr.write(repr(article_text) + '\n')
  175. raise Exception('could not match article')
  176. article = article_match.groupdict()
  177. order.items.append(Article(
  178. name = article['name'],
  179. price_brutto = Sum(
  180. float(article['price_brutto'].replace(',', '.')),
  181. article['price_brutto_currency']
  182. ),
  183. quantity = int(article['quantity']) if article['quantity'] else 1,
  184. authors = article['authors'].split(',') if article['authors'] else [],
  185. state = article['state'],
  186. reseller = article['reseller'],
  187. shipper = article['shipper'],
  188. ))
  189. return order
  190. def parse_oebb(msg):
  191. msg_text = msg.get_payload()[0].get_payload(decode = True).decode('utf8')
  192. # msg_text = re.sub(
  193. # r'<[^>]+>',
  194. # '',
  195. # HTMLParser.HTMLParser().unescape(msg.get_payload(decode = True).decode('utf8'))
  196. # )
  197. order_match = re.search(
  198. ur'Booking code:\s+(?P<order_id>[\d ]+)\s+'
  199. + ur'Customer number:\s+(?P<customer_id>PV\d+)\s+'
  200. + ur'Booking date:\s+(?P<order_date>.* \d{4})\s',
  201. msg_text,
  202. re.MULTILINE | re.UNICODE
  203. )
  204. order_match_groups = order_match.groupdict()
  205. locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
  206. order_date = datetime.datetime.strptime(
  207. order_match_groups['order_date'],
  208. '%b %d, %Y'
  209. )
  210. order = Order(
  211. u'oebb',
  212. order_match_groups['order_id'],
  213. order_date,
  214. customer_id = order_match_groups['customer_id'],
  215. )
  216. item_match = re.search(
  217. ur'(?P<price_brutto_currency>.)(?P<price_brutto>\d+\.\d+)'
  218. + ur'[\W\w]+'
  219. + ur'Your Booking\s+'
  220. + ur'(?P<departure_point>.*)\s+>\s+(?P<destination_point>.*)',
  221. msg_text,
  222. re.MULTILINE | re.UNICODE
  223. )
  224. item = item_match.groupdict()
  225. order.items.append(Transportation(
  226. name = u'Train Ticket',
  227. price_brutto = Sum(
  228. float(item['price_brutto']),
  229. item['price_brutto_currency'],
  230. ),
  231. departure_point = item['departure_point'],
  232. destination_point = item['destination_point'],
  233. ))
  234. return order
  235. def parse_mytaxi(msg):
  236. if not 'mytaxi' in msg.get_payload()[0].get_payload()[0].get_payload(decode = True):
  237. raise Exception('no mytaxi mail')
  238. pdf_compressed = msg.get_payload()[1].get_payload(decode = True)
  239. pdftk = subprocess.Popen(
  240. ['pdftk - output - uncompress'],
  241. shell = True,
  242. stdin = subprocess.PIPE,
  243. stdout = subprocess.PIPE,
  244. )
  245. pdf_uncompressed = pdftk.communicate(
  246. input = pdf_compressed,
  247. )[0].decode('latin-1')
  248. assert type(pdf_uncompressed) is unicode
  249. order_match = re.search(
  250. ur'Rechnungsnummer:[^\(]+\((?P<order_id>\w+)\)',
  251. pdf_uncompressed,
  252. re.MULTILINE | re.UNICODE
  253. )
  254. order_id = order_match.groupdict()['order_id']
  255. ride_match_groups = re.search(
  256. ur'\(Bruttobetrag\)'
  257. + ur'[^\(]+'
  258. + ur'\((?P<price_brutto>\d+,\d+) (?P<price_brutto_currency>.+)\)'
  259. + ur'[\w\W]+'
  260. + ur'\((?P<driver>[^\(]+)\)'
  261. + ur'[^\(]+'
  262. + ur'\(\d+,\d+ .\)'
  263. + ur'[^\(]+'
  264. + ur'\((?P<name>Taxifahrt)'
  265. + ur'[^\(]+'
  266. + ur'\(von: (?P<departure_point>[^\)]+)'
  267. + ur'[^\(]+'
  268. + ur'\(nach: (?P<destination_point>[^\)]+)'
  269. + ur'[\w\W]+'
  270. + ur'Belegdatum \\\(Leistungszeitpunkt\\\):[^\(]+\((?P<arrival_time>\d\d.\d\d.\d\d \d\d:\d\d)\)',
  271. pdf_uncompressed,
  272. re.MULTILINE | re.UNICODE
  273. ).groupdict()
  274. arrival_time = datetime.datetime.strptime(
  275. ride_match_groups['arrival_time'],
  276. '%d.%m.%y %H:%M'
  277. )
  278. order = Order(
  279. u'mytaxi',
  280. order_id,
  281. arrival_time,
  282. )
  283. locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
  284. order.items.append(TaxiRide(
  285. price_brutto = Sum(
  286. float(ride_match_groups['price_brutto'].replace(',', '.')),
  287. # why 0x80 ?
  288. u'EUR' if (ride_match_groups['price_brutto_currency'] == u'\x80')
  289. else ride_match_groups['price_brutto_currency'],
  290. ),
  291. departure_point = ride_match_groups['departure_point'],
  292. destination_point = ride_match_groups['destination_point'],
  293. driver = ride_match_groups['driver'],
  294. arrival_time = arrival_time,
  295. ))
  296. return order
  297. def parse_yipbee(msg):
  298. html = msg.get_payload()[0].get_payload()[1].get_payload(decode = True)
  299. if not 'yipbee' in html:
  300. raise Exception('no yipbee confirmation')
  301. doc = BeautifulSoup.BeautifulSoup(html, convertEntities = BeautifulSoup.BeautifulSoup.HTML_ENTITIES)
  302. content_table = doc.find('table')
  303. order_match_groups = re.search(
  304. ur'Bestellung:(?P<order_id>\w+) vom (?P<order_time>\d\d.\d\d.\d{4} \d\d:\d\d:\d\d)',
  305. content_table.find('table').findAll('tr')[3].text,
  306. re.UNICODE
  307. ).groupdict()
  308. order = Order(
  309. u'yipbee',
  310. order_match_groups['order_id'],
  311. datetime.datetime.strptime(order_match_groups['order_time'], '%d.%m.%Y %H:%M:%S'),
  312. )
  313. articles_table = content_table.find('table').find('tbody').findAll('tr', recursive = False)[4].find('table')
  314. for article_row in articles_table.find('tbody').findAll('tr', recursive = False)[1:]:
  315. article_columns = article_row.findAll('td', recursive = False)
  316. (price, currency) = re.sub(ur'\s+', ' ', article_columns[2].text.replace(u',', u'.')).split(' ')
  317. order.items.append(Article(
  318. name = article_columns[1].text,
  319. price_brutto = Sum(float(price), currency),
  320. quantity = int(article_columns[3].text),
  321. reseller = u'yipbee',
  322. shipper = u'yipbee',
  323. ))
  324. discount_row = content_table.find('table').find('tbody').findAll('tr', recursive = False)[6]
  325. (discount_name, discount_value_with_currency) = [c.text for c in discount_row.findAll('td', recursive = False)]
  326. (discount_value, discount_currency) = discount_value_with_currency.split(' ')
  327. order.discounts.append(Discount(
  328. name = discount_name,
  329. amount = Sum(float(discount_value.replace(',', '.')) * -1, discount_currency)
  330. ))
  331. shipping_costs_table = content_table.find('tbody').findAll('tr', recursive = False)[3].findAll('table')[1]
  332. (shipping_price, shipping_currency) = shipping_costs_table.text.replace(',', '.').split(' ')
  333. order.items.append(Item(
  334. name = u'Delivery',
  335. price_brutto = Sum(float(shipping_price), shipping_currency),
  336. ))
  337. return order
  338. def parse(msg):
  339. tracebacks = {}
  340. try:
  341. return parse_amazon(msg)
  342. except:
  343. tracebacks['amazon'] = traceback.format_exc()
  344. try:
  345. return parse_oebb(msg)
  346. except:
  347. tracebacks['oebb'] = traceback.format_exc()
  348. try:
  349. return parse_mytaxi(msg)
  350. except:
  351. tracebacks['mytaxi'] = traceback.format_exc()
  352. try:
  353. return parse_yipbee(msg)
  354. except:
  355. tracebacks['yipbee'] = traceback.format_exc()
  356. for parser_name in tracebacks:
  357. sys.stderr.write('%s parser: \n%s\n' % (parser_name, tracebacks[parser_name]))
  358. raise Exception('failed to parse')
  359. def compute(register_path):
  360. msg = email.message_from_string(sys.stdin.read())
  361. order = parse(msg)
  362. if register_path:
  363. with open(register_path, 'r') as register:
  364. orders = yaml.load(register.read().decode('utf-8'))
  365. if not orders:
  366. orders = {}
  367. if order.platform not in orders:
  368. orders[order.platform] = {}
  369. if order.order_id in orders[order.platform]:
  370. raise Exception('already registered')
  371. orders[order.platform][order.order_id] = order
  372. with open(register_path, 'w') as register:
  373. register.write(yaml.safe_dump(orders, default_flow_style = False))
  374. else:
  375. print(yaml.safe_dump(order, default_flow_style = False))
  376. def _init_argparser():
  377. argparser = argparse.ArgumentParser(description = None)
  378. argparser.add_argument('--register', metavar = 'path', dest = 'register_path')
  379. return argparser
  380. def main(argv):
  381. argparser = _init_argparser()
  382. argcomplete.autocomplete(argparser)
  383. args = argparser.parse_args(argv)
  384. compute(**vars(args))
  385. return 0
  386. if __name__ == "__main__":
  387. sys.exit(main(sys.argv[1:]))