order-confirmation-mail-parser 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # PYTHON_ARGCOMPLETE_OK
  4. import dingguo
  5. import dingguo.parser.banggood
  6. import re
  7. import os
  8. import sys
  9. import yaml
  10. import email
  11. import shutil
  12. import pprint
  13. import random
  14. import locale
  15. import argparse
  16. import datetime
  17. import traceback
  18. import subprocess
  19. import HTMLParser
  20. import argcomplete
  21. import BeautifulSoup
  22. def parse_amazon(msg):
  23. msg_text = msg.get_payload()[0].get_payload(decode = True).decode('utf-8')
  24. if not u'Amazon.de Bestellbestätigung' in msg_text:
  25. raise Exception('no amazon order confirmation')
  26. orders = []
  27. for order_text in re.split(ur'={32,}', msg_text)[1:-1]:
  28. order_id = re.search(r'Bestellnummer #(.+)', order_text).group(1)
  29. order_date_formatted = re.search(ur'Aufgegeben am (.+)', order_text, re.UNICODE).group(1)
  30. locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8')
  31. order_date = datetime.datetime.strptime(order_date_formatted.encode('utf-8'), '%d. %B %Y')
  32. order = dingguo.Order(
  33. u'amazon.de',
  34. order_id,
  35. order_date
  36. )
  37. articles_text = order_text.split('Bestellte(r) Artikel:')[1].split('_' * 10)[0].strip()
  38. for article_text in re.split(ur'\n\t*\n', articles_text):
  39. article_match = re.match(
  40. ur' *((?P<quantity>\d+) x )?(?P<name>.*)\n'
  41. + ur'( *von (?P<authors>.*)\n)?'
  42. + ur' *(?P<price_brutto_currency>[A-Z]+) (?P<price_brutto>\d+,\d+)\n'
  43. + ur'( *Zustand: (?P<state>.*)\n)?'
  44. + ur' *Verkauft von: (?P<reseller>.*)'
  45. + ur'(\n *Versand durch (?P<shipper>.*))?',
  46. article_text,
  47. re.MULTILINE | re.UNICODE
  48. )
  49. if article_match is None:
  50. sys.stderr.write(repr(article_text) + '\n')
  51. raise Exception('could not match article')
  52. article = article_match.groupdict()
  53. order.items.append(dingguo.Article(
  54. name = article['name'],
  55. price_brutto = dingguo.Sum(
  56. float(article['price_brutto'].replace(',', '.')),
  57. article['price_brutto_currency']
  58. ),
  59. quantity = int(article['quantity']) if article['quantity'] else 1,
  60. authors = article['authors'].split(',') if article['authors'] else [],
  61. state = article['state'],
  62. reseller = article['reseller'],
  63. shipper = article['shipper'],
  64. ))
  65. orders.append(order)
  66. return orders
  67. def parse_oebb(msg):
  68. msg = msg.get_payload()[0]
  69. if type(msg.get_payload()) is list:
  70. msg = msg.get_payload()[0]
  71. msg_text = msg.get_payload(decode = True).decode('utf8')
  72. # msg_text = re.sub(
  73. # r'<[^>]+>',
  74. # '',
  75. # HTMLParser.HTMLParser().unescape(msg.get_payload(decode = True).decode('utf8'))
  76. # )
  77. order_match = re.search(
  78. ur'Booking code:\s+(?P<order_id>[\d ]+)\s+'
  79. + ur'Customer number:\s+(?P<customer_id>PV\d+)\s+'
  80. + ur'Booking date:\s+(?P<order_date>.* \d{4})\s',
  81. msg_text,
  82. re.MULTILINE | re.UNICODE
  83. )
  84. order_match_groups = order_match.groupdict()
  85. locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
  86. order_date = datetime.datetime.strptime(
  87. order_match_groups['order_date'],
  88. '%b %d, %Y'
  89. )
  90. order = dingguo.Order(
  91. u'oebb',
  92. order_match_groups['order_id'],
  93. order_date,
  94. customer_id = order_match_groups['customer_id'],
  95. )
  96. item_match = re.search(
  97. ur'(?P<price_brutto_currency>.)(?P<price_brutto>\d+\.\d+)'
  98. + ur'[\W\w]+'
  99. + ur'Your Booking\s+'
  100. + ur'(?P<departure_point>.*)\s+>\s+(?P<destination_point>.*)',
  101. msg_text,
  102. re.MULTILINE | re.UNICODE
  103. )
  104. item = item_match.groupdict()
  105. order.items.append(dingguo.Transportation(
  106. name = u'Train Ticket',
  107. price_brutto = dingguo.Sum(
  108. float(item['price_brutto']),
  109. item['price_brutto_currency'],
  110. ),
  111. departure_point = item['departure_point'],
  112. destination_point = item['destination_point'],
  113. ))
  114. return [order]
  115. def parse_mytaxi(msg):
  116. if not 'mytaxi' in msg.get_payload()[0].get_payload()[0].get_payload(decode = True):
  117. raise Exception('no mytaxi mail')
  118. pdf_compressed = msg.get_payload()[1].get_payload(decode = True)
  119. pdftk = subprocess.Popen(
  120. ['pdftk - output - uncompress'],
  121. shell = True,
  122. stdin = subprocess.PIPE,
  123. stdout = subprocess.PIPE,
  124. )
  125. pdf_uncompressed = pdftk.communicate(
  126. input = pdf_compressed,
  127. )[0].decode('latin-1')
  128. assert type(pdf_uncompressed) is unicode
  129. order_match = re.search(
  130. ur'Rechnungsnummer:[^\(]+\((?P<order_id>\w+)\)',
  131. pdf_uncompressed,
  132. re.MULTILINE | re.UNICODE
  133. )
  134. order_id = order_match.groupdict()['order_id']
  135. ride_match_groups = re.search(
  136. ur'\(Bruttobetrag\)'
  137. + ur'[^\(]+'
  138. + ur'\((?P<price_brutto>\d+,\d+) (?P<price_brutto_currency>.+)\)'
  139. + ur'[\w\W]+'
  140. + ur'\((?P<driver>[^\(]+)\)'
  141. + ur'[^\(]+'
  142. + ur'\(\d+,\d+ .\)'
  143. + ur'[^\(]+'
  144. + ur'\((?P<name>Taxifahrt)'
  145. + ur'[^\(]+'
  146. + ur'\(von: (?P<departure_point>[^\)]+)'
  147. + ur'[^\(]+'
  148. + ur'\(nach: (?P<destination_point>[^\)]+)'
  149. + ur'[\w\W]+'
  150. + ur'Belegdatum \\\(Leistungszeitpunkt\\\):[^\(]+\((?P<arrival_time>\d\d.\d\d.\d\d \d\d:\d\d)\)',
  151. pdf_uncompressed,
  152. re.MULTILINE | re.UNICODE
  153. ).groupdict()
  154. arrival_time = datetime.datetime.strptime(
  155. ride_match_groups['arrival_time'],
  156. '%d.%m.%y %H:%M'
  157. )
  158. order = dingguo.Order(
  159. u'mytaxi',
  160. order_id,
  161. arrival_time,
  162. )
  163. locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
  164. order.items.append(dingguo.TaxiRide(
  165. price_brutto = dingguo.Sum(
  166. float(ride_match_groups['price_brutto'].replace(',', '.')),
  167. # why 0x80 ?
  168. u'EUR' if (ride_match_groups['price_brutto_currency'] == u'\x80')
  169. else ride_match_groups['price_brutto_currency'],
  170. ),
  171. departure_point = ride_match_groups['departure_point'],
  172. destination_point = ride_match_groups['destination_point'],
  173. driver = ride_match_groups['driver'],
  174. arrival_time = arrival_time,
  175. ))
  176. return [order]
  177. def parse_uber(msg):
  178. html = msg.get_payload()[0].get_payload(decode = True)
  179. """ document in html2 has the same structure as the one in html.
  180. only difference is that hyperlink urls in html2 have been
  181. replaced by 'email.uber.com/wf/click?upn=.*' urls.
  182. """
  183. html2 = msg.get_payload()[1].get_payload()[0].get_payload(decode = True)
  184. route_map = msg.get_payload()[1].get_payload()[1].get_payload(decode = True)
  185. doc = BeautifulSoup.BeautifulSoup(
  186. html,
  187. convertEntities = BeautifulSoup.BeautifulSoup.HTML_ENTITIES,
  188. )
  189. # strptime
  190. locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
  191. trip_id = re.search(
  192. ur'[\da-f\-]{36}',
  193. doc.find(text = 'Visit the trip page').parent['href'],
  194. ).group(0)
  195. order = dingguo.Order(
  196. u'uber',
  197. trip_id,
  198. datetime.datetime.strptime(
  199. doc.find(attrs = {'class': 'date'}).text,
  200. '%B %d, %Y',
  201. ),
  202. )
  203. departure_time_tag = doc.find(attrs = {'class': 'from time'})
  204. departure_time = datetime.datetime.strptime(
  205. departure_time_tag.text,
  206. '%I:%M%p',
  207. ).time()
  208. arrival_time_tag = doc.find(attrs = {'class': 'to time'})
  209. arrival_time = datetime.datetime.strptime(
  210. arrival_time_tag.text,
  211. '%I:%M%p',
  212. ).time()
  213. distance = dingguo.Distance(
  214. float(doc.find(text = 'kilometers').parent.parent.find(attrs = {'class': 'data'}).text),
  215. u'km',
  216. )
  217. fare = doc.find(attrs = {'class': 'header-price'}).find(attrs = {'class': 'header-fare text-pad'}).text
  218. order.items.append(dingguo.TaxiRide(
  219. name = doc.find(text = 'CAR').parent.parent.find(attrs = {'class': 'data'}).text + ' Ride',
  220. price_brutto = dingguo.Sum(float(fare[1:]), fare[0]),
  221. arrival_time = datetime.datetime.combine(order.order_date, arrival_time),
  222. departure_time = datetime.datetime.combine(order.order_date, departure_time),
  223. departure_point = departure_time_tag.parent.find(attrs = {'class': 'address'}).text,
  224. destination_point = arrival_time_tag.parent.find(attrs = {'class': 'address'}).text,
  225. distance = distance,
  226. driver = doc.find(attrs = {'class': 'driver-info'}).text[len('You rode with '):],
  227. route_map = route_map,
  228. ))
  229. return [order]
  230. def parse_yipbee(msg):
  231. text = msg.get_payload()[0].get_payload()[0].get_payload(decode = True).decode('utf-8')
  232. if not u'Vielen Dank für deine Bestellung bei yipbee' in text:
  233. raise Exception('no yipbee confirmation')
  234. order_match_groups = re.search(
  235. ur'[\W\w]+'
  236. + ur'BESTELLUNG: (?P<order_id>\w+) vom (?P<order_time>\d\d.\d\d.\d{4} \d\d:\d\d:\d\d)'
  237. + ur'[\W\w]+'
  238. + ur'GESAMTPREIS\s+'
  239. + ur'(?P<articles_and_discount_text>[\W\w]+)'
  240. + ur'(?P<summary_text>ARTIKEL [\W\w]+)',
  241. text,
  242. re.UNICODE
  243. ).groupdict()
  244. order = dingguo.Order(
  245. u'yipbee',
  246. order_match_groups['order_id'],
  247. datetime.datetime.strptime(order_match_groups['order_time'], '%d.%m.%Y %H:%M:%S'),
  248. )
  249. for article_match in re.finditer(
  250. ur'(?P<name>[\w\-\.\:,%\(\) ]+ (Klasse \d|[\w\-\. ]+[^\d ]))'
  251. + ur'(?P<total_price>\d+,\d\d) €(?P<quantity>\d)(?P<total_price_2>\d+,\d\d) €',
  252. order_match_groups['articles_and_discount_text'].replace('\n', ' '),
  253. re.UNICODE,
  254. ):
  255. article_match_groups = article_match.groupdict()
  256. total_price = float(article_match_groups['total_price'].replace(',', '.'))
  257. total_price_2 = float(article_match_groups['total_price_2'].replace(',', '.'))
  258. assert abs(total_price - total_price_2) < 0.01, 'expected %f, received %f' % (total_price, total_price_2)
  259. quantity = int(article_match_groups['quantity'])
  260. order.items.append(dingguo.Article(
  261. name = article_match_groups['name'],
  262. price_brutto = dingguo.Sum(round(total_price / quantity, 2), u'EUR'),
  263. quantity = quantity,
  264. reseller = u'yipbee',
  265. shipper = u'yipbee',
  266. ))
  267. articles_price = float(text.split('RABATTE')[0].split('ARTIKEL')[-1].strip().split(' ')[0].replace(',', '.'))
  268. assert abs(articles_price - sum([a.price_brutto.value * a.quantity for a in order.items])) < 0.01
  269. discount_tag = BeautifulSoup.BeautifulSoup(
  270. order_match_groups['articles_and_discount_text'],
  271. convertEntities = BeautifulSoup.BeautifulSoup.HTML_ENTITIES,
  272. ).find('tr')
  273. if discount_tag:
  274. name_tag, value_tag = discount_tag.findAll('td', recursive = False)
  275. value, currency = value_tag.text.split(' ')
  276. order.discounts.append(dingguo.Discount(
  277. name = name_tag.text,
  278. amount = dingguo.Sum(float(value.replace(',', '.')) * -1, currency),
  279. ))
  280. delivery_price = order_match_groups['summary_text'].split('VERSAND')[1].split('STEUERN')[0].strip()
  281. delivery_price_value, delivery_price_currency = delivery_price.split(' ')
  282. order.items.append(dingguo.Item(
  283. name = u'Delivery',
  284. price_brutto = dingguo.Sum(float(delivery_price_value.replace(',', '.')), delivery_price_currency),
  285. ))
  286. return [order]
  287. def parse_yipbee_html(msg):
  288. html = msg.get_payload()[0].get_payload()[1].get_payload(decode = True)
  289. if not 'yipbee' in html:
  290. raise Exception('no yipbee confirmation')
  291. doc = BeautifulSoup.BeautifulSoup(html, convertEntities = BeautifulSoup.BeautifulSoup.HTML_ENTITIES)
  292. content_table = doc.find('table')
  293. order_match_groups = re.search(
  294. ur'Bestellung:(?P<order_id>\w+) vom (?P<order_time>\d\d.\d\d.\d{4} \d\d:\d\d:\d\d)',
  295. content_table.find('table').findAll('tr')[3].text,
  296. re.UNICODE
  297. ).groupdict()
  298. order = dingguo.Order(
  299. u'yipbee',
  300. order_match_groups['order_id'],
  301. datetime.datetime.strptime(order_match_groups['order_time'], '%d.%m.%Y %H:%M:%S'),
  302. )
  303. articles_table = content_table.find('table').find('tbody').findAll('tr', recursive = False)[4].find('table')
  304. for article_row in articles_table.find('tbody').findAll('tr', recursive = False)[1:]:
  305. article_columns = article_row.findAll('td', recursive = False)
  306. (price, currency) = re.sub(ur'\s+', ' ', article_columns[2].text.replace(u',', u'.')).split(' ')
  307. order.items.append(dingguo.Article(
  308. name = article_columns[1].text,
  309. price_brutto = dingguo.Sum(float(price), currency),
  310. quantity = int(article_columns[3].text),
  311. reseller = u'yipbee',
  312. shipper = u'yipbee',
  313. ))
  314. discount_row = content_table.find('table').find('tbody').findAll('tr', recursive = False)[6]
  315. (discount_name, discount_value_with_currency) = [c.text for c in discount_row.findAll('td', recursive = False)]
  316. (discount_value, discount_currency) = discount_value_with_currency.split(' ')
  317. order.discounts.append(dingguo.Discount(
  318. name = discount_name,
  319. amount = dingguo.Sum(float(discount_value.replace(',', '.')) * -1, discount_currency)
  320. ))
  321. shipping_costs_table = content_table.find('tbody').findAll('tr', recursive = False)[3].findAll('table')[1]
  322. (shipping_price, shipping_currency) = shipping_costs_table.text.replace(',', '.').split(' ')
  323. order.items.append(dingguo.Item(
  324. name = u'Delivery',
  325. price_brutto = dingguo.Sum(float(shipping_price), shipping_currency),
  326. ))
  327. return [order]
  328. def parse_lieferservice(msg):
  329. text = msg.get_payload()[0].get_payload(decode = True).decode('utf-8').replace('\r\n', '\n')
  330. assert type(text) is unicode
  331. if not 'Lieferservice.at' in text:
  332. raise Exception('no lieferservice.at confirmation')
  333. order_match = re.search(
  334. ur'(Your order|Ihre Bestellung) \(.+\) (at|bei) (?P<restaurant>.*)\s+'
  335. + ur'(Your order reference is|Ihre Bestellnummer lautet): (?P<order_id>.*)\s+'
  336. + ur'[\W\w]+'
  337. + ur'(Your order|Ihre Bestellung)\s+'
  338. + ur'(?P<orders_text>[\W\w]+)'
  339. + ur'(Delivery costs|Lieferung):\s+(?P<delivery_costs>.*)\s+',
  340. text,
  341. re.UNICODE,
  342. )
  343. order_match_groups = order_match.groupdict()
  344. import time
  345. import email.utils
  346. order_date = datetime.datetime.fromtimestamp(
  347. time.mktime(email.utils.parsedate(msg['Date']))
  348. )
  349. order = dingguo.Order(
  350. u'lieferservice.at',
  351. order_match_groups['order_id'].strip(),
  352. order_date
  353. )
  354. restaurant = order_match_groups['restaurant'].strip('"')
  355. for article_match in re.finditer(
  356. ur'(?P<quantity>\d+)x\s'
  357. + ur'(?P<name>.*)\s'
  358. + ur'(?P<currency>.) (?P<price>-?\d+,\d+)\s',
  359. order_match_groups['orders_text'],
  360. re.UNICODE,
  361. ):
  362. article_match_groups = article_match.groupdict()
  363. quantity = int(article_match_groups['quantity'])
  364. assert quantity == 1
  365. name = re.sub(ur' +', ' ', article_match_groups['name'])
  366. price = dingguo.Sum(
  367. float(article_match_groups['price'].replace(',', '.')),
  368. article_match_groups['currency'],
  369. )
  370. if price.value < 0:
  371. price.value *= -1
  372. order.discounts.append(dingguo.Discount(
  373. name = name,
  374. amount = price,
  375. ))
  376. else:
  377. order.items.append(dingguo.Article(
  378. name = name,
  379. quantity = 1,
  380. price_brutto = price,
  381. reseller = restaurant,
  382. shipper = restaurant,
  383. ))
  384. delivery_costs = order_match_groups['delivery_costs'].strip()
  385. if delivery_costs in ['FREE', 'GRATIS']:
  386. order.items.append(dingguo.Item(
  387. name = u'Delivery',
  388. price_brutto = dingguo.Sum(0.0, u'EUR'),
  389. ))
  390. else:
  391. unit, value = delivery_costs.split(' ')
  392. order.items.append(dingguo.Item(
  393. name = u'Delivery',
  394. price_brutto = dingguo.Sum(float(value.replace(',', '.')), unit),
  395. ))
  396. return [order]
  397. def parse(msg):
  398. tracebacks = {}
  399. try:
  400. return parse_amazon(msg)
  401. except:
  402. tracebacks['amazon'] = traceback.format_exc()
  403. try:
  404. return dingguo.parser.banggood.parse_order_confirmation_mail(msg)
  405. except:
  406. tracebacks['banggood'] = traceback.format_exc()
  407. try:
  408. return parse_oebb(msg)
  409. except:
  410. tracebacks['oebb'] = traceback.format_exc()
  411. try:
  412. return parse_lieferservice(msg)
  413. except:
  414. tracebacks['lieferservice'] = traceback.format_exc()
  415. try:
  416. return parse_mytaxi(msg)
  417. except:
  418. tracebacks['mytaxi'] = traceback.format_exc()
  419. try:
  420. return parse_uber(msg)
  421. except:
  422. tracebacks['uber'] = traceback.format_exc()
  423. try:
  424. return parse_yipbee(msg)
  425. except:
  426. tracebacks['yipbee'] = traceback.format_exc()
  427. for parser_name in tracebacks:
  428. sys.stderr.write('%s parser: \n%s\n' % (parser_name, tracebacks[parser_name]))
  429. raise Exception('failed to parse')
  430. def compute(mail_path, catalogue, register_path):
  431. orders = []
  432. if mail_path:
  433. for p in mail_path:
  434. with open(p, 'r') as mail:
  435. mail_orders = parse(email.message_from_file(mail))
  436. orders += mail_orders
  437. if catalogue:
  438. for order in mail_orders:
  439. order_dir_path = os.path.join(order.platform, order.order_id)
  440. if not os.path.isdir(order_dir_path):
  441. os.makedirs(order_dir_path)
  442. shutil.copyfile(p, os.path.join(order_dir_path, os.path.basename(p)))
  443. os.remove(p)
  444. else:
  445. msg = email.message_from_string(sys.stdin.read())
  446. orders += parse(msg)
  447. if register_path:
  448. with open(register_path, 'r') as register:
  449. registered_orders = yaml.load(register.read().decode('utf-8'))
  450. if not registered_orders:
  451. registered_orders = {}
  452. for order in orders:
  453. if order.platform not in registered_orders:
  454. registered_orders[order.platform] = {}
  455. if order.order_id in registered_orders[order.platform]:
  456. raise Exception('already registered')
  457. registered_orders[order.platform][order.order_id] = order
  458. with open(register_path, 'w') as register:
  459. register.write(yaml.safe_dump(registered_orders, default_flow_style = False))
  460. else:
  461. print(yaml.safe_dump(orders, default_flow_style = False))
  462. def _init_argparser():
  463. argparser = argparse.ArgumentParser(description = None)
  464. argparser.add_argument('--register', metavar = 'path', dest = 'register_path')
  465. argparser.add_argument('--catalogue', action='store_true')
  466. argparser.add_argument('mail_path', nargs = '*')
  467. return argparser
  468. def main(argv):
  469. argparser = _init_argparser()
  470. argcomplete.autocomplete(argparser)
  471. args = argparser.parse_args(argv)
  472. compute(**vars(args))
  473. return 0
  474. if __name__ == "__main__":
  475. sys.exit(main(sys.argv[1:]))