# -*- coding: utf-8 -*- import BeautifulSoup import datetime import dingguo import email.message import ioex import re def parse_order_confirmation_mail(mail): assert isinstance(mail, email.message.Message) html = mail.get_payload()[1].get_payload(decode = True).decode('utf-8') doc = BeautifulSoup.BeautifulSoup(html) """ Your Order#13998883(placed on Saturday 16 April, 2016)""" order_attr = re.search( ur'Your Order#(?P\d+)\(placed on (?P.*)\)', doc.find(text = re.compile(r'placed on')).parent.text, re.UNICODE, ).groupdict() with ioex.setlocale('en_US.UTF-8'): order = dingguo.Order( platform = u'banggood', order_id = order_attr['id'], order_date = datetime.datetime.strptime(order_attr['date'], '%A %d %B, %Y').date(), customer_id = None, ) """ 1 x 10X E27 LED Bulb 7W Warm White 36 SMD 5730 AC 220V Corn Light (10xPOA162664)€23.431 x 5X E14 7W Warm White 36 SMD 5730 LED Corn Light Lamp Bulbs AC 220V (5xPOA162668)€12.31 """ articles_text = doc.find(text = re.compile(r'Subtotal of Items')) \ .parent.parent.text.replace(' ', ' ') \ .split('Subtotal of Items:')[1].split('IMPORTANT NOTICE')[0] for article_match in re.finditer( ur'(?P\d+) x (?P[^\(]*) \((\d+x)?(?P[^\)]+)\)' + ur'(?P[^\d]+)(?P\d+.\d\d)(?P