ping.py 694 B

123456789101112131415161718192021222324
  1. import re
  2. from scrapy.linkextractors import LinkExtractor
  3. from scrapy.spiders import CrawlSpider, Rule
  4. class PingSpider(CrawlSpider):
  5. name = 'ping'
  6. custom_settings = {
  7. # WORKAROUND
  8. # [boto] ERROR: Unable to read instance data, giving up
  9. 'DOWNLOAD_HANDLERS': {'s3': None},
  10. }
  11. def __init__(self, start_url):
  12. self.start_urls = [start_url]
  13. self.rules = (
  14. Rule(LinkExtractor(allow='^'+re.escape(start_url))),
  15. Rule(LinkExtractor(tags='img', attrs='src', deny_extensions=set())),
  16. Rule(LinkExtractor(tags='link', attrs='href', deny_extensions=set())),
  17. )
  18. super(PingSpider, self).__init__()