ping.py 760 B

1234567891011121314151617181920212223242526
  1. import re
  2. from scrapy.linkextractors import LinkExtractor
  3. from scrapy.spiders import CrawlSpider, Rule
  4. class PingSpider(CrawlSpider):
  5. name = "ping"
  6. custom_settings = {
  7. # default: 3 min
  8. "DOWNLOAD_TIMEOUT": 30, # [s]
  9. # WORKAROUND
  10. # [boto] ERROR: Unable to read instance data, giving up
  11. "DOWNLOAD_HANDLERS": {"s3": None},
  12. }
  13. def __init__(self, start_url):
  14. self.start_urls = [start_url]
  15. self.rules = (
  16. Rule(LinkExtractor(allow="^" + re.escape(start_url))),
  17. Rule(LinkExtractor(tags="img", attrs="src", deny_extensions=set())),
  18. Rule(LinkExtractor(tags="link", attrs="href", deny_extensions=set())),
  19. )
  20. super(PingSpider, self).__init__()