import re from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule class PingSpider(CrawlSpider): name = "ping" custom_settings = { # default: 3 min "DOWNLOAD_TIMEOUT": 30, # [s] # WORKAROUND # [boto] ERROR: Unable to read instance data, giving up "DOWNLOAD_HANDLERS": {"s3": None}, } def __init__(self, start_url): self.start_urls = [start_url] self.rules = ( Rule(LinkExtractor(allow="^" + re.escape(start_url))), Rule(LinkExtractor(tags="img", attrs="src", deny_extensions=set())), Rule(LinkExtractor(tags="link", attrs="href", deny_extensions=set())), ) super(PingSpider, self).__init__()