import re from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule class PingSpider(CrawlSpider): name = 'ping' custom_settings = { # default: 3 min 'DOWNLOAD_TIMEOUT': 30, # [s] # WORKAROUND # [boto] ERROR: Unable to read instance data, giving up 'DOWNLOAD_HANDLERS': {'s3': None}, } def __init__(self, start_url): self.start_urls = [start_url] self.rules = ( Rule(LinkExtractor(allow='^'+re.escape(start_url))), Rule(LinkExtractor(tags='img', attrs='src', deny_extensions=set())), Rule(LinkExtractor(tags='link', attrs='href', deny_extensions=set())), ) super(PingSpider, self).__init__()