1234567891011121314151617181920212223242526 |
- import re
- from scrapy.linkextractors import LinkExtractor
- from scrapy.spiders import CrawlSpider, Rule
- class PingSpider(CrawlSpider):
- name = "ping"
- custom_settings = {
- # default: 3 min
- "DOWNLOAD_TIMEOUT": 30, # [s]
- # WORKAROUND
- # [boto] ERROR: Unable to read instance data, giving up
- "DOWNLOAD_HANDLERS": {"s3": None},
- }
- def __init__(self, start_url):
- self.start_urls = [start_url]
- self.rules = (
- Rule(LinkExtractor(allow="^" + re.escape(start_url))),
- Rule(LinkExtractor(tags="img", attrs="src", deny_extensions=set())),
- Rule(LinkExtractor(tags="link", attrs="href", deny_extensions=set())),
- )
- super(PingSpider, self).__init__()
|