123456789101112131415161718192021222324 |
- import re
- from scrapy.linkextractors import LinkExtractor
- from scrapy.spiders import CrawlSpider, Rule
- class PingSpider(CrawlSpider):
- name = 'ping'
- custom_settings = {
- # WORKAROUND
- # [boto] ERROR: Unable to read instance data, giving up
- 'DOWNLOAD_HANDLERS': {'s3': None},
- }
- def __init__(self, start_url):
- self.start_urls = [start_url]
- self.rules = (
- Rule(LinkExtractor(allow='^'+re.escape(start_url))),
- Rule(LinkExtractor(tags='img', attrs='src', deny_extensions=set())),
- Rule(LinkExtractor(tags='link', attrs='href', deny_extensions=set())),
- )
- super(PingSpider, self).__init__()
|