Browse Source

follow links, images & stylesheets

Fabian Peter Hammerle 5 years ago
commit
9657976810
3 changed files with 28 additions and 0 deletions
  1. 6 0
      Dockerfile
  2. 5 0
      README.md
  3. 17 0
      ping.py

+ 6 - 0
Dockerfile

@@ -0,0 +1,6 @@
+FROM debian:stretch-slim
+
+RUN apt-get update && apt-get install --yes python-scrapy
+
+COPY ./ping.py /ping.py
+ENTRYPOINT ["scrapy", "runspider", "/ping.py"]

+ 5 - 0
README.md

@@ -0,0 +1,5 @@
+```sh
+sudo docker run --rm fphammerle/scrapy-ping:0.1-scrapy1.0.3-python2 \
+    --loglevel=DEBUG \
+    -a start_url=https://ipfs.fabian.hammerle.me
+```

+ 17 - 0
ping.py

@@ -0,0 +1,17 @@
+import re
+
+from scrapy.linkextractors import LinkExtractor
+from scrapy.spiders import CrawlSpider, Rule
+
+
+class PingSpider(CrawlSpider):
+    name = 'ping'
+
+    def __init__(self, start_url):
+        self.start_urls = [start_url]
+        self.rules = (
+            Rule(LinkExtractor(allow='^'+re.escape(start_url))),
+            Rule(LinkExtractor(tags='img', attrs='src', deny_extensions=set())),
+            Rule(LinkExtractor(tags='link', attrs='href', deny_extensions=set())),
+        )
+        super(PingSpider, self).__init__()