Skip to content

Instantly share code, notes, and snippets.

@nyov
Created August 19, 2015 19:41
Show Gist options
  • Save nyov/399747653bc70a75a8d0 to your computer and use it in GitHub Desktop.
Save nyov/399747653bc70a75a8d0 to your computer and use it in GitHub Desktop.
Voat.co Spider
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
import logging
from scrapy.utils.log import configure_logging
from scrapy.spiders import Spider
from scrapy.exceptions import CloseSpider
from scrapy.http import Request
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
class VoatSpider(Spider):
"""Voat.co legacy-API Spider"""
name = 'voat'
allowed_domains = [
'voat.co',
]
start_urls = [
'https://voat.co/api/top200subverses',
#'https://voat.co/api/frontpage',
]
# link to thumbs
voat_thumbs_url = 'https://cdn.voat.co/thumbs/'
# api urls
api_endpoint = 'https://voat.co/api'
api_subversefrontpage = '%s/subversefrontpage?subverse=%s' % (api_endpoint, '%s')
def debug(self, response):
# DEBUG: check response in a shell
from scrapy.shell import inspect_response
inspect_response(response, self)
raise CloseSpider('debug stop')
def parse(self, response):
logging.info('Visited %s' % response.url)
return self.parse_top200subverses(response)
def parse_top200subverses(self, response):
response.selector.remove_namespaces()
subverses = response.xpath('/ArrayOfstring/string')
for idx, subverse in enumerate(subverses.re(r'.*Name: (\S+),')):
subverse = subverse.strip()
logging.info('%3d Found subverse "%s"' % (idx+1, subverse))
sublink = Request(url=self.api_subversefrontpage % subverse, callback=self.parse_frontpage)
yield sublink
thumbnail_counter = 0
def parse_frontpage(self, response):
logging.info('Visited %s [%s]' % (response.url, response.status))
response.selector.remove_namespaces()
error = response.xpath('/Error')
if not not error:
errmsg = error.xpath('./Message/text()').extract_first()
logging.info('Page %s errored: %s' % (response.url, errmsg))
return
entries = response.xpath('/ArrayOfApiMessage')
for idx, entry in enumerate(entries.xpath('./ApiMessage[Thumbnail[not(@nil)]]')):
post = entry.xpath("./Title/text()").extract_first()
if not post:
post = ''
thumb = entry.xpath("./Thumbnail[not(@nil)]/text()").extract_first()
if thumb:
logging.info('%4d Found Entry %s with Thumbnail image: %s%s' % (idx, post, self.voat_thumbs_url, thumb))
self.thumbnail_counter += 1
#self.debug(response)
def closed(self, reason):
logging.info('*** Found %d thumbnail images total. ***' % self.thumbnail_counter)
if __name__ == "__main__":
### for scrapy 1.0 ###
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner, Crawler
#from scrapy.settings import Settings
from scrapy.utils.project import get_project_settings
settings = get_project_settings()
settings.setdict({
# disable some cruft
'EXTENSIONS': {
'scrapy.telnet.TelnetConsole': None,
},
'DOWNLOAD_HANDLERS': {'s3': None},
# config
'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:17.0) Gecko/20100101 Firefox/17.0',
'AUTOTHROTTLE_ENABLED': True,
'CONCURRENT_REQUESTS_PER_DOMAIN': '1',
'DOWNLOAD_DELAY': 3, # 3 secs delay
'RETRY_ENABLED': False, # dont retry any errors now
#'COOKIES_ENABLED': False,
# a dumb cache that drops every visited page to /tmp/
# so script re-runs are from disk.
'HTTPCACHE_ENABLED': True,
'HTTPCACHE_DIR': '/tmp/scrapy-httpcache',
'HTTPCACHE_POLICY': 'scrapy.extensions.httpcache.DummyPolicy',
})
runner = CrawlerRunner(settings)
spider = VoatSpider()
d = runner.crawl(spider)
d.addBoth(lambda _: reactor.stop())
reactor.run() # the script will block here until the spider_closed signal was sent
# EOF
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment