nyov · August 19, 2015 19:41
diff --git a/voat.py b/voat.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 from __future__ import print_function

 import logging
 from scrapy.utils.log import configure_logging
 from scrapy.spiders import Spider
 from scrapy.exceptions import CloseSpider
 from scrapy.http import Request


 configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})


 class VoatSpider(Spider):
    """Voat.co legacy-API Spider"""

    name = 'voat'

    allowed_domains = [
        'voat.co',
    ]
    start_urls = [
        'https://voat.co/api/top200subverses',
        #'https://voat.co/api/frontpage',
    ]

    # link to thumbs
    voat_thumbs_url = 'https://cdn.voat.co/thumbs/'
    # api urls
    api_endpoint = 'https://voat.co/api'
    api_subversefrontpage = '%s/subversefrontpage?subverse=%s' % (api_endpoint, '%s')

    def debug(self, response):
        # DEBUG: check response in a shell
        from scrapy.shell import inspect_response
        inspect_response(response, self)
        raise CloseSpider('debug stop')

    def parse(self, response):
        logging.info('Visited %s' % response.url)

        return self.parse_top200subverses(response)

    def parse_top200subverses(self, response):
        response.selector.remove_namespaces()
        subverses = response.xpath('/ArrayOfstring/string')
        for idx, subverse in enumerate(subverses.re(r'.*Name: (\S+),')):
            subverse = subverse.strip()
            logging.info('%3d Found subverse "%s"' % (idx+1, subverse))
            sublink = Request(url=self.api_subversefrontpage % subverse, callback=self.parse_frontpage)
            yield sublink

    thumbnail_counter = 0

    def parse_frontpage(self, response):
        logging.info('Visited %s [%s]' % (response.url, response.status))
        response.selector.remove_namespaces()
        error = response.xpath('/Error')
        if not not error:
            errmsg = error.xpath('./Message/text()').extract_first()
            logging.info('Page %s errored: %s' % (response.url, errmsg))
            return

        entries = response.xpath('/ArrayOfApiMessage')
        for idx, entry in enumerate(entries.xpath('./ApiMessage[Thumbnail[not(@nil)]]')):
            post = entry.xpath("./Title/text()").extract_first()
            if not post:
                post = ''
            thumb = entry.xpath("./Thumbnail[not(@nil)]/text()").extract_first()
            if thumb:
                logging.info('%4d Found Entry %s with Thumbnail image: %s%s' % (idx, post, self.voat_thumbs_url, thumb))
                self.thumbnail_counter += 1

        #self.debug(response)

    def closed(self, reason):
        logging.info('*** Found %d thumbnail images total. ***' % self.thumbnail_counter)



 if __name__ == "__main__":
    ### for scrapy 1.0 ###
    from twisted.internet import reactor
    from scrapy.crawler import CrawlerRunner, Crawler
    #from scrapy.settings import Settings
    from scrapy.utils.project import get_project_settings

    settings = get_project_settings()
    settings.setdict({
        # disable some cruft
        'EXTENSIONS': {
            'scrapy.telnet.TelnetConsole': None,
        },
        'DOWNLOAD_HANDLERS': {'s3': None},
        # config
        'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:17.0) Gecko/20100101 Firefox/17.0',
        'AUTOTHROTTLE_ENABLED': True,
        'CONCURRENT_REQUESTS_PER_DOMAIN': '1',
        'DOWNLOAD_DELAY': 3, # 3 secs delay
        'RETRY_ENABLED': False, # dont retry any errors now
        #'COOKIES_ENABLED': False,
        # a dumb cache that drops every visited page to /tmp/
        # so script re-runs are from disk.
        'HTTPCACHE_ENABLED': True,
        'HTTPCACHE_DIR': '/tmp/scrapy-httpcache',
        'HTTPCACHE_POLICY': 'scrapy.extensions.httpcache.DummyPolicy',
    })
    runner = CrawlerRunner(settings)

    spider = VoatSpider()

    d = runner.crawl(spider)
    d.addBoth(lambda _: reactor.stop())
    reactor.run() # the script will block here until the spider_closed signal was sent
    # EOF
	#!/usr/bin/env python
	# -- coding: utf-8 --
	from __future__ import print_function

	import logging
	from scrapy.utils.log import configure_logging
	from scrapy.spiders import Spider
	from scrapy.exceptions import CloseSpider
	from scrapy.http import Request


	configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})


	class VoatSpider(Spider):
	"""Voat.co legacy-API Spider"""

	name = 'voat'

	allowed_domains = [
	'voat.co',
	]
	start_urls = [
	'https://voat.co/api/top200subverses',
	#'https://voat.co/api/frontpage',
	]

	# link to thumbs
	voat_thumbs_url = 'https://cdn.voat.co/thumbs/'
	# api urls
	api_endpoint = 'https://voat.co/api'
	api_subversefrontpage = '%s/subversefrontpage?subverse=%s' % (api_endpoint, '%s')

	def debug(self, response):
	# DEBUG: check response in a shell
	from scrapy.shell import inspect_response
	inspect_response(response, self)
	raise CloseSpider('debug stop')

	def parse(self, response):
	logging.info('Visited %s' % response.url)

	return self.parse_top200subverses(response)

	def parse_top200subverses(self, response):
	response.selector.remove_namespaces()
	subverses = response.xpath('/ArrayOfstring/string')
	for idx, subverse in enumerate(subverses.re(r'.*Name: (\S+),')):
	subverse = subverse.strip()
	logging.info('%3d Found subverse "%s"' % (idx+1, subverse))
	sublink = Request(url=self.api_subversefrontpage % subverse, callback=self.parse_frontpage)
	yield sublink

	thumbnail_counter = 0

	def parse_frontpage(self, response):
	logging.info('Visited %s [%s]' % (response.url, response.status))
	response.selector.remove_namespaces()
	error = response.xpath('/Error')
	if not not error:
	errmsg = error.xpath('./Message/text()').extract_first()
	logging.info('Page %s errored: %s' % (response.url, errmsg))
	return

	entries = response.xpath('/ArrayOfApiMessage')
	for idx, entry in enumerate(entries.xpath('./ApiMessage[Thumbnail[not(@nil)]]')):
	post = entry.xpath("./Title/text()").extract_first()
	if not post:
	post = ''
	thumb = entry.xpath("./Thumbnail[not(@nil)]/text()").extract_first()
	if thumb:
	logging.info('%4d Found Entry %s with Thumbnail image: %s%s' % (idx, post, self.voat_thumbs_url, thumb))
	self.thumbnail_counter += 1

	#self.debug(response)

	def closed(self, reason):
	logging.info('* Found %d thumbnail images total. *' % self.thumbnail_counter)



	if __name__ == "__main__":
	### for scrapy 1.0 ###
	from twisted.internet import reactor
	from scrapy.crawler import CrawlerRunner, Crawler
	#from scrapy.settings import Settings
	from scrapy.utils.project import get_project_settings

	settings = get_project_settings()
	settings.setdict({
	# disable some cruft
	'EXTENSIONS': {
	'scrapy.telnet.TelnetConsole': None,
	},
	'DOWNLOAD_HANDLERS': {'s3': None},
	# config
	'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:17.0) Gecko/20100101 Firefox/17.0',
	'AUTOTHROTTLE_ENABLED': True,
	'CONCURRENT_REQUESTS_PER_DOMAIN': '1',
	'DOWNLOAD_DELAY': 3, # 3 secs delay
	'RETRY_ENABLED': False, # dont retry any errors now
	#'COOKIES_ENABLED': False,
	# a dumb cache that drops every visited page to /tmp/
	# so script re-runs are from disk.
	'HTTPCACHE_ENABLED': True,
	'HTTPCACHE_DIR': '/tmp/scrapy-httpcache',
	'HTTPCACHE_POLICY': 'scrapy.extensions.httpcache.DummyPolicy',
	})
	runner = CrawlerRunner(settings)

	spider = VoatSpider()

	d = runner.crawl(spider)
	d.addBoth(lambda _: reactor.stop())
	reactor.run() # the script will block here until the spider_closed signal was sent
	# EOF