Skip to content

Instantly share code, notes, and snippets.

@jbinfo
Last active June 7, 2019 06:40
Show Gist options
  • Save jbinfo/7e3bac6038fb618ad249 to your computer and use it in GitHub Desktop.
Save jbinfo/7e3bac6038fb618ad249 to your computer and use it in GitHub Desktop.
CloseSpider is a Scrapy extension that force spider to be closed after it reach a drop items limit
# -*- coding: utf-8 -*-
# MIT License (c) Lhassan Baazzi <baazzilhassan@gmail.com>
from collections import defaultdict
from twisted.internet import reactor
from scrapy import signals
class CloseSpider(object):
def __init__(self, crawler):
self.crawler = crawler
self.close_on = {
'drop_item_count': crawler.settings.getint('CLOSESPIDER_DROP_ITEM_COUNT'),
}
self.counter = defaultdict(int)
if self.close_on.get('drop_item_count'):
crawler.signals.connect(self.drop_item_count, signal=signals.item_dropped)
crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def drop_item_count(self, item, response, exception, spider):
self.counter['drop_item_count'] += 1
if self.counter['drop_item_count'] == self.close_on['drop_item_count']:
self.crawler.engine.close_spider(spider, 'closespider_drop_item_count')
def spider_closed(self, spider):
task = getattr(self, 'task', False)
if task and task.active():
task.cancel()
# ...
# ...
# ...
# After 10 drops the spider will be closed
CLOSESPIDER_DROP_ITEM_COUNT = 10
# Replace project_name by name of your scrapy project, I put my closespider.py extension under extensions folder
EXTENSIONS = {
# ...
'project_name.extensions.closespider.CloseSpider': 500,
# ...
}
# ...
# ...
# ...
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment