Created
December 5, 2017 17:30
-
-
Save raphapassini/9c7ab4149908cee986dafd245a7888d1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import scrapy | |
from scrapy import Request | |
from scrapy import signals | |
class G1Spider(scrapy.Spider): | |
name = 'g1' | |
allowed_domains = ['g1.globo.com'] | |
start_urls = [ | |
'https://g1.globo.com/mg/centro-oeste/noticia/criminosos-cercam-quartel-em-pompeu-e-policiais-sao-mortos.ghtml', | |
'http://g1.globo.com/minas-gerais/interatividade/enquete/2017/12/4/escolha-o-tema-da-entrevista-no-estudio-do-mgtv-para-quinta-feira-f0445236-d8ee-11e7-851f-0242ac110003.html', | |
'https://g1.globo.com/mg/zona-da-mata/noticia/trecho-interditado-da-br-116-na-zona-da-mata-deve-ser-liberado-ate-o-final-de-semana.ghtml' | |
] | |
counter = 0 | |
@classmethod | |
def from_crawler(cls, crawler, *args, **kwargs): | |
spider = super(G1Spider, cls).from_crawler(crawler, *args, **kwargs) | |
crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle) | |
return spider | |
def spider_idle(self, spider): | |
self.logger.info('Spider is IDLE') | |
if not self.start_urls: | |
return | |
self.counter += 1 | |
for req in self.start_requests(): | |
self.crawler.engine.crawl(req, self) | |
def start_requests(self): | |
self.logger.info('Sending URL #{}'.format(self.counter)) | |
url = self.start_urls.pop() | |
yield Request(url, meta={'idx': self.counter}) | |
def parse(self, response): | |
self.logger.info('Received the response #{}'.format( | |
response.meta['idx'])) | |
yield Request( | |
'http://python.org', dont_filter=True, | |
callback=self.parse_step2, meta={'idx': response.meta['idx']}) | |
def parse_step2(self, response): | |
self.logger.info('Reached step 2 for URL #{}'.format( | |
response.meta['idx'])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment