Skip to content

Instantly share code, notes, and snippets.

@maurostorch
Created September 27, 2016 11:22
Show Gist options
  • Save maurostorch/362ef3301c3e8f85055086047eec1f58 to your computer and use it in GitHub Desktop.
Save maurostorch/362ef3301c3e8f85055086047eec1f58 to your computer and use it in GitHub Desktop.
jobsearcher crawler using scrapy
#job searcher crawler
import scrapy
class BlogSpider(scrapy.Spider):
name = 'blogspider'
start_urls = ['https://www.bcjobs.ca/search-jobs?q=cloud']
pagecount = 2
def parse(self, response):
for it in response.css('a.list-item-wrapper'):
links = it.css('a::attr("href")').extract()
titles = it.css('.list-item-title::text').extract()
cias = it.css('div>div>div::text').extract()
cities = it.css('div.text-right::text').extract()
yield {
'jobtitle': len(titles)>0 and titles[0] or 'no title',
'cia': len(cias)>0 and cias[0] or '',
'city': len(cities)>0 and cities[0] or '',
'link': len(links)>0 and response.urljoin(links[0]) or '',
}
#follow next button
btn = response.css('a[title="Next"]::attr("href")').extract()
if len(btn)>0:
yield scrapy.Request(response.urljoin(btn[0]))
def parse_titles(self, response):
for post_title in response.css('div.entries > ul > li a::text').extract():
yield {'title': post_title}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment