jluczak · August 3, 2017 13:37
diff --git a/items.py b/items.py
 import scrapy
 from scrapy.item import Item,Field

 class ExpertItem(scrapy.Item):
    name=Field()
    tangline=Field()
    file_urls=Field()
    files=Field()
    city=Field()
    gplus=Field()
    twitter=Field()
    linkedin=Field()
    bio=Field()
    skills=Field()
diff --git a/settings.py b/settings.py
 BOT_NAME = 'expert'

 SPIDER_MODULES = ['expert.spiders']
 NEWSPIDER_MODULE = 'expert.spiders'

 ROBOTSTXT_OBEY = True

 ITEM_PIPELINES = {
    'scrapy.pipelines.files.FilesPipeline': 1,
 }
 FILES_STORE = 'expert_photos'
diff --git a/spider_google.py b/spider_google.py
 from scrapy.spiders import CrawlSpider, Rule
 from expert.items import ExpertItem
 from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
 from scrapy.selector import Selector

 class MySpider(CrawlSpider):
    name = "expert"
    allowed_domains = ["developers.google.com"]
    start_urls = ["https://developers.google.com/experts/"]

    rules = (

        Rule(LxmlLinkExtractor(

            restrict_xpaths=(".//*[@id='experts-body']//a")),
            follow=False,
            callback='parse_item'
         ),

        )

    def parse_item(self, response):

        sel = Selector(response)

        item = ExpertItem()

        item['name'] = sel.css('.profile-header h2::text').extract()
        item['tangline'] = sel.css('h2.main-title::text').extract()
        item['file_urls'] = sel.css('img.profile-img::attr(src)').extract()
        item['city'] = sel.css('.location::text').extract_first()
        item['gplus'] = sel.css('a.gplus::attr(href)').extract_first()
        item['twitter'] = sel.css('a.twitter::attr(href)').extract_first()
        item['linkedin'] = sel.css('a.linkedin::attr(href)').extract_first()
        item['bio'] = sel.css('.profile-row p::text').extract()[2:5]
        item['skills'] = sel.css('.profile-row li::text').extract()

        yield item
	import scrapy
	from scrapy.item import Item,Field

	class ExpertItem(scrapy.Item):
	name=Field()
	tangline=Field()
	file_urls=Field()
	files=Field()
	city=Field()
	gplus=Field()
	twitter=Field()
	linkedin=Field()
	bio=Field()
	skills=Field()
	BOT_NAME = 'expert'

	SPIDER_MODULES = ['expert.spiders']
	NEWSPIDER_MODULE = 'expert.spiders'

	ROBOTSTXT_OBEY = True

	ITEM_PIPELINES = {
	'scrapy.pipelines.files.FilesPipeline': 1,
	}
	FILES_STORE = 'expert_photos'
	from scrapy.spiders import CrawlSpider, Rule
	from expert.items import ExpertItem
	from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
	from scrapy.selector import Selector

	class MySpider(CrawlSpider):
	name = "expert"
	allowed_domains = ["developers.google.com"]
	start_urls = ["https://developers.google.com/experts/"]

	rules = (

	Rule(LxmlLinkExtractor(

	restrict_xpaths=(".//*[@id='experts-body']//a")),
	follow=False,
	callback='parse_item'
	),

	)

	def parse_item(self, response):

	sel = Selector(response)

	item = ExpertItem()

	item['name'] = sel.css('.profile-header h2::text').extract()
	item['tangline'] = sel.css('h2.main-title::text').extract()
	item['file_urls'] = sel.css('img.profile-img::attr(src)').extract()
	item['city'] = sel.css('.location::text').extract_first()
	item['gplus'] = sel.css('a.gplus::attr(href)').extract_first()
	item['twitter'] = sel.css('a.twitter::attr(href)').extract_first()
	item['linkedin'] = sel.css('a.linkedin::attr(href)').extract_first()
	item['bio'] = sel.css('.profile-row p::text').extract()[2:5]
	item['skills'] = sel.css('.profile-row li::text').extract()

	yield item